From 81f88f6ab674973d361b6d176aa4d3ebd32253ab Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Wed, 3 Dec 2025 19:15:07 +0000
Subject: libbpf: Add debug messaging in dedup equivalence/identity matching

We have seen a number of issues like [1]; failures to deduplicate
key kernel data structures like task_struct.  These are often hard
to debug from pahole even with verbose output, especially when
identity/equivalence checks fail deep in a nested struct comparison.

Here we add debug messages of the form

libbpf: STRUCT 'task_struct' size=2560 vlen=194 cand_id[54222] canon_id[102820] shallow-equal but not equiv for field#23 'sched_class': 0

These will be emitted during dedup from pahole when --verbose/-V
is specified.  This greatly helps identify exactly where dedup
failures are experienced.

[1] https://lore.kernel.org/bpf/b8e8b560-bce5-414b-846d-0da6d22a9983@oracle.com/

Changes since v1:

- updated debug messages to refer to shallow-equal, added ids (Andrii)

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251203191507.55565-1-alan.maguire@oracle.com
---
 tools/lib/bpf/btf.c | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 84a4b0abc8be..b136572e889a 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -4431,11 +4431,14 @@ static bool btf_dedup_identical_types(struct btf_dedup *d, __u32 id1, __u32 id2,
 	struct btf_type *t1, *t2;
 	int k1, k2;
 recur:
-	if (depth <= 0)
-		return false;
-
 	t1 = btf_type_by_id(d->btf, id1);
 	t2 = btf_type_by_id(d->btf, id2);
+	if (depth <= 0) {
+		pr_debug("Reached depth limit for identical type comparison for '%s'/'%s'\n",
+			 btf__name_by_offset(d->btf, t1->name_off),
+			 btf__name_by_offset(d->btf, t2->name_off));
+		return false;
+	}
 
 	k1 = btf_kind(t1);
 	k2 = btf_kind(t2);
@@ -4497,8 +4500,16 @@ recur:
 		for (i = 0, n = btf_vlen(t1); i < n; i++, m1++, m2++) {
 			if (m1->type == m2->type)
 				continue;
-			if (!btf_dedup_identical_types(d, m1->type, m2->type, depth - 1))
+			if (!btf_dedup_identical_types(d, m1->type, m2->type, depth - 1)) {
+				if (t1->name_off) {
+					pr_debug("%s '%s' size=%d vlen=%d id1[%u] id2[%u] shallow-equal but not identical for field#%d '%s'\n",
+						 k1 == BTF_KIND_STRUCT ? "STRUCT" : "UNION",
+						 btf__name_by_offset(d->btf, t1->name_off),
+						 t1->size, btf_vlen(t1), id1, id2, i,
+						 btf__name_by_offset(d->btf, m1->name_off));
+				}
 				return false;
+			}
 		}
 		return true;
 	}
@@ -4739,8 +4750,16 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 		canon_m = btf_members(canon_type);
 		for (i = 0; i < vlen; i++) {
 			eq = btf_dedup_is_equiv(d, cand_m->type, canon_m->type);
-			if (eq <= 0)
+			if (eq <= 0) {
+				if (cand_type->name_off) {
+					pr_debug("%s '%s' size=%d vlen=%d cand_id[%u] canon_id[%u] shallow-equal but not equiv for field#%d '%s': %d\n",
+						 cand_kind == BTF_KIND_STRUCT ? "STRUCT" : "UNION",
+						 btf__name_by_offset(d->btf, cand_type->name_off),
+						 cand_type->size, vlen, cand_id, canon_id, i,
+						 btf__name_by_offset(d->btf, cand_m->name_off), eq);
+				}
 				return eq;
+			}
 			cand_m++;
 			canon_m++;
 		}
-- 
cgit v1.2.3


From b5709f6d26d65f6bb9711f4b5f98469fd507cb5b Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Wed, 3 Dec 2025 15:37:44 -0800
Subject: bpf: Support associating BPF program with struct_ops

Add a new BPF command BPF_PROG_ASSOC_STRUCT_OPS to allow associating
a BPF program with a struct_ops map. This command takes a file
descriptor of a struct_ops map and a BPF program and set
prog->aux->st_ops_assoc to the kdata of the struct_ops map.

The command does not accept a struct_ops program nor a non-struct_ops
map. Programs of a struct_ops map is automatically associated with the
map during map update. If a program is shared between two struct_ops
maps, prog->aux->st_ops_assoc will be poisoned to indicate that the
associated struct_ops is ambiguous. The pointer, once poisoned, cannot
be reset since we have lost track of associated struct_ops. For other
program types, the associated struct_ops map, once set, cannot be
changed later. This restriction may be lifted in the future if there is
a use case.

A kernel helper bpf_prog_get_assoc_struct_ops() can be used to retrieve
the associated struct_ops pointer. The returned pointer, if not NULL, is
guaranteed to be valid and point to a fully updated struct_ops struct.
For struct_ops program reused in multiple struct_ops map, the return
will be NULL.

prog->aux->st_ops_assoc is protected by bumping the refcount for
non-struct_ops programs and RCU for struct_ops programs. Since it would
be inefficient to track programs associated with a struct_ops map, every
non-struct_ops program will bump the refcount of the map to make sure
st_ops_assoc stays valid. For a struct_ops program, it is protected by
RCU as map_free will wait for an RCU grace period before disassociating
the program with the map. The helper must be called in BPF program
context or RCU read-side critical section.

struct_ops implementers should note that the struct_ops returned may not
be initialized nor attached yet. The struct_ops implementer will be
responsible for tracking and checking the state of the associated
struct_ops map if the use case expects an initialized or attached
struct_ops.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20251203233748.668365-3-ameryhung@gmail.com
---
 include/linux/bpf.h            | 16 ++++++++
 include/uapi/linux/bpf.h       | 17 ++++++++
 kernel/bpf/bpf_struct_ops.c    | 88 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/core.c              |  3 ++
 kernel/bpf/syscall.c           | 46 ++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h | 17 ++++++++
 6 files changed, 187 insertions(+)

(limited to 'tools')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6498be4c44f8..28d8d6b7bb1e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1739,6 +1739,8 @@ struct bpf_prog_aux {
 		struct rcu_head	rcu;
 	};
 	struct bpf_stream stream[2];
+	struct mutex st_ops_assoc_mutex;
+	struct bpf_map __rcu *st_ops_assoc;
 };
 
 struct bpf_prog {
@@ -2041,6 +2043,9 @@ static inline void bpf_module_put(const void *data, struct module *owner)
 		module_put(owner);
 }
 int bpf_struct_ops_link_create(union bpf_attr *attr);
+int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map);
+void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog);
+void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux);
 u32 bpf_struct_ops_id(const void *kdata);
 
 #ifdef CONFIG_NET
@@ -2088,6 +2093,17 @@ static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
 {
 	return -EOPNOTSUPP;
 }
+static inline int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map)
+{
+	return -EOPNOTSUPP;
+}
+static inline void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog)
+{
+}
+static inline void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux)
+{
+	return NULL;
+}
 static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
 {
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f8d8513eda27..84ced3ed2d21 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -918,6 +918,16 @@ union bpf_iter_link_info {
  *		Number of bytes read from the stream on success, or -1 if an
  *		error occurred (in which case, *errno* is set appropriately).
  *
+ * BPF_PROG_ASSOC_STRUCT_OPS
+ * 	Description
+ * 		Associate a BPF program with a struct_ops map. The struct_ops
+ * 		map is identified by *map_fd* and the BPF program is
+ * 		identified by *prog_fd*.
+ *
+ * 	Return
+ * 		0 on success or -1 if an error occurred (in which case,
+ * 		*errno* is set appropriately).
+ *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -974,6 +984,7 @@ enum bpf_cmd {
 	BPF_PROG_BIND_MAP,
 	BPF_TOKEN_CREATE,
 	BPF_PROG_STREAM_READ_BY_FD,
+	BPF_PROG_ASSOC_STRUCT_OPS,
 	__MAX_BPF_CMD,
 };
 
@@ -1894,6 +1905,12 @@ union bpf_attr {
 		__u32		prog_fd;
 	} prog_stream_read;
 
+	struct {
+		__u32		map_fd;
+		__u32		prog_fd;
+		__u32		flags;
+	} prog_assoc_struct_ops;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 278490683d28..c43346cb3d76 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -533,6 +533,17 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
 	}
 }
 
+static void bpf_struct_ops_map_dissoc_progs(struct bpf_struct_ops_map *st_map)
+{
+	u32 i;
+
+	for (i = 0; i < st_map->funcs_cnt; i++) {
+		if (!st_map->links[i])
+			break;
+		bpf_prog_disassoc_struct_ops(st_map->links[i]->prog);
+	}
+}
+
 static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map)
 {
 	int i;
@@ -801,6 +812,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 			goto reset_unlock;
 		}
 
+		/* Poison pointer on error instead of return for backward compatibility */
+		bpf_prog_assoc_struct_ops(prog, &st_map->map);
+
 		link = kzalloc(sizeof(*link), GFP_USER);
 		if (!link) {
 			bpf_prog_put(prog);
@@ -980,6 +994,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
 	if (btf_is_module(st_map->btf))
 		module_put(st_map->st_ops_desc->st_ops->owner);
 
+	bpf_struct_ops_map_dissoc_progs(st_map);
+
 	bpf_struct_ops_map_del_ksyms(st_map);
 
 	/* The struct_ops's function may switch to another struct_ops.
@@ -1396,6 +1412,78 @@ err_out:
 	return err;
 }
 
+int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map)
+{
+	struct bpf_map *st_ops_assoc;
+
+	guard(mutex)(&prog->aux->st_ops_assoc_mutex);
+
+	st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc,
+						 lockdep_is_held(&prog->aux->st_ops_assoc_mutex));
+	if (st_ops_assoc && st_ops_assoc == map)
+		return 0;
+
+	if (st_ops_assoc) {
+		if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
+			return -EBUSY;
+
+		rcu_assign_pointer(prog->aux->st_ops_assoc, BPF_PTR_POISON);
+	} else {
+		/*
+		 * struct_ops map does not track associated non-struct_ops programs.
+		 * Bump the refcount to make sure st_ops_assoc is always valid.
+		 */
+		if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
+			bpf_map_inc(map);
+
+		rcu_assign_pointer(prog->aux->st_ops_assoc, map);
+	}
+
+	return 0;
+}
+
+void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog)
+{
+	struct bpf_map *st_ops_assoc;
+
+	guard(mutex)(&prog->aux->st_ops_assoc_mutex);
+
+	st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc,
+						 lockdep_is_held(&prog->aux->st_ops_assoc_mutex));
+	if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON)
+		return;
+
+	if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
+		bpf_map_put(st_ops_assoc);
+
+	RCU_INIT_POINTER(prog->aux->st_ops_assoc, NULL);
+}
+
+/*
+ * Get a reference to the struct_ops struct (i.e., kdata) associated with a
+ * program. Should only be called in BPF program context (e.g., in a kfunc).
+ *
+ * If the returned pointer is not NULL, it must points to a valid struct_ops.
+ * The struct_ops map is not guaranteed to be initialized nor attached.
+ * Kernel struct_ops implementers are responsible for tracking and checking
+ * the state of the struct_ops if the use case requires an initialized or
+ * attached struct_ops.
+ */
+void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux)
+{
+	struct bpf_struct_ops_map *st_map;
+	struct bpf_map *st_ops_assoc;
+
+	st_ops_assoc = rcu_dereference_check(aux->st_ops_assoc, bpf_rcu_lock_held());
+	if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON)
+		return NULL;
+
+	st_map = (struct bpf_struct_ops_map *)st_ops_assoc;
+
+	return &st_map->kvalue.data;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_assoc_struct_ops);
+
 void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c8ae6ab31651..67226145a4db 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -136,6 +136,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	mutex_init(&fp->aux->used_maps_mutex);
 	mutex_init(&fp->aux->ext_mutex);
 	mutex_init(&fp->aux->dst_mutex);
+	mutex_init(&fp->aux->st_ops_assoc_mutex);
 
 #ifdef CONFIG_BPF_SYSCALL
 	bpf_prog_stream_init(fp);
@@ -286,6 +287,7 @@ void __bpf_prog_free(struct bpf_prog *fp)
 	if (fp->aux) {
 		mutex_destroy(&fp->aux->used_maps_mutex);
 		mutex_destroy(&fp->aux->dst_mutex);
+		mutex_destroy(&fp->aux->st_ops_assoc_mutex);
 		kfree(fp->aux->poke_tab);
 		kfree(fp->aux);
 	}
@@ -2896,6 +2898,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 #endif
 	bpf_free_used_maps(aux);
 	bpf_free_used_btfs(aux);
+	bpf_prog_disassoc_struct_ops(aux->prog);
 	if (bpf_prog_is_dev_bound(aux))
 		bpf_prog_dev_bound_destroy(aux->prog);
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6589acc89ef8..3080cc48bfc3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -6122,6 +6122,49 @@ static int prog_stream_read(union bpf_attr *attr)
 	return ret;
 }
 
+#define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd
+
+static int prog_assoc_struct_ops(union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct bpf_map *map;
+	int ret;
+
+	if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS))
+		return -EINVAL;
+
+	if (attr->prog_assoc_struct_ops.flags)
+		return -EINVAL;
+
+	prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
+		ret = -EINVAL;
+		goto put_prog;
+	}
+
+	map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd);
+	if (IS_ERR(map)) {
+		ret = PTR_ERR(map);
+		goto put_prog;
+	}
+
+	if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
+		ret = -EINVAL;
+		goto put_map;
+	}
+
+	ret = bpf_prog_assoc_struct_ops(prog, map);
+
+put_map:
+	bpf_map_put(map);
+put_prog:
+	bpf_prog_put(prog);
+	return ret;
+}
+
 static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
 {
 	union bpf_attr attr;
@@ -6261,6 +6304,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
 	case BPF_PROG_STREAM_READ_BY_FD:
 		err = prog_stream_read(&attr);
 		break;
+	case BPF_PROG_ASSOC_STRUCT_OPS:
+		err = prog_assoc_struct_ops(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index be7d8e060e10..6b92b0847ec2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -918,6 +918,16 @@ union bpf_iter_link_info {
  *		Number of bytes read from the stream on success, or -1 if an
  *		error occurred (in which case, *errno* is set appropriately).
  *
+ * BPF_PROG_ASSOC_STRUCT_OPS
+ * 	Description
+ * 		Associate a BPF program with a struct_ops map. The struct_ops
+ * 		map is identified by *map_fd* and the BPF program is
+ * 		identified by *prog_fd*.
+ *
+ * 	Return
+ * 		0 on success or -1 if an error occurred (in which case,
+ * 		*errno* is set appropriately).
+ *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -974,6 +984,7 @@ enum bpf_cmd {
 	BPF_PROG_BIND_MAP,
 	BPF_TOKEN_CREATE,
 	BPF_PROG_STREAM_READ_BY_FD,
+	BPF_PROG_ASSOC_STRUCT_OPS,
 	__MAX_BPF_CMD,
 };
 
@@ -1894,6 +1905,12 @@ union bpf_attr {
 		__u32		prog_fd;
 	} prog_stream_read;
 
+	struct {
+		__u32		map_fd;
+		__u32		prog_fd;
+		__u32		flags;
+	} prog_assoc_struct_ops;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
-- 
cgit v1.2.3


From 87cd177b149a5d86103736994307c25999e0be4b Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Wed, 3 Dec 2025 15:37:45 -0800
Subject: libbpf: Add support for associating BPF program with struct_ops

Add low-level wrapper and libbpf API for BPF_PROG_ASSOC_STRUCT_OPS
command in the bpf() syscall.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251203233748.668365-4-ameryhung@gmail.com
---
 tools/lib/bpf/bpf.c      | 19 +++++++++++++++++++
 tools/lib/bpf/bpf.h      | 21 +++++++++++++++++++++
 tools/lib/bpf/libbpf.c   | 31 +++++++++++++++++++++++++++++++
 tools/lib/bpf/libbpf.h   | 16 ++++++++++++++++
 tools/lib/bpf/libbpf.map |  2 ++
 5 files changed, 89 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index b66f5fbfbbb2..21b57a629916 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -1397,3 +1397,22 @@ int bpf_prog_stream_read(int prog_fd, __u32 stream_id, void *buf, __u32 buf_len,
 	err = sys_bpf(BPF_PROG_STREAM_READ_BY_FD, &attr, attr_sz);
 	return libbpf_err_errno(err);
 }
+
+int bpf_prog_assoc_struct_ops(int prog_fd, int map_fd,
+			      struct bpf_prog_assoc_struct_ops_opts *opts)
+{
+	const size_t attr_sz = offsetofend(union bpf_attr, prog_assoc_struct_ops);
+	union bpf_attr attr;
+	int err;
+
+	if (!OPTS_VALID(opts, bpf_prog_assoc_struct_ops_opts))
+		return libbpf_err(-EINVAL);
+
+	memset(&attr, 0, attr_sz);
+	attr.prog_assoc_struct_ops.map_fd = map_fd;
+	attr.prog_assoc_struct_ops.prog_fd = prog_fd;
+	attr.prog_assoc_struct_ops.flags = OPTS_GET(opts, flags, 0);
+
+	err = sys_bpf(BPF_PROG_ASSOC_STRUCT_OPS, &attr, attr_sz);
+	return libbpf_err_errno(err);
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index e983a3e40d61..1f9c28d27795 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -733,6 +733,27 @@ struct bpf_prog_stream_read_opts {
 LIBBPF_API int bpf_prog_stream_read(int prog_fd, __u32 stream_id, void *buf, __u32 buf_len,
 				    struct bpf_prog_stream_read_opts *opts);
 
+struct bpf_prog_assoc_struct_ops_opts {
+	size_t sz;
+	__u32 flags;
+	size_t :0;
+};
+#define bpf_prog_assoc_struct_ops_opts__last_field flags
+
+/**
+ * @brief **bpf_prog_assoc_struct_ops** associates a BPF program with a
+ * struct_ops map.
+ *
+ * @param prog_fd FD for the BPF program
+ * @param map_fd FD for the struct_ops map to be associated with the BPF program
+ * @param opts optional options, can be NULL
+ *
+ * @return 0 on success; negative error code, otherwise (errno is also set to
+ * the error code)
+ */
+LIBBPF_API int bpf_prog_assoc_struct_ops(int prog_fd, int map_fd,
+					 struct bpf_prog_assoc_struct_ops_opts *opts);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 3dc8a8078815..c7c79014d46c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -14133,6 +14133,37 @@ int bpf_program__set_attach_target(struct bpf_program *prog,
 	return 0;
 }
 
+int bpf_program__assoc_struct_ops(struct bpf_program *prog, struct bpf_map *map,
+				  struct bpf_prog_assoc_struct_ops_opts *opts)
+{
+	int prog_fd, map_fd;
+
+	prog_fd = bpf_program__fd(prog);
+	if (prog_fd < 0) {
+		pr_warn("prog '%s': can't associate BPF program without FD (was it loaded?)\n",
+			prog->name);
+		return libbpf_err(-EINVAL);
+	}
+
+	if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
+		pr_warn("prog '%s': can't associate struct_ops program\n", prog->name);
+		return libbpf_err(-EINVAL);
+	}
+
+	map_fd = bpf_map__fd(map);
+	if (map_fd < 0) {
+		pr_warn("map '%s': can't associate BPF map without FD (was it created?)\n", map->name);
+		return libbpf_err(-EINVAL);
+	}
+
+	if (!bpf_map__is_struct_ops(map)) {
+		pr_warn("map '%s': can't associate non-struct_ops map\n", map->name);
+		return libbpf_err(-EINVAL);
+	}
+
+	return bpf_prog_assoc_struct_ops(prog_fd, map_fd, opts);
+}
+
 int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz)
 {
 	int err = 0, n, len, start, end = -1;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 65e68e964b89..e14d9e349f9c 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1006,6 +1006,22 @@ LIBBPF_API int
 bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd,
 			       const char *attach_func_name);
 
+struct bpf_prog_assoc_struct_ops_opts; /* defined in bpf.h */
+
+/**
+ * @brief **bpf_program__assoc_struct_ops()** associates a BPF program with a
+ * struct_ops map.
+ *
+ * @param prog BPF program
+ * @param map struct_ops map to be associated with the BPF program
+ * @param opts optional options, can be NULL
+ *
+ * @return 0, on success; negative error code, otherwise
+ */
+LIBBPF_API int
+bpf_program__assoc_struct_ops(struct bpf_program *prog, struct bpf_map *map,
+			      struct bpf_prog_assoc_struct_ops_opts *opts);
+
 /**
  * @brief **bpf_object__find_map_by_name()** returns BPF map of
  * the given name, if it exists within the passed BPF object
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 8ed8749907d4..84fb90a016c9 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -451,4 +451,6 @@ LIBBPF_1.7.0 {
 	global:
 		bpf_map__set_exclusive_program;
 		bpf_map__exclusive_program;
+		bpf_prog_assoc_struct_ops;
+		bpf_program__assoc_struct_ops;
 } LIBBPF_1.6.0;
-- 
cgit v1.2.3


From 33a165f9c2c1b9ddceaaccc356ce841baf1a08a2 Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Wed, 3 Dec 2025 15:37:46 -0800
Subject: selftests/bpf: Test BPF_PROG_ASSOC_STRUCT_OPS command

Test BPF_PROG_ASSOC_STRUCT_OPS command that associates a BPF program
with a struct_ops. The test follows the same logic in commit
ba7000f1c360 ("selftests/bpf: Test multi_st_ops and calling kfuncs from
different programs"), but instead of using map id to identify a specific
struct_ops, this test uses the new BPF command to associate a struct_ops
with a program.

The test consists of two sets of almost identical struct_ops maps and BPF
programs associated with the map. Their only difference is the unique
value returned by bpf_testmod_multi_st_ops::test_1().

The test first loads the programs and associates them with struct_ops
maps. Then, it exercises the BPF programs. They will in turn call kfunc
bpf_kfunc_multi_st_ops_test_1_prog_arg() to trigger test_1() of the
associated struct_ops map, and then check if the right unique value is
returned.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251203233748.668365-5-ameryhung@gmail.com
---
 .../bpf/prog_tests/test_struct_ops_assoc.c         |  72 ++++++++++++++
 .../testing/selftests/bpf/progs/struct_ops_assoc.c | 105 +++++++++++++++++++++
 .../testing/selftests/bpf/test_kmods/bpf_testmod.c |  17 ++++
 .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h   |   1 +
 4 files changed, 195 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c
 create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_assoc.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c
new file mode 100644
index 000000000000..1e24a4915524
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "struct_ops_assoc.skel.h"
+
+static void test_st_ops_assoc(void)
+{
+	struct struct_ops_assoc *skel = NULL;
+	int err, pid;
+
+	skel = struct_ops_assoc__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "struct_ops_assoc__open"))
+		goto out;
+
+	/* cannot explicitly associate struct_ops program */
+	err = bpf_program__assoc_struct_ops(skel->progs.test_1_a,
+					    skel->maps.st_ops_map_a, NULL);
+	ASSERT_ERR(err, "bpf_program__assoc_struct_ops(test_1_a, st_ops_map_a)");
+
+	err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_a,
+					    skel->maps.st_ops_map_a, NULL);
+	ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_a, st_ops_map_a)");
+
+	err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_a,
+					    skel->maps.st_ops_map_a, NULL);
+	ASSERT_OK(err, "bpf_program__assoc_struct_ops(sys_enter_prog_a, st_ops_map_a)");
+
+	err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_b,
+					    skel->maps.st_ops_map_b, NULL);
+	ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_b, st_ops_map_b)");
+
+	err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_b,
+					    skel->maps.st_ops_map_b, NULL);
+	ASSERT_OK(err, "bpf_program__assoc_struct_ops(sys_enter_prog_b, st_ops_map_b)");
+
+	/* sys_enter_prog_a already associated with map_a */
+	err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_a,
+					    skel->maps.st_ops_map_b, NULL);
+	ASSERT_ERR(err, "bpf_program__assoc_struct_ops(sys_enter_prog_a, st_ops_map_b)");
+
+	err = struct_ops_assoc__attach(skel);
+	if (!ASSERT_OK(err, "struct_ops_assoc__attach"))
+		goto out;
+
+	/* run tracing prog that calls .test_1 and checks return */
+	pid = getpid();
+	skel->bss->test_pid = pid;
+	sys_gettid();
+	skel->bss->test_pid = 0;
+
+	ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a");
+	ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b");
+
+	/* run syscall_prog that calls .test_1 and checks return */
+	err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_a), NULL);
+	ASSERT_OK(err, "bpf_prog_test_run_opts");
+
+	err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_b), NULL);
+	ASSERT_OK(err, "bpf_prog_test_run_opts");
+
+	ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a");
+	ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b");
+
+out:
+	struct_ops_assoc__destroy(skel);
+}
+
+void test_struct_ops_assoc(void)
+{
+	if (test__start_subtest("st_ops_assoc"))
+		test_st_ops_assoc();
+}
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c
new file mode 100644
index 000000000000..8f1097903e22
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "../test_kmods/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
+
+char _license[] SEC("license") = "GPL";
+
+int test_pid;
+
+/* Programs associated with st_ops_map_a */
+
+#define MAP_A_MAGIC 1234
+int test_err_a;
+
+SEC("struct_ops")
+int BPF_PROG(test_1_a, struct st_ops_args *args)
+{
+	return MAP_A_MAGIC;
+}
+
+SEC("tp_btf/sys_enter")
+int BPF_PROG(sys_enter_prog_a, struct pt_regs *regs, long id)
+{
+	struct st_ops_args args = {};
+	struct task_struct *task;
+	int ret;
+
+	task = bpf_get_current_task_btf();
+	if (!test_pid || task->pid != test_pid)
+		return 0;
+
+	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	if (ret != MAP_A_MAGIC)
+		test_err_a++;
+
+	return 0;
+}
+
+SEC("syscall")
+int syscall_prog_a(void *ctx)
+{
+	struct st_ops_args args = {};
+	int ret;
+
+	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	if (ret != MAP_A_MAGIC)
+		test_err_a++;
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct bpf_testmod_multi_st_ops st_ops_map_a = {
+	.test_1 = (void *)test_1_a,
+};
+
+/* Programs associated with st_ops_map_b */
+
+#define MAP_B_MAGIC 5678
+int test_err_b;
+
+SEC("struct_ops")
+int BPF_PROG(test_1_b, struct st_ops_args *args)
+{
+	return MAP_B_MAGIC;
+}
+
+SEC("tp_btf/sys_enter")
+int BPF_PROG(sys_enter_prog_b, struct pt_regs *regs, long id)
+{
+	struct st_ops_args args = {};
+	struct task_struct *task;
+	int ret;
+
+	task = bpf_get_current_task_btf();
+	if (!test_pid || task->pid != test_pid)
+		return 0;
+
+	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	if (ret != MAP_B_MAGIC)
+		test_err_b++;
+
+	return 0;
+}
+
+SEC("syscall")
+int syscall_prog_b(void *ctx)
+{
+	struct st_ops_args args = {};
+	int ret;
+
+	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	if (ret != MAP_B_MAGIC)
+		test_err_b++;
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct bpf_testmod_multi_st_ops st_ops_map_b = {
+	.test_1 = (void *)test_1_b,
+};
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index 1669a7eeda26..90c4b1a51de6 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -1134,6 +1134,7 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args)
 }
 
 __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id);
+__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux_prog);
 
 BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc)
@@ -1176,6 +1177,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABL
 BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl, KF_TRUSTED_ARGS)
 BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids)
 
 static int bpf_testmod_ops_init(struct btf *btf)
@@ -1637,6 +1639,7 @@ static struct bpf_testmod_multi_st_ops *multi_st_ops_find_nolock(u32 id)
 	return NULL;
 }
 
+/* Call test_1() of the struct_ops map identified by the id */
 int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id)
 {
 	struct bpf_testmod_multi_st_ops *st_ops;
@@ -1652,6 +1655,20 @@ int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id)
 	return ret;
 }
 
+/* Call test_1() of the associated struct_ops map */
+int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog)
+{
+	struct bpf_prog_aux *prog_aux = (struct bpf_prog_aux *)aux__prog;
+	struct bpf_testmod_multi_st_ops *st_ops;
+	int ret = -1;
+
+	st_ops = (struct bpf_testmod_multi_st_ops *)bpf_prog_get_assoc_struct_ops(prog_aux);
+	if (st_ops)
+		ret = st_ops->test_1(args);
+
+	return ret;
+}
+
 static int multi_st_ops_reg(void *kdata, struct bpf_link *link)
 {
 	struct bpf_testmod_multi_st_ops *st_ops =
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
index 4df6fa6a92cb..2357a0340ffe 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
@@ -162,5 +162,6 @@ struct task_struct *bpf_kfunc_ret_rcu_test(void) __ksym;
 int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) __ksym;
 
 int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __ksym;
+int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) __ksym;
 
 #endif /* _BPF_TESTMOD_KFUNC_H */
-- 
cgit v1.2.3


From 04fd12df4e05dd6fd3017b637f6fbc9da10b4e65 Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Wed, 3 Dec 2025 15:37:47 -0800
Subject: selftests/bpf: Test ambiguous associated struct_ops

Add a test to make sure implicit struct_ops association does not
break backward compatibility nor return incorrect struct_ops.
struct_ops programs should still be allowed to be reused in
different struct_ops map. The associated struct_ops map set implicitly
however will be poisoned. Trying to read it through the helper
bpf_prog_get_assoc_struct_ops() should result in a NULL pointer.

While recursion of test_1() cannot happen due to the associated
struct_ops being ambiguois, explicitly check for it to prevent stack
overflow if the test regresses.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251203233748.668365-6-ameryhung@gmail.com
---
 .../bpf/prog_tests/test_struct_ops_assoc.c         | 38 +++++++++++
 .../selftests/bpf/progs/struct_ops_assoc_reuse.c   | 75 ++++++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c
index 1e24a4915524..02173504f675 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c
@@ -2,6 +2,7 @@
 
 #include <test_progs.h>
 #include "struct_ops_assoc.skel.h"
+#include "struct_ops_assoc_reuse.skel.h"
 
 static void test_st_ops_assoc(void)
 {
@@ -65,8 +66,45 @@ out:
 	struct_ops_assoc__destroy(skel);
 }
 
+static void test_st_ops_assoc_reuse(void)
+{
+	struct struct_ops_assoc_reuse *skel = NULL;
+	int err;
+
+	skel = struct_ops_assoc_reuse__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_reuse__open"))
+		goto out;
+
+	err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_a,
+					    skel->maps.st_ops_map_a, NULL);
+	ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_a, st_ops_map_a)");
+
+	err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_b,
+					    skel->maps.st_ops_map_b, NULL);
+	ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_b, st_ops_map_b)");
+
+	err = struct_ops_assoc_reuse__attach(skel);
+	if (!ASSERT_OK(err, "struct_ops_assoc__attach"))
+		goto out;
+
+	/* run syscall_prog that calls .test_1 and checks return */
+	err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_a), NULL);
+	ASSERT_OK(err, "bpf_prog_test_run_opts");
+
+	err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_b), NULL);
+	ASSERT_OK(err, "bpf_prog_test_run_opts");
+
+	ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a");
+	ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b");
+
+out:
+	struct_ops_assoc_reuse__destroy(skel);
+}
+
 void test_struct_ops_assoc(void)
 {
 	if (test__start_subtest("st_ops_assoc"))
 		test_st_ops_assoc();
+	if (test__start_subtest("st_ops_assoc_reuse"))
+		test_st_ops_assoc_reuse();
 }
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c
new file mode 100644
index 000000000000..5bb6ebf5eed4
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "../test_kmods/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
+
+char _license[] SEC("license") = "GPL";
+
+#define MAP_A_MAGIC 1234
+int test_err_a;
+int recur;
+
+/*
+ * test_1_a is reused. The kfunc should not be able to get the associated
+ * struct_ops and call test_1 recursively as it is ambiguous.
+ */
+SEC("struct_ops")
+int BPF_PROG(test_1_a, struct st_ops_args *args)
+{
+	int ret;
+
+	if (!recur) {
+		recur++;
+		ret = bpf_kfunc_multi_st_ops_test_1_impl(args, NULL);
+		if (ret != -1)
+			test_err_a++;
+		recur--;
+	}
+
+	return MAP_A_MAGIC;
+}
+
+/* Programs associated with st_ops_map_a */
+
+SEC("syscall")
+int syscall_prog_a(void *ctx)
+{
+	struct st_ops_args args = {};
+	int ret;
+
+	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	if (ret != MAP_A_MAGIC)
+		test_err_a++;
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct bpf_testmod_multi_st_ops st_ops_map_a = {
+	.test_1 = (void *)test_1_a,
+};
+
+/* Programs associated with st_ops_map_b */
+
+int test_err_b;
+
+SEC("syscall")
+int syscall_prog_b(void *ctx)
+{
+	struct st_ops_args args = {};
+	int ret;
+
+	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	if (ret != MAP_A_MAGIC)
+		test_err_b++;
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct bpf_testmod_multi_st_ops st_ops_map_b = {
+	.test_1 = (void *)test_1_a,
+};
-- 
cgit v1.2.3


From 0e841d19263ab6e1ca2b280109832f57624e48d1 Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Wed, 3 Dec 2025 15:37:48 -0800
Subject: selftests/bpf: Test getting associated struct_ops in timer callback

Make sure 1) a timer callback can also reference the associated
struct_ops, and then make sure 2) the timer callback cannot get a
dangled pointer to the struct_ops when the map is freed.

The test schedules a timer callback from a struct_ops program since
struct_ops programs do not pin the map. It is possible for the timer
callback to run after the map is freed. The timer callback calls a
kfunc that runs .test_1() of the associated struct_ops, which should
return MAP_MAGIC when the map is still alive or -1 when the map is
gone.

The first subtest added in this patch schedules the timer callback to
run immediately, while the map is still alive. The second subtest added
schedules the callback to run 500ms after syscall_prog runs and then
frees the map right after syscall_prog runs. Both subtests then wait
until the callback runs to check the return of the kfunc.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251203233748.668365-7-ameryhung@gmail.com
---
 .../bpf/prog_tests/test_struct_ops_assoc.c         | 81 ++++++++++++++++++++++
 .../bpf/progs/struct_ops_assoc_in_timer.c          | 77 ++++++++++++++++++++
 2 files changed, 158 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c
index 02173504f675..461ded722351 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c
@@ -3,6 +3,7 @@
 #include <test_progs.h>
 #include "struct_ops_assoc.skel.h"
 #include "struct_ops_assoc_reuse.skel.h"
+#include "struct_ops_assoc_in_timer.skel.h"
 
 static void test_st_ops_assoc(void)
 {
@@ -101,10 +102,90 @@ out:
 	struct_ops_assoc_reuse__destroy(skel);
 }
 
+static void test_st_ops_assoc_in_timer(void)
+{
+	struct struct_ops_assoc_in_timer *skel = NULL;
+	int err;
+
+	skel = struct_ops_assoc_in_timer__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_in_timer__open"))
+		goto out;
+
+	err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog,
+					    skel->maps.st_ops_map, NULL);
+	ASSERT_OK(err, "bpf_program__assoc_struct_ops");
+
+	err = struct_ops_assoc_in_timer__attach(skel);
+	if (!ASSERT_OK(err, "struct_ops_assoc__attach"))
+		goto out;
+
+	/*
+	 * Run .test_1 by calling kfunc bpf_kfunc_multi_st_ops_test_1_prog_arg() and checks
+	 * the return value. .test_1 will also schedule timer_cb that runs .test_1 again
+	 * immediately.
+	 */
+	err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog), NULL);
+	ASSERT_OK(err, "bpf_prog_test_run_opts");
+
+	/* Check the return of the kfunc after timer_cb runs */
+	while (!READ_ONCE(skel->bss->timer_cb_run))
+		sched_yield();
+	ASSERT_EQ(skel->bss->timer_test_1_ret, 1234, "skel->bss->timer_test_1_ret");
+	ASSERT_EQ(skel->bss->test_err, 0, "skel->bss->test_err_a");
+out:
+	struct_ops_assoc_in_timer__destroy(skel);
+}
+
+static void test_st_ops_assoc_in_timer_no_uref(void)
+{
+	struct struct_ops_assoc_in_timer *skel = NULL;
+	struct bpf_link *link;
+	int err;
+
+	skel = struct_ops_assoc_in_timer__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_in_timer__open"))
+		goto out;
+
+	err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog,
+					    skel->maps.st_ops_map, NULL);
+	ASSERT_OK(err, "bpf_program__assoc_struct_ops");
+
+	link = bpf_map__attach_struct_ops(skel->maps.st_ops_map);
+	if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops"))
+		goto out;
+
+	/*
+	 * Run .test_1 by calling kfunc bpf_kfunc_multi_st_ops_test_1_prog_arg() and checks
+	 * the return value. .test_1 will also schedule timer_cb that runs .test_1 again.
+	 * timer_cb will run 500ms after syscall_prog runs, when the user space no longer
+	 * holds a reference to st_ops_map.
+	 */
+	skel->bss->timer_ns = 500000000;
+	err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog), NULL);
+	ASSERT_OK(err, "bpf_prog_test_run_opts");
+
+	/* Detach and close struct_ops map to cause it to be freed */
+	bpf_link__destroy(link);
+	close(bpf_program__fd(skel->progs.syscall_prog));
+	close(bpf_map__fd(skel->maps.st_ops_map));
+
+	/* Check the return of the kfunc after timer_cb runs */
+	while (!READ_ONCE(skel->bss->timer_cb_run))
+		sched_yield();
+	ASSERT_EQ(skel->bss->timer_test_1_ret, -1, "skel->bss->timer_test_1_ret");
+	ASSERT_EQ(skel->bss->test_err, 0, "skel->bss->test_err_a");
+out:
+	struct_ops_assoc_in_timer__destroy(skel);
+}
+
 void test_struct_ops_assoc(void)
 {
 	if (test__start_subtest("st_ops_assoc"))
 		test_st_ops_assoc();
 	if (test__start_subtest("st_ops_assoc_reuse"))
 		test_st_ops_assoc_reuse();
+	if (test__start_subtest("st_ops_assoc_in_timer"))
+		test_st_ops_assoc_in_timer();
+	if (test__start_subtest("st_ops_assoc_in_timer_no_uref"))
+		test_st_ops_assoc_in_timer_no_uref();
 }
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c
new file mode 100644
index 000000000000..d5a2ea934284
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "../test_kmods/bpf_testmod.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct elem {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, int);
+	__type(value, struct elem);
+} array_map SEC(".maps");
+
+#define MAP_MAGIC 1234
+int recur;
+int test_err;
+int timer_ns;
+int timer_test_1_ret;
+int timer_cb_run;
+
+__noinline static int timer_cb(void *map, int *key, struct bpf_timer *timer)
+{
+	struct st_ops_args args = {};
+
+	recur++;
+	timer_test_1_ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	recur--;
+
+	timer_cb_run++;
+
+	return 0;
+}
+
+SEC("struct_ops")
+int BPF_PROG(test_1, struct st_ops_args *args)
+{
+	struct bpf_timer *timer;
+	int key = 0;
+
+	if (!recur) {
+		timer = bpf_map_lookup_elem(&array_map, &key);
+		if (!timer)
+			return 0;
+
+		bpf_timer_init(timer, &array_map, 1);
+		bpf_timer_set_callback(timer, timer_cb);
+		bpf_timer_start(timer, timer_ns, 0);
+	}
+
+	return MAP_MAGIC;
+}
+
+SEC("syscall")
+int syscall_prog(void *ctx)
+{
+	struct st_ops_args args = {};
+	int ret;
+
+	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	if (ret != MAP_MAGIC)
+		test_err++;
+
+	return 0;
+}
+
+SEC(".struct_ops.link")
+struct bpf_testmod_multi_st_ops st_ops_map = {
+	.test_1 = (void *)test_1,
+};
-- 
cgit v1.2.3


From 311ead1be05d2348e89f873337c8375e856e1abb Mon Sep 17 00:00:00 2001
From: Guopeng Zhang <zhangguopeng@kylinos.cn>
Date: Wed, 3 Dec 2025 19:56:29 +0800
Subject: selftests: cgroup: Add cg_read_key_long_poll() to poll a cgroup key
 with retries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a new helper function `cg_read_key_long_poll()` in
cgroup_util.h. This function polls the specified key in a cgroup file
until it matches the expected value or the retry limit is reached,
with configurable wait intervals between retries.

This helper is particularly useful for handling asynchronously updated
cgroup statistics (e.g., memory.stat), where immediate reads may
observe stale values, especially on busy systems. It allows tests and
other utilities to handle such cases more flexibly.

Signed-off-by: Guopeng Zhang <zhangguopeng@kylinos.cn>
Suggested-by: Michal Koutný <mkoutny@suse.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/lib/cgroup_util.c    | 21 +++++++++++++++++++++
 .../selftests/cgroup/lib/include/cgroup_util.h      |  5 +++++
 2 files changed, 26 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c
index 44c52f620fda..ce6c2642fd9b 100644
--- a/tools/testing/selftests/cgroup/lib/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c
@@ -168,6 +168,27 @@ long cg_read_key_long(const char *cgroup, const char *control, const char *key)
 	return atol(ptr + strlen(key));
 }
 
+long cg_read_key_long_poll(const char *cgroup, const char *control,
+			   const char *key, long expected, int retries,
+			   useconds_t wait_interval_us)
+{
+	long val = -1;
+	int i;
+
+	for (i = 0; i < retries; i++) {
+		val = cg_read_key_long(cgroup, control, key);
+		if (val < 0)
+			return val;
+
+		if (val == expected)
+			break;
+
+		usleep(wait_interval_us);
+	}
+
+	return val;
+}
+
 long cg_read_lc(const char *cgroup, const char *control)
 {
 	char buf[PAGE_SIZE];
diff --git a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
index 7ab2824ed7b5..77f386dab5e8 100644
--- a/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h
@@ -17,6 +17,8 @@
 #define CG_NAMED_NAME "selftest"
 #define CG_PATH_FORMAT (!cg_test_v1_named ? "0::%s" : (":name=" CG_NAMED_NAME ":%s"))
 
+#define DEFAULT_WAIT_INTERVAL_US (100 * 1000) /* 100 ms */
+
 /*
  * Checks if two given values differ by less than err% of their sum.
  */
@@ -64,6 +66,9 @@ extern int cg_read_strstr(const char *cgroup, const char *control,
 extern long cg_read_long(const char *cgroup, const char *control);
 extern long cg_read_long_fd(int fd);
 long cg_read_key_long(const char *cgroup, const char *control, const char *key);
+long cg_read_key_long_poll(const char *cgroup, const char *control,
+			   const char *key, long expected, int retries,
+			   useconds_t wait_interval_us);
 extern long cg_read_lc(const char *cgroup, const char *control);
 extern int cg_write(const char *cgroup, const char *control, char *buf);
 extern int cg_open(const char *cgroup, const char *control, int flags);
-- 
cgit v1.2.3


From 6360d444ae32871c6a048ac880ef3b871a439bad Mon Sep 17 00:00:00 2001
From: Guopeng Zhang <zhangguopeng@kylinos.cn>
Date: Wed, 3 Dec 2025 19:56:30 +0800
Subject: selftests: cgroup: make test_memcg_sock robust against delayed sock
 stats
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_memcg_sock() currently requires that memory.stat's "sock " counter
is exactly zero immediately after the TCP server exits. On a busy system
this assumption is too strict:

  - Socket memory may be freed with a small delay (e.g. RCU callbacks).
  - memcg statistics are updated asynchronously via the rstat flushing
    worker, so the "sock " value in memory.stat can stay non-zero for a
    short period of time even after all socket memory has been uncharged.

As a result, test_memcg_sock() can intermittently fail even though socket
memory accounting is working correctly.

Make the test more robust by polling memory.stat for the "sock "
counter and allowing it some time to drop to zero instead of checking
it only once. The timeout is set to 3 seconds to cover the periodic
rstat flush interval (FLUSH_TIME = 2*HZ by default) plus some
scheduling slack. If the counter does not become zero within the
timeout, the test still fails as before.

On my test system, running test_memcontrol 50 times produced:

  - Before this patch:  6/50 runs passed.
  - After this patch:  50/50 runs passed.

Signed-off-by: Guopeng Zhang <zhangguopeng@kylinos.cn>
Suggested-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_memcontrol.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index 4e1647568c5b..2fb096a2a9f9 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -21,6 +21,8 @@
 #include "kselftest.h"
 #include "cgroup_util.h"
 
+#define MEMCG_SOCKSTAT_WAIT_RETRIES        30
+
 static bool has_localevents;
 static bool has_recursiveprot;
 
@@ -1384,6 +1386,7 @@ static int test_memcg_sock(const char *root)
 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
 	unsigned short port;
 	char *memcg;
+	long sock_post = -1;
 
 	memcg = cg_name(root, "memcg_test");
 	if (!memcg)
@@ -1432,7 +1435,22 @@ static int test_memcg_sock(const char *root)
 	if (cg_read_long(memcg, "memory.current") < 0)
 		goto cleanup;
 
-	if (cg_read_key_long(memcg, "memory.stat", "sock "))
+	/*
+	 * memory.stat is updated asynchronously via the memcg rstat
+	 * flushing worker, which runs periodically (every 2 seconds,
+	 * see FLUSH_TIME). On a busy system, the "sock " counter may
+	 * stay non-zero for a short period of time after the TCP
+	 * connection is closed and all socket memory has been
+	 * uncharged.
+	 *
+	 * Poll memory.stat for up to 3 seconds (~FLUSH_TIME plus some
+	 * scheduling slack) and require that the "sock " counter
+	 * eventually drops to zero.
+	 */
+	sock_post = cg_read_key_long_poll(memcg, "memory.stat", "sock ", 0,
+					 MEMCG_SOCKSTAT_WAIT_RETRIES,
+					 DEFAULT_WAIT_INTERVAL_US);
+	if (sock_post)
 		goto cleanup;
 
 	ret = KSFT_PASS;
-- 
cgit v1.2.3


From 50133c09d189a26f4cc6e78e382864fd599a1dc4 Mon Sep 17 00:00:00 2001
From: Guopeng Zhang <zhangguopeng@kylinos.cn>
Date: Wed, 3 Dec 2025 19:56:31 +0800
Subject: selftests: cgroup: Replace sleep with cg_read_key_long_poll() for
 waiting on nr_dying_descendants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the manual sleep-and-retry logic in test_kmem_dead_cgroups()
with the new helper `cg_read_key_long_poll()`. This change improves
the robustness of the test by polling the "nr_dying_descendants"
counter in `cgroup.stat` until it reaches 0 or the timeout is exceeded.

Additionally, increase the retry timeout to 8 seconds (from 5 seconds)
based on testing results:
  - With 5-second timeout: 4/20 runs passed.
  - With 8-second timeout: 20/20 runs passed.

The 8 second timeout is based on stress testing of test_kmem_dead_cgroups()
under load: 5 seconds was occasionally not enough for reclaim of dying
descendants to complete, whereas 8 seconds consistently covered the observed
latencies. This value is intended as a generous upper bound for the
asynchronous reclaim and is not tied to any specific kernel constant, so it
can be adjusted in the future if reclaim behavior changes.

Signed-off-by: Guopeng Zhang <zhangguopeng@kylinos.cn>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/cgroup/test_kmem.c | 33 ++++++++++++++----------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c
index ca38525484e3..eeabd34bf083 100644
--- a/tools/testing/selftests/cgroup/test_kmem.c
+++ b/tools/testing/selftests/cgroup/test_kmem.c
@@ -26,6 +26,7 @@
  */
 #define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs())
 
+#define KMEM_DEAD_WAIT_RETRIES        80
 
 static int alloc_dcache(const char *cgroup, void *arg)
 {
@@ -306,9 +307,7 @@ static int test_kmem_dead_cgroups(const char *root)
 {
 	int ret = KSFT_FAIL;
 	char *parent;
-	long dead;
-	int i;
-	int max_time = 20;
+	long dead = -1;
 
 	parent = cg_name(root, "kmem_dead_cgroups_test");
 	if (!parent)
@@ -323,21 +322,19 @@ static int test_kmem_dead_cgroups(const char *root)
 	if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
 		goto cleanup;
 
-	for (i = 0; i < max_time; i++) {
-		dead = cg_read_key_long(parent, "cgroup.stat",
-					"nr_dying_descendants ");
-		if (dead == 0) {
-			ret = KSFT_PASS;
-			break;
-		}
-		/*
-		 * Reclaiming cgroups might take some time,
-		 * let's wait a bit and repeat.
-		 */
-		sleep(1);
-		if (i > 5)
-			printf("Waiting time longer than 5s; wait: %ds (dead: %ld)\n", i, dead);
-	}
+	/*
+	 * Allow up to ~8s for reclaim of dying descendants to complete.
+	 * This is a generous upper bound derived from stress testing, not
+	 * from a specific kernel constant, and can be adjusted if reclaim
+	 * behavior changes in the future.
+	 */
+	dead = cg_read_key_long_poll(parent, "cgroup.stat",
+					"nr_dying_descendants ", 0, KMEM_DEAD_WAIT_RETRIES,
+					DEFAULT_WAIT_INTERVAL_US);
+	if (dead)
+		goto cleanup;
+
+	ret = KSFT_PASS;
 
 cleanup:
 	cg_destroy(parent);
-- 
cgit v1.2.3


From 18352f8fae91d23bbd7165a7b2a1f15c4f5beff8 Mon Sep 17 00:00:00 2001
From: Kohei Enju <enjuk@amazon.com>
Date: Mon, 8 Dec 2025 22:14:32 +0900
Subject: selftests/bpf: add tests for attaching invalid fd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add test cases for situations where adding the following types of file
descriptors to a cpumap entry should fail:
- Non-BPF file descriptor (expect -EINVAL)
- Nonexistent file descriptor (expect -EBADF)

Also tighten the assertion for the expected error when adding a
non-BPF_XDP_CPUMAP program to a cpumap entry.

Signed-off-by: Kohei Enju <enjuk@amazon.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20251208131449.73036-3-enjuk@amazon.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/xdp_cpumap_attach.c      | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
index df27535995af..ad56e4370ce3 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
@@ -18,7 +18,7 @@ static void test_xdp_with_cpumap_helpers(void)
 	struct bpf_cpumap_val val = {
 		.qsize = 192,
 	};
-	int err, prog_fd, prog_redir_fd, map_fd;
+	int err, prog_fd, prog_redir_fd, map_fd, bad_fd;
 	struct nstoken *nstoken = NULL;
 	__u32 idx = 0;
 
@@ -79,7 +79,22 @@ static void test_xdp_with_cpumap_helpers(void)
 	val.qsize = 192;
 	val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog);
 	err = bpf_map_update_elem(map_fd, &idx, &val, 0);
-	ASSERT_NEQ(err, 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry");
+	ASSERT_EQ(err, -EINVAL, "Add non-BPF_XDP_CPUMAP program to cpumap entry");
+
+	/* Try to attach non-BPF file descriptor */
+	bad_fd = open("/dev/null", O_RDONLY);
+	ASSERT_GE(bad_fd, 0, "Open /dev/null for non-BPF fd");
+
+	val.bpf_prog.fd = bad_fd;
+	err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+	ASSERT_EQ(err, -EINVAL, "Add non-BPF fd to cpumap entry");
+
+	/* Try to attach nonexistent file descriptor */
+	err = close(bad_fd);
+	ASSERT_EQ(err, 0, "Close non-BPF fd for nonexistent fd");
+
+	err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+	ASSERT_EQ(err, -EBADF, "Add nonexistent fd to cpumap entry");
 
 	/* Try to attach BPF_XDP program with frags to cpumap when we have
 	 * already loaded a BPF_XDP program on the map
-- 
cgit v1.2.3


From a5b4867fad18e72fd5fc442c16be83723776283b Mon Sep 17 00:00:00 2001
From: Cupertino Miranda <cupertino.miranda@oracle.com>
Date: Tue, 2 Dec 2025 18:02:20 +0000
Subject: selftests/bpf: add verifier sign extension bound computation tests.

This commit adds 3 tests to verify a common compiler generated
pattern for sign extension (r1 <<= 32; r1 s>>= 32).
The tests make sure the register bounds are correctly computed both for
positive and negative register values.

Signed-off-by: Cupertino Miranda  <cupertino.miranda@oracle.com>
Signed-off-by: Andrew Pinski  <andrew.pinski@oss.qualcomm.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Cc: David Faust  <david.faust@oracle.com>
Cc: Jose Marchesi  <jose.marchesi@oracle.com>
Cc: Elena Zannoni  <elena.zannoni@oracle.com>
Link: https://lore.kernel.org/r/20251202180220.11128-3-cupertino.miranda@oracle.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/progs/verifier_subreg.c  | 68 ++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_subreg.c b/tools/testing/selftests/bpf/progs/verifier_subreg.c
index 8613ea160dcd..b3e1c3eef9ae 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subreg.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subreg.c
@@ -531,6 +531,74 @@ __naked void arsh32_imm_zero_extend_check(void)
 	: __clobber_all);
 }
 
+SEC("socket")
+__description("arsh32 imm sign positive extend check")
+__success __retval(0)
+__log_level(2)
+__msg("2: (57) r6 &= 4095                    ; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=4095,var_off=(0x0; 0xfff))")
+__msg("3: (67) r6 <<= 32                     ; R6=scalar(smin=smin32=0,smax=umax=0xfff00000000,smax32=umax32=0,var_off=(0x0; 0xfff00000000))")
+__msg("4: (c7) r6 s>>= 32                    ; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=4095,var_off=(0x0; 0xfff))")
+__naked void arsh32_imm_sign_extend_positive_check(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r6 = r0;					\
+	r6 &= 4095;					\
+	r6 <<= 32;					\
+	r6 s>>= 32;					\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("arsh32 imm sign negative extend check")
+__success __retval(0)
+__log_level(2)
+__msg("3: (17) r6 -= 4095                    ; R6=scalar(smin=smin32=-4095,smax=smax32=0)")
+__msg("4: (67) r6 <<= 32                     ; R6=scalar(smin=0xfffff00100000000,smax=smax32=umax32=0,umax=0xffffffff00000000,smin32=0,var_off=(0x0; 0xffffffff00000000))")
+__msg("5: (c7) r6 s>>= 32                    ; R6=scalar(smin=smin32=-4095,smax=smax32=0)")
+__naked void arsh32_imm_sign_extend_negative_check(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r6 = r0;					\
+	r6 &= 4095;					\
+	r6 -= 4095;					\
+	r6 <<= 32;					\
+	r6 s>>= 32;					\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("arsh32 imm sign extend check")
+__success __retval(0)
+__log_level(2)
+__msg("3: (17) r6 -= 2047                    ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)")
+__msg("4: (67) r6 <<= 32                     ; R6=scalar(smin=0xfffff80100000000,smax=0x80000000000,umax=0xffffffff00000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))")
+__msg("5: (c7) r6 s>>= 32                    ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)")
+__naked void arsh32_imm_sign_extend_check(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r6 = r0;					\
+	r6 &= 4095;					\
+	r6 -= 2047;					\
+	r6 <<= 32;					\
+	r6 s>>= 32;					\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
 SEC("socket")
 __description("end16 (to_le) reg zero extend check")
 __success __success_unpriv __retval(0)
-- 
cgit v1.2.3


From 0c82fdbbbfbee878a0a5e884ea5f31dd16138f0d Mon Sep 17 00:00:00 2001
From: Bhavik Sachdev <b.sachdev1904@gmail.com>
Date: Sat, 29 Nov 2025 14:41:22 +0530
Subject: selftests: statmount: tests for STATMOUNT_BY_FD

Add tests for STATMOUNT_BY_FD flag, which adds support for passing a
file descriptors to statmount(). The fd can also be on a "unmounted"
mount (mount unmounted with MNT_DETACH), we also include tests for that.

Co-developed-by: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Bhavik Sachdev <b.sachdev1904@gmail.com>
Link: https://patch.msgid.link/20251129091455.757724-4-b.sachdev1904@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../selftests/filesystems/statmount/statmount.h    |  15 +-
 .../filesystems/statmount/statmount_test.c         | 261 +++++++++++++++++++--
 .../filesystems/statmount/statmount_test_ns.c      | 101 +++++++-
 3 files changed, 354 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h
index 99e5ad082fb1..e1cba4bfd8d9 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount.h
+++ b/tools/testing/selftests/filesystems/statmount/statmount.h
@@ -43,19 +43,24 @@
 	#endif
 #endif
 
-static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask,
-			    struct statmount *buf, size_t bufsize,
+static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint32_t fd,
+			    uint64_t mask, struct statmount *buf, size_t bufsize,
 			    unsigned int flags)
 {
 	struct mnt_id_req req = {
 		.size = MNT_ID_REQ_SIZE_VER0,
-		.mnt_id = mnt_id,
 		.param = mask,
 	};
 
-	if (mnt_ns_id) {
+	if (flags & STATMOUNT_BY_FD) {
 		req.size = MNT_ID_REQ_SIZE_VER1;
-		req.mnt_ns_id = mnt_ns_id;
+		req.mnt_fd = fd;
+	} else {
+		req.mnt_id = mnt_id;
+		if (mnt_ns_id) {
+			req.size = MNT_ID_REQ_SIZE_VER1;
+			req.mnt_ns_id = mnt_ns_id;
+		}
 	}
 
 	return syscall(__NR_statmount, &req, buf, bufsize, flags);
diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c
index 6e53430423d2..a04bcaace126 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount_test.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c
@@ -33,15 +33,24 @@ static const char *const known_fs[] = {
 	"sysv", "tmpfs", "tracefs", "ubifs", "udf", "ufs", "v7", "vboxsf",
 	"vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL };
 
-static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigned int flags)
+static struct statmount *statmount_alloc(uint64_t mnt_id, int fd, uint64_t mask, unsigned int flags)
 {
 	size_t bufsize = 1 << 15;
-	struct statmount *buf = NULL, *tmp = alloca(bufsize);
+	struct statmount *buf = NULL, *tmp = NULL;
 	int tofree = 0;
 	int ret;
 
+	if (flags & STATMOUNT_BY_FD && fd < 0)
+		return NULL;
+
+	tmp = alloca(bufsize);
+
 	for (;;) {
-		ret = statmount(mnt_id, 0, mask, tmp, bufsize, flags);
+		if (flags & STATMOUNT_BY_FD)
+			ret = statmount(0, 0, (uint32_t) fd, mask, tmp, bufsize, flags);
+		else
+			ret = statmount(mnt_id, 0, 0, mask, tmp, bufsize, flags);
+
 		if (ret != -1)
 			break;
 		if (tofree)
@@ -237,7 +246,7 @@ static void test_statmount_zero_mask(void)
 	struct statmount sm;
 	int ret;
 
-	ret = statmount(root_id, 0, 0, &sm, sizeof(sm), 0);
+	ret = statmount(root_id, 0, 0, 0, &sm, sizeof(sm), 0);
 	if (ret == -1) {
 		ksft_test_result_fail("statmount zero mask: %s\n",
 				      strerror(errno));
@@ -263,7 +272,7 @@ static void test_statmount_mnt_basic(void)
 	int ret;
 	uint64_t mask = STATMOUNT_MNT_BASIC;
 
-	ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0);
+	ret = statmount(root_id, 0, 0, mask, &sm, sizeof(sm), 0);
 	if (ret == -1) {
 		ksft_test_result_fail("statmount mnt basic: %s\n",
 				      strerror(errno));
@@ -323,7 +332,7 @@ static void test_statmount_sb_basic(void)
 	struct statx sx;
 	struct statfs sf;
 
-	ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0);
+	ret = statmount(root_id, 0, 0, mask, &sm, sizeof(sm), 0);
 	if (ret == -1) {
 		ksft_test_result_fail("statmount sb basic: %s\n",
 				      strerror(errno));
@@ -375,7 +384,7 @@ static void test_statmount_mnt_point(void)
 {
 	struct statmount *sm;
 
-	sm = statmount_alloc(root_id, STATMOUNT_MNT_POINT, 0);
+	sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_POINT, 0);
 	if (!sm) {
 		ksft_test_result_fail("statmount mount point: %s\n",
 				      strerror(errno));
@@ -405,7 +414,7 @@ static void test_statmount_mnt_root(void)
 	assert(last_dir);
 	last_dir++;
 
-	sm = statmount_alloc(root_id, STATMOUNT_MNT_ROOT, 0);
+	sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_ROOT, 0);
 	if (!sm) {
 		ksft_test_result_fail("statmount mount root: %s\n",
 				      strerror(errno));
@@ -438,7 +447,7 @@ static void test_statmount_fs_type(void)
 	const char *fs_type;
 	const char *const *s;
 
-	sm = statmount_alloc(root_id, STATMOUNT_FS_TYPE, 0);
+	sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0);
 	if (!sm) {
 		ksft_test_result_fail("statmount fs type: %s\n",
 				      strerror(errno));
@@ -467,7 +476,7 @@ static void test_statmount_mnt_opts(void)
 	char *line = NULL;
 	size_t len = 0;
 
-	sm = statmount_alloc(root_id, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS,
+	sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS,
 			     0);
 	if (!sm) {
 		ksft_test_result_fail("statmount mnt opts: %s\n",
@@ -557,7 +566,7 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name)
 	uint32_t start, i;
 	int ret;
 
-	sm = statmount_alloc(root_id, mask, 0);
+	sm = statmount_alloc(root_id, 0, mask, 0);
 	if (!sm) {
 		ksft_test_result_fail("statmount %s: %s\n", name,
 				      strerror(errno));
@@ -586,14 +595,14 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name)
 	exactsize = sm->size;
 	shortsize = sizeof(*sm) + i;
 
-	ret = statmount(root_id, 0, mask, sm, exactsize, 0);
+	ret = statmount(root_id, 0, 0, mask, sm, exactsize, 0);
 	if (ret == -1) {
 		ksft_test_result_fail("statmount exact size: %s\n",
 				      strerror(errno));
 		goto out;
 	}
 	errno = 0;
-	ret = statmount(root_id, 0, mask, sm, shortsize, 0);
+	ret = statmount(root_id, 0, 0, mask, sm, shortsize, 0);
 	if (ret != -1 || errno != EOVERFLOW) {
 		ksft_test_result_fail("should have failed with EOVERFLOW: %s\n",
 				      strerror(errno));
@@ -658,6 +667,226 @@ static void test_listmount_tree(void)
 	ksft_test_result_pass("listmount tree\n");
 }
 
+static void test_statmount_by_fd(void)
+{
+	struct statmount *sm = NULL;
+	char tmpdir[] = "/statmount.fd.XXXXXX";
+	const char root[] = "/test";
+	char subdir[PATH_MAX], tmproot[PATH_MAX];
+	int fd;
+
+	if (!mkdtemp(tmpdir)) {
+		ksft_perror("mkdtemp");
+		return;
+	}
+
+	if (mount("statmount.test", tmpdir, "tmpfs", 0, NULL)) {
+		ksft_perror("mount");
+		rmdir(tmpdir);
+		return;
+	}
+
+	snprintf(subdir, PATH_MAX, "%s%s", tmpdir, root);
+	snprintf(tmproot, PATH_MAX, "%s/%s", tmpdir, "chroot");
+
+	if (mkdir(subdir, 0755)) {
+		ksft_perror("mkdir");
+		goto err_tmpdir;
+	}
+
+	if (mount(subdir, subdir, NULL, MS_BIND, 0)) {
+		ksft_perror("mount");
+		goto err_subdir;
+	}
+
+	if (mkdir(tmproot, 0755)) {
+		ksft_perror("mkdir");
+		goto err_subdir;
+	}
+
+	fd = open(subdir, O_PATH);
+	if (fd < 0) {
+		ksft_perror("open");
+		goto err_tmproot;
+	}
+
+	if (chroot(tmproot)) {
+		ksft_perror("chroot");
+		goto err_fd;
+	}
+
+	sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD);
+	if (!sm) {
+		ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno));
+		goto err_chroot;
+	}
+
+	if (sm->size < sizeof(*sm)) {
+		ksft_test_result_fail("unexpected size: %u < %u\n",
+				      sm->size, (uint32_t) sizeof(*sm));
+		goto err_chroot;
+	}
+
+	if (sm->mask & STATMOUNT_MNT_POINT) {
+		ksft_test_result_fail("STATMOUNT_MNT_POINT unexpectedly set in statmount\n");
+		goto err_chroot;
+	}
+
+	if (!(sm->mask & STATMOUNT_MNT_ROOT)) {
+		ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in statmount\n");
+		goto err_chroot;
+	}
+
+	if (strcmp(root, sm->str + sm->mnt_root) != 0) {
+		ksft_test_result_fail("statmount returned incorrect mnt_root,"
+			"statmount mnt_root: %s != %s\n",
+			sm->str + sm->mnt_root, root);
+		goto err_chroot;
+	}
+
+	if (chroot(".")) {
+		ksft_perror("chroot");
+		goto out;
+	}
+
+	free(sm);
+	sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD);
+	if (!sm) {
+		ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno));
+		goto err_fd;
+	}
+
+	if (sm->size < sizeof(*sm)) {
+		ksft_test_result_fail("unexpected size: %u < %u\n",
+				      sm->size, (uint32_t) sizeof(*sm));
+		goto out;
+	}
+
+	if (!(sm->mask & STATMOUNT_MNT_POINT)) {
+		ksft_test_result_fail("STATMOUNT_MNT_POINT not set in statmount\n");
+		goto out;
+	}
+
+	if (!(sm->mask & STATMOUNT_MNT_ROOT)) {
+		ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in statmount\n");
+		goto out;
+	}
+
+	if (strcmp(subdir, sm->str + sm->mnt_point) != 0) {
+		ksft_test_result_fail("statmount returned incorrect mnt_point,"
+			"statmount mnt_point: %s != %s\n", sm->str + sm->mnt_point, subdir);
+		goto out;
+	}
+
+	if (strcmp(root, sm->str + sm->mnt_root) != 0) {
+		ksft_test_result_fail("statmount returned incorrect mnt_root,"
+			"statmount mnt_root: %s != %s\n", sm->str + sm->mnt_root, root);
+		goto out;
+	}
+
+	ksft_test_result_pass("statmount by fd\n");
+	goto out;
+err_chroot:
+	chroot(".");
+out:
+	free(sm);
+err_fd:
+	close(fd);
+err_tmproot:
+	rmdir(tmproot);
+err_subdir:
+	umount2(subdir, MNT_DETACH);
+	rmdir(subdir);
+err_tmpdir:
+	umount2(tmpdir, MNT_DETACH);
+	rmdir(tmpdir);
+}
+
+static void test_statmount_by_fd_unmounted(void)
+{
+	const char root[] = "/test.unmounted";
+	char tmpdir[] = "/statmount.fd.XXXXXX";
+	char subdir[PATH_MAX];
+	int fd;
+	struct statmount *sm = NULL;
+
+	if (!mkdtemp(tmpdir)) {
+		ksft_perror("mkdtemp");
+		return;
+	}
+
+	if (mount("statmount.test", tmpdir, "tmpfs", 0, NULL)) {
+		ksft_perror("mount");
+		rmdir(tmpdir);
+		return;
+	}
+
+	snprintf(subdir, PATH_MAX, "%s%s", tmpdir, root);
+
+	if (mkdir(subdir, 0755)) {
+		ksft_perror("mkdir");
+		goto err_tmpdir;
+	}
+
+	if (mount(subdir, subdir, 0, MS_BIND, NULL)) {
+		ksft_perror("mount");
+		goto err_subdir;
+	}
+
+	fd = open(subdir, O_PATH);
+	if (fd < 0) {
+		ksft_perror("open");
+		goto err_subdir;
+	}
+
+	if (umount2(tmpdir, MNT_DETACH)) {
+		ksft_perror("umount2");
+		goto err_fd;
+	}
+
+	sm = statmount_alloc(0, fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT, STATMOUNT_BY_FD);
+	if (!sm) {
+		ksft_test_result_fail("statmount by fd unmounted: %s\n",
+				      strerror(errno));
+		goto err_sm;
+	}
+
+	if (sm->size < sizeof(*sm)) {
+		ksft_test_result_fail("unexpected size: %u < %u\n",
+				      sm->size, (uint32_t) sizeof(*sm));
+		goto err_sm;
+	}
+
+	if (sm->mask & STATMOUNT_MNT_POINT) {
+		ksft_test_result_fail("STATMOUNT_MNT_POINT unexpectedly set in mask\n");
+		goto err_sm;
+	}
+
+	if (!(sm->mask & STATMOUNT_MNT_ROOT)) {
+		ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in mask\n");
+		goto err_sm;
+	}
+
+	if (strcmp(sm->str + sm->mnt_root, root) != 0) {
+		ksft_test_result_fail("statmount returned incorrect mnt_root,"
+			"statmount mnt_root: %s != %s\n",
+			sm->str + sm->mnt_root, root);
+		goto err_sm;
+	}
+
+	ksft_test_result_pass("statmount by fd on unmounted mount\n");
+err_sm:
+	free(sm);
+err_fd:
+	close(fd);
+err_subdir:
+	umount2(subdir, MNT_DETACH);
+	rmdir(subdir);
+err_tmpdir:
+	umount2(tmpdir, MNT_DETACH);
+	rmdir(tmpdir);
+}
+
 #define str_off(memb) (offsetof(struct statmount, memb) / sizeof(uint32_t))
 
 int main(void)
@@ -669,14 +898,14 @@ int main(void)
 
 	ksft_print_header();
 
-	ret = statmount(0, 0, 0, NULL, 0, 0);
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
 	assert(ret == -1);
 	if (errno == ENOSYS)
 		ksft_exit_skip("statmount() syscall not supported\n");
 
 	setup_namespace();
 
-	ksft_set_plan(15);
+	ksft_set_plan(17);
 	test_listmount_empty_root();
 	test_statmount_zero_mask();
 	test_statmount_mnt_basic();
@@ -693,6 +922,8 @@ int main(void)
 	test_statmount_string(all_mask, str_off(fs_type), "fs type & all");
 
 	test_listmount_tree();
+	test_statmount_by_fd_unmounted();
+	test_statmount_by_fd();
 
 
 	if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0)
diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
index d56d4103182f..063d9de46431 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
@@ -102,7 +102,7 @@ static int _test_statmount_mnt_ns_id(void)
 	if (!root_id)
 		return NSID_ERROR;
 
-	ret = statmount(root_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0);
+	ret = statmount(root_id, 0, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0);
 	if (ret == -1) {
 		ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno));
 		return NSID_ERROR;
@@ -128,6 +128,98 @@ static int _test_statmount_mnt_ns_id(void)
 	return NSID_PASS;
 }
 
+static int _test_statmount_mnt_ns_id_by_fd(void)
+{
+	struct statmount sm;
+	uint64_t mnt_ns_id;
+	int ret, fd, mounted = 1, status = NSID_ERROR;
+	char mnt[] = "/statmount.fd.XXXXXX";
+
+	ret = get_mnt_ns_id("/proc/self/ns/mnt", &mnt_ns_id);
+	if (ret != NSID_PASS)
+		return ret;
+
+	if (!mkdtemp(mnt)) {
+		ksft_print_msg("statmount by fd mnt ns id mkdtemp: %s\n", strerror(errno));
+		return NSID_ERROR;
+	}
+
+	if (mount(mnt, mnt, NULL, MS_BIND, 0)) {
+		ksft_print_msg("statmount by fd mnt ns id mount: %s\n", strerror(errno));
+		status = NSID_ERROR;
+		goto err;
+	}
+
+	fd = open(mnt, O_PATH);
+	if (fd < 0) {
+		ksft_print_msg("statmount by fd mnt ns id open: %s\n", strerror(errno));
+		goto err;
+	}
+
+	ret = statmount(0, 0, fd, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), STATMOUNT_BY_FD);
+	if (ret == -1) {
+		ksft_print_msg("statmount mnt ns id statmount: %s\n", strerror(errno));
+		status = NSID_ERROR;
+		goto out;
+	}
+
+	if (sm.size != sizeof(sm)) {
+		ksft_print_msg("unexpected size: %u != %u\n", sm.size,
+			       (uint32_t)sizeof(sm));
+		status = NSID_FAIL;
+		goto out;
+	}
+	if (sm.mask != STATMOUNT_MNT_NS_ID) {
+		ksft_print_msg("statmount mnt ns id unavailable\n");
+		status = NSID_SKIP;
+		goto out;
+	}
+
+	if (sm.mnt_ns_id != mnt_ns_id) {
+		ksft_print_msg("unexpected mnt ns ID: 0x%llx != 0x%llx\n",
+			       (unsigned long long)sm.mnt_ns_id,
+			       (unsigned long long)mnt_ns_id);
+		status = NSID_FAIL;
+		goto out;
+	}
+
+	mounted = 0;
+	if (umount2(mnt, MNT_DETACH)) {
+		ksft_print_msg("statmount by fd mnt ns id umount2: %s\n", strerror(errno));
+		goto out;
+	}
+
+	ret = statmount(0, 0, fd, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), STATMOUNT_BY_FD);
+	if (ret == -1) {
+		ksft_print_msg("statmount mnt ns id statmount: %s\n", strerror(errno));
+		status = NSID_ERROR;
+		goto out;
+	}
+
+	if (sm.size != sizeof(sm)) {
+		ksft_print_msg("unexpected size: %u != %u\n", sm.size,
+			       (uint32_t)sizeof(sm));
+		status = NSID_FAIL;
+		goto out;
+	}
+
+	if (sm.mask == STATMOUNT_MNT_NS_ID) {
+		ksft_print_msg("unexpected STATMOUNT_MNT_NS_ID in mask\n");
+		status = NSID_FAIL;
+		goto out;
+	}
+
+	status = NSID_PASS;
+out:
+	close(fd);
+	if (mounted)
+		umount2(mnt, MNT_DETACH);
+err:
+	rmdir(mnt);
+	return status;
+}
+
+
 static void test_statmount_mnt_ns_id(void)
 {
 	pid_t pid;
@@ -148,6 +240,9 @@ static void test_statmount_mnt_ns_id(void)
 	if (ret != NSID_PASS)
 		exit(ret);
 	ret = _test_statmount_mnt_ns_id();
+	if (ret != NSID_PASS)
+		exit(ret);
+	ret = _test_statmount_mnt_ns_id_by_fd();
 	exit(ret);
 }
 
@@ -179,7 +274,7 @@ static int validate_external_listmount(pid_t pid, uint64_t child_nr_mounts)
 	for (int i = 0; i < nr_mounts; i++) {
 		struct statmount sm;
 
-		ret = statmount(list[i], mnt_ns_id, STATMOUNT_MNT_NS_ID, &sm,
+		ret = statmount(list[i], mnt_ns_id, 0, STATMOUNT_MNT_NS_ID, &sm,
 				sizeof(sm), 0);
 		if (ret < 0) {
 			ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno));
@@ -275,7 +370,7 @@ int main(void)
 	int ret;
 
 	ksft_print_header();
-	ret = statmount(0, 0, 0, NULL, 0, 0);
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
 	assert(ret == -1);
 	if (errno == ENOSYS)
 		ksft_exit_skip("statmount() syscall not supported\n");
-- 
cgit v1.2.3


From 6b401a5b2d2acf56ec902f96f6381982457ab339 Mon Sep 17 00:00:00 2001
From: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Date: Tue, 2 Dec 2025 10:10:12 +0530
Subject: cpupower: idle_monitor: fix incorrect value logged after stop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cpuidle sysfs monitor printed the previous sample’s counter
value in cpuidle_stop() instead of the freshly read one. The dprint
line used previous_count[cpu][state] while current_count[cpu][state]
had just been populated. This caused misleading debug output.

Switch the logging to current_count so the post-interval snapshot
matches the displayed value.

Link: https://lore.kernel.org/r/20251202044012.3844790-1-kaushlendra.kumar@intel.com
Signed-off-by: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c
index 8b42c2f0a5b0..4225eff9833d 100644
--- a/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c
+++ b/tools/power/cpupower/utils/idle_monitor/cpuidle_sysfs.c
@@ -70,7 +70,7 @@ static int cpuidle_stop(void)
 			current_count[cpu][state] =
 				cpuidle_state_time(cpu, state);
 			dprint("CPU %d - State: %d - Val: %llu\n",
-			       cpu, state, previous_count[cpu][state]);
+			       cpu, state, current_count[cpu][state]);
 		}
 	}
 	return 0;
-- 
cgit v1.2.3


From 24858a84163c8d04827166b3bcaed80612bb62fc Mon Sep 17 00:00:00 2001
From: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Date: Wed, 26 Nov 2025 14:46:13 +0530
Subject: tools/cpupower: Fix inverted APERF capability check

The capability check was inverted, causing the function to return
error when APERF support is available and proceed when it is not.

Negate the condition to return error only when APERF capability
is absent.

Link: https://lore.kernel.org/r/20251126091613.567480-1-kaushlendra.kumar@intel.com
Signed-off-by: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/utils/cpufreq-info.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c
index 7d3732f5f2f6..5fe01e516817 100644
--- a/tools/power/cpupower/utils/cpufreq-info.c
+++ b/tools/power/cpupower/utils/cpufreq-info.c
@@ -270,7 +270,7 @@ static int get_freq_hardware(unsigned int cpu, unsigned int human)
 {
 	unsigned long freq;
 
-	if (cpupower_cpu_info.caps & CPUPOWER_CAP_APERF)
+	if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF))
 		return -EINVAL;
 
 	freq = cpufreq_get_freq_hardware(cpu);
-- 
cgit v1.2.3


From 1b9aaf36b7b40235e5a529c15848c3d866362207 Mon Sep 17 00:00:00 2001
From: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Date: Thu, 27 Nov 2025 10:15:36 +0530
Subject: tools/cpupower: Use strcspn() to strip trailing newline

Replace manual newline removal with strcspn() which is safer and
cleaner. This avoids potential out-of-bounds access on empty strings
and handles the case where no newline exists.

Link: https://lore.kernel.org/r/20251127044536.715722-1-kaushlendra.kumar@intel.com
Signed-off-by: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/lib/cpuidle.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/power/cpupower/lib/cpuidle.c b/tools/power/cpupower/lib/cpuidle.c
index f2c1139adf71..6a881d93d2e9 100644
--- a/tools/power/cpupower/lib/cpuidle.c
+++ b/tools/power/cpupower/lib/cpuidle.c
@@ -193,8 +193,7 @@ static char *cpuidle_state_get_one_string(unsigned int cpu,
 	if (result == NULL)
 		return NULL;
 
-	if (result[strlen(result) - 1] == '\n')
-		result[strlen(result) - 1] = '\0';
+	result[strcspn(result, "\n")] = '\0';
 
 	return result;
 }
@@ -366,8 +365,7 @@ static char *sysfs_cpuidle_get_one_string(enum cpuidle_string which)
 	if (result == NULL)
 		return NULL;
 
-	if (result[strlen(result) - 1] == '\n')
-		result[strlen(result) - 1] = '\0';
+	result[strcspn(result, "\n")] = '\0';
 
 	return result;
 }
-- 
cgit v1.2.3


From f9bd3762cf1bd0c2465f2e6121b340883471d1bf Mon Sep 17 00:00:00 2001
From: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Date: Mon, 1 Dec 2025 17:47:45 +0530
Subject: tools/power cpupower: Reset errno before strtoull()

cpuidle_state_get_one_value() never cleared errno before calling
strtoull(), so a prior ERANGE caused every cpuidle counter read to
return zero. Reset errno to 0 before the conversion so each sysfs read
is evaluated independently.

Link: https://lore.kernel.org/r/20251201121745.3776703-1-kaushlendra.kumar@intel.com
Signed-off-by: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/lib/cpuidle.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/power/cpupower/lib/cpuidle.c b/tools/power/cpupower/lib/cpuidle.c
index 6a881d93d2e9..2fcb343d8e75 100644
--- a/tools/power/cpupower/lib/cpuidle.c
+++ b/tools/power/cpupower/lib/cpuidle.c
@@ -150,6 +150,7 @@ unsigned long long cpuidle_state_get_one_value(unsigned int cpu,
 	if (len == 0)
 		return 0;
 
+	errno = 0;
 	value = strtoull(linebuf, &endp, 0);
 
 	if (endp == linebuf || errno == ERANGE)
-- 
cgit v1.2.3


From ff72619e11348ab189e232c59515dd5c33780d7c Mon Sep 17 00:00:00 2001
From: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Date: Tue, 2 Dec 2025 12:24:03 +0530
Subject: tools/power cpupower: Show C0 in idle-info dump

`cpupower idle-info -o` skipped C0 because the loop began at 1:

  before:
    states:
      C1 ... latency[002] residency[00002]
      C2 ... latency[010] residency[00020]
      C3 ... latency[133] residency[00600]

  after:
    states:
      C0 ... latency[000] residency[00000]
      C1 ... latency[002] residency[00002]
      C2 ... latency[010] residency[00020]
      C3 ... latency[133] residency[00600]

Start iterating at index 0 so the idle report mirrors sysfs and
includes C0 stats.

Link: https://lore.kernel.org/r/20251202065403.1492807-1-kaushlendra.kumar@intel.com
Signed-off-by: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/utils/cpuidle-info.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/cpupower/utils/cpuidle-info.c b/tools/power/cpupower/utils/cpuidle-info.c
index e0d17f0de3fe..81b4763a97d6 100644
--- a/tools/power/cpupower/utils/cpuidle-info.c
+++ b/tools/power/cpupower/utils/cpuidle-info.c
@@ -111,7 +111,7 @@ static void proc_cpuidle_cpu_output(unsigned int cpu)
 	printf(_("max_cstate:              C%u\n"), cstates-1);
 	printf(_("maximum allowed latency: %lu usec\n"), max_allowed_cstate);
 	printf(_("states:\t\n"));
-	for (cstate = 1; cstate < cstates; cstate++) {
+	for (cstate = 0; cstate < cstates; cstate++) {
 		printf(_("    C%d:                  "
 			 "type[C%d] "), cstate, cstate);
 		printf(_("promotion[--] demotion[--] "));
-- 
cgit v1.2.3


From 0355911ac021a424b18d2d746536d70b879cdeab Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Tue, 16 Dec 2025 12:33:21 -0500
Subject: selftests/bpf: Explicitly account for globals in verifier_arena_large

The big_alloc1 test in verifier_arena_large assumes that the arena base
and the first page allocated by bpf_arena_alloc_pages are identical.
This is not the case, because the first page in the arena is populated
by global arena data. The test still passes because the code makes the
tacit assumption that the first page is on offset PAGE_SIZE instead of
0.

Make this distinction explicit in the code, and adjust the page offsets
requested during the test to count from the beginning of the arena
instead of using the address of the first allocated page.

Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20251216173325.98465-2-emil@etsalapatis.com
---
 tools/testing/selftests/bpf/progs/verifier_arena_large.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
index f19e15400b3e..bd430a34c3ab 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
@@ -23,18 +23,25 @@ int big_alloc1(void *ctx)
 {
 #if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
 	volatile char __arena *page1, *page2, *no_page, *page3;
-	void __arena *base;
+	u64 base;
 
-	page1 = base = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+	base = (u64)arena_base(&arena);
+
+	page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
 	if (!page1)
 		return 1;
+
+	/* Account for global arena data. */
+	if ((u64)page1 != base + PAGE_SIZE)
+		return 15;
+
 	*page1 = 1;
-	page2 = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE * 2,
+	page2 = bpf_arena_alloc_pages(&arena, (void __arena *)(ARENA_SIZE - PAGE_SIZE),
 				      1, NUMA_NO_NODE, 0);
 	if (!page2)
 		return 2;
 	*page2 = 2;
-	no_page = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE,
+	no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE,
 					1, NUMA_NO_NODE, 0);
 	if (no_page)
 		return 3;
-- 
cgit v1.2.3


From 12a1fe6e12dbad39f2f0dad1a385625f0298eff4 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Tue, 16 Dec 2025 12:33:22 -0500
Subject: bpf/verifier: Do not limit maximum direct offset into arena map

The verifier currently limits direct offsets into a map to 512MiB
to avoid overflow during pointer arithmetic. However, this prevents
arena maps from using direct addressing instructions to access data
at the end of > 512MiB arena maps. This is necessary when moving
arena globals to the end of the arena instead of the front.

Refactor the verifier code to remove the offset calculation during
direct value access calculations. This is possible because the only
two map types that implement .map_direct_value_addr() are arrays and
arenas, and they both do their own internal checks to ensure the
offset is within bounds.

Adjust selftests that expect the old error. These tests still fail
because the verifier identifies the access as out of bounds for the
map, so change them to expect an "invalid access to map value pointer"
error instead.

Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251216173325.98465-3-emil@etsalapatis.com
---
 kernel/bpf/verifier.c                                      | 5 -----
 tools/testing/selftests/bpf/verifier/direct_value_access.c | 4 ++--
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a31c032b2dd6..d6b8a77fbe3b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21132,11 +21132,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
 			} else {
 				u32 off = insn[1].imm;
 
-				if (off >= BPF_MAX_VAR_OFF) {
-					verbose(env, "direct value offset of %u is not allowed\n", off);
-					return -EINVAL;
-				}
-
 				if (!map->ops->map_direct_value_addr) {
 					verbose(env, "no direct value access support for this map type\n");
 					return -EINVAL;
diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c
index c0648dc009b5..e569d119fb60 100644
--- a/tools/testing/selftests/bpf/verifier/direct_value_access.c
+++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c
@@ -81,7 +81,7 @@
 	},
 	.fixup_map_array_48b = { 1 },
 	.result = REJECT,
-	.errstr = "direct value offset of 4294967295 is not allowed",
+	.errstr = "invalid access to map value pointer, value_size=48 off=4294967295",
 },
 {
 	"direct map access, write test 8",
@@ -141,7 +141,7 @@
 	},
 	.fixup_map_array_48b = { 1 },
 	.result = REJECT,
-	.errstr = "direct value offset of 536870912 is not allowed",
+	.errstr = "invalid access to map value pointer, value_size=48 off=536870912",
 },
 {
 	"direct map access, write test 13",
-- 
cgit v1.2.3


From 0aa721437e4b74d737f58582f1bbf2eea3e038c7 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Tue, 16 Dec 2025 12:33:23 -0500
Subject: libbpf: Turn relo_core->sym_off unsigned

The symbols' relocation offsets in BPF are stored in an int field,
but cannot actually be negative. When in the next patch libbpf relocates
globals to the end of the arena, it is also possible to have valid
offsets > 2GiB that are used to calculate the final relo offsets.
Avoid accidentally interpreting large offsets as negative by turning
the sym_off field unsigned.

Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20251216173325.98465-4-emil@etsalapatis.com
---
 tools/lib/bpf/libbpf.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index c7c79014d46c..4d4badb64824 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -380,7 +380,7 @@ struct reloc_desc {
 		const struct bpf_core_relo *core_relo; /* used when type == RELO_CORE */
 		struct {
 			int map_idx;
-			int sym_off;
+			unsigned int sym_off;
 			/*
 			 * The following two fields can be unionized, as the
 			 * ext_idx field is used for extern symbols, and the
@@ -763,7 +763,7 @@ struct bpf_object {
 
 	struct {
 		struct bpf_program *prog;
-		int sym_off;
+		unsigned int sym_off;
 		int fd;
 	} *jumptable_maps;
 	size_t jumptable_map_cnt;
@@ -6192,7 +6192,7 @@ static void poison_kfunc_call(struct bpf_program *prog, int relo_idx,
 	insn->imm = POISON_CALL_KFUNC_BASE + ext_idx;
 }
 
-static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off)
+static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, unsigned int sym_off)
 {
 	size_t i;
 
@@ -6210,7 +6210,7 @@ static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym
 	return -ENOENT;
 }
 
-static int add_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off, int map_fd)
+static int add_jt_map(struct bpf_object *obj, struct bpf_program *prog, unsigned int sym_off, int map_fd)
 {
 	size_t cnt = obj->jumptable_map_cnt;
 	size_t size = sizeof(obj->jumptable_maps[0]);
@@ -6244,7 +6244,7 @@ static int find_subprog_idx(struct bpf_program *prog, int insn_idx)
 static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struct reloc_desc *relo)
 {
 	const __u32 jt_entry_size = 8;
-	int sym_off = relo->sym_off;
+	unsigned int sym_off = relo->sym_off;
 	int jt_size = relo->sym_size;
 	__u32 max_entries = jt_size / jt_entry_size;
 	__u32 value_size = sizeof(struct bpf_insn_array_value);
@@ -6260,7 +6260,7 @@ static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struc
 		return map_fd;
 
 	if (sym_off % jt_entry_size) {
-		pr_warn("map '.jumptables': jumptable start %d should be multiple of %u\n",
+		pr_warn("map '.jumptables': jumptable start %u should be multiple of %u\n",
 			sym_off, jt_entry_size);
 		return -EINVAL;
 	}
@@ -6316,7 +6316,7 @@ static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struc
 		 * should contain values that fit in u32.
 		 */
 		if (insn_off > UINT32_MAX) {
-			pr_warn("map '.jumptables': invalid jump table value 0x%llx at offset %d\n",
+			pr_warn("map '.jumptables': invalid jump table value 0x%llx at offset %u\n",
 				(long long)jt[i], sym_off + i * jt_entry_size);
 			err = -EINVAL;
 			goto err_close;
-- 
cgit v1.2.3


From c1f61171d44b19834cf24def2cf832f2688e83df Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Tue, 16 Dec 2025 12:33:24 -0500
Subject: libbpf: Move arena globals to the end of the arena

Arena globals are currently placed at the beginning of the arena
by libbpf. This is convenient, but prevents users from reserving
guard pages in the beginning of the arena to identify NULL pointer
dereferences. Adjust the load logic to place the globals at the
end of the arena instead.

Also modify bpftool to set the arena pointer in the program's BPF
skeleton to point to the globals. Users now call bpf_map__initial_value()
to find the beginning of the arena mapping and use the arena pointer
in the skeleton to determine which part of the mapping holds the
arena globals and which part is free.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20251216173325.98465-5-emil@etsalapatis.com
---
 tools/lib/bpf/libbpf.c                                  | 17 +++++++++++++----
 .../testing/selftests/bpf/progs/verifier_arena_large.c  | 12 +++++++++---
 2 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 4d4badb64824..6fba879492a8 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -757,6 +757,7 @@ struct bpf_object {
 	int arena_map_idx;
 	void *arena_data;
 	size_t arena_data_sz;
+	size_t arena_data_off;
 
 	void *jumptables_data;
 	size_t jumptables_data_sz;
@@ -2991,10 +2992,11 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map,
 			       void *data, size_t data_sz)
 {
 	const long page_sz = sysconf(_SC_PAGE_SIZE);
+	const size_t data_alloc_sz = roundup(data_sz, page_sz);
 	size_t mmap_sz;
 
 	mmap_sz = bpf_map_mmap_sz(map);
-	if (roundup(data_sz, page_sz) > mmap_sz) {
+	if (data_alloc_sz > mmap_sz) {
 		pr_warn("elf: sec '%s': declared ARENA map size (%zu) is too small to hold global __arena variables of size %zu\n",
 			sec_name, mmap_sz, data_sz);
 		return -E2BIG;
@@ -3006,6 +3008,9 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map,
 	memcpy(obj->arena_data, data, data_sz);
 	obj->arena_data_sz = data_sz;
 
+	/* place globals at the end of the arena */
+	obj->arena_data_off = mmap_sz - data_alloc_sz;
+
 	/* make bpf_map__init_value() work for ARENA maps */
 	map->mmaped = obj->arena_data;
 
@@ -4663,7 +4668,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
 		reloc_desc->type = RELO_DATA;
 		reloc_desc->insn_idx = insn_idx;
 		reloc_desc->map_idx = obj->arena_map_idx;
-		reloc_desc->sym_off = sym->st_value;
+		reloc_desc->sym_off = sym->st_value + obj->arena_data_off;
 
 		map = &obj->maps[obj->arena_map_idx];
 		pr_debug("prog '%s': found arena map %d (%s, sec %d, off %zu) for insn %u\n",
@@ -5624,7 +5629,8 @@ retry:
 					return err;
 				}
 				if (obj->arena_data) {
-					memcpy(map->mmaped, obj->arena_data, obj->arena_data_sz);
+					memcpy(map->mmaped + obj->arena_data_off, obj->arena_data,
+						obj->arena_data_sz);
 					zfree(&obj->arena_data);
 				}
 			}
@@ -14429,7 +14435,10 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s)
 		if (!map_skel->mmaped)
 			continue;
 
-		*map_skel->mmaped = map->mmaped;
+		if (map->def.type == BPF_MAP_TYPE_ARENA)
+			*map_skel->mmaped = map->mmaped + map->obj->arena_data_off;
+		else
+			*map_skel->mmaped = map->mmaped;
 	}
 
 	return 0;
diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
index bd430a34c3ab..2b8cf2a4d880 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
@@ -31,16 +31,22 @@ int big_alloc1(void *ctx)
 	if (!page1)
 		return 1;
 
-	/* Account for global arena data. */
-	if ((u64)page1 != base + PAGE_SIZE)
+	if ((u64)page1 != base)
 		return 15;
 
 	*page1 = 1;
-	page2 = bpf_arena_alloc_pages(&arena, (void __arena *)(ARENA_SIZE - PAGE_SIZE),
+	page2 = bpf_arena_alloc_pages(&arena, (void __arena *)(ARENA_SIZE - 2 * PAGE_SIZE),
 				      1, NUMA_NO_NODE, 0);
 	if (!page2)
 		return 2;
 	*page2 = 2;
+
+	/* Test for the guard region at the end of the arena. */
+	no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE - PAGE_SIZE,
+					1, NUMA_NO_NODE, 0);
+	if (no_page)
+		return 16;
+
 	no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE,
 					1, NUMA_NO_NODE, 0);
 	if (no_page)
-- 
cgit v1.2.3


From 19f12431b6c339416e656c794a26ff0ebb2dba56 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Tue, 16 Dec 2025 12:33:25 -0500
Subject: selftests/bpf: Add tests for the arena offset of globals

Add tests for the new libbpf globals arena offset logic. The
tests cover the case of globals being as large as the arena
itself, and being smaller than the arena. In that case, the
data is placed at the end of the arena, and the beginning
of the arena is free.

Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20251216173325.98465-6-emil@etsalapatis.com
---
 tools/testing/selftests/bpf/prog_tests/verifier.c  |  4 +
 .../selftests/bpf/progs/verifier_arena_globals1.c  | 87 ++++++++++++++++++++++
 .../selftests/bpf/progs/verifier_arena_globals2.c  | 49 ++++++++++++
 3 files changed, 140 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_arena_globals1.c
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_arena_globals2.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 4b4b081b46cc..5829ffd70f8f 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -6,6 +6,8 @@
 #include "verifier_and.skel.h"
 #include "verifier_arena.skel.h"
 #include "verifier_arena_large.skel.h"
+#include "verifier_arena_globals1.skel.h"
+#include "verifier_arena_globals2.skel.h"
 #include "verifier_array_access.skel.h"
 #include "verifier_async_cb_context.skel.h"
 #include "verifier_basic_stack.skel.h"
@@ -147,6 +149,8 @@ static void run_tests_aux(const char *skel_name,
 void test_verifier_and(void)                  { RUN(verifier_and); }
 void test_verifier_arena(void)                { RUN(verifier_arena); }
 void test_verifier_arena_large(void)          { RUN(verifier_arena_large); }
+void test_verifier_arena_globals1(void)       { RUN(verifier_arena_globals1); }
+void test_verifier_arena_globals2(void)       { RUN(verifier_arena_globals2); }
 void test_verifier_basic_stack(void)          { RUN(verifier_basic_stack); }
 void test_verifier_bitfield_write(void)       { RUN(verifier_bitfield_write); }
 void test_verifier_bounds(void)               { RUN(verifier_bounds); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c
new file mode 100644
index 000000000000..14afef3d6442
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#define BPF_NO_KFUNC_PROTOTYPES
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_experimental.h"
+#include "bpf_arena_common.h"
+#include "bpf_misc.h"
+
+#define ARENA_PAGES (1UL<< (32 - 12))
+#define GLOBAL_PAGES (16)
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARENA);
+	__uint(map_flags, BPF_F_MMAPABLE);
+	__uint(max_entries, ARENA_PAGES);
+#ifdef __TARGET_ARCH_arm64
+	__ulong(map_extra, (1ull << 32) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1));
+#else
+	__ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1));
+#endif
+} arena SEC(".maps");
+
+/*
+ * Global data, to be placed at the end of the arena.
+ */
+volatile char __arena global_data[GLOBAL_PAGES][PAGE_SIZE];
+
+SEC("syscall")
+__success __retval(0)
+int check_reserve1(void *ctx)
+{
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+	const u8 magic = 0x5a;
+	__u8 __arena *guard, *globals;
+	volatile char __arena *ptr;
+	int i;
+	int ret;
+
+	guard = (void __arena *)arena_base(&arena);
+	globals = (void __arena *)(arena_base(&arena) + (ARENA_PAGES - GLOBAL_PAGES) * PAGE_SIZE);
+
+	/* Reserve the region we've offset the globals by. */
+	ret = bpf_arena_reserve_pages(&arena, guard, ARENA_PAGES - GLOBAL_PAGES);
+	if (ret)
+		return 1;
+
+	/* Make sure the globals are in the expected offset. */
+	ret = bpf_arena_reserve_pages(&arena, globals, 1);
+	if (!ret)
+		return 2;
+
+	/* Verify globals are properly mapped in by libbpf. */
+	for (i = 0; i < GLOBAL_PAGES; i++) {
+		ptr = &global_data[i][PAGE_SIZE / 2];
+
+		*ptr = magic;
+		if (*ptr != magic)
+			return i + 3;
+	}
+#endif
+	return 0;
+}
+
+/*
+ * Relocation check by reading directly into the global data w/o using symbols.
+ */
+SEC("syscall")
+__success __retval(0)
+int check_relocation(void *ctx)
+{
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+	const u8 magic = 0xfa;
+	u8 __arena *ptr;
+
+	global_data[GLOBAL_PAGES - 1][PAGE_SIZE / 2] = magic;
+	ptr = (u8 __arena *)((u64)(ARENA_PAGES * PAGE_SIZE - PAGE_SIZE / 2));
+	if (*ptr != magic)
+		return 1;
+
+#endif
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c
new file mode 100644
index 000000000000..e6bd7b61f9f1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#define BPF_NO_KFUNC_PROTOTYPES
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+#include "bpf_experimental.h"
+#include "bpf_arena_common.h"
+
+#define ARENA_PAGES (32)
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARENA);
+	__uint(map_flags, BPF_F_MMAPABLE);
+	__uint(max_entries, ARENA_PAGES);
+#ifdef __TARGET_ARCH_arm64
+	__ulong(map_extra, (1ull << 32) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1));
+#else
+	__ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1));
+#endif
+} arena SEC(".maps");
+
+/*
+ * Fill the entire arena with global data.
+ * The offset into the arena should be 0.
+ */
+char __arena global_data[ARENA_PAGES][PAGE_SIZE];
+
+SEC("syscall")
+__success __retval(0)
+int check_reserve2(void *ctx)
+{
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+	void __arena *guard;
+	int ret;
+
+	guard = (void __arena *)arena_base(&arena);
+
+	/* Make sure the data at offset 0 case is properly handled. */
+	ret = bpf_arena_reserve_pages(&arena, guard, 1);
+	if (!ret)
+		return 1;
+#endif
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From d311783bc68b011c77a4ef81321de2c94d7deffc Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@kernel.org>
Date: Thu, 11 Dec 2025 19:17:53 -0300
Subject: perf list: Remove unused 'sep' variable

It is just being set to the return of strchr() but never used, just
ditch it and with it get rid of a warning about it not being const on
fedora 44.

Reviewed-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20251211221756.96294-2-acme@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-list.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index 5cbca0bacd35..ac7bd0e41aa1 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -648,7 +648,7 @@ int cmd_list(int argc, const char **argv)
 	}
 
 	for (i = 0; i < argc; ++i) {
-		char *sep, *s;
+		char *s;
 
 		if (strcmp(argv[i], "tracepoint") == 0) {
 			char *old_pmu_glob = default_ps.pmu_glob;
@@ -720,7 +720,7 @@ int cmd_list(int argc, const char **argv)
 		else if (strcmp(argv[i], "pfm") == 0)
 			print_libpfm_events(&print_cb, ps);
 #endif
-		else if ((sep = strchr(argv[i], ':')) != NULL) {
+		else if (strchr(argv[i], ':') != NULL) {
 			char *old_pmu_glob = ps->pmu_glob;
 			char *old_event_glob = ps->event_glob;
 
-- 
cgit v1.2.3


From f6f41aef53761517391b6192fe5b4bc30b2d717a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@kernel.org>
Date: Thu, 11 Dec 2025 19:17:54 -0300
Subject: perf diff: Constify strchr() return variables

Newer glibc versions return const char for strchr() when the 's' arg is
const, change the return variable to const to match that.

Also we don't need to turn that ',' into a '\0', as strtol() will stop
in the first invalid char. No need to touch read only memory.

First noticed with fedora 44.

Reviewed-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20251211221756.96294-3-acme@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-diff.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 53d5ea4a6a4f..59bf1f72d12e 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -178,10 +178,9 @@ static struct header_column {
 	}
 };
 
-static int setup_compute_opt_wdiff(char *opt)
+static int setup_compute_opt_wdiff(const char *opt)
 {
-	char *w1_str = opt;
-	char *w2_str;
+	const char *w1_str = opt, *w2_str;
 
 	int ret = -EINVAL;
 
@@ -192,8 +191,7 @@ static int setup_compute_opt_wdiff(char *opt)
 	if (!w2_str)
 		goto out;
 
-	*w2_str++ = 0x0;
-	if (!*w2_str)
+	if (!*++w2_str)
 		goto out;
 
 	compute_wdiff_w1 = strtol(w1_str, NULL, 10);
@@ -214,7 +212,7 @@ static int setup_compute_opt_wdiff(char *opt)
 	return ret;
 }
 
-static int setup_compute_opt(char *opt)
+static int setup_compute_opt(const char *opt)
 {
 	if (compute == COMPUTE_WEIGHTED_DIFF)
 		return setup_compute_opt_wdiff(opt);
@@ -234,7 +232,7 @@ static int setup_compute(const struct option *opt, const char *str,
 	char *cstr = (char *) str;
 	char buf[50];
 	unsigned i;
-	char *option;
+	const char *option;
 
 	if (!str) {
 		*cp = COMPUTE_DELTA;
-- 
cgit v1.2.3


From 45718bce7daf39c618188b70a52644bb5a2f968a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@kernel.org>
Date: Thu, 11 Dec 2025 19:17:55 -0300
Subject: perf tools: Use const for variables receiving str{str,r?chr}()
 returns

Newer glibc versions return const char for str{str,chr}() where the
haystack/s is const so to avoid warnings like these on fedora 44 change
some variables to const:

  36     8.17 fedora:44                     : FAIL gcc version 15.2.1 20251111 (Red Hat 15.2.1-4) (GCC)
    libbpf.c: In function 'kallsyms_cb':
    libbpf.c:8489:13: error: assignment discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers]
     8489 |         res = strstr(sym_name, ".llvm.");

Reviewed-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20251211221756.96294-4-acme@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/jvmti/libjvmti.c     | 2 +-
 tools/perf/tests/parse-events.c | 4 ++--
 tools/perf/util/evlist.c        | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/jvmti/libjvmti.c b/tools/perf/jvmti/libjvmti.c
index 82514e6532b8..87bfd4781003 100644
--- a/tools/perf/jvmti/libjvmti.c
+++ b/tools/perf/jvmti/libjvmti.c
@@ -142,7 +142,7 @@ copy_class_filename(const char * class_sign, const char * file_name, char * resu
 	*/
 	if (*class_sign == 'L') {
 		size_t j, i = 0;
-		char *p = strrchr(class_sign, '/');
+		const char *p = strrchr(class_sign, '/');
 		if (p) {
 			/* drop the 'L' prefix and copy up to the final '/' */
 			for (i = 0; i < (size_t)(p - class_sign); i++)
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 128d21dc389f..2bd622972114 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -2609,8 +2609,8 @@ static int test_events(const struct evlist_test *events, int cnt)
 	for (int i = 0; i < cnt; i++) {
 		struct evlist_test e = events[i];
 		int test_ret;
-		const char *pos = e.name;
-		char buf[1024], *buf_pos = buf, *end;
+		const char *pos = e.name, *end;
+		char buf[1024], *buf_pos = buf;
 
 		while ((end = strstr(pos, "default_core"))) {
 			size_t len = end - pos;
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 03674d2cbd01..649519628541 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1945,7 +1945,8 @@ out_free:
 
 int evlist__parse_control(const char *str, int *ctl_fd, int *ctl_fd_ack, bool *ctl_fd_close)
 {
-	char *comma = NULL, *endptr = NULL;
+	const char *comma = NULL;
+	char *endptr = NULL;
 
 	*ctl_fd_close = false;
 
-- 
cgit v1.2.3


From c85eff00cf296c146a2b189166eaf85188cd1487 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@kernel.org>
Date: Thu, 11 Dec 2025 19:17:56 -0300
Subject: perf trace: Don't change const char strings

We got away with this so far but now with fedora 44 complaining about
the return value of strchr et all, lets use strdup for good measure.

Reviewed-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20251211221756.96294-5-acme@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index baee1f695600..d49c1ae409d7 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -5173,8 +5173,8 @@ static int trace__parse_events_option(const struct option *opt, const char *str,
 				      int unset __maybe_unused)
 {
 	struct trace *trace = (struct trace *)opt->value;
-	const char *s = str;
-	char *sep = NULL, *lists[2] = { NULL, NULL, };
+	const char *s;
+	char *strd, *sep = NULL, *lists[2] = { NULL, NULL, };
 	int len = strlen(str) + 1, err = -1, list, idx;
 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
 	char group_name[PATH_MAX];
@@ -5183,6 +5183,10 @@ static int trace__parse_events_option(const struct option *opt, const char *str,
 	if (strace_groups_dir == NULL)
 		return -1;
 
+	s = strd = strdup(str);
+	if (strd == NULL)
+		return -1;
+
 	if (*s == '!') {
 		++s;
 		trace->not_ev_qualifier = true;
@@ -5257,8 +5261,7 @@ out:
 	free(strace_groups_dir);
 	free(lists[0]);
 	free(lists[1]);
-	if (sep)
-		*sep = ',';
+	free(strd);
 
 	return err;
 }
-- 
cgit v1.2.3


From cddfb3611275697b88031c473d656dc27da34480 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 31 Oct 2025 09:26:37 -0700
Subject: perf vendor power9 nest metrics: Correct unit from MB to MiB

6.1e-5 is very close to 1/16384, where 16384 is 2^14, i.e. a power of
2. When units are in powers of 2 the IEC unit is MiB (mebibytes)
rather than MB (megabytes) where the values are powers of 10.

This patch corrects the unit for uniformity and because such units may
be pattern matched against.

Reviewed-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Kajol Jain <kjain@linux.ibm.com>
Link: https://lore.kernel.org/r/20251031162637.1456191-1-irogers@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json b/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json
index 7a5d1bf543f8..8d028a7c2777 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/nest_metrics.json
@@ -29,25 +29,25 @@
 	"MetricExpr" : "nest_mcs01_imc@PM_MCS01_128B_RD_DISP_PORT01@ + nest_mcs01_imc@PM_MCS01_128B_RD_DISP_PORT23@",
 	"MetricName" : "mcs01-read",
 	"MetricGroup" : "memory-bandwidth",
-	"ScaleUnit": "6.1e-5MB"
+	"ScaleUnit": "6.1e-5MiB"
     },
     {
 	"MetricExpr" : "nest_mcs23_imc@PM_MCS23_128B_RD_DISP_PORT01@ + nest_mcs23_imc@PM_MCS23_128B_RD_DISP_PORT23@",
 	"MetricName" : "mcs23-read",
 	"MetricGroup" : "memory-bandwidth",
-	"ScaleUnit": "6.1e-5MB"
+	"ScaleUnit": "6.1e-5MiB"
     },
     {
 	"MetricExpr" : "nest_mcs01_imc@PM_MCS01_128B_WR_DISP_PORT01@ + nest_mcs01_imc@PM_MCS01_128B_WR_DISP_PORT23@",
 	"MetricName" : "mcs01-write",
 	"MetricGroup" : "memory-bandwidth",
-	"ScaleUnit": "6.1e-5MB"
+	"ScaleUnit": "6.1e-5MiB"
     },
     {
 	"MetricExpr" : "nest_mcs23_imc@PM_MCS23_128B_WR_DISP_PORT01@ + nest_mcs23_imc@PM_MCS23_128B_WR_DISP_PORT23@",
 	"MetricName" : "mcs23-write",
 	"MetricGroup" : "memory-bandwidth",
-	"ScaleUnit": "6.1e-5MB"
+	"ScaleUnit": "6.1e-5MiB"
     },
     {
 	"MetricExpr" : "nest_powerbus0_imc@PM_PB_CYC@",
-- 
cgit v1.2.3


From bdd051e249141c793dec28544e7f5d5bc7690bf3 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 9 Dec 2025 18:33:24 -0800
Subject: perf record: Split --data-mmap option

Currently -d/--data option controls both PERF_SAMPLE_ADDR bit and
perf_event_attr.mmap_data flag.  Separate them using new --data-mmap
option to support recording only one of them.

For data-type profiling, data MMAP is unnecessary but it wastes a lot
of space in the ring buffer and data file.

Committer testing:

On an idle system:

  root@x1:~# perf record -d -a sleep 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 5.672 MB perf.data (1075 samples) ]
  root@x1:~# ls -la perf.data
  -rw-------. 1 root root 5982480 Dec 16 15:34 perf.data
  root@x1:~# perf evlist -v
  cpu_atom/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 144, config: 0xa00000000 (cpu_atom/PERF_COUNT_HW_CPU_CYCLES/), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|CPU|PERIOD|IDENTIFIER|DATA_SRC, read_format: ID|LOST, disabled: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  cpu_core/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 144, config: 0x400000000 (cpu_core/PERF_COUNT_HW_CPU_CYCLES/), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ADDR|CPU|PERIOD|IDENTIFIER|DATA_SRC, read_format: ID|LOST, disabled: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  dummy:u: type: 1 (PERF_TYPE_SOFTWARE), size: 144, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|ADDR|CPU|IDENTIFIER|DATA_SRC, read_format: ID|LOST, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, mmap_data: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, build_id: 1
  root@x1:~#

Now with just --data-mmap we will not save that much, as only DATA_SRC
will not be enabled in sample_type:

  root@x1:~# perf record --data-mmap -a sleep 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 5.576 MB perf.data (716 samples) ]
  root@x1:~# ls -la perf.data
  -rw-------. 1 root root 5880112 Dec 16 15:37 perf.data
  root@x1:~# perf evlist -v
  cpu_atom/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 144, config: 0xa00000000 (cpu_atom/PERF_COUNT_HW_CPU_CYCLES/), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  cpu_core/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 144, config: 0x400000000 (cpu_core/PERF_COUNT_HW_CPU_CYCLES/), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER, read_format: ID|LOST, disabled: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  dummy:u: type: 1 (PERF_TYPE_SOFTWARE), size: 144, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CPU|IDENTIFIER, read_format: ID|LOST, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, mmap_data: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, build_id: 1
  root@x1:~#

To complete, just with DATA_SRC, no mmap_data:

  root@x1:~# perf record --sample-mem-info -a sleep 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 1.407 MB perf.data (1311 samples) ]
  root@x1:~# ls -la perf.data
  -rw-------. 1 root root 1509224 Dec 16 15:40 perf.data
  root@x1:~# perf evlist -v
  cpu_atom/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 144, config: 0xa00000000 (cpu_atom/PERF_COUNT_HW_CPU_CYCLES/), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER|DATA_SRC, read_format: ID|LOST, disabled: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  cpu_core/cycles/P: type: 0 (PERF_TYPE_HARDWARE), size: 144, config: 0x400000000 (cpu_core/PERF_COUNT_HW_CPU_CYCLES/), { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER|DATA_SRC, read_format: ID|LOST, disabled: 1, freq: 1, precise_ip: 3, sample_id_all: 1
  dummy:u: type: 1 (PERF_TYPE_SOFTWARE), size: 144, config: 0x9 (PERF_COUNT_SW_DUMMY), { sample_period, sample_freq }: 1, sample_type: IP|TID|TIME|CPU|IDENTIFIER|DATA_SRC, read_format: ID|LOST, exclude_kernel: 1, exclude_hv: 1, mmap: 1, comm: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1, build_id: 1
  root@x1:~#

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-record.txt |  8 +++++++-
 tools/perf/builtin-record.c              | 19 +++++++++++++------
 tools/perf/util/evsel.c                  |  5 +++--
 tools/perf/util/record.h                 |  2 ++
 4 files changed, 25 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index e8b9aadbbfa5..c402e74172f6 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -344,7 +344,8 @@ OPTIONS
 
 -d::
 --data::
-	Record the sample virtual addresses.  Implies --sample-mem-info.
+	Record the sample virtual addresses.  Implies --sample-mem-info and
+	--data-mmap.
 
 --phys-data::
 	Record the sample physical addresses.
@@ -861,6 +862,11 @@ filtered through the mask provided by -C option.
 	Prepare BPF filter to be used by regular users.  The action should be
 	either "pin" or "unpin".  The filter can be used after it's pinned.
 
+--data-mmap::
+	Enable recording MMAP events for non-executable mappings.  Basically
+	perf only records executable mappings but data mmaping can be useful
+	when you analyze data access with sample addresses.  So using -d option
+	would enable this unless you specify --no-data-mmap manually.
 
 include::intel-hybrid.txt[]
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 2584d0d8bc82..cbfbd9bb1063 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1881,7 +1881,7 @@ static int record__synthesize_workload(struct record *rec, bool tail)
 						 process_synthesized_event,
 						 &rec->session->machines.host,
 						 needs_mmap,
-						 rec->opts.sample_address);
+						 rec->opts.record_data_mmap);
 	perf_thread_map__put(thread_map);
 	return err;
 }
@@ -2191,7 +2191,7 @@ static int record__synthesize(struct record *rec, bool tail)
 
 		err = __machine__synthesize_threads(machine, tool, &opts->target,
 						    rec->evlist->core.threads,
-						    f, needs_mmap, opts->sample_address,
+						    f, needs_mmap, opts->record_data_mmap,
 						    rec->opts.nr_threads_synthesize);
 	}
 
@@ -3006,8 +3006,9 @@ int record_opts__parse_callchain(struct record_opts *record,
 	ret = parse_callchain_record_opt(arg, callchain);
 	if (!ret) {
 		/* Enable data address sampling for DWARF unwind. */
-		if (callchain->record_mode == CALLCHAIN_DWARF)
-			record->sample_address = true;
+		if (callchain->record_mode == CALLCHAIN_DWARF &&
+		    !record->record_data_mmap_set)
+			record->record_data_mmap = true;
 		callchain_debug(callchain);
 	}
 
@@ -3686,6 +3687,9 @@ static struct option __record_options[] = {
 	OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
 		     "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
 		     record__parse_off_cpu_thresh),
+	OPT_BOOLEAN_SET(0, "data-mmap", &record.opts.record_data_mmap,
+			&record.opts.record_data_mmap_set,
+			"Record mmap events for non-executable mappings"),
 	OPT_END()
 };
 
@@ -4249,9 +4253,12 @@ int cmd_record(int argc, const char **argv)
 		goto out_opts;
 	}
 
-	/* For backward compatibility, -d implies --mem-info */
-	if (rec->opts.sample_address)
+	/* For backward compatibility, -d implies --mem-info and --data-mmap */
+	if (rec->opts.sample_address) {
 		rec->opts.sample_data_src = true;
+		if (!rec->opts.record_data_mmap_set)
+			rec->opts.record_data_mmap = true;
+	}
 
 	/*
 	 * Allow aliases to facilitate the lookup of symbols for address
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 9cd706f62793..ec6552a6f667 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1445,10 +1445,11 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts,
 		attr->inherit_stat = 1;
 	}
 
-	if (opts->sample_address) {
+	if (opts->sample_address)
 		evsel__set_sample_bit(evsel, ADDR);
+
+	if (opts->record_data_mmap)
 		attr->mmap_data = track;
-	}
 
 	/*
 	 * We don't allow user space callchains for  function trace
diff --git a/tools/perf/util/record.h b/tools/perf/util/record.h
index ea3a6c4657ee..93627c9a7338 100644
--- a/tools/perf/util/record.h
+++ b/tools/perf/util/record.h
@@ -40,6 +40,8 @@ struct record_opts {
 	bool	      record_cgroup;
 	bool	      record_switch_events;
 	bool	      record_switch_events_set;
+	bool	      record_data_mmap;
+	bool	      record_data_mmap_set;
 	bool	      all_kernel;
 	bool	      all_user;
 	bool	      kernel_callchains;
-- 
cgit v1.2.3


From 9cdc9738d169f82bd4abb638b2ac8690bdee5522 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 9 Dec 2025 18:33:25 -0800
Subject: perf report: Enable data-type profiling with -F option too

It checked -s/--sort options only.  As the sort keys can be setup using
the -F/--fields option as well, it should enable data-type profiling
with it too.

The following two commands should have the same output.

  $ perf report -s type

  $ perf report -F overhead,type

But there's another problem on this.  I'll handle it in the next commit.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index add6b1c2aaf0..6c2b4f93ec78 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1727,7 +1727,8 @@ repeat:
 			sort_order = NULL;
 	}
 
-	if (sort_order && strstr(sort_order, "type")) {
+	if ((sort_order && strstr(sort_order, "type")) ||
+	    (field_order && strstr(field_order, "type"))) {
 		report.data_type = true;
 		annotate_opts.annotate_src = false;
 
-- 
cgit v1.2.3


From 5d35d829bb0b19ee51be9732e3b5f81abc7ef3bb Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 9 Dec 2025 18:33:26 -0800
Subject: perf report: Fix histogram entry collapsing for -F option

Users can use -F/--fields option to set output fields and sort keys
together.

But it missed to set perf_hpp_list->need_collapse for sort entries that
have se_collapse callbacks.

So it ends up with having duplicated entries separately.

For example, let's run this command first.

  $ perf mem record -t load -U -- perf test -w datasym

This will record samples for memory access (load) to struct 'buf' and a
loop condition ('sig_atomic_t') types.

So the following two commands should have identical output.

  $ perf report -s type --stdio --percent-limit=1 -q
      87.80%  perf                  buf
      12.17%  perf                  sig_atomic_t

But using -F option didn't collapse the entries based on types so the
result looked like below:

  $ perf report -F overhead,type --stdio --percent-limit=1 -q
      23.31%  perf                  buf
      22.84%  perf                  buf
      21.26%  perf                  buf
      20.39%  perf                  buf
      12.17%  perf                  sig_atomic_t

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/sort.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index f3a565b0e230..3d4b68fd6e44 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -3585,6 +3585,9 @@ static int __sort_dimension__add_output(struct perf_hpp_list *list,
 	if (__sort_dimension__add_hpp_output(sd, list, level) < 0)
 		return -1;
 
+	if (sd->entry->se_collapse)
+		list->need_collapse = 1;
+
 	sd->taken = 1;
 	return 0;
 }
-- 
cgit v1.2.3


From cbd41c6d4c26c161a2b0e70ad411d3885ff13507 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 9 Dec 2025 18:33:27 -0800
Subject: perf report: Update sort key state from -F option

Factor out __sort_dimension__update() so that it can be called from -s
and -F option parsing logics.  Otherwise the following command cannot go
into the annotation mode.

  $ perf report -F overhead,type,sym

  Warning: Annotation is only available for symbolic views, include "sym*" in --sort to use it.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/sort.c | 100 ++++++++++++++++++++++++++-----------------------
 1 file changed, 54 insertions(+), 46 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 3d4b68fd6e44..f963d61ac166 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -3538,6 +3538,56 @@ out:
 	return ret;
 }
 
+static int __sort_dimension__update(struct sort_dimension *sd,
+				    struct perf_hpp_list *list)
+{
+	if (sd->entry == &sort_parent && parent_pattern) {
+		int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
+		if (ret) {
+			char err[BUFSIZ];
+
+			regerror(ret, &parent_regex, err, sizeof(err));
+			pr_err("Invalid regex: %s\n%s", parent_pattern, err);
+			return -EINVAL;
+		}
+		list->parent = 1;
+	} else if (sd->entry == &sort_sym) {
+		list->sym = 1;
+		/*
+		 * perf diff displays the performance difference amongst
+		 * two or more perf.data files. Those files could come
+		 * from different binaries. So we should not compare
+		 * their ips, but the name of symbol.
+		 */
+		if (sort__mode == SORT_MODE__DIFF)
+			sd->entry->se_collapse = sort__sym_sort;
+
+	} else if (sd->entry == &sort_sym_offset) {
+		list->sym = 1;
+	} else if (sd->entry == &sort_dso) {
+		list->dso = 1;
+	} else if (sd->entry == &sort_socket) {
+		list->socket = 1;
+	} else if (sd->entry == &sort_thread) {
+		list->thread = 1;
+	} else if (sd->entry == &sort_comm) {
+		list->comm = 1;
+	} else if (sd->entry == &sort_type_offset) {
+		symbol_conf.annotate_data_member = true;
+	} else if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to) {
+		list->sym = 1;
+	} else if (sd->entry == &sort_mem_dcacheline && cacheline_size() == 0) {
+		return -EINVAL;
+	} else if (sd->entry == &sort_mem_daddr_sym) {
+		list->sym = 1;
+	}
+
+	if (sd->entry->se_collapse)
+		list->need_collapse = 1;
+
+	return 0;
+}
+
 static int __sort_dimension__add(struct sort_dimension *sd,
 				 struct perf_hpp_list *list,
 				 int level)
@@ -3548,8 +3598,8 @@ static int __sort_dimension__add(struct sort_dimension *sd,
 	if (__sort_dimension__add_hpp_sort(sd, list, level) < 0)
 		return -1;
 
-	if (sd->entry->se_collapse)
-		list->need_collapse = 1;
+	if (__sort_dimension__update(sd, list) < 0)
+		return -1;
 
 	sd->taken = 1;
 
@@ -3585,8 +3635,8 @@ static int __sort_dimension__add_output(struct perf_hpp_list *list,
 	if (__sort_dimension__add_hpp_output(sd, list, level) < 0)
 		return -1;
 
-	if (sd->entry->se_collapse)
-		list->need_collapse = 1;
+	if (__sort_dimension__update(sd, list) < 0)
+		return -1;
 
 	sd->taken = 1;
 	return 0;
@@ -3651,39 +3701,6 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 				sort_dimension_add_dynamic_header(sd, env);
 		}
 
-		if (sd->entry == &sort_parent && parent_pattern) {
-			int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
-			if (ret) {
-				char err[BUFSIZ];
-
-				regerror(ret, &parent_regex, err, sizeof(err));
-				pr_err("Invalid regex: %s\n%s", parent_pattern, err);
-				return -EINVAL;
-			}
-			list->parent = 1;
-		} else if (sd->entry == &sort_sym) {
-			list->sym = 1;
-			/*
-			 * perf diff displays the performance difference amongst
-			 * two or more perf.data files. Those files could come
-			 * from different binaries. So we should not compare
-			 * their ips, but the name of symbol.
-			 */
-			if (sort__mode == SORT_MODE__DIFF)
-				sd->entry->se_collapse = sort__sym_sort;
-
-		} else if (sd->entry == &sort_dso) {
-			list->dso = 1;
-		} else if (sd->entry == &sort_socket) {
-			list->socket = 1;
-		} else if (sd->entry == &sort_thread) {
-			list->thread = 1;
-		} else if (sd->entry == &sort_comm) {
-			list->comm = 1;
-		} else if (sd->entry == &sort_type_offset) {
-			symbol_conf.annotate_data_member = true;
-		}
-
 		return __sort_dimension__add(sd, list, level);
 	}
 
@@ -3702,9 +3719,6 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 				    strlen(tok)))
 			return -EINVAL;
 
-		if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to)
-			list->sym = 1;
-
 		__sort_dimension__add(sd, list, level);
 		return 0;
 	}
@@ -3718,12 +3732,6 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 		if (sort__mode != SORT_MODE__MEMORY)
 			return -EINVAL;
 
-		if (sd->entry == &sort_mem_dcacheline && cacheline_size() == 0)
-			return -EINVAL;
-
-		if (sd->entry == &sort_mem_daddr_sym)
-			list->sym = 1;
-
 		__sort_dimension__add(sd, list, level);
 		return 0;
 	}
-- 
cgit v1.2.3


From a05385d84b2af64600fc84b027bea481e8f6261d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 5 Dec 2025 16:16:44 -0800
Subject: perf/x86/core: Register a new vector for handling mediated guest PMIs

Wire up system vector 0xf5 for handling PMIs (i.e. interrupts delivered
through the LVTPC) while running KVM guests with a mediated PMU.  Perf
currently delivers all PMIs as NMIs, e.g. so that events that trigger while
IRQs are disabled aren't delayed and generate useless records, but due to
the multiplexing of NMIs throughout the system, correctly identifying NMIs
for a mediated PMU is practically infeasible.

To (greatly) simplify identifying guest mediated PMU PMIs, perf will
switch the CPU's LVTPC between PERF_GUEST_MEDIATED_PMI_VECTOR and NMI when
guest PMU context is loaded/put.  I.e. PMIs that are generated by the CPU
while the guest is active will be identified purely based on the IRQ
vector.

Route the vector through perf, e.g. as opposed to letting KVM attach a
handler directly a la posted interrupt notification vectors, as perf owns
the LVTPC and thus is the rightful owner of PERF_GUEST_MEDIATED_PMI_VECTOR.
Functionally, having KVM directly own the vector would be fine (both KVM
and perf will be completely aware of when a mediated PMU is active), but
would lead to an undesirable split in ownership: perf would be responsible
for installing the vector, but not handling the resulting IRQs.

Add a new perf_guest_info_callbacks hook (and static call) to allow KVM to
register its handler with perf when running guests with mediated PMUs.

Note, because KVM always runs guests with host IRQs enabled, there is no
danger of a PMI being delayed from the guest's perspective due to using a
regular IRQ instead of an NMI.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Xudong Hao <xudong.hao@intel.com>
Link: https://patch.msgid.link/20251206001720.468579-9-seanjc@google.com
---
 arch/x86/entry/entry_fred.c                           |  1 +
 arch/x86/include/asm/hardirq.h                        |  3 +++
 arch/x86/include/asm/idtentry.h                       |  6 ++++++
 arch/x86/include/asm/irq_vectors.h                    |  4 +++-
 arch/x86/kernel/idt.c                                 |  3 +++
 arch/x86/kernel/irq.c                                 | 19 +++++++++++++++++++
 include/linux/perf_event.h                            |  8 ++++++++
 kernel/events/core.c                                  |  9 +++++++--
 .../trace/beauty/arch/x86/include/asm/irq_vectors.h   |  3 ++-
 virt/kvm/kvm_main.c                                   |  3 +++
 10 files changed, 55 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index 94e626cc6a07..a9b72997103d 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -114,6 +114,7 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
 
 	SYSVEC(IRQ_WORK_VECTOR,			irq_work),
 
+	SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR,	perf_guest_mediated_pmi_handler),
 	SYSVEC(POSTED_INTR_VECTOR,		kvm_posted_intr_ipi),
 	SYSVEC(POSTED_INTR_WAKEUP_VECTOR,	kvm_posted_intr_wakeup_ipi),
 	SYSVEC(POSTED_INTR_NESTED_VECTOR,	kvm_posted_intr_nested_ipi),
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 6b6d472baa0b..9314642ae93c 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -18,6 +18,9 @@ typedef struct {
 	unsigned int kvm_posted_intr_ipis;
 	unsigned int kvm_posted_intr_wakeup_ipis;
 	unsigned int kvm_posted_intr_nested_ipis;
+#endif
+#ifdef CONFIG_GUEST_PERF_EVENTS
+	unsigned int perf_guest_mediated_pmis;
 #endif
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int apic_perf_irqs;
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 3218770670d3..42bf6a58ec36 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -746,6 +746,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR,	sysvec_kvm_posted_intr_nested
 # define fred_sysvec_kvm_posted_intr_nested_ipi		NULL
 #endif
 
+# ifdef CONFIG_GUEST_PERF_EVENTS
+DECLARE_IDTENTRY_SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR,	sysvec_perf_guest_mediated_pmi_handler);
+#else
+# define fred_sysvec_perf_guest_mediated_pmi_handler	NULL
+#endif
+
 # ifdef CONFIG_X86_POSTED_MSI
 DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR,	sysvec_posted_msi_notification);
 #else
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 47051871b436..85253fc8e384 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -77,7 +77,9 @@
  */
 #define IRQ_WORK_VECTOR			0xf6
 
-/* 0xf5 - unused, was UV_BAU_MESSAGE */
+/* IRQ vector for PMIs when running a guest with a mediated PMU. */
+#define PERF_GUEST_MEDIATED_PMI_VECTOR	0xf5
+
 #define DEFERRED_ERROR_VECTOR		0xf4
 
 /* Vector on which hypervisor callbacks will be delivered */
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index f445bec516a0..260456588756 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -158,6 +158,9 @@ static const __initconst struct idt_data apic_idts[] = {
 	INTG(POSTED_INTR_WAKEUP_VECTOR,		asm_sysvec_kvm_posted_intr_wakeup_ipi),
 	INTG(POSTED_INTR_NESTED_VECTOR,		asm_sysvec_kvm_posted_intr_nested_ipi),
 # endif
+#ifdef CONFIG_GUEST_PERF_EVENTS
+	INTG(PERF_GUEST_MEDIATED_PMI_VECTOR,	asm_sysvec_perf_guest_mediated_pmi_handler),
+#endif
 # ifdef CONFIG_IRQ_WORK
 	INTG(IRQ_WORK_VECTOR,			asm_sysvec_irq_work),
 # endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 86f4e574de02..d56185b49a0e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -192,6 +192,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 			   irq_stats(j)->kvm_posted_intr_wakeup_ipis);
 	seq_puts(p, "  Posted-interrupt wakeup event\n");
 #endif
+#ifdef CONFIG_GUEST_PERF_EVENTS
+	seq_printf(p, "%*s: ", prec, "VPMI");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ",
+			   irq_stats(j)->perf_guest_mediated_pmis);
+	seq_puts(p, " Perf Guest Mediated PMI\n");
+#endif
 #ifdef CONFIG_X86_POSTED_MSI
 	seq_printf(p, "%*s: ", prec, "PMN");
 	for_each_online_cpu(j)
@@ -349,6 +356,18 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi)
 }
 #endif
 
+#ifdef CONFIG_GUEST_PERF_EVENTS
+/*
+ * Handler for PERF_GUEST_MEDIATED_PMI_VECTOR.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_perf_guest_mediated_pmi_handler)
+{
+	 apic_eoi();
+	 inc_irq_stat(perf_guest_mediated_pmis);
+	 perf_guest_handle_mediated_pmi();
+}
+#endif
+
 #if IS_ENABLED(CONFIG_KVM)
 static void dummy_handler(void) {}
 static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 322cfa9f3d48..82e617fad165 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1677,6 +1677,8 @@ struct perf_guest_info_callbacks {
 	unsigned int			(*state)(void);
 	unsigned long			(*get_ip)(void);
 	unsigned int			(*handle_intel_pt_intr)(void);
+
+	void				(*handle_mediated_pmi)(void);
 };
 
 #ifdef CONFIG_GUEST_PERF_EVENTS
@@ -1686,6 +1688,7 @@ extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
 DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state);
 DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
 DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
+DECLARE_STATIC_CALL(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi);
 
 static inline unsigned int perf_guest_state(void)
 {
@@ -1702,6 +1705,11 @@ static inline unsigned int perf_guest_handle_intel_pt_intr(void)
 	return static_call(__perf_guest_handle_intel_pt_intr)();
 }
 
+static inline void perf_guest_handle_mediated_pmi(void)
+{
+	static_call(__perf_guest_handle_mediated_pmi)();
+}
+
 extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
 extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bbb81a4a3196..dd842a4ca789 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7644,6 +7644,7 @@ struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
 DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
 DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
 DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
+DEFINE_STATIC_CALL_RET0(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi);
 
 void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 {
@@ -7658,6 +7659,10 @@ void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 	if (cbs->handle_intel_pt_intr)
 		static_call_update(__perf_guest_handle_intel_pt_intr,
 				   cbs->handle_intel_pt_intr);
+
+	if (cbs->handle_mediated_pmi)
+		static_call_update(__perf_guest_handle_mediated_pmi,
+				   cbs->handle_mediated_pmi);
 }
 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
 
@@ -7669,8 +7674,8 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 	rcu_assign_pointer(perf_guest_cbs, NULL);
 	static_call_update(__perf_guest_state, (void *)&__static_call_return0);
 	static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
-	static_call_update(__perf_guest_handle_intel_pt_intr,
-			   (void *)&__static_call_return0);
+	static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0);
+	static_call_update(__perf_guest_handle_mediated_pmi, (void *)&__static_call_return0);
 	synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
diff --git a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h
index 47051871b436..6e1d5b955aae 100644
--- a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h
+++ b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h
@@ -77,7 +77,8 @@
  */
 #define IRQ_WORK_VECTOR			0xf6
 
-/* 0xf5 - unused, was UV_BAU_MESSAGE */
+#define PERF_GUEST_MEDIATED_PMI_VECTOR	0xf5
+
 #define DEFERRED_ERROR_VECTOR		0xf4
 
 /* Vector on which hypervisor callbacks will be delivered */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5fcd401a5897..21a0d226d63f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6467,11 +6467,14 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
 	.state			= kvm_guest_state,
 	.get_ip			= kvm_guest_get_ip,
 	.handle_intel_pt_intr	= NULL,
+	.handle_mediated_pmi	= NULL,
 };
 
 void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
 {
 	kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
+	kvm_guest_cbs.handle_mediated_pmi = NULL;
+
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);
 }
 void kvm_unregister_perf_callbacks(void)
-- 
cgit v1.2.3


From c1c7d61746f42154bad30bc4adb88f5374969408 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Fri, 19 Dec 2025 10:13:14 -0800
Subject: resolve_btfids: Rename object btf field to btf_path

Rename the member of `struct object` holding the path to BTF data if
provided via --btf arg. `btf_path` is less ambiguous.

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20251219181321.1283664-2-ihor.solodrai@linux.dev
---
 tools/bpf/resolve_btfids/main.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index d47191c6e55e..164f0c941f04 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -113,7 +113,7 @@ struct btf_id {
 
 struct object {
 	const char *path;
-	const char *btf;
+	const char *btf_path;
 	const char *base_btf_path;
 
 	struct {
@@ -550,11 +550,11 @@ static int symbols_resolve(struct object *obj)
 		}
 	}
 
-	btf = btf__parse_split(obj->btf ?: obj->path, base_btf);
+	btf = btf__parse_split(obj->btf_path ?: obj->path, base_btf);
 	err = libbpf_get_error(btf);
 	if (err) {
 		pr_err("FAILED: load BTF from %s: %s\n",
-			obj->btf ?: obj->path, strerror(-err));
+			obj->btf_path ?: obj->path, strerror(-err));
 		goto out;
 	}
 
@@ -790,8 +790,8 @@ int main(int argc, const char **argv)
 	struct option btfid_options[] = {
 		OPT_INCR('v', "verbose", &verbose,
 			 "be more verbose (show errors, etc)"),
-		OPT_STRING(0, "btf", &obj.btf, "BTF data",
-			   "BTF data"),
+		OPT_STRING(0, "btf", &obj.btf_path, "file",
+			   "path to a file with input BTF data"),
 		OPT_STRING('b', "btf_base", &obj.base_btf_path, "file",
 			   "path of file providing base BTF"),
 		OPT_BOOLEAN(0, "fatal_warnings", &fatal_warnings,
-- 
cgit v1.2.3


From 5f347a0f781a36927a547904dde4c4dac343010b Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Fri, 19 Dec 2025 10:13:15 -0800
Subject: resolve_btfids: Factor out load_btf()

Increase the lifetime of parsed BTF in resolve_btfids by factoring
load_btf() routine out of symbols_resolve() and storing the base_btf
and btf pointers in the struct object.

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20251219181321.1283664-3-ihor.solodrai@linux.dev
---
 tools/bpf/resolve_btfids/main.c | 47 +++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index 164f0c941f04..b4caae1170dd 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -116,6 +116,9 @@ struct object {
 	const char *btf_path;
 	const char *base_btf_path;
 
+	struct btf *btf;
+	struct btf *base_btf;
+
 	struct {
 		int		 fd;
 		Elf		*elf;
@@ -529,16 +532,10 @@ static int symbols_collect(struct object *obj)
 	return 0;
 }
 
-static int symbols_resolve(struct object *obj)
+static int load_btf(struct object *obj)
 {
-	int nr_typedefs = obj->nr_typedefs;
-	int nr_structs  = obj->nr_structs;
-	int nr_unions   = obj->nr_unions;
-	int nr_funcs    = obj->nr_funcs;
-	struct btf *base_btf = NULL;
-	int err, type_id;
-	struct btf *btf;
-	__u32 nr_types;
+	struct btf *base_btf = NULL, *btf = NULL;
+	int err;
 
 	if (obj->base_btf_path) {
 		base_btf = btf__parse(obj->base_btf_path, NULL);
@@ -546,7 +543,7 @@ static int symbols_resolve(struct object *obj)
 		if (err) {
 			pr_err("FAILED: load base BTF from %s: %s\n",
 			       obj->base_btf_path, strerror(-err));
-			return -1;
+			goto out_err;
 		}
 	}
 
@@ -555,9 +552,30 @@ static int symbols_resolve(struct object *obj)
 	if (err) {
 		pr_err("FAILED: load BTF from %s: %s\n",
 			obj->btf_path ?: obj->path, strerror(-err));
-		goto out;
+		goto out_err;
 	}
 
+	obj->base_btf = base_btf;
+	obj->btf = btf;
+
+	return 0;
+
+out_err:
+	btf__free(base_btf);
+	btf__free(btf);
+	return err;
+}
+
+static int symbols_resolve(struct object *obj)
+{
+	int nr_typedefs = obj->nr_typedefs;
+	int nr_structs  = obj->nr_structs;
+	int nr_unions   = obj->nr_unions;
+	int nr_funcs    = obj->nr_funcs;
+	struct btf *btf = obj->btf;
+	int err, type_id;
+	__u32 nr_types;
+
 	err = -1;
 	nr_types = btf__type_cnt(btf);
 
@@ -615,8 +633,6 @@ static int symbols_resolve(struct object *obj)
 
 	err = 0;
 out:
-	btf__free(base_btf);
-	btf__free(btf);
 	return err;
 }
 
@@ -824,6 +840,9 @@ int main(int argc, const char **argv)
 	if (symbols_collect(&obj))
 		goto out;
 
+	if (load_btf(&obj))
+		goto out;
+
 	if (symbols_resolve(&obj))
 		goto out;
 
@@ -833,6 +852,8 @@ int main(int argc, const char **argv)
 	if (!(fatal_warnings && warnings))
 		err = 0;
 out:
+	btf__free(obj.base_btf);
+	btf__free(obj.btf);
 	if (obj.efile.elf) {
 		elf_end(obj.efile.elf);
 		close(obj.efile.fd);
-- 
cgit v1.2.3


From a4fa885bd52d1711994ad1ef99e989977cd15698 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Fri, 19 Dec 2025 10:13:16 -0800
Subject: resolve_btfids: Introduce enum btf_id_kind

Instead of using multiple flags, make struct btf_id tagged with an
enum value indicating its kind in the context of resolve_btfids.

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20251219181321.1283664-4-ihor.solodrai@linux.dev
---
 tools/bpf/resolve_btfids/main.c | 83 +++++++++++++++++++++++++++++------------
 1 file changed, 60 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index b4caae1170dd..e721e20a2bbd 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -98,6 +98,13 @@
 # error "Unknown machine endianness!"
 #endif
 
+enum btf_id_kind {
+	BTF_ID_KIND_NONE,
+	BTF_ID_KIND_SYM,
+	BTF_ID_KIND_SET,
+	BTF_ID_KIND_SET8
+};
+
 struct btf_id {
 	struct rb_node	 rb_node;
 	char		*name;
@@ -105,9 +112,8 @@ struct btf_id {
 		int	 id;
 		int	 cnt;
 	};
+	enum btf_id_kind kind;
 	int		 addr_cnt;
-	bool		 is_set;
-	bool		 is_set8;
 	Elf64_Addr	 addr[ADDR_CNT];
 };
 
@@ -197,8 +203,10 @@ static struct btf_id *btf_id__find(struct rb_root *root, const char *name)
 	return NULL;
 }
 
-static struct btf_id *
-btf_id__add(struct rb_root *root, char *name, bool unique)
+static struct btf_id *__btf_id__add(struct rb_root *root,
+				    char *name,
+				    enum btf_id_kind kind,
+				    bool unique)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
@@ -221,12 +229,23 @@ btf_id__add(struct rb_root *root, char *name, bool unique)
 	if (id) {
 		pr_debug("adding symbol %s\n", name);
 		id->name = name;
+		id->kind = kind;
 		rb_link_node(&id->rb_node, parent, p);
 		rb_insert_color(&id->rb_node, root);
 	}
 	return id;
 }
 
+static inline struct btf_id *btf_id__add(struct rb_root *root, char *name, enum btf_id_kind kind)
+{
+	return __btf_id__add(root, name, kind, false);
+}
+
+static inline struct btf_id *btf_id__add_unique(struct rb_root *root, char *name, enum btf_id_kind kind)
+{
+	return __btf_id__add(root, name, kind, true);
+}
+
 static char *get_id(const char *prefix_end)
 {
 	/*
@@ -260,22 +279,36 @@ static char *get_id(const char *prefix_end)
 	return id;
 }
 
-static struct btf_id *add_set(struct object *obj, char *name, bool is_set8)
+static struct btf_id *add_set(struct object *obj, char *name, enum btf_id_kind kind)
 {
+	int len = strlen(name);
+	int prefixlen;
+	char *id;
+
 	/*
 	 * __BTF_ID__set__name
 	 * name =    ^
 	 * id   =         ^
 	 */
-	char *id = name + (is_set8 ? sizeof(BTF_SET8 "__") : sizeof(BTF_SET "__")) - 1;
-	int len = strlen(name);
+	switch (kind) {
+	case BTF_ID_KIND_SET:
+		prefixlen = sizeof(BTF_SET "__") - 1;
+		break;
+	case BTF_ID_KIND_SET8:
+		prefixlen = sizeof(BTF_SET8 "__") - 1;
+		break;
+	default:
+		pr_err("Unexpected kind %d passed to %s() for symbol %s\n", kind, __func__, name);
+		return NULL;
+	}
 
+	id = name + prefixlen;
 	if (id >= name + len) {
 		pr_err("FAILED to parse set name: %s\n", name);
 		return NULL;
 	}
 
-	return btf_id__add(&obj->sets, id, true);
+	return btf_id__add_unique(&obj->sets, id, kind);
 }
 
 static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size)
@@ -288,7 +321,7 @@ static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size)
 		return NULL;
 	}
 
-	return btf_id__add(root, id, false);
+	return btf_id__add(root, id, BTF_ID_KIND_SYM);
 }
 
 /* Older libelf.h and glibc elf.h might not yet define the ELF compression types. */
@@ -491,35 +524,31 @@ static int symbols_collect(struct object *obj)
 			id = add_symbol(&obj->funcs, prefix, sizeof(BTF_FUNC) - 1);
 		/* set8 */
 		} else if (!strncmp(prefix, BTF_SET8, sizeof(BTF_SET8) - 1)) {
-			id = add_set(obj, prefix, true);
+			id = add_set(obj, prefix, BTF_ID_KIND_SET8);
 			/*
 			 * SET8 objects store list's count, which is encoded
 			 * in symbol's size, together with 'cnt' field hence
 			 * that - 1.
 			 */
-			if (id) {
+			if (id)
 				id->cnt = sym.st_size / sizeof(uint64_t) - 1;
-				id->is_set8 = true;
-			}
 		/* set */
 		} else if (!strncmp(prefix, BTF_SET, sizeof(BTF_SET) - 1)) {
-			id = add_set(obj, prefix, false);
+			id = add_set(obj, prefix, BTF_ID_KIND_SET);
 			/*
 			 * SET objects store list's count, which is encoded
 			 * in symbol's size, together with 'cnt' field hence
 			 * that - 1.
 			 */
-			if (id) {
+			if (id)
 				id->cnt = sym.st_size / sizeof(int) - 1;
-				id->is_set = true;
-			}
 		} else {
 			pr_err("FAILED unsupported prefix %s\n", prefix);
 			return -1;
 		}
 
 		if (!id)
-			return -ENOMEM;
+			return -EINVAL;
 
 		if (id->addr_cnt >= ADDR_CNT) {
 			pr_err("FAILED symbol %s crossed the number of allowed lists\n",
@@ -643,7 +672,7 @@ static int id_patch(struct object *obj, struct btf_id *id)
 	int i;
 
 	/* For set, set8, id->id may be 0 */
-	if (!id->id && !id->is_set && !id->is_set8) {
+	if (!id->id && id->kind != BTF_ID_KIND_SET && id->kind != BTF_ID_KIND_SET8) {
 		pr_err("WARN: resolve_btfids: unresolved symbol %s\n", id->name);
 		warnings++;
 	}
@@ -696,6 +725,7 @@ static int sets_patch(struct object *obj)
 {
 	Elf_Data *data = obj->efile.idlist;
 	struct rb_node *next;
+	int cnt;
 
 	next = rb_first(&obj->sets);
 	while (next) {
@@ -715,11 +745,15 @@ static int sets_patch(struct object *obj)
 			return -1;
 		}
 
-		if (id->is_set) {
+		switch (id->kind) {
+		case BTF_ID_KIND_SET:
 			set = data->d_buf + off;
+			cnt = set->cnt;
 			qsort(set->ids, set->cnt, sizeof(set->ids[0]), cmp_id);
-		} else {
+			break;
+		case BTF_ID_KIND_SET8:
 			set8 = data->d_buf + off;
+			cnt = set8->cnt;
 			/*
 			 * Make sure id is at the beginning of the pairs
 			 * struct, otherwise the below qsort would not work.
@@ -744,10 +778,13 @@ static int sets_patch(struct object *obj)
 						bswap_32(set8->pairs[i].flags);
 				}
 			}
+			break;
+		default:
+			pr_err("Unexpected btf_id_kind %d for set '%s'\n", id->kind, id->name);
+			return -1;
 		}
 
-		pr_debug("sorting  addr %5lu: cnt %6d [%s]\n",
-			 off, id->is_set ? set->cnt : set8->cnt, id->name);
+		pr_debug("sorting  addr %5lu: cnt %6d [%s]\n", off, cnt, id->name);
 
 		next = rb_next(next);
 	}
-- 
cgit v1.2.3


From fb348d4fdf5ec08aea8b1f686136584f758e4364 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Fri, 19 Dec 2025 10:13:17 -0800
Subject: resolve_btfids: Always build with -Wall -Werror

resolve_btfids builds without compiler warnings currently, so let's
enforce this for future changes with '-Wall -Werror' flags [1].

[1] https://lore.kernel.org/bpf/1957a60b-6c45-42a7-b525-a6e335a735ff@linux.dev/

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/bpf/20251219181321.1283664-5-ihor.solodrai@linux.dev
---
 tools/bpf/resolve_btfids/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile
index ce1b556dfa90..1733a6e93a07 100644
--- a/tools/bpf/resolve_btfids/Makefile
+++ b/tools/bpf/resolve_btfids/Makefile
@@ -70,7 +70,8 @@ HOSTCFLAGS_resolve_btfids += -g \
           -I$(srctree)/tools/include/uapi \
           -I$(LIBBPF_INCLUDE) \
           -I$(SUBCMD_INCLUDE) \
-          $(LIBELF_FLAGS)
+          $(LIBELF_FLAGS) \
+          -Wall -Werror
 
 LIBS = $(LIBELF_LIBS) -lz
 
-- 
cgit v1.2.3


From 903922cfa0e60573234ff895974c23a000035258 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Fri, 19 Dec 2025 10:18:23 -0800
Subject: lib/Kconfig.debug: Set the minimum required pahole version to v1.22

Subsequent patches in the series change vmlinux linking scripts to
unconditionally pass --btf_encode_detached to pahole, which was
introduced in v1.22 [1][2].

This change allows to remove PAHOLE_HAS_SPLIT_BTF Kconfig option and
other checks of older pahole versions.

[1] https://github.com/acmel/dwarves/releases/tag/v1.22
[2] https://lore.kernel.org/bpf/cbafbf4e-9073-4383-8ee6-1353f9e5869c@oracle.com/

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Acked-by: Nicolas Schier <nsc@kernel.org>
Link: https://lore.kernel.org/bpf/20251219181825.1289460-1-ihor.solodrai@linux.dev
---
 Documentation/process/changes.rst     |  4 ++--
 Documentation/scheduler/sched-ext.rst |  1 -
 lib/Kconfig.debug                     | 13 ++++---------
 scripts/Makefile.btf                  |  9 +--------
 tools/sched_ext/README.md             |  1 -
 5 files changed, 7 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index 62951cdb13ad..b7e329159d00 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -38,7 +38,7 @@ bash                   4.2              bash --version
 binutils               2.30             ld -v
 flex                   2.5.35           flex --version
 bison                  2.0              bison --version
-pahole                 1.16             pahole --version
+pahole                 1.22             pahole --version
 util-linux             2.10o            mount --version
 kmod                   13               depmod -V
 e2fsprogs              1.41.4           e2fsck -V
@@ -143,7 +143,7 @@ pahole
 
 Since Linux 5.2, if CONFIG_DEBUG_INFO_BTF is selected, the build system
 generates BTF (BPF Type Format) from DWARF in vmlinux, a bit later from kernel
-modules as well.  This requires pahole v1.16 or later.
+modules as well.  This requires pahole v1.22 or later.
 
 It is found in the 'dwarves' or 'pahole' distro packages or from
 https://fedorapeople.org/~acme/dwarves/.
diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 404fe6126a76..9e2882d937b4 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -43,7 +43,6 @@ options should be enabled to use sched_ext:
     CONFIG_DEBUG_INFO_BTF=y
     CONFIG_BPF_JIT_ALWAYS_ON=y
     CONFIG_BPF_JIT_DEFAULT_ON=y
-    CONFIG_PAHOLE_HAS_SPLIT_BTF=y
     CONFIG_PAHOLE_HAS_BTF_TAG=y
 
 sched_ext is used only when the BPF scheduler is loaded and running.
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ba36939fda79..60281c4f9e99 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -388,18 +388,13 @@ config DEBUG_INFO_BTF
 	depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED
 	depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST
 	depends on BPF_SYSCALL
-	depends on PAHOLE_VERSION >= 116
-	depends on DEBUG_INFO_DWARF4 || PAHOLE_VERSION >= 121
+	depends on PAHOLE_VERSION >= 122
 	# pahole uses elfutils, which does not have support for Hexagon relocations
 	depends on !HEXAGON
 	help
 	  Generate deduplicated BTF type information from DWARF debug info.
-	  Turning this on requires pahole v1.16 or later (v1.21 or later to
-	  support DWARF 5), which will convert DWARF type info into equivalent
-	  deduplicated BTF type info.
-
-config PAHOLE_HAS_SPLIT_BTF
-	def_bool PAHOLE_VERSION >= 119
+	  Turning this on requires pahole v1.22 or later, which will convert
+	  DWARF type info into equivalent deduplicated BTF type info.
 
 config PAHOLE_HAS_BTF_TAG
 	def_bool PAHOLE_VERSION >= 123
@@ -421,7 +416,7 @@ config PAHOLE_HAS_LANG_EXCLUDE
 config DEBUG_INFO_BTF_MODULES
 	bool "Generate BTF type information for kernel modules"
 	default y
-	depends on DEBUG_INFO_BTF && MODULES && PAHOLE_HAS_SPLIT_BTF
+	depends on DEBUG_INFO_BTF && MODULES
 	help
 	  Generate compact split BTF type information for kernel modules.
 
diff --git a/scripts/Makefile.btf b/scripts/Makefile.btf
index db76335dd917..840a55de42da 100644
--- a/scripts/Makefile.btf
+++ b/scripts/Makefile.btf
@@ -7,14 +7,7 @@ JOBS := $(patsubst -j%,%,$(filter -j%,$(MAKEFLAGS)))
 
 ifeq ($(call test-le, $(pahole-ver), 125),y)
 
-# pahole 1.18 through 1.21 can't handle zero-sized per-CPU vars
-ifeq ($(call test-le, $(pahole-ver), 121),y)
-pahole-flags-$(call test-ge, $(pahole-ver), 118)	+= --skip_encoding_btf_vars
-endif
-
-pahole-flags-$(call test-ge, $(pahole-ver), 121)	+= --btf_gen_floats
-
-pahole-flags-$(call test-ge, $(pahole-ver), 122)	+= -j$(JOBS)
+pahole-flags-y                                  	+= --btf_gen_floats -j$(JOBS)
 
 pahole-flags-$(call test-ge, $(pahole-ver), 125)	+= --skip_encoding_btf_inconsistent_proto --btf_gen_optimized
 
diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md
index 16a42e4060f6..56a9d1557ac4 100644
--- a/tools/sched_ext/README.md
+++ b/tools/sched_ext/README.md
@@ -65,7 +65,6 @@ It's also recommended that you also include the following Kconfig options:
 ```
 CONFIG_BPF_JIT_ALWAYS_ON=y
 CONFIG_BPF_JIT_DEFAULT_ON=y
-CONFIG_PAHOLE_HAS_SPLIT_BTF=y
 CONFIG_PAHOLE_HAS_BTF_TAG=y
 ```
 
-- 
cgit v1.2.3


From 014e1cdb5fad8c6034feb3a97468a91edf23d3d0 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Fri, 19 Dec 2025 10:18:24 -0800
Subject: selftests/bpf: Run resolve_btfids only for relevant .test.o objects

A selftest targeting resolve_btfids functionality relies on a resolved
.BTF_ids section to be available in the TRUNNER_BINARY. The underlying
BTF data is taken from a special BPF program (btf_data.c), and so
resolve_btfids is executed as a part of a TRUNNER_BINARY build recipe
on the final binary.

Subsequent patches in this series allow resolve_btfids to modify BTF
before resolving the symbols, which means that the test needs access
to that modified BTF [1]. Currently the test simply reads in
btf_data.bpf.o on the assumption that BTF hasn't changed.

Implement resolve_btfids call only for particular test objects (just
resolve_btfids.test.o for now). The test objects are linked into the
TRUNNER_BINARY, and so .BTF_ids section will be available there.

This will make it trivial for the resolve_btfids test to access BTF
modified by resolve_btfids.

[1] https://lore.kernel.org/bpf/CAErzpmvsgSDe-QcWH8SFFErL6y3p3zrqNri5-UHJ9iK2ChyiBw@mail.gmail.com/

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20251219181825.1289460-2-ihor.solodrai@linux.dev
---
 tools/testing/selftests/bpf/Makefile | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 4aa60e83ff19..ffd0a4c354c7 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -643,6 +643,9 @@ $(TRUNNER_TESTS_HDR): $(TRUNNER_TESTS_DIR)/*.c
 		 ) > $$@)
 endif
 
+$(TRUNNER_OUTPUT)/resolve_btfids.test.o: $(RESOLVE_BTFIDS) $(TRUNNER_OUTPUT)/btf_data.bpf.o
+$(TRUNNER_OUTPUT)/resolve_btfids.test.o: private TEST_NEEDS_BTFIDS = 1
+
 # compile individual test files
 # Note: we cd into output directory to ensure embedded BPF object is found
 $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o:			\
@@ -650,6 +653,9 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o:			\
 		      | $(TRUNNER_OUTPUT)/%.test.d
 	$$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@)
 	$(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F)
+	$$(if $$(TEST_NEEDS_BTFIDS),					\
+		$$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@)		\
+		$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@)
 
 $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d:			\
 			    $(TRUNNER_TESTS_DIR)/%.c			\
@@ -695,13 +701,11 @@ $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS)
 $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS)			\
 			     $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ)		\
 			     $(TRUNNER_LIB_OBJS)			\
-			     $(RESOLVE_BTFIDS)				\
 			     $(TRUNNER_BPFTOOL)				\
 			     $(OUTPUT)/veristat				\
 			     | $(TRUNNER_BINARY)-extras
 	$$(call msg,BINARY,,$$@)
 	$(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) $$(LLVM_LDLIBS) $$(LDFLAGS) $$(LLVM_LDFLAGS) -o $$@
-	$(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@
 	$(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/$(USE_BOOTSTRAP)bpftool \
 		   $(OUTPUT)/$(if $2,$2/)bpftool
 
-- 
cgit v1.2.3


From 522397d05e7d4a7c30b91841492360336b24f833 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Fri, 19 Dec 2025 10:18:25 -0800
Subject: resolve_btfids: Change in-place update with raw binary output

Currently resolve_btfids updates .BTF_ids section of an ELF file
in-place, based on the contents of provided BTF, usually within the
same input file, and optionally a BTF base.

Change resolve_btfids behavior to enable BTF transformations as part
of its main operation. To achieve this, in-place ELF write in
resolve_btfids is replaced with generation of the following binaries:
  * ${1}.BTF with .BTF section data
  * ${1}.BTF_ids with .BTF_ids section data if it existed in ${1}
  * ${1}.BTF.base with .BTF.base section data for out-of-tree modules

The execution of resolve_btfids and consumption of its output is
orchestrated by scripts/gen-btf.sh introduced in this patch.

The motivation for emitting binary data is that it allows simplifying
resolve_btfids implementation by delegating ELF update to the $OBJCOPY
tool [1], which is already widely used across the codebase.

There are two distinct paths for BTF generation and resolve_btfids
application in the kernel build: for vmlinux and for kernel modules.

For the vmlinux binary a .BTF section is added in a roundabout way to
ensure correct linking. The patch doesn't change this approach, only
the implementation is a little different.

Before this patch it worked as follows:

  * pahole consumed .tmp_vmlinux1 [2] and added .BTF section with
    llvm-objcopy [3] to it
  * then everything except the .BTF section was stripped from .tmp_vmlinux1
    into a .tmp_vmlinux1.bpf.o object [2], later linked into vmlinux
  * resolve_btfids was executed later on vmlinux.unstripped [4],
    updating it in-place

After this patch gen-btf.sh implements the following:

  * pahole consumes .tmp_vmlinux1 and produces a *detached* file with
    raw BTF data
  * resolve_btfids consumes .tmp_vmlinux1 and detached BTF to produce
    (potentially modified) .BTF, and .BTF_ids sections data
  * a .tmp_vmlinux1.bpf.o object is then produced with objcopy copying
    BTF output of resolve_btfids
  * .BTF_ids data gets embedded into vmlinux.unstripped in
    link-vmlinux.sh by objcopy --update-section

For kernel modules, creating a special .bpf.o file is not necessary,
and so embedding of sections data produced by resolve_btfids is
straightforward with objcopy.

With this patch an ELF file becomes effectively read-only within
resolve_btfids, which allows deleting elf_update() call and satellite
code (like compressed_section_fix [5]).

Endianness handling of .BTF_ids data is also changed. Previously the
"flags" part of the section was bswapped in sets_patch() [6], and then
Elf_Type was modified before elf_update() to signal to libelf that
bswap may be necessary. With this patch we explicitly bswap entire
data buffer on load and on dump.

[1] https://lore.kernel.org/bpf/131b4190-9c49-4f79-a99d-c00fac97fa44@linux.dev/
[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/scripts/link-vmlinux.sh?h=v6.18#n110
[3] https://git.kernel.org/pub/scm/devel/pahole/pahole.git/tree/btf_encoder.c?h=v1.31#n1803
[4] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/scripts/link-vmlinux.sh?h=v6.18#n284
[5] https://lore.kernel.org/bpf/20200819092342.259004-1-jolsa@kernel.org/
[6] https://lore.kernel.org/bpf/cover.1707223196.git.vmalik@redhat.com/

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20251219181825.1289460-3-ihor.solodrai@linux.dev
---
 MAINTAINERS                                        |   1 +
 scripts/Makefile.btf                               |  12 +-
 scripts/Makefile.modfinal                          |   5 +-
 scripts/Makefile.vmlinux                           |   2 +-
 scripts/gen-btf.sh                                 | 157 +++++++++++++++
 scripts/link-vmlinux.sh                            |  42 +---
 tools/bpf/resolve_btfids/main.c                    | 224 +++++++++++++--------
 tools/testing/selftests/bpf/.gitignore             |   3 +
 tools/testing/selftests/bpf/Makefile               |   9 +-
 .../selftests/bpf/prog_tests/resolve_btfids.c      |   4 +-
 10 files changed, 328 insertions(+), 131 deletions(-)
 create mode 100755 scripts/gen-btf.sh

(limited to 'tools')

diff --git a/MAINTAINERS b/MAINTAINERS
index 5b11839cba9d..cb1898a85b05 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4766,6 +4766,7 @@ F:	net/sched/act_bpf.c
 F:	net/sched/cls_bpf.c
 F:	samples/bpf/
 F:	scripts/bpf_doc.py
+F:	scripts/gen-btf.sh
 F:	scripts/Makefile.btf
 F:	scripts/pahole-version.sh
 F:	tools/bpf/
diff --git a/scripts/Makefile.btf b/scripts/Makefile.btf
index 840a55de42da..562a04b40e06 100644
--- a/scripts/Makefile.btf
+++ b/scripts/Makefile.btf
@@ -18,13 +18,15 @@ pahole-flags-$(call test-ge, $(pahole-ver), 126)  = -j$(JOBS) --btf_features=enc
 
 pahole-flags-$(call test-ge, $(pahole-ver), 130) += --btf_features=attributes
 
-ifneq ($(KBUILD_EXTMOD),)
-module-pahole-flags-$(call test-ge, $(pahole-ver), 128) += --btf_features=distilled_base
-endif
-
 endif
 
 pahole-flags-$(CONFIG_PAHOLE_HAS_LANG_EXCLUDE)		+= --lang_exclude=rust
 
 export PAHOLE_FLAGS := $(pahole-flags-y)
-export MODULE_PAHOLE_FLAGS := $(module-pahole-flags-y)
+
+resolve-btfids-flags-y :=
+resolve-btfids-flags-$(CONFIG_WERROR) += --fatal_warnings
+resolve-btfids-flags-$(if $(KBUILD_EXTMOD),y) += --distill_base
+resolve-btfids-flags-$(if $(KBUILD_VERBOSE),y) += --verbose
+
+export RESOLVE_BTFIDS_FLAGS := $(resolve-btfids-flags-y)
diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal
index 149e12ff5700..422c56dc878e 100644
--- a/scripts/Makefile.modfinal
+++ b/scripts/Makefile.modfinal
@@ -42,9 +42,8 @@ quiet_cmd_btf_ko = BTF [M] $@
       cmd_btf_ko = 							\
 	if [ ! -f $(objtree)/vmlinux ]; then				\
 		printf "Skipping BTF generation for %s due to unavailability of vmlinux\n" $@ 1>&2; \
-	else								\
-		LLVM_OBJCOPY="$(OBJCOPY)" $(PAHOLE) -J $(PAHOLE_FLAGS) $(MODULE_PAHOLE_FLAGS) --btf_base $(objtree)/vmlinux $@; \
-		$(RESOLVE_BTFIDS) -b $(objtree)/vmlinux $@;		\
+	else	\
+		$(srctree)/scripts/gen-btf.sh --btf_base $(objtree)/vmlinux $@; \
 	fi;
 
 # Same as newer-prereqs, but allows to exclude specified extra dependencies
diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux
index cd788cac9d91..20a988f4fe0c 100644
--- a/scripts/Makefile.vmlinux
+++ b/scripts/Makefile.vmlinux
@@ -71,7 +71,7 @@ targets += vmlinux.unstripped .vmlinux.export.o
 vmlinux.unstripped: scripts/link-vmlinux.sh vmlinux.o .vmlinux.export.o $(KBUILD_LDS) FORCE
 	+$(call if_changed_dep,link_vmlinux)
 ifdef CONFIG_DEBUG_INFO_BTF
-vmlinux.unstripped: $(RESOLVE_BTFIDS)
+vmlinux.unstripped: $(RESOLVE_BTFIDS) $(srctree)/scripts/gen-btf.sh
 endif
 
 ifdef CONFIG_BUILDTIME_TABLE_SORT
diff --git a/scripts/gen-btf.sh b/scripts/gen-btf.sh
new file mode 100755
index 000000000000..06c6d8becaa2
--- /dev/null
+++ b/scripts/gen-btf.sh
@@ -0,0 +1,157 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+#
+# This script generates BTF data for the provided ELF file.
+#
+# Kernel BTF generation involves these conceptual steps:
+#   1. pahole generates BTF from DWARF data
+#   2. resolve_btfids applies kernel-specific btf2btf
+#      transformations and computes data for .BTF_ids section
+#   3. the result gets linked/objcopied into the target binary
+#
+# How step (3) should be done differs between vmlinux, and
+# kernel modules, which is the primary reason for the existence
+# of this script.
+#
+# For modules the script expects vmlinux passed in as --btf_base.
+# Generated .BTF, .BTF.base and .BTF_ids sections become embedded
+# into the input ELF file with objcopy.
+#
+# For vmlinux the input file remains unchanged and two files are produced:
+#   - ${1}.btf.o ready for linking into vmlinux
+#   - ${1}.BTF_ids with .BTF_ids data blob
+# This output is consumed by scripts/link-vmlinux.sh
+
+set -e
+
+usage()
+{
+	echo "Usage: $0 [--btf_base <file>] <target ELF file>"
+	exit 1
+}
+
+BTF_BASE=""
+
+while [ $# -gt 0 ]; do
+	case "$1" in
+	--btf_base)
+		BTF_BASE="$2"
+		shift 2
+		;;
+	-*)
+		echo "Unknown option: $1" >&2
+		usage
+		;;
+	*)
+		break
+		;;
+	esac
+done
+
+if [ $# -ne 1 ]; then
+	usage
+fi
+
+ELF_FILE="$1"
+shift
+
+is_enabled() {
+	grep -q "^$1=y" ${objtree}/include/config/auto.conf
+}
+
+info()
+{
+	printf "  %-7s %s\n" "${1}" "${2}"
+}
+
+case "${KBUILD_VERBOSE}" in
+*1*)
+	set -x
+	;;
+esac
+
+
+gen_btf_data()
+{
+	info BTF "${ELF_FILE}"
+	btf1="${ELF_FILE}.BTF.1"
+	${PAHOLE} -J ${PAHOLE_FLAGS}			\
+		${BTF_BASE:+--btf_base ${BTF_BASE}}	\
+		--btf_encode_detached=${btf1}		\
+		"${ELF_FILE}"
+
+	info BTFIDS "${ELF_FILE}"
+	${RESOLVE_BTFIDS} ${RESOLVE_BTFIDS_FLAGS}	\
+		${BTF_BASE:+--btf_base ${BTF_BASE}}	\
+		--btf ${btf1} "${ELF_FILE}"
+}
+
+gen_btf_o()
+{
+	local btf_data=${ELF_FILE}.btf.o
+
+	# Create ${btf_data} which contains just .BTF section but no symbols. Add
+	# SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all
+	# deletes all symbols including __start_BTF and __stop_BTF, which will
+	# be redefined in the linker script.
+	info OBJCOPY "${btf_data}"
+	echo "" | ${CC} ${CLANG_FLAGS} -c -x c -o ${btf_data} -
+	${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF \
+		--set-section-flags .BTF=alloc,readonly ${btf_data}
+	${OBJCOPY} --only-section=.BTF --strip-all ${btf_data}
+
+	# Change e_type to ET_REL so that it can be used to link final vmlinux.
+	# GNU ld 2.35+ and lld do not allow an ET_EXEC input.
+	if is_enabled CONFIG_CPU_BIG_ENDIAN; then
+		et_rel='\0\1'
+	else
+		et_rel='\1\0'
+	fi
+	printf "${et_rel}" | dd of="${btf_data}" conv=notrunc bs=1 seek=16 status=none
+}
+
+embed_btf_data()
+{
+	info OBJCOPY "${ELF_FILE}.BTF"
+	${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF ${ELF_FILE}
+
+	# a module might not have a .BTF_ids or .BTF.base section
+	local btf_base="${ELF_FILE}.BTF.base"
+	if [ -f "${btf_base}" ]; then
+		${OBJCOPY} --add-section .BTF.base=${btf_base} ${ELF_FILE}
+	fi
+	local btf_ids="${ELF_FILE}.BTF_ids"
+	if [ -f "${btf_ids}" ]; then
+		${OBJCOPY} --update-section .BTF_ids=${btf_ids} ${ELF_FILE}
+	fi
+}
+
+cleanup()
+{
+	rm -f "${ELF_FILE}.BTF.1"
+	rm -f "${ELF_FILE}.BTF"
+	if [ "${BTFGEN_MODE}" = "module" ]; then
+		rm -f "${ELF_FILE}.BTF.base"
+		rm -f "${ELF_FILE}.BTF_ids"
+	fi
+}
+trap cleanup EXIT
+
+BTFGEN_MODE="vmlinux"
+if [ -n "${BTF_BASE}" ]; then
+	BTFGEN_MODE="module"
+fi
+
+gen_btf_data
+
+case "${BTFGEN_MODE}" in
+vmlinux)
+	gen_btf_o
+	;;
+module)
+	embed_btf_data
+	;;
+esac
+
+exit 0
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 4ab44c73da4d..e2207e612ac3 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -106,34 +106,6 @@ vmlinux_link()
 		${kallsymso} ${btf_vmlinux_bin_o} ${arch_vmlinux_o} ${ldlibs}
 }
 
-# generate .BTF typeinfo from DWARF debuginfo
-# ${1} - vmlinux image
-gen_btf()
-{
-	local btf_data=${1}.btf.o
-
-	info BTF "${btf_data}"
-	LLVM_OBJCOPY="${OBJCOPY}" ${PAHOLE} -J ${PAHOLE_FLAGS} ${1}
-
-	# Create ${btf_data} which contains just .BTF section but no symbols. Add
-	# SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all
-	# deletes all symbols including __start_BTF and __stop_BTF, which will
-	# be redefined in the linker script. Add 2>/dev/null to suppress GNU
-	# objcopy warnings: "empty loadable segment detected at ..."
-	${OBJCOPY} --only-section=.BTF --set-section-flags .BTF=alloc,readonly \
-		--strip-all ${1} "${btf_data}" 2>/dev/null
-	# Change e_type to ET_REL so that it can be used to link final vmlinux.
-	# GNU ld 2.35+ and lld do not allow an ET_EXEC input.
-	if is_enabled CONFIG_CPU_BIG_ENDIAN; then
-		et_rel='\0\1'
-	else
-		et_rel='\1\0'
-	fi
-	printf "${et_rel}" | dd of="${btf_data}" conv=notrunc bs=1 seek=16 status=none
-
-	btf_vmlinux_bin_o=${btf_data}
-}
-
 # Create ${2}.o file with all symbols from the ${1} object file
 kallsyms()
 {
@@ -205,6 +177,7 @@ if is_enabled CONFIG_ARCH_WANTS_PRE_LINK_VMLINUX; then
 fi
 
 btf_vmlinux_bin_o=
+btfids_vmlinux=
 kallsymso=
 strip_debug=
 generate_map=
@@ -232,11 +205,13 @@ if is_enabled CONFIG_KALLSYMS || is_enabled CONFIG_DEBUG_INFO_BTF; then
 fi
 
 if is_enabled CONFIG_DEBUG_INFO_BTF; then
-	if ! gen_btf .tmp_vmlinux1; then
+	if ! ${srctree}/scripts/gen-btf.sh .tmp_vmlinux1; then
 		echo >&2 "Failed to generate BTF for vmlinux"
 		echo >&2 "Try to disable CONFIG_DEBUG_INFO_BTF"
 		exit 1
 	fi
+	btf_vmlinux_bin_o=.tmp_vmlinux1.btf.o
+	btfids_vmlinux=.tmp_vmlinux1.BTF_ids
 fi
 
 if is_enabled CONFIG_KALLSYMS; then
@@ -289,14 +264,9 @@ fi
 
 vmlinux_link "${VMLINUX}"
 
-# fill in BTF IDs
 if is_enabled CONFIG_DEBUG_INFO_BTF; then
-	info BTFIDS "${VMLINUX}"
-	RESOLVE_BTFIDS_ARGS=""
-	if is_enabled CONFIG_WERROR; then
-		RESOLVE_BTFIDS_ARGS=" --fatal_warnings "
-	fi
-	${RESOLVE_BTFIDS} ${RESOLVE_BTFIDS_ARGS} "${VMLINUX}"
+	info OBJCOPY ${btfids_vmlinux}
+	${OBJCOPY} --update-section .BTF_ids=${btfids_vmlinux} ${VMLINUX}
 fi
 
 mksysmap "${VMLINUX}" System.map
diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index e721e20a2bbd..2cbc252259be 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -71,9 +71,11 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <linux/btf_ids.h>
+#include <linux/kallsyms.h>
 #include <linux/rbtree.h>
 #include <linux/zalloc.h>
 #include <linux/err.h>
+#include <linux/limits.h>
 #include <bpf/btf.h>
 #include <bpf/libbpf.h>
 #include <subcmd/parse-options.h>
@@ -124,6 +126,7 @@ struct object {
 
 	struct btf *btf;
 	struct btf *base_btf;
+	bool distill_base;
 
 	struct {
 		int		 fd;
@@ -324,42 +327,16 @@ static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size)
 	return btf_id__add(root, id, BTF_ID_KIND_SYM);
 }
 
-/* Older libelf.h and glibc elf.h might not yet define the ELF compression types. */
-#ifndef SHF_COMPRESSED
-#define SHF_COMPRESSED (1 << 11) /* Section with compressed data. */
-#endif
-
-/*
- * The data of compressed section should be aligned to 4
- * (for 32bit) or 8 (for 64 bit) bytes. The binutils ld
- * sets sh_addralign to 1, which makes libelf fail with
- * misaligned section error during the update:
- *    FAILED elf_update(WRITE): invalid section alignment
- *
- * While waiting for ld fix, we fix the compressed sections
- * sh_addralign value manualy.
- */
-static int compressed_section_fix(Elf *elf, Elf_Scn *scn, GElf_Shdr *sh)
+static void bswap_32_data(void *data, u32 nr_bytes)
 {
-	int expected = gelf_getclass(elf) == ELFCLASS32 ? 4 : 8;
-
-	if (!(sh->sh_flags & SHF_COMPRESSED))
-		return 0;
-
-	if (sh->sh_addralign == expected)
-		return 0;
+	u32 cnt, i;
+	u32 *ptr;
 
-	pr_debug2(" - fixing wrong alignment sh_addralign %u, expected %u\n",
-		  sh->sh_addralign, expected);
+	cnt = nr_bytes / sizeof(u32);
+	ptr = data;
 
-	sh->sh_addralign = expected;
-
-	if (gelf_update_shdr(scn, sh) == 0) {
-		pr_err("FAILED cannot update section header: %s\n",
-			elf_errmsg(-1));
-		return -1;
-	}
-	return 0;
+	for (i = 0; i < cnt; i++)
+		ptr[i] = bswap_32(ptr[i]);
 }
 
 static int elf_collect(struct object *obj)
@@ -380,7 +357,7 @@ static int elf_collect(struct object *obj)
 
 	elf_version(EV_CURRENT);
 
-	elf = elf_begin(fd, ELF_C_RDWR_MMAP, NULL);
+	elf = elf_begin(fd, ELF_C_READ_MMAP_PRIVATE, NULL);
 	if (!elf) {
 		close(fd);
 		pr_err("FAILED cannot create ELF descriptor: %s\n",
@@ -443,21 +420,20 @@ static int elf_collect(struct object *obj)
 			obj->efile.symbols_shndx = idx;
 			obj->efile.strtabidx     = sh.sh_link;
 		} else if (!strcmp(name, BTF_IDS_SECTION)) {
+			/*
+			 * If target endianness differs from host, we need to bswap32
+			 * the .BTF_ids section data on load, because .BTF_ids has
+			 * Elf_Type = ELF_T_BYTE, and so libelf returns data buffer in
+			 * the target endianness. We repeat this on dump.
+			 */
+			if (obj->efile.encoding != ELFDATANATIVE) {
+				pr_debug("bswap_32 .BTF_ids data from target to host endianness\n");
+				bswap_32_data(data->d_buf, data->d_size);
+			}
 			obj->efile.idlist       = data;
 			obj->efile.idlist_shndx = idx;
 			obj->efile.idlist_addr  = sh.sh_addr;
-		} else if (!strcmp(name, BTF_BASE_ELF_SEC)) {
-			/* If a .BTF.base section is found, do not resolve
-			 * BTF ids relative to vmlinux; resolve relative
-			 * to the .BTF.base section instead.  btf__parse_split()
-			 * will take care of this once the base BTF it is
-			 * passed is NULL.
-			 */
-			obj->base_btf_path = NULL;
 		}
-
-		if (compressed_section_fix(elf, scn, &sh))
-			return -1;
 	}
 
 	return 0;
@@ -587,11 +563,26 @@ static int load_btf(struct object *obj)
 	obj->base_btf = base_btf;
 	obj->btf = btf;
 
+	if (obj->base_btf && obj->distill_base) {
+		err = btf__distill_base(obj->btf, &base_btf, &btf);
+		if (err) {
+			pr_err("FAILED to distill base BTF: %s\n", strerror(errno));
+			goto out_err;
+		}
+
+		btf__free(obj->base_btf);
+		btf__free(obj->btf);
+		obj->base_btf = base_btf;
+		obj->btf = btf;
+	}
+
 	return 0;
 
 out_err:
 	btf__free(base_btf);
 	btf__free(btf);
+	obj->base_btf = NULL;
+	obj->btf = NULL;
 	return err;
 }
 
@@ -760,24 +751,6 @@ static int sets_patch(struct object *obj)
 			 */
 			BUILD_BUG_ON((u32 *)set8->pairs != &set8->pairs[0].id);
 			qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id);
-
-			/*
-			 * When ELF endianness does not match endianness of the
-			 * host, libelf will do the translation when updating
-			 * the ELF. This, however, corrupts SET8 flags which are
-			 * already in the target endianness. So, let's bswap
-			 * them to the host endianness and libelf will then
-			 * correctly translate everything.
-			 */
-			if (obj->efile.encoding != ELFDATANATIVE) {
-				int i;
-
-				set8->flags = bswap_32(set8->flags);
-				for (i = 0; i < set8->cnt; i++) {
-					set8->pairs[i].flags =
-						bswap_32(set8->pairs[i].flags);
-				}
-			}
 			break;
 		default:
 			pr_err("Unexpected btf_id_kind %d for set '%s'\n", id->kind, id->name);
@@ -793,8 +766,6 @@ static int sets_patch(struct object *obj)
 
 static int symbols_patch(struct object *obj)
 {
-	off_t err;
-
 	if (__symbols_patch(obj, &obj->structs)  ||
 	    __symbols_patch(obj, &obj->unions)   ||
 	    __symbols_patch(obj, &obj->typedefs) ||
@@ -805,20 +776,90 @@ static int symbols_patch(struct object *obj)
 	if (sets_patch(obj))
 		return -1;
 
-	/* Set type to ensure endian translation occurs. */
-	obj->efile.idlist->d_type = ELF_T_WORD;
+	return 0;
+}
 
-	elf_flagdata(obj->efile.idlist, ELF_C_SET, ELF_F_DIRTY);
+static int dump_raw_data(const char *out_path, const void *data, u32 size)
+{
+	size_t written;
+	FILE *file;
 
-	err = elf_update(obj->efile.elf, ELF_C_WRITE);
-	if (err < 0) {
-		pr_err("FAILED elf_update(WRITE): %s\n",
-			elf_errmsg(-1));
+	file = fopen(out_path, "wb");
+	if (!file) {
+		pr_err("Couldn't open %s for writing\n", out_path);
+		return -1;
+	}
+
+	written = fwrite(data, 1, size, file);
+	if (written != size) {
+		pr_err("Failed to write data to %s\n", out_path);
+		fclose(file);
+		unlink(out_path);
+		return -1;
+	}
+
+	fclose(file);
+	pr_debug("Dumped %lu bytes of data to %s\n", size, out_path);
+
+	return 0;
+}
+
+static int dump_raw_btf_ids(struct object *obj, const char *out_path)
+{
+	Elf_Data *data = obj->efile.idlist;
+	int err;
+
+	if (!data || !data->d_buf) {
+		pr_debug("%s has no BTF_ids data to dump\n", obj->path);
+		return 0;
+	}
+
+	/*
+	 * If target endianness differs from host, we need to bswap32 the
+	 * .BTF_ids section data before dumping so that the output is in
+	 * target endianness.
+	 */
+	if (obj->efile.encoding != ELFDATANATIVE) {
+		pr_debug("bswap_32 .BTF_ids data from host to target endianness\n");
+		bswap_32_data(data->d_buf, data->d_size);
+	}
+
+	err = dump_raw_data(out_path, data->d_buf, data->d_size);
+	if (err)
+		return -1;
+
+	return 0;
+}
+
+static int dump_raw_btf(struct btf *btf, const char *out_path)
+{
+	const void *raw_btf_data;
+	u32 raw_btf_size;
+	int err;
+
+	raw_btf_data = btf__raw_data(btf, &raw_btf_size);
+	if (!raw_btf_data) {
+		pr_err("btf__raw_data() failed\n");
+		return -1;
 	}
 
-	pr_debug("update %s for %s\n",
-		 err >= 0 ? "ok" : "failed", obj->path);
-	return err < 0 ? -1 : 0;
+	err = dump_raw_data(out_path, raw_btf_data, raw_btf_size);
+	if (err)
+		return -1;
+
+	return 0;
+}
+
+static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, const char *suffix)
+{
+	int len = snprintf(buf, buf_sz, "%s%s", in_path, suffix);
+
+	if (len < 0 || len >= buf_sz) {
+		pr_err("Output path is too long: %s%s\n", in_path, suffix);
+		return -E2BIG;
+	}
+
+	return 0;
 }
 
 static const char * const resolve_btfids_usage[] = {
@@ -840,6 +881,8 @@ int main(int argc, const char **argv)
 		.sets     = RB_ROOT,
 	};
 	bool fatal_warnings = false;
+	char out_path[PATH_MAX];
+
 	struct option btfid_options[] = {
 		OPT_INCR('v', "verbose", &verbose,
 			 "be more verbose (show errors, etc)"),
@@ -849,6 +892,8 @@ int main(int argc, const char **argv)
 			   "path of file providing base BTF"),
 		OPT_BOOLEAN(0, "fatal_warnings", &fatal_warnings,
 			    "turn warnings into errors"),
+		OPT_BOOLEAN(0, "distill_base", &obj.distill_base,
+			    "distill --btf_base and emit .BTF.base section data"),
 		OPT_END()
 	};
 	int err = -1;
@@ -860,6 +905,9 @@ int main(int argc, const char **argv)
 
 	obj.path = argv[0];
 
+	if (load_btf(&obj))
+		goto out;
+
 	if (elf_collect(&obj))
 		goto out;
 
@@ -869,23 +917,37 @@ int main(int argc, const char **argv)
 	 */
 	if (obj.efile.idlist_shndx == -1 ||
 	    obj.efile.symbols_shndx == -1) {
-		pr_debug("Cannot find .BTF_ids or symbols sections, nothing to do\n");
-		err = 0;
-		goto out;
+		pr_debug("Cannot find .BTF_ids or symbols sections, skip symbols resolution\n");
+		goto dump_btf;
 	}
 
 	if (symbols_collect(&obj))
 		goto out;
 
-	if (load_btf(&obj))
-		goto out;
-
 	if (symbols_resolve(&obj))
 		goto out;
 
 	if (symbols_patch(&obj))
 		goto out;
 
+	err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_IDS_SECTION);
+	err = err ?: dump_raw_btf_ids(&obj, out_path);
+	if (err)
+		goto out;
+
+dump_btf:
+	err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_ELF_SEC);
+	err = err ?: dump_raw_btf(obj.btf, out_path);
+	if (err)
+		goto out;
+
+	if (obj.base_btf && obj.distill_base) {
+		err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_BASE_ELF_SEC);
+		err = err ?: dump_raw_btf(obj.base_btf, out_path);
+		if (err)
+			goto out;
+	}
+
 	if (!(fatal_warnings && warnings))
 		err = 0;
 out:
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 19c1638e312a..b8bf51b7a0b0 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -45,3 +45,6 @@ xdp_synproxy
 xdp_hw_metadata
 xdp_features
 verification_cert.h
+*.BTF
+*.BTF_ids
+*.BTF.base
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index ffd0a4c354c7..f28a32b16ff0 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -4,6 +4,7 @@ include ../../../scripts/Makefile.arch
 include ../../../scripts/Makefile.include
 
 CXX ?= $(CROSS_COMPILE)g++
+OBJCOPY ?= $(CROSS_COMPILE)objcopy
 
 CURDIR := $(abspath .)
 TOOLSDIR := $(abspath ../../..)
@@ -653,9 +654,10 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o:			\
 		      | $(TRUNNER_OUTPUT)/%.test.d
 	$$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@)
 	$(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F)
-	$$(if $$(TEST_NEEDS_BTFIDS),					\
-		$$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@)		\
-		$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@)
+	$$(if $$(TEST_NEEDS_BTFIDS),						\
+		$$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@)			\
+		$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@;	\
+		$(OBJCOPY) --update-section .BTF_ids=$$@.BTF_ids $$@)
 
 $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d:			\
 			    $(TRUNNER_TESTS_DIR)/%.c			\
@@ -894,6 +896,7 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)			\
 	prog_tests/tests.h map_tests/tests.h verifier/tests.h		\
 	feature bpftool $(TEST_KMOD_TARGETS)				\
 	$(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h	\
+			       *.BTF *.BTF_ids *.BTF.base		\
 			       no_alu32 cpuv4 bpf_gcc			\
 			       liburandom_read.so)			\
 	$(OUTPUT)/FEATURE-DUMP.selftests				\
diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
index 51544372f52e..41dfaaabb73f 100644
--- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
+++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
@@ -101,9 +101,9 @@ static int resolve_symbols(void)
 	int type_id;
 	__u32 nr;
 
-	btf = btf__parse_elf("btf_data.bpf.o", NULL);
+	btf = btf__parse_raw("resolve_btfids.test.o.BTF");
 	if (CHECK(libbpf_get_error(btf), "resolve",
-		  "Failed to load BTF from btf_data.bpf.o\n"))
+		  "Failed to load BTF from resolve_btfids.test.o.BTF\n"))
 		return -1;
 
 	nr = btf__type_cnt(btf);
-- 
cgit v1.2.3


From d2749ae85aec685e52e0474f445f6a8552363eb0 Mon Sep 17 00:00:00 2001
From: Matt Bobrowski <mattbobrowski@google.com>
Date: Tue, 16 Dec 2025 13:30:00 +0000
Subject: selftests/bpf: add test case for BPF LSM hook bpf_lsm_mmap_file

Add a trivial test case asserting that the BPF verifier enforces
PTR_MAYBE_NULL semantics on the struct file pointer argument of BPF
LSM hook bpf_lsm_mmap_file().

Dereferencing the struct file pointer passed into bpf_lsm_mmap_file()
without explicitly performing a NULL check first should not be
permitted by the BPF verifier as it can lead to NULL pointer
dereferences and a kernel crash.

Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20251216133000.3690723-2-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_lsm.c | 31 +++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c
index 6af9100a37ff..38e8e9176862 100644
--- a/tools/testing/selftests/bpf/progs/verifier_lsm.c
+++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c
@@ -1,7 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#include <linux/bpf.h>
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
 #include "bpf_misc.h"
 
 SEC("lsm/file_permission")
@@ -159,4 +160,32 @@ __naked int disabled_hook_test3(void *ctx)
 	::: __clobber_all);
 }
 
+SEC("lsm/mmap_file")
+__description("not null checking nullable pointer in bpf_lsm_mmap_file")
+__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'")
+int BPF_PROG(no_null_check, struct file *file)
+{
+	struct inode *inode;
+
+	inode = file->f_inode;
+	__sink(inode);
+
+	return 0;
+}
+
+SEC("lsm/mmap_file")
+__description("null checking nullable pointer in bpf_lsm_mmap_file")
+__success
+int BPF_PROG(null_check, struct file *file)
+{
+	struct inode *inode;
+
+	if (file) {
+		inode = file->f_inode;
+		__sink(inode);
+	}
+
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 1045ec382c6019b63cab24428783749a1cecc439 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 16 Dec 2025 15:26:12 +0100
Subject: kernel-doc: add support for handling global variables

Specially on kAPI, sometimes it is desirable to be able to
describe global variables that are part of kAPI.

Documenting vars with Sphinx is simple, as we don't need
to parse a data struct. All we need is the variable
declaration and use native C domain ::c:var: to format it
for us.

Add support for it.

Link: https://lore.kernel.org/linux-doc/491c3022-cef8-4860-a945-c9c4a3b63c09@infradead.org/T/#m947c25d95cb1d96a394410ab1131dc8e9e5013f1
Suggested-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <fa7d1c61a8de9150f71b318382f1507d3b13848d.1765894964.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/kdoc_output.py | 47 ++++++++++++++++++++++++++++++
 tools/lib/python/kdoc/kdoc_parser.py | 56 +++++++++++++++++++++++++++++++++++-
 2 files changed, 102 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/kdoc_output.py b/tools/lib/python/kdoc/kdoc_output.py
index b1aaa7fc3604..50aedbb3d6de 100644
--- a/tools/lib/python/kdoc/kdoc_output.py
+++ b/tools/lib/python/kdoc/kdoc_output.py
@@ -199,6 +199,10 @@ class OutputFormat:
             self.out_enum(fname, name, args)
             return self.data
 
+        if dtype == "var":
+            self.out_var(fname, name, args)
+            return self.data
+
         if dtype == "typedef":
             self.out_typedef(fname, name, args)
             return self.data
@@ -227,6 +231,9 @@ class OutputFormat:
     def out_enum(self, fname, name, args):
         """Outputs an enum"""
 
+    def out_var(self, fname, name, args):
+        """Outputs a variable"""
+
     def out_typedef(self, fname, name, args):
         """Outputs a typedef"""
 
@@ -472,6 +479,25 @@ class RestFormat(OutputFormat):
         self.lineprefix = oldprefix
         self.out_section(args)
 
+    def out_var(self, fname, name, args):
+        oldprefix = self.lineprefix
+        ln = args.declaration_start_line
+        full_proto = args.other_stuff["full_proto"]
+
+        self.lineprefix = "  "
+
+        self.data += f"\n\n.. c:macro:: {name}\n\n{self.lineprefix}{full_proto}\n\n"
+
+        self.print_lineno(ln)
+        self.output_highlight(args.get('purpose', ''))
+        self.data += "\n"
+
+        if args.other_stuff["default_val"]:
+            self.data += f'{self.lineprefix}**Initialization**\n\n'
+            self.output_highlight(f'default: ``{args.other_stuff["default_val"]}``')
+
+        self.out_section(args)
+
     def out_typedef(self, fname, name, args):
 
         oldprefix = self.lineprefix
@@ -773,6 +799,27 @@ class ManFormat(OutputFormat):
             self.data += f'.SH "{section}"' + "\n"
             self.output_highlight(text)
 
+    def out_var(self, fname, name, args):
+        out_name = self.arg_name(args, name)
+        prototype = args.other_stuff["var_type"]
+        full_proto = args.other_stuff["full_proto"]
+
+        self.data += f'.TH "{self.modulename}" 9 "{out_name}" "{self.man_date}" "API Manual" LINUX' + "\n"
+
+        self.data += ".SH NAME\n"
+        self.data += f"{prototype} \\- {args['purpose']}\n"
+
+        self.data += ".SH SYNOPSIS\n"
+        self.data += f"{full_proto}\n"
+
+        if args.other_stuff["default_val"]:
+            self.data += f'.SH "Initialization"' + "\n"
+            self.output_highlight(f'default: {args.other_stuff["default_val"]}')
+
+        for section, text in args.sections.items():
+            self.data += f'.SH "{section}"' + "\n"
+            self.output_highlight(text)
+
     def out_typedef(self, fname, name, args):
         module = self.modulename
         purpose = args.get('purpose')
diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py
index 500aafc50032..06bed1a12a45 100644
--- a/tools/lib/python/kdoc/kdoc_parser.py
+++ b/tools/lib/python/kdoc/kdoc_parser.py
@@ -64,7 +64,7 @@ type_param = KernRe(r"@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)", cache=False)
 # Tests for the beginning of a kerneldoc block in its various forms.
 #
 doc_block = doc_com + KernRe(r'DOC:\s*(.*)?', cache=False)
-doc_begin_data = KernRe(r"^\s*\*?\s*(struct|union|enum|typedef)\b\s*(\w*)", cache = False)
+doc_begin_data = KernRe(r"^\s*\*?\s*(struct|union|enum|typedef|var)\b\s*(\w*)", cache = False)
 doc_begin_func = KernRe(str(doc_com) +			# initial " * '
                         r"(?:\w+\s*\*\s*)?" + 		# type (not captured)
                         r'(?:define\s+)?' + 		# possible "define" (not captured)
@@ -927,6 +927,58 @@ class KernelDoc:
         self.output_declaration('enum', declaration_name,
                                 purpose=self.entry.declaration_purpose)
 
+    def dump_var(self, ln, proto):
+        """
+        Store variables that are part of kAPI.
+        """
+        VAR_ATTRIBS = [
+            "extern",
+        ]
+        OPTIONAL_VAR_ATTR = "^(?:" + "|".join(VAR_ATTRIBS) + ")?"
+
+        sub_prefixes = [
+            (KernRe(r"__read_mostly"), ""),
+            (KernRe(r"__ro_after_init"), ""),
+            (KernRe(r"(?://.*)$"), ""),
+            (KernRe(r"(?:/\*.*\*/)"), ""),
+            (KernRe(r";$"), ""),
+            (KernRe(r"=.*"), ""),
+        ]
+
+        #
+        # Store the full prototype before modifying it
+        #
+        full_proto = proto
+
+        #
+        # Drop comments and macros to have a pure C prototype
+        #
+        for search, sub in sub_prefixes:
+            proto = search.sub(sub, proto)
+
+        proto = proto.rstrip()
+
+        #
+        # Variable name is at the end of the declaration
+        #
+
+        r= KernRe(OPTIONAL_VAR_ATTR + r"\w.*\s+(?:\*+)?([\w_]+)\s*[\d\]\[]*\s*(=.*)?")
+        if not r.match(proto):
+           self.emit_msg(ln,f"{proto}: can't parse variable")
+           return
+
+        var_type = r.group(0)
+        declaration_name = r.group(1)
+        default_val = r.group(2)
+        if default_val:
+            default_val = default_val.lstrip("=").strip()
+
+        self.output_declaration("var", declaration_name,
+                                full_proto=full_proto,
+                                var_type=var_type,
+                                default_val=default_val,
+                                purpose=self.entry.declaration_purpose)
+
     def dump_declaration(self, ln, prototype):
         """
         Stores a data declaration inside self.entries array.
@@ -938,6 +990,8 @@ class KernelDoc:
             self.dump_typedef(ln, prototype)
         elif self.entry.decl_type in ["union", "struct"]:
             self.dump_struct(ln, prototype)
+        elif self.entry.decl_type == "var":
+            self.dump_var(ln, prototype)
         else:
             # This would be a bug
             self.emit_message(ln, f'Unknown declaration type: {self.entry.decl_type}')
-- 
cgit v1.2.3


From bdd1cf87847ff6aaadd53a185209d2bb2db72165 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 16 Dec 2025 15:26:13 +0100
Subject: kernel-doc: add support to handle DEFINE_ variables

Improve the parser and output plugin to work with macros,
adding support for the common pattern of using DEFINE_*
to create variables.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <757a45100cfc493984574ff780aa9d90506eecb4.1765894964.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/kdoc_output.py |  5 ++---
 tools/lib/python/kdoc/kdoc_parser.py | 25 +++++++++++++++++++++----
 2 files changed, 23 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/kdoc_output.py b/tools/lib/python/kdoc/kdoc_output.py
index 50aedbb3d6de..d2bf94275d65 100644
--- a/tools/lib/python/kdoc/kdoc_output.py
+++ b/tools/lib/python/kdoc/kdoc_output.py
@@ -486,7 +486,7 @@ class RestFormat(OutputFormat):
 
         self.lineprefix = "  "
 
-        self.data += f"\n\n.. c:macro:: {name}\n\n{self.lineprefix}{full_proto}\n\n"
+        self.data += f"\n\n.. c:macro:: {name}\n\n{self.lineprefix}``{full_proto}``\n\n"
 
         self.print_lineno(ln)
         self.output_highlight(args.get('purpose', ''))
@@ -801,13 +801,12 @@ class ManFormat(OutputFormat):
 
     def out_var(self, fname, name, args):
         out_name = self.arg_name(args, name)
-        prototype = args.other_stuff["var_type"]
         full_proto = args.other_stuff["full_proto"]
 
         self.data += f'.TH "{self.modulename}" 9 "{out_name}" "{self.man_date}" "API Manual" LINUX' + "\n"
 
         self.data += ".SH NAME\n"
-        self.data += f"{prototype} \\- {args['purpose']}\n"
+        self.data += f"{name} \\- {args['purpose']}\n"
 
         self.data += ".SH SYNOPSIS\n"
         self.data += f"{full_proto}\n"
diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py
index 06bed1a12a45..aaa352855717 100644
--- a/tools/lib/python/kdoc/kdoc_parser.py
+++ b/tools/lib/python/kdoc/kdoc_parser.py
@@ -949,12 +949,27 @@ class KernelDoc:
         # Store the full prototype before modifying it
         #
         full_proto = proto
+        declaration_name = None
+
+        #
+        # Handle macro definitions
+        #
+        macro_prefixes = [
+            KernRe(r"DEFINE_[\w_]+\s*\(([\w_]+)\)"),
+        ]
+
+        for r in macro_prefixes:
+            match = r.search(proto)
+            if match:
+                declaration_name = match.group(1)
+                break
 
         #
         # Drop comments and macros to have a pure C prototype
         #
-        for search, sub in sub_prefixes:
-            proto = search.sub(sub, proto)
+        if not declaration_name:
+            for r, sub in sub_prefixes:
+                proto = r.sub(sub, proto)
 
         proto = proto.rstrip()
 
@@ -968,14 +983,16 @@ class KernelDoc:
            return
 
         var_type = r.group(0)
-        declaration_name = r.group(1)
+
+        if not declaration_name:
+            declaration_name = r.group(1)
+
         default_val = r.group(2)
         if default_val:
             default_val = default_val.lstrip("=").strip()
 
         self.output_declaration("var", declaration_name,
                                 full_proto=full_proto,
-                                var_type=var_type,
                                 default_val=default_val,
                                 purpose=self.entry.declaration_purpose)
 
-- 
cgit v1.2.3


From aaacd70fb77afe75075e8bdf8e493b0af42eeabd Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 16 Dec 2025 15:26:17 +0100
Subject: docs: kernel-doc.rst: Parse DEFINE_ macros without prefixes

Currently, the logic for vars require a
	type DEFINE_foo();

where type is usually "static".

Make the logic more generic.

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Closes: https://lore.kernel.org/linux-doc/e1dad7e4-a0ca-4be6-a33c-97b75175c12f@infradead.org/
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <be16e087cbc065fbd041fb6d6f8fa5cf0426cca5.1765894964.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/kdoc_parser.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py
index aaa352855717..e137bd9a7dac 100644
--- a/tools/lib/python/kdoc/kdoc_parser.py
+++ b/tools/lib/python/kdoc/kdoc_parser.py
@@ -977,17 +977,23 @@ class KernelDoc:
         # Variable name is at the end of the declaration
         #
 
+        default_val = None
+
         r= KernRe(OPTIONAL_VAR_ATTR + r"\w.*\s+(?:\*+)?([\w_]+)\s*[\d\]\[]*\s*(=.*)?")
-        if not r.match(proto):
-           self.emit_msg(ln,f"{proto}: can't parse variable")
-           return
+        if r.match(proto):
+            if not declaration_name:
+                declaration_name = r.group(1)
 
-        var_type = r.group(0)
+            default_val = r.group(2)
+        else:
+            r= KernRe(OPTIONAL_VAR_ATTR + r"(?:\w.*)?\s+(?:\*+)?(?:[\w_]+)\s*[\d\]\[]*\s*(=.*)?")
+        if r.match(proto):
+            default_val = r.group(1)
 
         if not declaration_name:
-            declaration_name = r.group(1)
+           self.emit_msg(ln,f"{proto}: can't parse variable")
+           return
 
-        default_val = r.group(2)
         if default_val:
             default_val = default_val.lstrip("=").strip()
 
-- 
cgit v1.2.3


From 9dbbd32ecd7bbfa6d3fa150bf8de8bba25e8dab2 Mon Sep 17 00:00:00 2001
From: Steven Price <steven.price@arm.com>
Date: Thu, 11 Dec 2025 10:48:49 +0000
Subject: kdoc: allow dots in inline @param names

Inline kernel-doc blocks failed to parse tags containing dots (e.g.
creator.process_name in panfrost_gem.h) because the @name regex only
matched word characters. Modify the single-line pattern to match
doc_inline_sect so it includes \. and parses the same as a multi-line
comment.

Signed-off-by: Steven Price <steven.price@arm.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <20251211104851.45330-1-steven.price@arm.com>
---
 tools/lib/python/kdoc/kdoc_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py
index e137bd9a7dac..a9a37519145d 100644
--- a/tools/lib/python/kdoc/kdoc_parser.py
+++ b/tools/lib/python/kdoc/kdoc_parser.py
@@ -53,7 +53,7 @@ doc_content = doc_com_body + KernRe(r'(.*)', cache=False)
 doc_inline_start = KernRe(r'^\s*/\*\*\s*$', cache=False)
 doc_inline_sect = KernRe(r'\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)', cache=False)
 doc_inline_end = KernRe(r'^\s*\*/\s*$', cache=False)
-doc_inline_oneline = KernRe(r'^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$', cache=False)
+doc_inline_oneline = KernRe(r'^\s*/\*\*\s*(@\s*[\w][\w\.]*\s*):\s*(.*)\s*\*/\s*$', cache=False)
 
 export_symbol = KernRe(r'^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*', cache=False)
 export_symbol_ns = KernRe(r'^\s*EXPORT_SYMBOL_NS(_GPL)?\s*\(\s*(\w+)\s*,\s*"\S+"\)\s*', cache=False)
-- 
cgit v1.2.3


From 6bce6ddbe634bbc6d21672b5bfdbb5ad0409bd8d Mon Sep 17 00:00:00 2001
From: JP Kobryn <inwardvessel@gmail.com>
Date: Mon, 22 Dec 2025 20:41:55 -0800
Subject: bpf: selftests: selftests for memcg stat kfuncs

Add test coverage for the kfuncs that fetch memcg stats. Using some common
stats, test scenarios ensuring that the given stat increases by some
arbitrary amount. The stats selected cover the three categories represented
by the enums: node_stat_item, memcg_stat_item, vm_event_item.

Since only a subset of all stats are queried, use a static struct made up
of fields for each stat. Write to the struct with the fetched values when
the bpf program is invoked and read the fields in the user mode program for
verification.

Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://lore.kernel.org/r/20251223044156.208250-6-roman.gushchin@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/cgroup_iter_memcg.h    |  18 ++
 .../selftests/bpf/prog_tests/cgroup_iter_memcg.c   | 223 +++++++++++++++++++++
 .../selftests/bpf/progs/cgroup_iter_memcg.c        |  39 ++++
 3 files changed, 280 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/cgroup_iter_memcg.h
 create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c
 create mode 100644 tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/cgroup_iter_memcg.h b/tools/testing/selftests/bpf/cgroup_iter_memcg.h
new file mode 100644
index 000000000000..3f59b127943b
--- /dev/null
+++ b/tools/testing/selftests/bpf/cgroup_iter_memcg.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+#ifndef __CGROUP_ITER_MEMCG_H
+#define __CGROUP_ITER_MEMCG_H
+
+struct memcg_query {
+	/* some node_stat_item's */
+	unsigned long nr_anon_mapped;
+	unsigned long nr_shmem;
+	unsigned long nr_file_pages;
+	unsigned long nr_file_mapped;
+	/* some memcg_stat_item */
+	unsigned long memcg_kmem;
+	/* some vm_event_item */
+	unsigned long pgfault;
+};
+
+#endif /* __CGROUP_ITER_MEMCG_H */
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c
new file mode 100644
index 000000000000..a5afd16705f0
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include <bpf/libbpf.h>
+#include <bpf/btf.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include "cgroup_helpers.h"
+#include "cgroup_iter_memcg.h"
+#include "cgroup_iter_memcg.skel.h"
+
+static int read_stats(struct bpf_link *link)
+{
+	int fd, ret = 0;
+	ssize_t bytes;
+
+	fd = bpf_iter_create(bpf_link__fd(link));
+	if (!ASSERT_OK_FD(fd, "bpf_iter_create"))
+		return 1;
+
+	/*
+	 * Invoke iter program by reading from its fd. We're not expecting any
+	 * data to be written by the bpf program so the result should be zero.
+	 * Results will be read directly through the custom data section
+	 * accessible through skel->data_query.memcg_query.
+	 */
+	bytes = read(fd, NULL, 0);
+	if (!ASSERT_EQ(bytes, 0, "read fd"))
+		ret = 1;
+
+	close(fd);
+	return ret;
+}
+
+static void test_anon(struct bpf_link *link, struct memcg_query *memcg_query)
+{
+	void *map;
+	size_t len;
+
+	len = sysconf(_SC_PAGESIZE) * 1024;
+
+	/*
+	 * Increase memcg anon usage by mapping and writing
+	 * to a new anon region.
+	 */
+	map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon"))
+		return;
+
+	memset(map, 1, len);
+
+	if (!ASSERT_OK(read_stats(link), "read stats"))
+		goto cleanup;
+
+	ASSERT_GT(memcg_query->nr_anon_mapped, 0, "final anon mapped val");
+
+cleanup:
+	munmap(map, len);
+}
+
+static void test_file(struct bpf_link *link, struct memcg_query *memcg_query)
+{
+	void *map;
+	size_t len;
+	char *path;
+	int fd;
+
+	len = sysconf(_SC_PAGESIZE) * 1024;
+	path = "/tmp/test_cgroup_iter_memcg";
+
+	/*
+	 * Increase memcg file usage by creating and writing
+	 * to a mapped file.
+	 */
+	fd = open(path, O_CREAT | O_RDWR, 0644);
+	if (!ASSERT_OK_FD(fd, "open fd"))
+		return;
+	if (!ASSERT_OK(ftruncate(fd, len), "ftruncate"))
+		goto cleanup_fd;
+
+	map = mmap(NULL, len, PROT_WRITE, MAP_SHARED, fd, 0);
+	if (!ASSERT_NEQ(map, MAP_FAILED, "mmap file"))
+		goto cleanup_fd;
+
+	memset(map, 1, len);
+
+	if (!ASSERT_OK(read_stats(link), "read stats"))
+		goto cleanup_map;
+
+	ASSERT_GT(memcg_query->nr_file_pages, 0, "final file value");
+	ASSERT_GT(memcg_query->nr_file_mapped, 0, "final file mapped value");
+
+cleanup_map:
+	munmap(map, len);
+cleanup_fd:
+	close(fd);
+	unlink(path);
+}
+
+static void test_shmem(struct bpf_link *link, struct memcg_query *memcg_query)
+{
+	size_t len;
+	int fd;
+
+	len = sysconf(_SC_PAGESIZE) * 1024;
+
+	/*
+	 * Increase memcg shmem usage by creating and writing
+	 * to a shmem object.
+	 */
+	fd = shm_open("/tmp_shmem", O_CREAT | O_RDWR, 0644);
+	if (!ASSERT_OK_FD(fd, "shm_open"))
+		return;
+
+	if (!ASSERT_OK(fallocate(fd, 0, 0, len), "fallocate"))
+		goto cleanup;
+
+	if (!ASSERT_OK(read_stats(link), "read stats"))
+		goto cleanup;
+
+	ASSERT_GT(memcg_query->nr_shmem, 0, "final shmem value");
+
+cleanup:
+	close(fd);
+	shm_unlink("/tmp_shmem");
+}
+
+#define NR_PIPES 64
+static void test_kmem(struct bpf_link *link, struct memcg_query *memcg_query)
+{
+	int fds[NR_PIPES][2], i;
+
+	/*
+	 * Increase kmem value by creating pipes which will allocate some
+	 * kernel buffers.
+	 */
+	for (i = 0; i < NR_PIPES; i++) {
+		if (!ASSERT_OK(pipe(fds[i]), "pipe"))
+			goto cleanup;
+	}
+
+	if (!ASSERT_OK(read_stats(link), "read stats"))
+		goto cleanup;
+
+	ASSERT_GT(memcg_query->memcg_kmem, 0, "kmem value");
+
+cleanup:
+	for (i = i - 1; i >= 0; i--) {
+		close(fds[i][0]);
+		close(fds[i][1]);
+	}
+}
+
+static void test_pgfault(struct bpf_link *link, struct memcg_query *memcg_query)
+{
+	void *map;
+	size_t len;
+
+	len = sysconf(_SC_PAGESIZE) * 1024;
+
+	/* Create region to use for triggering a page fault. */
+	map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon"))
+		return;
+
+	/* Trigger page fault. */
+	memset(map, 1, len);
+
+	if (!ASSERT_OK(read_stats(link), "read stats"))
+		goto cleanup;
+
+	ASSERT_GT(memcg_query->pgfault, 0, "final pgfault val");
+
+cleanup:
+	munmap(map, len);
+}
+
+void test_cgroup_iter_memcg(void)
+{
+	char *cgroup_rel_path = "/cgroup_iter_memcg_test";
+	struct cgroup_iter_memcg *skel;
+	struct bpf_link *link;
+	int cgroup_fd;
+
+	cgroup_fd = cgroup_setup_and_join(cgroup_rel_path);
+	if (!ASSERT_OK_FD(cgroup_fd, "cgroup_setup_and_join"))
+		return;
+
+	skel = cgroup_iter_memcg__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "cgroup_iter_memcg__open_and_load"))
+		goto cleanup_cgroup_fd;
+
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo = {
+		.cgroup.cgroup_fd = cgroup_fd,
+		.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY,
+	};
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	link = bpf_program__attach_iter(skel->progs.cgroup_memcg_query, &opts);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter"))
+		goto cleanup_skel;
+
+	if (test__start_subtest("cgroup_iter_memcg__anon"))
+		test_anon(link, &skel->data_query->memcg_query);
+	if (test__start_subtest("cgroup_iter_memcg__shmem"))
+		test_shmem(link, &skel->data_query->memcg_query);
+	if (test__start_subtest("cgroup_iter_memcg__file"))
+		test_file(link, &skel->data_query->memcg_query);
+	if (test__start_subtest("cgroup_iter_memcg__kmem"))
+		test_kmem(link, &skel->data_query->memcg_query);
+	if (test__start_subtest("cgroup_iter_memcg__pgfault"))
+		test_pgfault(link, &skel->data_query->memcg_query);
+
+	bpf_link__destroy(link);
+cleanup_skel:
+	cgroup_iter_memcg__destroy(skel);
+cleanup_cgroup_fd:
+	close(cgroup_fd);
+	cleanup_cgroup_environment();
+}
diff --git a/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c
new file mode 100644
index 000000000000..59fb70a3cc50
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_core_read.h>
+#include "cgroup_iter_memcg.h"
+
+char _license[] SEC("license") = "GPL";
+
+/* The latest values read are stored here. */
+struct memcg_query memcg_query SEC(".data.query");
+
+SEC("iter.s/cgroup")
+int cgroup_memcg_query(struct bpf_iter__cgroup *ctx)
+{
+	struct cgroup *cgrp = ctx->cgroup;
+	struct cgroup_subsys_state *css;
+	struct mem_cgroup *memcg;
+
+	if (!cgrp)
+		return 1;
+
+	css = &cgrp->self;
+	memcg = bpf_get_mem_cgroup(css);
+	if (!memcg)
+		return 1;
+
+	bpf_mem_cgroup_flush_stats(memcg);
+
+	memcg_query.nr_anon_mapped = bpf_mem_cgroup_page_state(memcg, NR_ANON_MAPPED);
+	memcg_query.nr_shmem = bpf_mem_cgroup_page_state(memcg, NR_SHMEM);
+	memcg_query.nr_file_pages = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES);
+	memcg_query.nr_file_mapped = bpf_mem_cgroup_page_state(memcg, NR_FILE_MAPPED);
+	memcg_query.memcg_kmem = bpf_mem_cgroup_page_state(memcg, MEMCG_KMEM);
+	memcg_query.pgfault = bpf_mem_cgroup_vm_events(memcg, PGFAULT);
+
+	bpf_put_mem_cgroup(memcg);
+
+	return 0;
+}
-- 
cgit v1.2.3


From 83dd46ecb68ecc03cff23e68490ded5d40d79f66 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Mon, 22 Dec 2025 05:32:46 -0800
Subject: selftests: bpf: fix tests with raw_tp calling kfuncs

As the previous commit allowed raw_tp programs to call kfuncs, so of the
selftests that were expected to fail will now succeed.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20251222133250.1890587-3-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/dynptr_fail.c               | 2 +-
 tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index dda6a8dada82..8f2ae9640886 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -1465,7 +1465,7 @@ int xdp_invalid_data_slice2(struct xdp_md *xdp)
 }
 
 /* Only supported prog type can create skb-type dynptrs */
-SEC("?raw_tp")
+SEC("?xdp")
 __failure __msg("calling kernel function bpf_dynptr_from_skb is not allowed")
 int skb_invalid_ctx(void *ctx)
 {
diff --git a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c
index a509cad97e69..1fce7a7e8d03 100644
--- a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c
+++ b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c
@@ -32,7 +32,7 @@ static void task_kfunc_load_test(void)
 }
 
 SEC("raw_tp")
-__failure __msg("calling kernel function")
+__success
 int BPF_PROG(task_kfunc_raw_tp)
 {
 	task_kfunc_load_test();
@@ -86,7 +86,7 @@ static void cgrp_kfunc_load_test(void)
 }
 
 SEC("raw_tp")
-__failure __msg("calling kernel function")
+__success
 int BPF_PROG(cgrp_kfunc_raw_tp)
 {
 	cgrp_kfunc_load_test();
@@ -138,7 +138,7 @@ static void cpumask_kfunc_load_test(void)
 }
 
 SEC("raw_tp")
-__failure __msg("calling kernel function")
+__success
 int BPF_PROG(cpumask_kfunc_raw_tp)
 {
 	cpumask_kfunc_load_test();
-- 
cgit v1.2.3


From efecc9e825f4aa3fe616236152604a066a3e776d Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Mon, 22 Dec 2025 11:50:19 -0800
Subject: selftests: bpf: test non-sleepable arena allocations

As arena kfuncs can now be called from non-sleepable contexts, test this
by adding non-sleepable copies of tests in verifier_arena, this is done
by using a socket program instead of syscall.

Add a new test case in verifier_arena_large to check that the
bpf_arena_alloc_pages() works for more than 1024 pages.
1024 * sizeof(struct page *) is the upper limit of kmalloc_nolock() but
bpf_arena_alloc_pages() should still succeed because it re-uses this
array in a loop.

Augment the arena_list selftest to also run in non-sleepable context by
taking rcu_read_lock.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20251222195022.431211-5-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/arena_list.c  |  20 ++-
 tools/testing/selftests/bpf/progs/arena_list.c     |  11 ++
 tools/testing/selftests/bpf/progs/verifier_arena.c | 185 +++++++++++++++++++++
 .../selftests/bpf/progs/verifier_arena_large.c     |  29 ++++
 4 files changed, 240 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/arena_list.c b/tools/testing/selftests/bpf/prog_tests/arena_list.c
index d15867cddde0..4f2866a615ce 100644
--- a/tools/testing/selftests/bpf/prog_tests/arena_list.c
+++ b/tools/testing/selftests/bpf/prog_tests/arena_list.c
@@ -27,17 +27,23 @@ static int list_sum(struct arena_list_head *head)
 	return sum;
 }
 
-static void test_arena_list_add_del(int cnt)
+static void test_arena_list_add_del(int cnt, bool nonsleepable)
 {
 	LIBBPF_OPTS(bpf_test_run_opts, opts);
 	struct arena_list *skel;
 	int expected_sum = (u64)cnt * (cnt - 1) / 2;
 	int ret, sum;
 
-	skel = arena_list__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "arena_list__open_and_load"))
+	skel = arena_list__open();
+	if (!ASSERT_OK_PTR(skel, "arena_list__open"))
 		return;
 
+	skel->rodata->nonsleepable = nonsleepable;
+
+	ret = arena_list__load(skel);
+	if (!ASSERT_OK(ret, "arena_list__load"))
+		goto out;
+
 	skel->bss->cnt = cnt;
 	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_add), &opts);
 	ASSERT_OK(ret, "ret_add");
@@ -65,7 +71,11 @@ out:
 void test_arena_list(void)
 {
 	if (test__start_subtest("arena_list_1"))
-		test_arena_list_add_del(1);
+		test_arena_list_add_del(1, false);
 	if (test__start_subtest("arena_list_1000"))
-		test_arena_list_add_del(1000);
+		test_arena_list_add_del(1000, false);
+	if (test__start_subtest("arena_list_1_nonsleepable"))
+		test_arena_list_add_del(1, true);
+	if (test__start_subtest("arena_list_1000_nonsleepable"))
+		test_arena_list_add_del(1000, true);
 }
diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c
index 3a2ddcacbea6..235d8cc95bdd 100644
--- a/tools/testing/selftests/bpf/progs/arena_list.c
+++ b/tools/testing/selftests/bpf/progs/arena_list.c
@@ -30,6 +30,7 @@ struct arena_list_head __arena *list_head;
 int list_sum;
 int cnt;
 bool skip = false;
+const volatile bool nonsleepable = false;
 
 #ifdef __BPF_FEATURE_ADDR_SPACE_CAST
 long __arena arena_sum;
@@ -42,6 +43,9 @@ int test_val SEC(".addr_space.1");
 
 int zero;
 
+void bpf_rcu_read_lock(void) __ksym;
+void bpf_rcu_read_unlock(void) __ksym;
+
 SEC("syscall")
 int arena_list_add(void *ctx)
 {
@@ -71,6 +75,10 @@ int arena_list_del(void *ctx)
 	struct elem __arena *n;
 	int sum = 0;
 
+	/* Take rcu_read_lock to test non-sleepable context */
+	if (nonsleepable)
+		bpf_rcu_read_lock();
+
 	arena_sum = 0;
 	list_for_each_entry(n, list_head, node) {
 		sum += n->value;
@@ -79,6 +87,9 @@ int arena_list_del(void *ctx)
 		bpf_free(n);
 	}
 	list_sum = sum;
+
+	if (nonsleepable)
+		bpf_rcu_read_unlock();
 #else
 	skip = true;
 #endif
diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c
index 7f4827eede3c..4a9d96344813 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena.c
@@ -21,6 +21,37 @@ struct {
 #endif
 } arena SEC(".maps");
 
+SEC("socket")
+__success __retval(0)
+int basic_alloc1_nosleep(void *ctx)
+{
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+	volatile int __arena *page1, *page2, *no_page;
+
+	page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+	if (!page1)
+		return 1;
+	*page1 = 1;
+	page2 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+	if (!page2)
+		return 2;
+	*page2 = 2;
+	no_page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+	if (no_page)
+		return 3;
+	if (*page1 != 1)
+		return 4;
+	if (*page2 != 2)
+		return 5;
+	bpf_arena_free_pages(&arena, (void __arena *)page2, 1);
+	if (*page1 != 1)
+		return 6;
+	if (*page2 != 0 && *page2 != 2) /* use-after-free should return 0 or the stored value */
+		return 7;
+#endif
+	return 0;
+}
+
 SEC("syscall")
 __success __retval(0)
 int basic_alloc1(void *ctx)
@@ -60,6 +91,44 @@ int basic_alloc1(void *ctx)
 	return 0;
 }
 
+SEC("socket")
+__success __retval(0)
+int basic_alloc2_nosleep(void *ctx)
+{
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+	volatile char __arena *page1, *page2, *page3, *page4;
+
+	page1 = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0);
+	if (!page1)
+		return 1;
+	page2 = page1 + __PAGE_SIZE;
+	page3 = page1 + __PAGE_SIZE * 2;
+	page4 = page1 - __PAGE_SIZE;
+	*page1 = 1;
+	*page2 = 2;
+	*page3 = 3;
+	*page4 = 4;
+	if (*page1 != 1)
+		return 1;
+	if (*page2 != 2)
+		return 2;
+	if (*page3 != 0)
+		return 3;
+	if (*page4 != 0)
+		return 4;
+	bpf_arena_free_pages(&arena, (void __arena *)page1, 2);
+	if (*page1 != 0 && *page1 != 1)
+		return 5;
+	if (*page2 != 0 && *page2 != 2)
+		return 6;
+	if (*page3 != 0)
+		return 7;
+	if (*page4 != 0)
+		return 8;
+#endif
+	return 0;
+}
+
 SEC("syscall")
 __success __retval(0)
 int basic_alloc2(void *ctx)
@@ -102,6 +171,19 @@ struct bpf_arena___l {
         struct bpf_map map;
 } __attribute__((preserve_access_index));
 
+SEC("socket")
+__success __retval(0) __log_level(2)
+int basic_alloc3_nosleep(void *ctx)
+{
+	struct bpf_arena___l *ar = (struct bpf_arena___l *)&arena;
+	volatile char __arena *pages;
+
+	pages = bpf_arena_alloc_pages(&ar->map, NULL, ar->map.max_entries, NUMA_NO_NODE, 0);
+	if (!pages)
+		return 1;
+	return 0;
+}
+
 SEC("syscall")
 __success __retval(0) __log_level(2)
 int basic_alloc3(void *ctx)
@@ -115,6 +197,38 @@ int basic_alloc3(void *ctx)
 	return 0;
 }
 
+SEC("socket")
+__success __retval(0)
+int basic_reserve1_nosleep(void *ctx)
+{
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+	char __arena *page;
+	int ret;
+
+	page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+	if (!page)
+		return 1;
+
+	page += __PAGE_SIZE;
+
+	/* Reserve the second page */
+	ret = bpf_arena_reserve_pages(&arena, page, 1);
+	if (ret)
+		return 2;
+
+	/* Try to explicitly allocate the reserved page. */
+	page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0);
+	if (page)
+		return 3;
+
+	/* Try to implicitly allocate the page (since there's only 2 of them). */
+	page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+	if (page)
+		return 4;
+#endif
+	return 0;
+}
+
 SEC("syscall")
 __success __retval(0)
 int basic_reserve1(void *ctx)
@@ -147,6 +261,26 @@ int basic_reserve1(void *ctx)
 	return 0;
 }
 
+SEC("socket")
+__success __retval(0)
+int basic_reserve2_nosleep(void *ctx)
+{
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+	char __arena *page;
+	int ret;
+
+	page = arena_base(&arena);
+	ret = bpf_arena_reserve_pages(&arena, page, 1);
+	if (ret)
+		return 1;
+
+	page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0);
+	if ((u64)page)
+		return 2;
+#endif
+	return 0;
+}
+
 SEC("syscall")
 __success __retval(0)
 int basic_reserve2(void *ctx)
@@ -168,6 +302,27 @@ int basic_reserve2(void *ctx)
 }
 
 /* Reserve the same page twice, should return -EBUSY. */
+SEC("socket")
+__success __retval(0)
+int reserve_twice_nosleep(void *ctx)
+{
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+	char __arena *page;
+	int ret;
+
+	page = arena_base(&arena);
+
+	ret = bpf_arena_reserve_pages(&arena, page, 1);
+	if (ret)
+		return 1;
+
+	ret = bpf_arena_reserve_pages(&arena, page, 1);
+	if (ret != -EBUSY)
+		return 2;
+#endif
+	return 0;
+}
+
 SEC("syscall")
 __success __retval(0)
 int reserve_twice(void *ctx)
@@ -190,6 +345,36 @@ int reserve_twice(void *ctx)
 }
 
 /* Try to reserve past the end of the arena. */
+SEC("socket")
+__success __retval(0)
+int reserve_invalid_region_nosleep(void *ctx)
+{
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+	char __arena *page;
+	int ret;
+
+	/* Try a NULL pointer. */
+	ret = bpf_arena_reserve_pages(&arena, NULL, 3);
+	if (ret != -EINVAL)
+		return 1;
+
+	page = arena_base(&arena);
+
+	ret = bpf_arena_reserve_pages(&arena, page, 3);
+	if (ret != -EINVAL)
+		return 2;
+
+	ret = bpf_arena_reserve_pages(&arena, page, 4096);
+	if (ret != -EINVAL)
+		return 3;
+
+	ret = bpf_arena_reserve_pages(&arena, page, (1ULL << 32) - 1);
+	if (ret != -EINVAL)
+		return 4;
+#endif
+	return 0;
+}
+
 SEC("syscall")
 __success __retval(0)
 int reserve_invalid_region(void *ctx)
diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
index 2b8cf2a4d880..4ca491cbe8d1 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
@@ -283,5 +283,34 @@ int big_alloc2(void *ctx)
 		return 9;
 	return 0;
 }
+
+SEC("socket")
+__success __retval(0)
+int big_alloc3(void *ctx)
+{
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+	char __arena *pages;
+	u64 i;
+
+	/*
+	 * Allocate 2051 pages in one go to check how kmalloc_nolock() handles large requests.
+	 * Since kmalloc_nolock() can allocate up to 1024 struct page * at a time, this call should
+	 * result in three batches: two batches of 1024 pages each, followed by a final batch of 3
+	 * pages.
+	 */
+	pages = bpf_arena_alloc_pages(&arena, NULL, 2051, NUMA_NO_NODE, 0);
+	if (!pages)
+		return -1;
+
+	bpf_for(i, 0, 2051)
+			pages[i * PAGE_SIZE] = 123;
+	bpf_for(i, 0, 2051)
+			if (pages[i * PAGE_SIZE] != 123)
+				return i;
+
+	bpf_arena_free_pages(&arena, pages, 2051);
+#endif
+	return 0;
+}
 #endif
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From bd09d9a05cf04028f639e209b416bacaeffd4909 Mon Sep 17 00:00:00 2001
From: Matthieu Buffet <matthieu@buffet.re>
Date: Mon, 27 Oct 2025 20:07:24 +0100
Subject: selftests/landlock: Fix TCP bind(AF_UNSPEC) test case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The nominal error code for bind(AF_UNSPEC) on an IPv6 socket
is -EAFNOSUPPORT, not -EINVAL. -EINVAL is only returned when
the supplied address struct is too short, which happens to be
the case in current selftests because they treat AF_UNSPEC
like IPv4 sockets do: as an alias for AF_INET (which is a
16-byte struct instead of the 24 bytes required by IPv6
sockets).

Make the union large enough for any address (by adding struct
sockaddr_storage to the union), and make AF_UNSPEC addresses
large enough for any family.

Test for -EAFNOSUPPORT instead, and add a dedicated test case
for truncated inputs with -EINVAL.

Fixes: a549d055a22e ("selftests/landlock: Add network tests")
Signed-off-by: Matthieu Buffet <matthieu@buffet.re>
Link: https://lore.kernel.org/r/20251027190726.626244-2-matthieu@buffet.re
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/common.h   |  1 +
 tools/testing/selftests/landlock/net_test.c | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h
index 230b75f6015b..90551650299c 100644
--- a/tools/testing/selftests/landlock/common.h
+++ b/tools/testing/selftests/landlock/common.h
@@ -237,6 +237,7 @@ struct service_fixture {
 			struct sockaddr_un unix_addr;
 			socklen_t unix_addr_len;
 		};
+		struct sockaddr_storage _largest;
 	};
 };
 
diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c
index 2a45208551e6..3bbc0508420b 100644
--- a/tools/testing/selftests/landlock/net_test.c
+++ b/tools/testing/selftests/landlock/net_test.c
@@ -121,6 +121,10 @@ static socklen_t get_addrlen(const struct service_fixture *const srv,
 {
 	switch (srv->protocol.domain) {
 	case AF_UNSPEC:
+		if (minimal)
+			return sizeof(sa_family_t);
+		return sizeof(struct sockaddr_storage);
+
 	case AF_INET:
 		return sizeof(srv->ipv4_addr);
 
@@ -758,6 +762,11 @@ TEST_F(protocol, bind_unspec)
 	bind_fd = socket_variant(&self->srv0);
 	ASSERT_LE(0, bind_fd);
 
+	/* Tries to bind with too small addrlen. */
+	EXPECT_EQ(-EINVAL, bind_variant_addrlen(
+				   bind_fd, &self->unspec_any0,
+				   get_addrlen(&self->unspec_any0, true) - 1));
+
 	/* Allowed bind on AF_UNSPEC/INADDR_ANY. */
 	ret = bind_variant(bind_fd, &self->unspec_any0);
 	if (variant->prot.domain == AF_INET) {
@@ -766,6 +775,8 @@ TEST_F(protocol, bind_unspec)
 			TH_LOG("Failed to bind to unspec/any socket: %s",
 			       strerror(errno));
 		}
+	} else if (variant->prot.domain == AF_INET6) {
+		EXPECT_EQ(-EAFNOSUPPORT, ret);
 	} else {
 		EXPECT_EQ(-EINVAL, ret);
 	}
@@ -792,6 +803,8 @@ TEST_F(protocol, bind_unspec)
 		} else {
 			EXPECT_EQ(0, ret);
 		}
+	} else if (variant->prot.domain == AF_INET6) {
+		EXPECT_EQ(-EAFNOSUPPORT, ret);
 	} else {
 		EXPECT_EQ(-EINVAL, ret);
 	}
@@ -801,7 +814,8 @@ TEST_F(protocol, bind_unspec)
 	bind_fd = socket_variant(&self->srv0);
 	ASSERT_LE(0, bind_fd);
 	ret = bind_variant(bind_fd, &self->unspec_srv0);
-	if (variant->prot.domain == AF_INET) {
+	if (variant->prot.domain == AF_INET ||
+	    variant->prot.domain == AF_INET6) {
 		EXPECT_EQ(-EAFNOSUPPORT, ret);
 	} else {
 		EXPECT_EQ(-EINVAL, ret)
-- 
cgit v1.2.3


From 6685201ebfacff0c889bcd569181fa6e8af5575e Mon Sep 17 00:00:00 2001
From: Matthieu Buffet <matthieu@buffet.re>
Date: Mon, 27 Oct 2025 20:07:25 +0100
Subject: selftests/landlock: Add missing connect(minimal AF_UNSPEC) test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

connect_variant(unspec_any0) is called twice. Both calls end
up in connect_variant_addrlen() with an address length of
get_addrlen(minimal=false).
However, the connect() syscall and its variants (e.g.
iouring/compat) accept much shorter addresses of 4 bytes
and that behaviour was not tested.

Replace one of these calls with one using a minimal address
length (just a bare sa_family=AF_UNSPEC field with no actual
address). Also add a call using a truncated address for good
measure.

Signed-off-by: Matthieu Buffet <matthieu@buffet.re>
Link: https://lore.kernel.org/r/20251027190726.626244-3-matthieu@buffet.re
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/net_test.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c
index 3bbc0508420b..b34b139b3f89 100644
--- a/tools/testing/selftests/landlock/net_test.c
+++ b/tools/testing/selftests/landlock/net_test.c
@@ -906,7 +906,19 @@ TEST_F(protocol, connect_unspec)
 			EXPECT_EQ(0, close(ruleset_fd));
 		}
 
-		ret = connect_variant(connect_fd, &self->unspec_any0);
+		/* Try to re-disconnect with a truncated address struct. */
+		EXPECT_EQ(-EINVAL,
+			  connect_variant_addrlen(
+				  connect_fd, &self->unspec_any0,
+				  get_addrlen(&self->unspec_any0, true) - 1));
+
+		/*
+		 * Re-disconnect, with a minimal sockaddr struct (just a
+		 * bare af_family=AF_UNSPEC field).
+		 */
+		ret = connect_variant_addrlen(connect_fd, &self->unspec_any0,
+					      get_addrlen(&self->unspec_any0,
+							  true));
 		if (self->srv0.protocol.domain == AF_UNIX &&
 		    self->srv0.protocol.type == SOCK_STREAM) {
 			EXPECT_EQ(-EINVAL, ret);
-- 
cgit v1.2.3


From e1a57c33590a50a6639798e60a597af4a23b0340 Mon Sep 17 00:00:00 2001
From: Matthieu Buffet <matthieu@buffet.re>
Date: Mon, 1 Dec 2025 01:36:31 +0100
Subject: selftests/landlock: Remove invalid unix socket bind()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove bind() call on a client socket that doesn't make sense.
Since strlen(cli_un.sun_path) returns a random value depending on stack
garbage, that many uninitialized bytes are read from the stack as an
unix socket address. This creates random test failures due to the bind
address being invalid or already in use if the same stack value comes up
twice.

Fixes: f83d51a5bdfe ("selftests/landlock: Check IOCTL restrictions for named UNIX domain sockets")
Signed-off-by: Matthieu Buffet <matthieu@buffet.re>
Reviewed-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20251201003631.190817-1-matthieu@buffet.re
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index eee814e09dd7..7d378bdf3bce 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -4391,9 +4391,6 @@ TEST_F_FORK(layout1, named_unix_domain_socket_ioctl)
 	cli_fd = socket(AF_UNIX, SOCK_STREAM, 0);
 	ASSERT_LE(0, cli_fd);
 
-	size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path);
-	ASSERT_EQ(0, bind(cli_fd, (struct sockaddr *)&cli_un, size));
-
 	bzero(&cli_un, sizeof(cli_un));
 	cli_un.sun_family = AF_UNIX;
 	strncpy(cli_un.sun_path, path, sizeof(cli_un.sun_path));
-- 
cgit v1.2.3


From e4aa4461d4acb922ef45785581232f0588a6eea8 Mon Sep 17 00:00:00 2001
From: Matthieu Buffet <matthieu@buffet.re>
Date: Tue, 2 Dec 2025 22:51:41 +0100
Subject: selftests/landlock: NULL-terminate unix pathname addresses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The size of Unix pathname addresses is computed in selftests using
offsetof(struct sockaddr_un, sun_path) + strlen(xxx). It should have
been that +1, which makes addresses passed to the libc and kernel
non-NULL-terminated. unix_mkname_bsd() fixes that in Linux so there is
no harm, but just using sizeof(the address struct) should improve
readability.

Signed-off-by: Matthieu Buffet <matthieu@buffet.re>
Reviewed-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20251202215141.689986-1-matthieu@buffet.re
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c         | 24 ++++++++++------------
 .../selftests/landlock/scoped_abstract_unix_test.c | 21 ++++++++-----------
 2 files changed, 20 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 7d378bdf3bce..76491ba54dce 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -4362,22 +4362,24 @@ TEST_F_FORK(layout1, named_unix_domain_socket_ioctl)
 {
 	const char *const path = file1_s1d1;
 	int srv_fd, cli_fd, ruleset_fd;
-	socklen_t size;
-	struct sockaddr_un srv_un, cli_un;
+	struct sockaddr_un srv_un = {
+		.sun_family = AF_UNIX,
+	};
+	struct sockaddr_un cli_un = {
+		.sun_family = AF_UNIX,
+	};
 	const struct landlock_ruleset_attr attr = {
 		.handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV,
 	};
 
 	/* Sets up a server */
-	srv_un.sun_family = AF_UNIX;
-	strncpy(srv_un.sun_path, path, sizeof(srv_un.sun_path));
-
 	ASSERT_EQ(0, unlink(path));
 	srv_fd = socket(AF_UNIX, SOCK_STREAM, 0);
 	ASSERT_LE(0, srv_fd);
 
-	size = offsetof(struct sockaddr_un, sun_path) + strlen(srv_un.sun_path);
-	ASSERT_EQ(0, bind(srv_fd, (struct sockaddr *)&srv_un, size));
+	strncpy(srv_un.sun_path, path, sizeof(srv_un.sun_path));
+	ASSERT_EQ(0, bind(srv_fd, (struct sockaddr *)&srv_un, sizeof(srv_un)));
+
 	ASSERT_EQ(0, listen(srv_fd, 10 /* qlen */));
 
 	/* Enables Landlock. */
@@ -4387,16 +4389,12 @@ TEST_F_FORK(layout1, named_unix_domain_socket_ioctl)
 	ASSERT_EQ(0, close(ruleset_fd));
 
 	/* Sets up a client connection to it */
-	cli_un.sun_family = AF_UNIX;
 	cli_fd = socket(AF_UNIX, SOCK_STREAM, 0);
 	ASSERT_LE(0, cli_fd);
 
-	bzero(&cli_un, sizeof(cli_un));
-	cli_un.sun_family = AF_UNIX;
 	strncpy(cli_un.sun_path, path, sizeof(cli_un.sun_path));
-	size = offsetof(struct sockaddr_un, sun_path) + strlen(cli_un.sun_path);
-
-	ASSERT_EQ(0, connect(cli_fd, (struct sockaddr *)&cli_un, size));
+	ASSERT_EQ(0,
+		  connect(cli_fd, (struct sockaddr *)&cli_un, sizeof(cli_un)));
 
 	/* FIONREAD and other IOCTLs should not be forbidden. */
 	EXPECT_EQ(0, test_fionread_ioctl(cli_fd));
diff --git a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
index 6825082c079c..2cdf1ba07016 100644
--- a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
+++ b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
@@ -779,7 +779,6 @@ FIXTURE_TEARDOWN(various_address_sockets)
 
 TEST_F(various_address_sockets, scoped_pathname_sockets)
 {
-	socklen_t size_stream, size_dgram;
 	pid_t child;
 	int status;
 	char buf_child, buf_parent;
@@ -798,12 +797,8 @@ TEST_F(various_address_sockets, scoped_pathname_sockets)
 	/* Pathname address. */
 	snprintf(stream_pathname_addr.sun_path,
 		 sizeof(stream_pathname_addr.sun_path), "%s", stream_path);
-	size_stream = offsetof(struct sockaddr_un, sun_path) +
-		      strlen(stream_pathname_addr.sun_path);
 	snprintf(dgram_pathname_addr.sun_path,
 		 sizeof(dgram_pathname_addr.sun_path), "%s", dgram_path);
-	size_dgram = offsetof(struct sockaddr_un, sun_path) +
-		     strlen(dgram_pathname_addr.sun_path);
 
 	/* Abstract address. */
 	memset(&stream_abstract_addr, 0, sizeof(stream_abstract_addr));
@@ -841,8 +836,9 @@ TEST_F(various_address_sockets, scoped_pathname_sockets)
 		/* Connects with pathname sockets. */
 		stream_pathname_socket = socket(AF_UNIX, SOCK_STREAM, 0);
 		ASSERT_LE(0, stream_pathname_socket);
-		ASSERT_EQ(0, connect(stream_pathname_socket,
-				     &stream_pathname_addr, size_stream));
+		ASSERT_EQ(0,
+			  connect(stream_pathname_socket, &stream_pathname_addr,
+				  sizeof(stream_pathname_addr)));
 		ASSERT_EQ(1, write(stream_pathname_socket, "b", 1));
 		EXPECT_EQ(0, close(stream_pathname_socket));
 
@@ -850,12 +846,13 @@ TEST_F(various_address_sockets, scoped_pathname_sockets)
 		dgram_pathname_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
 		ASSERT_LE(0, dgram_pathname_socket);
 		err = sendto(dgram_pathname_socket, "c", 1, 0,
-			     &dgram_pathname_addr, size_dgram);
+			     &dgram_pathname_addr, sizeof(dgram_pathname_addr));
 		EXPECT_EQ(1, err);
 
 		/* Sends with connection. */
-		ASSERT_EQ(0, connect(dgram_pathname_socket,
-				     &dgram_pathname_addr, size_dgram));
+		ASSERT_EQ(0,
+			  connect(dgram_pathname_socket, &dgram_pathname_addr,
+				  sizeof(dgram_pathname_addr)));
 		ASSERT_EQ(1, write(dgram_pathname_socket, "d", 1));
 		EXPECT_EQ(0, close(dgram_pathname_socket));
 
@@ -910,13 +907,13 @@ TEST_F(various_address_sockets, scoped_pathname_sockets)
 	stream_pathname_socket = socket(AF_UNIX, SOCK_STREAM, 0);
 	ASSERT_LE(0, stream_pathname_socket);
 	ASSERT_EQ(0, bind(stream_pathname_socket, &stream_pathname_addr,
-			  size_stream));
+			  sizeof(stream_pathname_addr)));
 	ASSERT_EQ(0, listen(stream_pathname_socket, backlog));
 
 	dgram_pathname_socket = socket(AF_UNIX, SOCK_DGRAM, 0);
 	ASSERT_LE(0, dgram_pathname_socket);
 	ASSERT_EQ(0, bind(dgram_pathname_socket, &dgram_pathname_addr,
-			  size_dgram));
+			  sizeof(dgram_pathname_addr)));
 
 	/* Sets up abstract servers. */
 	stream_abstract_socket = socket(AF_UNIX, SOCK_STREAM, 0);
-- 
cgit v1.2.3


From 14c00e30d3a29a7fb6053fcaa54aeb6c07fb1055 Mon Sep 17 00:00:00 2001
From: Tingmao Wang <m@maowtm.org>
Date: Sun, 28 Dec 2025 01:27:31 +0000
Subject: selftests/landlock: Fix typo in fs_test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Tingmao Wang <m@maowtm.org>
Link: https://lore.kernel.org/r/62d18e06eeb26f62bc49d24c4467b3793c5ba32b.1766885035.git.m@maowtm.org
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 76491ba54dce..37a5a3df712e 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -7069,8 +7069,8 @@ static int matches_log_fs_extra(struct __test_metadata *const _metadata,
 		return -E2BIG;
 
 	/*
-	 * It is assume that absolute_path does not contain control characters nor
-	 * spaces, see audit_string_contains_control().
+	 * It is assumed that absolute_path does not contain control
+	 * characters nor spaces, see audit_string_contains_control().
 	 */
 	absolute_path = realpath(path, NULL);
 	if (!absolute_path)
-- 
cgit v1.2.3


From 7aa593d8fb64b884bf00c13e01387b0733f3d786 Mon Sep 17 00:00:00 2001
From: Tingmao Wang <m@maowtm.org>
Date: Sun, 28 Dec 2025 01:27:32 +0000
Subject: selftests/landlock: Fix missing semicolon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add missing semicolon after EXPECT_EQ(0, close(stream_server_child)) in
the scoped_vs_unscoped test.  I suspect currently it's just not executing
the close statement after the line, but this causes no observable
difference.

Fixes: fefcf0f7cf47 ("selftests/landlock: Test abstract UNIX socket scoping")
Cc: Tahera Fahimi <fahimitahera@gmail.com>
Signed-off-by: Tingmao Wang <m@maowtm.org>
Link: https://lore.kernel.org/r/d9e968e4cd4ecc9bf487593d7b7220bffbb3b5f5.1766885035.git.m@maowtm.org
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/scoped_abstract_unix_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
index 2cdf1ba07016..72f97648d4a7 100644
--- a/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
+++ b/tools/testing/selftests/landlock/scoped_abstract_unix_test.c
@@ -543,7 +543,7 @@ TEST_F(scoped_vs_unscoped, unix_scoping)
 
 		ASSERT_EQ(1, write(pipe_child[1], ".", 1));
 		ASSERT_EQ(grand_child, waitpid(grand_child, &status, 0));
-		EXPECT_EQ(0, close(stream_server_child))
+		EXPECT_EQ(0, close(stream_server_child));
 		EXPECT_EQ(0, close(dgram_server_child));
 		return;
 	}
-- 
cgit v1.2.3


From 55dc93a7c2717311d48ca0a47c5f8c1b0755a115 Mon Sep 17 00:00:00 2001
From: Tingmao Wang <m@maowtm.org>
Date: Sun, 28 Dec 2025 01:27:34 +0000
Subject: selftests/landlock: Use scoped_base_variants.h for ptrace_test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ptrace_test.c currently contains a duplicated version of the
scoped_domains fixture variants.  This patch removes that and make it use
the shared scoped_base_variants.h instead, like in
scoped_abstract_unix_test and scoped_signal_test.

This required renaming the hierarchy fixture to scoped_domains, but the
test is otherwise the same.

Cc: Tahera Fahimi <fahimitahera@gmail.com>
Signed-off-by: Tingmao Wang <m@maowtm.org>
Link: https://lore.kernel.org/r/48148f0134f95f819a25277486a875a6fd88ecf9.1766885035.git.m@maowtm.org
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/ptrace_test.c     | 154 +--------------------
 .../selftests/landlock/scoped_base_variants.h      |   9 +-
 2 files changed, 12 insertions(+), 151 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/ptrace_test.c b/tools/testing/selftests/landlock/ptrace_test.c
index 4e356334ecb7..4f64c90583cd 100644
--- a/tools/testing/selftests/landlock/ptrace_test.c
+++ b/tools/testing/selftests/landlock/ptrace_test.c
@@ -86,16 +86,9 @@ static int get_yama_ptrace_scope(void)
 }
 
 /* clang-format off */
-FIXTURE(hierarchy) {};
+FIXTURE(scoped_domains) {};
 /* clang-format on */
 
-FIXTURE_VARIANT(hierarchy)
-{
-	const bool domain_both;
-	const bool domain_parent;
-	const bool domain_child;
-};
-
 /*
  * Test multiple tracing combinations between a parent process P1 and a child
  * process P2.
@@ -104,155 +97,18 @@ FIXTURE_VARIANT(hierarchy)
  * restriction is enforced in addition to any Landlock check, which means that
  * all P2 requests to trace P1 would be denied.
  */
+#include "scoped_base_variants.h"
 
-/*
- *        No domain
- *
- *   P1-.               P1 -> P2 : allow
- *       \              P2 -> P1 : allow
- *        'P2
- */
-/* clang-format off */
-FIXTURE_VARIANT_ADD(hierarchy, allow_without_domain) {
-	/* clang-format on */
-	.domain_both = false,
-	.domain_parent = false,
-	.domain_child = false,
-};
-
-/*
- *        Child domain
- *
- *   P1--.              P1 -> P2 : allow
- *        \             P2 -> P1 : deny
- *        .'-----.
- *        |  P2  |
- *        '------'
- */
-/* clang-format off */
-FIXTURE_VARIANT_ADD(hierarchy, allow_with_one_domain) {
-	/* clang-format on */
-	.domain_both = false,
-	.domain_parent = false,
-	.domain_child = true,
-};
-
-/*
- *        Parent domain
- * .------.
- * |  P1  --.           P1 -> P2 : deny
- * '------'  \          P2 -> P1 : allow
- *            '
- *            P2
- */
-/* clang-format off */
-FIXTURE_VARIANT_ADD(hierarchy, deny_with_parent_domain) {
-	/* clang-format on */
-	.domain_both = false,
-	.domain_parent = true,
-	.domain_child = false,
-};
-
-/*
- *        Parent + child domain (siblings)
- * .------.
- * |  P1  ---.          P1 -> P2 : deny
- * '------'   \         P2 -> P1 : deny
- *         .---'--.
- *         |  P2  |
- *         '------'
- */
-/* clang-format off */
-FIXTURE_VARIANT_ADD(hierarchy, deny_with_sibling_domain) {
-	/* clang-format on */
-	.domain_both = false,
-	.domain_parent = true,
-	.domain_child = true,
-};
-
-/*
- *         Same domain (inherited)
- * .-------------.
- * | P1----.     |      P1 -> P2 : allow
- * |        \    |      P2 -> P1 : allow
- * |         '   |
- * |         P2  |
- * '-------------'
- */
-/* clang-format off */
-FIXTURE_VARIANT_ADD(hierarchy, allow_sibling_domain) {
-	/* clang-format on */
-	.domain_both = true,
-	.domain_parent = false,
-	.domain_child = false,
-};
-
-/*
- *         Inherited + child domain
- * .-----------------.
- * |  P1----.        |  P1 -> P2 : allow
- * |         \       |  P2 -> P1 : deny
- * |        .-'----. |
- * |        |  P2  | |
- * |        '------' |
- * '-----------------'
- */
-/* clang-format off */
-FIXTURE_VARIANT_ADD(hierarchy, allow_with_nested_domain) {
-	/* clang-format on */
-	.domain_both = true,
-	.domain_parent = false,
-	.domain_child = true,
-};
-
-/*
- *         Inherited + parent domain
- * .-----------------.
- * |.------.         |  P1 -> P2 : deny
- * ||  P1  ----.     |  P2 -> P1 : allow
- * |'------'    \    |
- * |             '   |
- * |             P2  |
- * '-----------------'
- */
-/* clang-format off */
-FIXTURE_VARIANT_ADD(hierarchy, deny_with_nested_and_parent_domain) {
-	/* clang-format on */
-	.domain_both = true,
-	.domain_parent = true,
-	.domain_child = false,
-};
-
-/*
- *         Inherited + parent and child domain (siblings)
- * .-----------------.
- * | .------.        |  P1 -> P2 : deny
- * | |  P1  .        |  P2 -> P1 : deny
- * | '------'\       |
- * |          \      |
- * |        .--'---. |
- * |        |  P2  | |
- * |        '------' |
- * '-----------------'
- */
-/* clang-format off */
-FIXTURE_VARIANT_ADD(hierarchy, deny_with_forked_domain) {
-	/* clang-format on */
-	.domain_both = true,
-	.domain_parent = true,
-	.domain_child = true,
-};
-
-FIXTURE_SETUP(hierarchy)
+FIXTURE_SETUP(scoped_domains)
 {
 }
 
-FIXTURE_TEARDOWN(hierarchy)
+FIXTURE_TEARDOWN(scoped_domains)
 {
 }
 
 /* Test PTRACE_TRACEME and PTRACE_ATTACH for parent and child. */
-TEST_F(hierarchy, trace)
+TEST_F(scoped_domains, trace)
 {
 	pid_t child, parent;
 	int status, err_proc_read;
diff --git a/tools/testing/selftests/landlock/scoped_base_variants.h b/tools/testing/selftests/landlock/scoped_base_variants.h
index d3b1fa8a584e..7116728ebc68 100644
--- a/tools/testing/selftests/landlock/scoped_base_variants.h
+++ b/tools/testing/selftests/landlock/scoped_base_variants.h
@@ -1,8 +1,13 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * Landlock scoped_domains variants
+ * Landlock scoped_domains test variant definition.
  *
- * See the hierarchy variants from ptrace_test.c
+ * This file defines a fixture variant "scoped_domains" that has all
+ * permutations of parent/child process being in separate or shared
+ * Landlock domain, or not being in a Landlock domain at all.
+ *
+ * Scoped access tests can include this file to avoid repeating these
+ * combinations.
  *
  * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
  * Copyright © 2019-2020 ANSSI
-- 
cgit v1.2.3


From 317a5df78f24bd77fb770a26eb85bf39620592e0 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Tue, 30 Dec 2025 11:51:32 -0800
Subject: selftests/bpf: Fix verifier_arena_large/big_alloc3 test

The big_alloc3() test tries to allocate 2051 pages at once in
non-sleepable context and this can fail sporadically on resource
contrained systems, so skip this test in case of such failures.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20251230195134.599463-1-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_arena_large.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
index 4ca491cbe8d1..5f7e7afee169 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c
@@ -300,7 +300,7 @@ int big_alloc3(void *ctx)
 	 */
 	pages = bpf_arena_alloc_pages(&arena, NULL, 2051, NUMA_NO_NODE, 0);
 	if (!pages)
-		return -1;
+		return 0;
 
 	bpf_for(i, 0, 2051)
 			pages[i * PAGE_SIZE] = 123;
-- 
cgit v1.2.3


From e6f2612f0e7c23ce991d3094b5387caf1a52a4fe Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Mon, 29 Dec 2025 23:13:08 -0800
Subject: selftests/bpf: test cases for bpf_loop SCC and state graph backedges

Test for state graph backedges accumulation for SCCs formed by
bpf_loop(). Equivalent to the following C program:

  int main(void) {
    1: fp[-8] = bpf_get_prandom_u32();
    2: fp[-16] = -32;                       // used in a memory access below
    3: bpf_loop(7, loop_cb4, fp, 0);
    4: return 0;
  }

  int loop_cb4(int i, void *ctx) {
    5: if (unlikely(ctx[-8] > bpf_get_prandom_u32()))
    6:   *(u64 *)(fp + ctx[-16]) = 42;      // aligned access expected
    7: if (unlikely(fp[-8] > bpf_get_prandom_u32()))
    8:   ctx[-16] = -31;                    // makes said access unaligned
    9: return 0;
  }

If state graph backedges are not accumulated properly at the SCC
formed by loop_cb4() call from bpf_loop(), the state {ctx[-16]=-32}
injected at instruction 9 on verification path 1,2,3,5,7,9,4 would be
considered fully verified and would lack precision mark for ctx[-16].
This would lead to early pruning of verification path 1,2,3,5,7,8,9 in
state {ctx[-16]=-31}, which in turn leads to the incorrect assumption
that the above program is safe.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251229-scc-for-callbacks-v1-2-ceadfe679900@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/iters.c | 75 +++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
index 7dd92a303bf6..69061f030957 100644
--- a/tools/testing/selftests/bpf/progs/iters.c
+++ b/tools/testing/selftests/bpf/progs/iters.c
@@ -1926,4 +1926,79 @@ static int loop1_wrapper(void)
 	);
 }
 
+/*
+ * This is similar to a test case absent_mark_in_the_middle_state(),
+ * but adapted for use with bpf_loop().
+ */
+SEC("raw_tp")
+__flag(BPF_F_TEST_STATE_FREQ)
+__failure __msg("math between fp pointer and register with unbounded min value is not allowed")
+__naked void absent_mark_in_the_middle_state4(void)
+{
+	/*
+	 * Equivalent to a C program below:
+	 *
+	 * int main(void) {
+	 *   fp[-8] = bpf_get_prandom_u32();
+	 *   fp[-16] = -32;                    // used in a memory access below
+	 *   bpf_loop(7, loop_cb4, fp, 0);
+	 *   return 0;
+	 * }
+	 *
+	 * int loop_cb4(int i, void *ctx) {
+	 *   if (unlikely(ctx[-8] > bpf_get_prandom_u32()))
+	 *     *(u64 *)(fp + ctx[-16]) = 42;   // aligned access expected
+	 *   if (unlikely(fp[-8] > bpf_get_prandom_u32()))
+	 *     ctx[-16] = -31;                 // makes said access unaligned
+	 *   return 0;
+	 * }
+	 */
+	asm volatile (
+		"call %[bpf_get_prandom_u32];"
+		"r8 = r0;"
+		"*(u64 *)(r10 - 8) = r0;"
+		"*(u64 *)(r10 - 16) = -32;"
+		"r1 = 7;"
+		"r2 = loop_cb4 ll;"
+		"r3 = r10;"
+		"r4 = 0;"
+		"call %[bpf_loop];"
+		"r0 = 0;"
+		"exit;"
+		:
+		: __imm(bpf_loop),
+		  __imm(bpf_get_prandom_u32)
+		: __clobber_all
+	);
+}
+
+__used __naked
+static void loop_cb4(void)
+{
+	asm volatile (
+		"r9 = r2;"
+		"r8 = *(u64 *)(r9 - 8);"
+		"r6 = *(u64 *)(r9 - 16);"
+		"call %[bpf_get_prandom_u32];"
+		"if r0 > r8 goto use_fp16_%=;"
+	"1:"
+		"call %[bpf_get_prandom_u32];"
+		"if r0 > r8 goto update_fp16_%=;"
+	"2:"
+		"r0 = 0;"
+		"exit;"
+	"use_fp16_%=:"
+		"r1 = r10;"
+		"r1 += r6;"
+		"*(u64 *)(r1 + 0) = 42;"
+		"goto 1b;"
+	"update_fp16_%=:"
+		"*(u64 *)(r9 - 16) = -31;"
+		"goto 2b;"
+		:
+		: __imm(bpf_get_prandom_u32)
+		: __clobber_all
+	);
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 4fd99103eef347174b3c9b6071428324a3cf9a60 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Tue, 30 Dec 2025 21:36:04 -0800
Subject: selftests/bpf: iterator based loop and STACK_MISC states pruning

The test case first initializes 9 stack slots as STACK_MISC,
then conditionally updates each of them to SCALAR spill inside an
iterator based loop. This leads to 2**9 combinations of MISC/SPILL
marks for these slots at the iterator next call.
The loop converges only if the verifier treats such states as
equivalent, otherwise visited states are evicted from the states cache
too quickly.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20251230-loop-stack-misc-pruning-v1-2-585cfd6cec51@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/iters.c | 65 +++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
index 69061f030957..7f27b517d5d5 100644
--- a/tools/testing/selftests/bpf/progs/iters.c
+++ b/tools/testing/selftests/bpf/progs/iters.c
@@ -1997,6 +1997,71 @@ static void loop_cb4(void)
 		"goto 2b;"
 		:
 		: __imm(bpf_get_prandom_u32)
+	);
+}
+
+SEC("raw_tp")
+__success
+__naked int stack_misc_vs_scalar_in_a_loop(void)
+{
+	asm volatile(
+		"*(u8 *)(r10 - 15) = 1;" /* This marks stack slot fp[-16] as STACK_MISC. */
+		"*(u8 *)(r10 - 23) = 1;"
+		"*(u8 *)(r10 - 31) = 1;"
+		"*(u8 *)(r10 - 39) = 1;"
+		"*(u8 *)(r10 - 47) = 1;"
+		"*(u8 *)(r10 - 55) = 1;"
+		"*(u8 *)(r10 - 63) = 1;"
+		"*(u8 *)(r10 - 71) = 1;"
+		"*(u8 *)(r10 - 79) = 1;"
+		"r1 = r10;"
+		"r1 += -8;"
+		"r2 = 0;"
+		"r3 = 10;"
+		"call %[bpf_iter_num_new];"
+	"loop_%=:"
+		"r1 = r10;"
+		"r1 += -8;"
+		"call %[bpf_iter_num_next];"
+		"if r0 == 0 goto loop_end_%=;"
+
+#define maybe_change_stack_slot(off) \
+		"call %[bpf_get_prandom_u32];"	\
+		"if r0 == 42 goto +1;"		\
+		"goto +1;"			\
+		"*(u64 *)(r10 " #off ") = r0;"
+
+		/*
+		 * When comparing verifier states fp[-16] will be
+		 * either STACK_MISC or SCALAR. Pruning logic should
+		 * consider old STACK_MISC equivalent to current SCALAR
+		 * to avoid states explosion.
+		 */
+		maybe_change_stack_slot(-16)
+		maybe_change_stack_slot(-24)
+		maybe_change_stack_slot(-32)
+		maybe_change_stack_slot(-40)
+		maybe_change_stack_slot(-48)
+		maybe_change_stack_slot(-56)
+		maybe_change_stack_slot(-64)
+		maybe_change_stack_slot(-72)
+		maybe_change_stack_slot(-80)
+
+#undef maybe_change_stack_slot
+
+		"goto loop_%=;"
+	"loop_end_%=:"
+		"r1 = r10;"
+		"r1 += -8;"
+		"call %[bpf_iter_num_destroy];"
+		"r0 = 0;"
+		"exit;"
+		:
+		: __imm(bpf_get_prandom_u32),
+		  __imm(bpf_iter_num_new),
+		  __imm(bpf_iter_num_next),
+		  __imm(bpf_iter_num_destroy),
+		  __imm_addr(amap)
 		: __clobber_all
 	);
 }
-- 
cgit v1.2.3


From 1a8fa7faf4890d201aad4f5d4943f74d840cd0ba Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Tue, 30 Dec 2025 17:25:57 -0800
Subject: resolve_btfids: Implement --patch_btfids

Recent changes in BTF generation [1] rely on ${OBJCOPY} command to
update .BTF_ids section data in target ELF files.

This exposed a bug in llvm-objcopy --update-section code path, that
may lead to corruption of a target ELF file. Specifically, because of
the bug st_shndx of some symbols may be (incorrectly) set to 0xffff
(SHN_XINDEX) [2][3].

While there is a pending fix for LLVM, it'll take some time before it
lands (likely in 22.x). And the kernel build must keep working with
older LLVM toolchains in the foreseeable future.

Using GNU objcopy for .BTF_ids update would work, but it would require
changes to LLVM-based build process, likely breaking existing build
environments as discussed in [2].

To work around llvm-objcopy bug, implement --patch_btfids code path in
resolve_btfids as a drop-in replacement for:

    ${OBJCOPY} --update-section .BTF_ids=${btf_ids} ${elf}

Which works specifically for .BTF_ids section:

    ${RESOLVE_BTFIDS} --patch_btfids ${btf_ids} ${elf}

This feature in resolve_btfids can be removed at some point in the
future, when llvm-objcopy with a relevant bugfix becomes common.

[1] https://lore.kernel.org/bpf/20251219181321.1283664-1-ihor.solodrai@linux.dev/
[2] https://lore.kernel.org/bpf/20251224005752.201911-1-ihor.solodrai@linux.dev/
[3] https://github.com/llvm/llvm-project/issues/168060#issuecomment-3533552952

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20251231012558.1699758-1-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 scripts/gen-btf.sh                   |   2 +-
 scripts/link-vmlinux.sh              |   2 +-
 tools/bpf/resolve_btfids/main.c      | 117 +++++++++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/Makefile |   2 +-
 4 files changed, 120 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/scripts/gen-btf.sh b/scripts/gen-btf.sh
index 12244dbe097c..0aec86615416 100755
--- a/scripts/gen-btf.sh
+++ b/scripts/gen-btf.sh
@@ -123,7 +123,7 @@ embed_btf_data()
 	fi
 	local btf_ids="${ELF_FILE}.BTF_ids"
 	if [ -f "${btf_ids}" ]; then
-		${OBJCOPY} --update-section .BTF_ids=${btf_ids} ${ELF_FILE}
+		${RESOLVE_BTFIDS} --patch_btfids ${btf_ids} ${ELF_FILE}
 	fi
 }
 
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index e2207e612ac3..1915adf3249b 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -266,7 +266,7 @@ vmlinux_link "${VMLINUX}"
 
 if is_enabled CONFIG_DEBUG_INFO_BTF; then
 	info OBJCOPY ${btfids_vmlinux}
-	${OBJCOPY} --update-section .BTF_ids=${btfids_vmlinux} ${VMLINUX}
+	${RESOLVE_BTFIDS} --patch_btfids ${btfids_vmlinux} ${VMLINUX}
 fi
 
 mksysmap "${VMLINUX}" System.map
diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index 2cbc252259be..df39982f51df 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -862,8 +862,119 @@ static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, cons
 	return 0;
 }
 
+/*
+ * Patch the .BTF_ids section of an ELF file with data from provided file.
+ * Equivalent to: objcopy --update-section .BTF_ids=<btfids> <elf>
+ *
+ * 1. Find .BTF_ids section in the ELF
+ * 2. Verify that blob file size matches section size
+ * 3. Update section data buffer with blob data
+ * 4. Write the ELF file
+ */
+static int patch_btfids(const char *btfids_path, const char *elf_path)
+{
+	Elf_Scn *scn = NULL;
+	FILE *btfids_file;
+	size_t shdrstrndx;
+	int fd, err = -1;
+	Elf_Data *data;
+	struct stat st;
+	GElf_Shdr sh;
+	char *name;
+	Elf *elf;
+
+	elf_version(EV_CURRENT);
+
+	fd = open(elf_path, O_RDWR, 0666);
+	if (fd < 0) {
+		pr_err("FAILED to open %s: %s\n", elf_path, strerror(errno));
+		return -1;
+	}
+
+	elf = elf_begin(fd, ELF_C_RDWR_MMAP, NULL);
+	if (!elf) {
+		close(fd);
+		pr_err("FAILED cannot create ELF descriptor: %s\n", elf_errmsg(-1));
+		return -1;
+	}
+
+	elf_flagelf(elf, ELF_C_SET, ELF_F_LAYOUT);
+
+	if (elf_getshdrstrndx(elf, &shdrstrndx) != 0) {
+		pr_err("FAILED cannot get shdr str ndx\n");
+		goto out;
+	}
+
+	while ((scn = elf_nextscn(elf, scn)) != NULL) {
+
+		if (gelf_getshdr(scn, &sh) != &sh) {
+			pr_err("FAILED to get section header\n");
+			goto out;
+		}
+
+		name = elf_strptr(elf, shdrstrndx, sh.sh_name);
+		if (!name)
+			continue;
+
+		if (strcmp(name, BTF_IDS_SECTION) == 0)
+			break;
+	}
+
+	if (!scn) {
+		pr_err("FAILED: section %s not found in %s\n", BTF_IDS_SECTION, elf_path);
+		goto out;
+	}
+
+	data = elf_getdata(scn, NULL);
+	if (!data) {
+		pr_err("FAILED to get %s section data from %s\n", BTF_IDS_SECTION, elf_path);
+		goto out;
+	}
+
+	if (stat(btfids_path, &st) < 0) {
+		pr_err("FAILED to stat %s: %s\n", btfids_path, strerror(errno));
+		goto out;
+	}
+
+	if ((size_t)st.st_size != data->d_size) {
+		pr_err("FAILED: size mismatch - %s section in %s is %zu bytes, %s is %zu bytes\n",
+		       BTF_IDS_SECTION, elf_path, data->d_size, btfids_path, (size_t)st.st_size);
+		goto out;
+	}
+
+	btfids_file = fopen(btfids_path, "rb");
+	if (!btfids_file) {
+		pr_err("FAILED to open %s: %s\n", btfids_path, strerror(errno));
+		goto out;
+	}
+
+	pr_debug("Copying data from %s to %s section of %s (%zu bytes)\n",
+		 btfids_path, BTF_IDS_SECTION, elf_path, data->d_size);
+
+	if (fread(data->d_buf, data->d_size, 1, btfids_file) != 1) {
+		pr_err("FAILED to read %s\n", btfids_path);
+		fclose(btfids_file);
+		goto out;
+	}
+	fclose(btfids_file);
+
+	elf_flagdata(data, ELF_C_SET, ELF_F_DIRTY);
+	if (elf_update(elf, ELF_C_WRITE) < 0) {
+		pr_err("FAILED to update ELF file %s\n", elf_path);
+		goto out;
+	}
+
+	err = 0;
+out:
+	elf_end(elf);
+	close(fd);
+
+	return err;
+}
+
 static const char * const resolve_btfids_usage[] = {
 	"resolve_btfids [<options>] <ELF object>",
+	"resolve_btfids --patch_btfids <.BTF_ids file> <ELF object>",
 	NULL
 };
 
@@ -880,6 +991,7 @@ int main(int argc, const char **argv)
 		.funcs    = RB_ROOT,
 		.sets     = RB_ROOT,
 	};
+	const char *btfids_path = NULL;
 	bool fatal_warnings = false;
 	char out_path[PATH_MAX];
 
@@ -894,6 +1006,8 @@ int main(int argc, const char **argv)
 			    "turn warnings into errors"),
 		OPT_BOOLEAN(0, "distill_base", &obj.distill_base,
 			    "distill --btf_base and emit .BTF.base section data"),
+		OPT_STRING(0, "patch_btfids", &btfids_path, "file",
+			   "path to .BTF_ids section data blob to patch into ELF file"),
 		OPT_END()
 	};
 	int err = -1;
@@ -905,6 +1019,9 @@ int main(int argc, const char **argv)
 
 	obj.path = argv[0];
 
+	if (btfids_path)
+		return patch_btfids(btfids_path, obj.path);
+
 	if (load_btf(&obj))
 		goto out;
 
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index f28a32b16ff0..9488d076c740 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -657,7 +657,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o:			\
 	$$(if $$(TEST_NEEDS_BTFIDS),						\
 		$$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@)			\
 		$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@;	\
-		$(OBJCOPY) --update-section .BTF_ids=$$@.BTF_ids $$@)
+		$(RESOLVE_BTFIDS) --patch_btfids $$@.BTF_ids $$@)
 
 $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d:			\
 			    $(TRUNNER_TESTS_DIR)/%.c			\
-- 
cgit v1.2.3


From 673a55cc49dafe47defb9ad76a73987fe89e5d70 Mon Sep 17 00:00:00 2001
From: Clint George <clintbgeorge@gmail.com>
Date: Mon, 15 Dec 2025 14:17:37 +0530
Subject: kselftest/coredump: use __builtin_trap() instead of null pointer

Use __builtin_trap() to truly crash the program instead of dereferencing
null pointer which may be optimized by the compiler preventing the crash
from occurring

[] Testing:
The diff between before and after of running the kselftest test of the
module shows no regression on system with x86 architecture

[] Error log:
~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/coredump$ make LLVM=1 W=1
  CC       stackdump_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
   59 |         i = *(int *)NULL;
      |             ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
  CC       coredump_socket_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
   59 |         i = *(int *)NULL;
      |             ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
  CC       coredump_socket_protocol_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
   59 |         i = *(int *)NULL;
      |             ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.

Link: https://lore.kernel.org/r/20251215084737.7504-1-clintbgeorge@gmail.com
Signed-off-by: Clint George <clintbgeorge@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/coredump/coredump_test_helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c
index a6f6d5f2ae07..5c8adee63641 100644
--- a/tools/testing/selftests/coredump/coredump_test_helpers.c
+++ b/tools/testing/selftests/coredump/coredump_test_helpers.c
@@ -56,7 +56,7 @@ void crashing_child(void)
 		pthread_create(&thread, NULL, do_nothing, NULL);
 
 	/* crash on purpose */
-	i = *(int *)NULL;
+	__builtin_trap();
 }
 
 int create_detached_tmpfs(void)
-- 
cgit v1.2.3


From 0aaff7b109037c0a45def1bed7b76ffaf253f7d0 Mon Sep 17 00:00:00 2001
From: Clint George <clintbgeorge@gmail.com>
Date: Mon, 15 Dec 2025 14:19:00 +0530
Subject: kselftest/anon_inode: replace null pointers with empty arrays

Make use of empty (NULL-terminated) array instead of NULL pointer to
avoid compiler errors while maintaining the behavior of the function
intact

[] Testing:
The diff between before and after of running the kselftest test of the
module shows no regression on system with x86 architecture

[] Error log:
~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/filesystems$ make LLVM=1 W=1
  CC       devpts_pts
  CC       file_stressor
  CC       anon_inode_test
anon_inode_test.c:45:37: warning: null passed to a callee that requires a non-null argument [-Wnonnull]
   45 |         ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0);
      |                                            ^~~~
/usr/lib/llvm-18/lib/clang/18/include/__stddef_null.h:26:14: note: expanded from macro 'NULL'
   26 | #define NULL ((void*)0)
      |              ^~~~~~~~~~
../Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:535:11: note: expanded from macro 'ASSERT_LT'
  535 |         __EXPECT(expected, #expected, seen, #seen, <, 1)
      |                  ^~~~~~~~
../Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:758:33: note: expanded from macro '__EXPECT'
  758 |         __typeof__(_expected) __exp = (_expected); \
      |                                        ^~~~~~~~~
1 warning generated.

Link: https://lore.kernel.org/r/20251215084900.7590-1-clintbgeorge@gmail.com
Signed-off-by: Clint George <clintbgeorge@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/filesystems/anon_inode_test.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c b/tools/testing/selftests/filesystems/anon_inode_test.c
index 94c6c81c2301..2c4c50500116 100644
--- a/tools/testing/selftests/filesystems/anon_inode_test.c
+++ b/tools/testing/selftests/filesystems/anon_inode_test.c
@@ -42,7 +42,10 @@ TEST(anon_inode_no_exec)
 	fd_context = sys_fsopen("tmpfs", 0);
 	ASSERT_GE(fd_context, 0);
 
-	ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0);
+	char *const empty_argv[] = {NULL};
+	char *const empty_envp[] = {NULL};
+
+	ASSERT_LT(execveat(fd_context, "", empty_argv, empty_envp, AT_EMPTY_PATH), 0);
 	ASSERT_EQ(errno, EACCES);
 
 	EXPECT_EQ(close(fd_context), 0);
-- 
cgit v1.2.3


From 3e6ad272bb8b3199bad952e7b077102af2d8df03 Mon Sep 17 00:00:00 2001
From: Clint George <clintbgeorge@gmail.com>
Date: Mon, 15 Dec 2025 14:20:22 +0530
Subject: kselftest/kublk: include message in _Static_assert for C11
 compatibility

Add descriptive message in the _Static_assert to comply with the C11
standard requirement to prevent compiler from throwing out error. The
compiler throws an error when _Static_assert is used without a message as
that is a C23 extension.

[] Testing:
The diff between before and after of running the kselftest test of the
module shows no regression on system with x86 architecture

[] Error log:
~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/ublk$ make LLVM=1 W=1
  CC       kublk
In file included from kublk.c:6:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
  220 |         _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
      |                                                  ^
      |                                                  , ""
1 error generated.
In file included from null.c:3:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
  220 |         _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
      |                                                  ^
      |                                                  , ""
1 error generated.
In file included from file_backed.c:3:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
  220 |         _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
      |                                                  ^
      |                                                  , ""
1 error generated.
In file included from common.c:3:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
  220 |         _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
      |                                                  ^
      |                                                  , ""
1 error generated.
In file included from stripe.c:3:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
  220 |         _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
      |                                                  ^
      |                                                  , ""
1 error generated.
In file included from fault_inject.c:11:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
  220 |         _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
      |                                                  ^
      |                                                  , ""
1 error generated.
make: *** [../lib.mk:225: ~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/ublk/kublk] Error 1

Link: https://lore.kernel.org/r/20251215085022.7642-1-clintbgeorge@gmail.com
Signed-off-by: Clint George <clintbgeorge@gmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ublk/kublk.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index fe42705c6d42..e5eb5f762c1c 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -217,7 +217,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op,
 		unsigned tgt_data, unsigned q_id, unsigned is_target_io)
 {
 	/* we only have 7 bits to encode q_id */
-	_Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
+	_Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7, "UBLK_MAX_QUEUES_SHIFT must be <= 7");
 	assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7));
 
 	return tag | (op << 16) | (tgt_data << 24) |
-- 
cgit v1.2.3


From c286e7e9d1f1f3d90ad11c37e896f582b02d19c4 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Wed, 31 Dec 2025 14:10:50 -0800
Subject: selftests/bpf: veristat: fix printing order in output_stats()

The order of the variables in the printf() doesn't match the text and
therefore veristat prints something like this:

Done. Processed 24 files, 0 programs. Skipped 62 files, 0 programs.

When it should print:

Done. Processed 24 files, 62 programs. Skipped 0 files, 0 programs.

Fix the order of variables in the printf() call.

Fixes: 518fee8bfaf2 ("selftests/bpf: make veristat skip non-BPF and failing-to-open BPF objects")
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20251231221052.759396-1-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/veristat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index e962f133250c..1be1e353d40a 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -2580,7 +2580,7 @@ static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last
 	if (last && fmt == RESFMT_TABLE) {
 		output_header_underlines();
 		printf("Done. Processed %d files, %d programs. Skipped %d files, %d programs.\n",
-		       env.files_processed, env.files_skipped, env.progs_processed, env.progs_skipped);
+		       env.files_processed, env.progs_processed, env.files_skipped, env.progs_skipped);
 	}
 }
 
-- 
cgit v1.2.3


From a73fc3dcc60b6d7a2075e2fbdca64fd53600f855 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 29 Dec 2025 11:10:58 -0800
Subject: rcu: Clean up after the SRCU-fastification of RCU Tasks Trace

Now that RCU Tasks Trace has been re-implemented in terms of SRCU-fast,
the ->trc_ipi_to_cpu, ->trc_blkd_cpu, ->trc_blkd_node, ->trc_holdout_list,
and ->trc_reader_special task_struct fields are no longer used.

In addition, the rcu_tasks_trace_qs(), rcu_tasks_trace_qs_blkd(),
exit_tasks_rcu_finish_trace(), and rcu_spawn_tasks_trace_kthread(),
show_rcu_tasks_trace_gp_kthread(), rcu_tasks_trace_get_gp_data(),
rcu_tasks_trace_torture_stats_print(), and get_rcu_tasks_trace_gp_kthread()
functions and all the other functions that they invoke are no longer used.

Also, the TRC_NEED_QS and TRC_NEED_QS_CHECKED CPP macros are no longer used.
Neither are the rcu_tasks_trace_lazy_ms and rcu_task_ipi_delay rcupdate
module parameters and the TASKS_TRACE_RCU_READ_MB Kconfig option.

This commit therefore removes all of them.

[ paulmck: Apply Alexei Starovoitov feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: bpf@vger.kernel.org
Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 Documentation/admin-guide/kernel-parameters.txt    | 15 ----
 include/linux/rcupdate.h                           | 31 +--------
 include/linux/rcupdate_trace.h                     |  2 -
 include/linux/sched.h                              |  5 --
 init/init_task.c                                   |  3 -
 kernel/fork.c                                      |  3 -
 kernel/rcu/Kconfig                                 | 18 -----
 kernel/rcu/rcu.h                                   |  9 ---
 kernel/rcu/rcuscale.c                              |  7 --
 kernel/rcu/rcutorture.c                            |  2 -
 kernel/rcu/tasks.h                                 | 79 +---------------------
 .../selftests/rcutorture/configs/rcu/TRACE01       |  1 -
 .../selftests/rcutorture/configs/rcu/TRACE02       |  1 -
 13 files changed, 2 insertions(+), 174 deletions(-)

(limited to 'tools')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a8d0afde7f85..1b8e5cadbecb 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6249,13 +6249,6 @@ Kernel parameters
 			dynamically) adjusted.	This parameter is intended
 			for use in testing.
 
-	rcupdate.rcu_task_ipi_delay= [KNL]
-			Set time in jiffies during which RCU tasks will
-			avoid sending IPIs, starting with the beginning
-			of a given grace period.  Setting a large
-			number avoids disturbing real-time workloads,
-			but lengthens grace periods.
-
 	rcupdate.rcu_task_lazy_lim= [KNL]
 			Number of callbacks on a given CPU that will
 			cancel laziness on that CPU.  Use -1 to disable
@@ -6299,14 +6292,6 @@ Kernel parameters
 			of zero will disable batching.	Batching is
 			always disabled for synchronize_rcu_tasks().
 
-	rcupdate.rcu_tasks_trace_lazy_ms= [KNL]
-			Set timeout in milliseconds RCU Tasks
-			Trace asynchronous callback batching for
-			call_rcu_tasks_trace().  A negative value
-			will take the default.	A value of zero will
-			disable batching.  Batching is always disabled
-			for synchronize_rcu_tasks_trace().
-
 	rcupdate.rcu_self_test= [KNL]
 			Run the RCU early boot self tests
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index c5b30054cd01..bd5a420cf09a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -175,36 +175,7 @@ void rcu_tasks_torture_stats_print(char *tt, char *tf);
 # define synchronize_rcu_tasks synchronize_rcu
 # endif
 
-# ifdef CONFIG_TASKS_TRACE_RCU
-// Bits for ->trc_reader_special.b.need_qs field.
-#define TRC_NEED_QS		0x1  // Task needs a quiescent state.
-#define TRC_NEED_QS_CHECKED	0x2  // Task has been checked for needing quiescent state.
-
-u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new);
-void rcu_tasks_trace_qs_blkd(struct task_struct *t);
-
-# define rcu_tasks_trace_qs(t)							\
-	do {									\
-		int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting);	\
-										\
-		if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) &&	\
-		    likely(!___rttq_nesting)) {					\
-			rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED);	\
-		} else if (___rttq_nesting && ___rttq_nesting != INT_MIN &&	\
-			   !READ_ONCE((t)->trc_reader_special.b.blocked)) {	\
-			rcu_tasks_trace_qs_blkd(t);				\
-		}								\
-	} while (0)
-void rcu_tasks_trace_torture_stats_print(char *tt, char *tf);
-# else
-# define rcu_tasks_trace_qs(t) do { } while (0)
-# endif
-
-#define rcu_tasks_qs(t, preempt)					\
-do {									\
-	rcu_tasks_classic_qs((t), (preempt));				\
-	rcu_tasks_trace_qs(t);						\
-} while (0)
+#define rcu_tasks_qs(t, preempt) rcu_tasks_classic_qs((t), (preempt))
 
 # ifdef CONFIG_TASKS_RUDE_RCU
 void synchronize_rcu_tasks_rude(void);
diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h
index 3f46cbe67000..0bd47f12ecd1 100644
--- a/include/linux/rcupdate_trace.h
+++ b/include/linux/rcupdate_trace.h
@@ -136,9 +136,7 @@ static inline void rcu_barrier_tasks_trace(void)
 }
 
 // Placeholders to enable stepwise transition.
-void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq);
 void __init rcu_tasks_trace_suppress_unused(void);
-struct task_struct *get_rcu_tasks_trace_gp_kthread(void);
 
 #else
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fe39d422b37d..56156643ccac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -946,11 +946,6 @@ struct task_struct {
 #ifdef CONFIG_TASKS_TRACE_RCU
 	int				trc_reader_nesting;
 	struct srcu_ctr __percpu	*trc_reader_scp;
-	int				trc_ipi_to_cpu;
-	union rcu_special		trc_reader_special;
-	struct list_head		trc_holdout_list;
-	struct list_head		trc_blkd_node;
-	int				trc_blkd_cpu;
 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
 
 	struct sched_info		sched_info;
diff --git a/init/init_task.c b/init/init_task.c
index 49b13d7c3985..db92c404d59a 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -195,9 +195,6 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 #endif
 #ifdef CONFIG_TASKS_TRACE_RCU
 	.trc_reader_nesting = 0,
-	.trc_reader_special.s = 0,
-	.trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list),
-	.trc_blkd_node = LIST_HEAD_INIT(init_task.trc_blkd_node),
 #endif
 #ifdef CONFIG_CPUSETS
 	.mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq,
diff --git a/kernel/fork.c b/kernel/fork.c
index b1f3915d5f8e..d7ed107cbb47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1828,9 +1828,6 @@ static inline void rcu_copy_process(struct task_struct *p)
 #endif /* #ifdef CONFIG_TASKS_RCU */
 #ifdef CONFIG_TASKS_TRACE_RCU
 	p->trc_reader_nesting = 0;
-	p->trc_reader_special.s = 0;
-	INIT_LIST_HEAD(&p->trc_holdout_list);
-	INIT_LIST_HEAD(&p->trc_blkd_node);
 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
 }
 
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 4d9b21f69eaa..8d5a1ecb7d56 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -313,24 +313,6 @@ config RCU_NOCB_CPU_CB_BOOST
 	  Say Y here if you want to set RT priority for offloading kthreads.
 	  Say N here if you are building a !PREEMPT_RT kernel and are unsure.
 
-config TASKS_TRACE_RCU_READ_MB
-	bool "Tasks Trace RCU readers use memory barriers in user and idle"
-	depends on RCU_EXPERT && TASKS_TRACE_RCU
-	default PREEMPT_RT || NR_CPUS < 8
-	help
-	  Use this option to further reduce the number of IPIs sent
-	  to CPUs executing in userspace or idle during tasks trace
-	  RCU grace periods.  Given that a reasonable setting of
-	  the rcupdate.rcu_task_ipi_delay kernel boot parameter
-	  eliminates such IPIs for many workloads, proper setting
-	  of this Kconfig option is important mostly for aggressive
-	  real-time installations and for battery-powered devices,
-	  hence the default chosen above.
-
-	  Say Y here if you hate IPIs.
-	  Say N here if you hate read-side memory barriers.
-	  Take the default if you are unsure.
-
 config RCU_LAZY
 	bool "RCU callback lazy invocation functionality"
 	depends on RCU_NOCB_CPU
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 9cf01832a6c3..dc5d614b372c 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -544,10 +544,6 @@ struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
 void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq);
 #endif // # ifdef CONFIG_TASKS_RUDE_RCU
 
-#ifdef CONFIG_TASKS_TRACE_RCU
-void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq);
-#endif
-
 #ifdef CONFIG_TASKS_RCU_GENERIC
 void tasks_cblist_init_generic(void);
 #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
@@ -673,11 +669,6 @@ void show_rcu_tasks_rude_gp_kthread(void);
 #else
 static inline void show_rcu_tasks_rude_gp_kthread(void) {}
 #endif
-#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_TRACE_RCU)
-void show_rcu_tasks_trace_gp_kthread(void);
-#else
-static inline void show_rcu_tasks_trace_gp_kthread(void) {}
-#endif
 
 #ifdef CONFIG_TINY_RCU
 static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 7484d8ad5767..1c50f89fbd6f 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -400,11 +400,6 @@ static void tasks_trace_scale_read_unlock(int idx)
 	rcu_read_unlock_trace();
 }
 
-static void rcu_tasks_trace_scale_stats(void)
-{
-	rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG);
-}
-
 static struct rcu_scale_ops tasks_tracing_ops = {
 	.ptype		= RCU_TASKS_FLAVOR,
 	.init		= rcu_sync_scale_init,
@@ -416,8 +411,6 @@ static struct rcu_scale_ops tasks_tracing_ops = {
 	.gp_barrier	= rcu_barrier_tasks_trace,
 	.sync		= synchronize_rcu_tasks_trace,
 	.exp_sync	= synchronize_rcu_tasks_trace,
-	.rso_gp_kthread	= get_rcu_tasks_trace_gp_kthread,
-	.stats		= IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats,
 	.name		= "tasks-tracing"
 };
 
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 07e51974b06b..78a6ebe77d35 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1180,8 +1180,6 @@ static struct rcu_torture_ops tasks_tracing_ops = {
 	.exp_sync	= synchronize_rcu_tasks_trace,
 	.call		= call_rcu_tasks_trace,
 	.cb_barrier	= rcu_barrier_tasks_trace,
-	.gp_kthread_dbg	= show_rcu_tasks_trace_gp_kthread,
-	.get_gp_data    = rcu_tasks_trace_get_gp_data,
 	.cbflood_max	= 50000,
 	.irq_capable	= 1,
 	.slow_gps	= 1,
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 1fe789c99f36..1249b47f0a8d 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -161,11 +161,6 @@ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
 static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
 #endif
 
-/* Avoid IPIing CPUs early in the grace period. */
-#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0)
-static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY;
-module_param(rcu_task_ipi_delay, int, 0644);
-
 /* Control stall timeouts.  Disable with <= 0, otherwise jiffies till stall. */
 #define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30)
 #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
@@ -800,8 +795,6 @@ static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *t
 
 #endif // #ifndef CONFIG_TINY_RCU
 
-static void exit_tasks_rcu_finish_trace(struct task_struct *t);
-
 #if defined(CONFIG_TASKS_RCU)
 
 ////////////////////////////////////////////////////////////////////////
@@ -1321,13 +1314,11 @@ void exit_tasks_rcu_finish(void)
 	raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
 	list_del_init(&t->rcu_tasks_exit_list);
 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
-
-	exit_tasks_rcu_finish_trace(t);
 }
 
 #else /* #ifdef CONFIG_TASKS_RCU */
 void exit_tasks_rcu_start(void) { }
-void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
+void exit_tasks_rcu_finish(void) { }
 #endif /* #else #ifdef CONFIG_TASKS_RCU */
 
 #ifdef CONFIG_TASKS_RUDE_RCU
@@ -1475,69 +1466,6 @@ void __init rcu_tasks_trace_suppress_unused(void)
 #endif // #ifndef CONFIG_TINY_RCU
 }
 
-/*
- * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
- * the four-byte operand-size restriction of some platforms.
- *
- * Returns the old value, which is often ignored.
- */
-u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
-{
-	return cmpxchg(&t->trc_reader_special.b.need_qs, old, new);
-}
-EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
-
-/* Add a newly blocked reader task to its CPU's list. */
-void rcu_tasks_trace_qs_blkd(struct task_struct *t)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd);
-
-/* Communicate task state back to the RCU tasks trace stall warning request. */
-struct trc_stall_chk_rdr {
-	int nesting;
-	int ipi_to_cpu;
-	u8 needqs;
-};
-
-/* Report any needed quiescent state for this exiting task. */
-static void exit_tasks_rcu_finish_trace(struct task_struct *t)
-{
-}
-
-int rcu_tasks_trace_lazy_ms = -1;
-module_param(rcu_tasks_trace_lazy_ms, int, 0444);
-
-static int __init rcu_spawn_tasks_trace_kthread(void)
-{
-	return 0;
-}
-
-#if !defined(CONFIG_TINY_RCU)
-void show_rcu_tasks_trace_gp_kthread(void)
-{
-}
-EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread);
-
-void rcu_tasks_trace_torture_stats_print(char *tt, char *tf)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print);
-#endif // !defined(CONFIG_TINY_RCU)
-
-struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
-{
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread);
-
-void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data);
-
-#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
-static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
 #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
 
 #ifndef CONFIG_TINY_RCU
@@ -1545,7 +1473,6 @@ void show_rcu_tasks_gp_kthreads(void)
 {
 	show_rcu_tasks_classic_gp_kthread();
 	show_rcu_tasks_rude_gp_kthread();
-	show_rcu_tasks_trace_gp_kthread();
 }
 #endif /* #ifndef CONFIG_TINY_RCU */
 
@@ -1684,10 +1611,6 @@ static int __init rcu_init_tasks_generic(void)
 	rcu_spawn_tasks_rude_kthread();
 #endif
 
-#ifdef CONFIG_TASKS_TRACE_RCU
-	rcu_spawn_tasks_trace_kthread();
-#endif
-
 	// Run the self-tests.
 	rcu_tasks_initiate_self_tests();
 
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01
index 85b407467454..18efab346381 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01
@@ -10,5 +10,4 @@ CONFIG_PROVE_LOCKING=n
 #CHECK#CONFIG_PROVE_RCU=n
 CONFIG_FORCE_TASKS_TRACE_RCU=y
 #CHECK#CONFIG_TASKS_TRACE_RCU=y
-CONFIG_TASKS_TRACE_RCU_READ_MB=y
 CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02
index 9003c56cd764..8da390e82829 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02
@@ -9,6 +9,5 @@ CONFIG_PROVE_LOCKING=y
 #CHECK#CONFIG_PROVE_RCU=y
 CONFIG_FORCE_TASKS_TRACE_RCU=y
 #CHECK#CONFIG_TASKS_TRACE_RCU=y
-CONFIG_TASKS_TRACE_RCU_READ_MB=n
 CONFIG_RCU_EXPERT=y
 CONFIG_DEBUG_OBJECTS=y
-- 
cgit v1.2.3


From 3ce40539cc0055b2df49303979c5b6a4a8321be4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 29 Dec 2025 11:13:54 -0800
Subject: torture: Parallelize kvm-series.sh guest-OS execution

Currently, kvm-series.sh builds and runs serially, which makes for
long execution times.  This commit changes its logic to build all of
the needed kernels serially, but then run the corresponding guest OSes
concurrently in batches using the entire machine.  On large systems,
this results in order-of-magnitude speedups of the guest-OS execution
portion of the runtime.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 .../testing/selftests/rcutorture/bin/kvm-series.sh | 174 ++++++++++++++++++---
 1 file changed, 153 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh
index 2ff905a1853b..6729687861f2 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-series.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh
@@ -15,7 +15,7 @@
 # This script is intended to replace kvm-check-branches.sh by providing
 # ease of use and faster execution.
 
-T="`mktemp -d ${TMPDIR-/tmp}/kvm-series.sh.XXXXXX`"
+T="`mktemp -d ${TMPDIR-/tmp}/kvm-series.sh.XXXXXX`"; export T
 trap 'rm -rf $T' 0
 
 scriptname=$0
@@ -53,40 +53,62 @@ shift
 
 RCUTORTURE="`pwd`/tools/testing/selftests/rcutorture"; export RCUTORTURE
 PATH=${RCUTORTURE}/bin:$PATH; export PATH
+RES="${RCUTORTURE}/res"; export RES
 . functions.sh
 
 ret=0
-nfail=0
+nbuildfail=0
+nrunfail=0
 nsuccess=0
-faillist=
+ncpus=0
+buildfaillist=
+runfaillist=
 successlist=
 cursha1="`git rev-parse --abbrev-ref HEAD`"
 ds="`date +%Y.%m.%d-%H.%M.%S`-series"
+DS="${RES}/${ds}"; export DS
 startdate="`date`"
 starttime="`get_starttime`"
 
 echo " --- " $scriptname $args | tee -a $T/log
 echo " --- Results directory: " $ds | tee -a $T/log
 
+# Do all builds.  Iterate through commits within a given scenario
+# because builds normally go faster from one commit to the next within a
+# given scenario.  In contrast, switching scenarios on each rebuild will
+# often force a full rebuild due to Kconfig differences, for example,
+# turning preemption on and off.  Defer actual runs in order to run
+# lots of them concurrently on large systems.
+touch $T/torunlist
 for config in ${config_list}
 do
 	sha_n=0
 	for sha in ${sha1_list}
 	do
 		sha1=${sha_n}.${sha} # Enable "sort -k1nr" to list commits in order.
+		echo
 		echo Starting ${config}/${sha1} at `date` | tee -a $T/log
-		git checkout "${sha}"
-		time tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 "$@"
+		git checkout --detach "${sha}"
+		tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 --build-only --trust-make "$@"
 		curret=$?
 		if test "${curret}" -ne 0
 		then
-			nfail=$((nfail+1))
-			faillist="$faillist ${config}/${sha1}(${curret})"
+			nbuildfail=$((nbuildfail+1))
+			buildfaillist="$buildfaillist ${config}/${sha1}(${curret})"
 		else
-			nsuccess=$((nsuccess+1))
-			successlist="$successlist ${config}/${sha1}"
-			# Successful run, so remove large files.
-			rm -f ${RCUTORTURE}/$ds/${config}/${sha1}/{vmlinux,bzImage,System.map,Module.symvers}
+			batchncpus="`grep -v "^# cpus=" "${DS}/${config}/${sha1}/batches" | awk '{ sum += $3 } END { print sum }'`"
+			echo run_one_qemu ${sha_n} ${config}/${sha1} ${batchncpus} >> $T/torunlist
+			if test "${ncpus}" -eq 0
+			then
+				ncpus="`grep "^# cpus=" "${DS}/${config}/${sha1}/batches" | sed -e 's/^# cpus=//'`"
+				case "${ncpus}" in
+				^[0-9]*$)
+					;;
+				*)
+					ncpus=0
+					;;
+				esac
+			fi
 		fi
 		if test "${ret}" -eq 0
 		then
@@ -95,22 +117,132 @@ do
 		sha_n=$((sha_n+1))
 	done
 done
+
+# If the user did not specify the number of CPUs, use them all.
+if test "${ncpus}" -eq 0
+then
+	ncpus="`identify_qemu_vcpus`"
+fi
+
+cpusused=0
+touch $T/successlistfile
+touch $T/faillistfile
+
+# do_run_one_qemu ds resultsdir qemu_curout
+#
+# Start the specified qemu run and record its success or failure.
+do_run_one_qemu () {
+	local ret
+	local ds="$1"
+	local resultsdir="$2"
+	local qemu_curout="$3"
+
+	tools/testing/selftests/rcutorture/bin/kvm-again.sh "${DS}/${resultsdir}" --link inplace-force > ${qemu_curout} 2>&1
+	ret=$?
+	if test "${ret}" -eq 0
+	then
+		echo ${resultsdir} >> $T/successlistfile
+		# Successful run, so remove large files.
+		rm -f ${DS}/${resultsdir}/{vmlinux,bzImage,System.map,Module.symvers}
+	else
+		echo "${resultsdir}(${ret})" >> $T/faillistfile
+	fi
+}
+
+# cleanup_qemu_batch batchncpus
+#
+# Update success and failure lists, files, and counts at the end of
+# a batch.
+cleanup_qemu_batch () {
+	local batchncpus="$1"
+
+	echo Waiting, cpusused=${cpusused}, ncpus=${ncpus} `date` | tee -a $T/log
+	wait
+	cpusused="${batchncpus}"
+	nsuccessbatch="`wc -l $T/successlistfile | awk '{ print $1 }'`"
+	nsuccess=$((nsuccess+nsuccessbatch))
+	successlist="$successlist `cat $T/successlistfile`"
+	rm $T/successlistfile
+	touch $T/successlistfile
+	nfailbatch="`wc -l $T/faillistfile | awk '{ print $1 }'`"
+	nrunfail=$((nrunfail+nfailbatch))
+	runfaillist="$runfaillist `cat $T/faillistfile`"
+	rm $T/faillistfile
+	touch $T/faillistfile
+}
+
+# run_one_qemu sha_n config/sha1 batchncpus
+#
+# Launch into the background the sha_n-th qemu job whose results directory
+# is config/sha1 and which uses batchncpus CPUs.  Once we reach a job that
+# would overflow the number of available CPUs, wait for the previous jobs
+# to complete and record their results.
+run_one_qemu () {
+	local sha_n="$1"
+	local config_sha1="$2"
+	local batchncpus="$3"
+	local qemu_curout
+
+	cpusused=$((cpusused+batchncpus))
+	if test "${cpusused}" -gt $ncpus
+	then
+		cleanup_qemu_batch "${batchncpus}"
+	fi
+	echo Starting ${config_sha1} using ${batchncpus} CPUs `date`
+	qemu_curout="${DS}/${config_sha1}/qemu-series"
+	do_run_one_qemu "$ds" "${config_sha1}" ${qemu_curout} &
+}
+
+# Re-ordering the runs will mess up the affinity chosen at build time
+# (among other things, over-using CPU 0), so suppress it.
+TORTURE_NO_AFFINITY="no-affinity"; export TORTURE_NO_AFFINITY
+
+# Run the kernels (if any) that built correctly.
+echo | tee -a $T/log # Put a blank line between build and run messages.
+. $T/torunlist
+cleanup_qemu_batch "${batchncpus}"
+
+# Get back to initial checkout/SHA-1.
 git checkout "${cursha1}"
 
-echo ${nsuccess} SUCCESSES: | tee -a $T/log
-echo ${successlist} | fmt | tee -a $T/log
-echo | tee -a $T/log
-echo ${nfail} FAILURES: | tee -a $T/log
-echo ${faillist} | fmt | tee -a $T/log
-if test -n "${faillist}"
+# Throw away leading and trailing space characters for fmt.
+successlist="`echo ${successlist} | sed -e 's/^ *//' -e 's/ *$//'`"
+buildfaillist="`echo ${buildfaillist} | sed -e 's/^ *//' -e 's/ *$//'`"
+runfaillist="`echo ${runfaillist} | sed -e 's/^ *//' -e 's/ *$//'`"
+
+# Print lists of successes, build failures, and run failures, if any.
+if test "${nsuccess}" -gt 0
+then
+	echo | tee -a $T/log
+	echo ${nsuccess} SUCCESSES: | tee -a $T/log
+	echo ${successlist} | fmt | tee -a $T/log
+fi
+if test "${nbuildfail}" -gt 0
 then
 	echo | tee -a $T/log
-	echo Failures across commits: | tee -a $T/log
-	echo ${faillist} | tr ' ' '\012' | sed -e 's,^[^/]*/,,' -e 's/([0-9]*)//' |
+	echo ${nbuildfail} BUILD FAILURES: | tee -a $T/log
+	echo ${buildfaillist} | fmt | tee -a $T/log
+fi
+if test "${nrunfail}" -gt 0
+then
+	echo | tee -a $T/log
+	echo ${nrunfail} RUN FAILURES: | tee -a $T/log
+	echo ${runfaillist} | fmt | tee -a $T/log
+fi
+
+# If there were build or runtime failures, map them to commits.
+if test "${nbuildfail}" -gt 0 || test "${nrunfail}" -gt 0
+then
+	echo | tee -a $T/log
+	echo Build failures across commits: | tee -a $T/log
+	echo ${buildfaillist} | tr ' ' '\012' | sed -e 's,^[^/]*/,,' -e 's/([0-9]*)//' |
 		sort | uniq -c | sort -k2n | tee -a $T/log
 fi
+
+# Print run summary.
+echo | tee -a $T/log
 echo Started at $startdate, ended at `date`, duration `get_starttime_duration $starttime`. | tee -a $T/log
-echo Summary: Successes: ${nsuccess} Failures: ${nfail} | tee -a $T/log
-cp $T/log tools/testing/selftests/rcutorture/res/${ds}
+echo Summary: Successes: ${nsuccess} " "Build Failures: ${nbuildfail} " "Runtime Failures: ${nrunfail}| tee -a $T/log
+cp $T/log ${DS}
 
 exit "${ret}"
-- 
cgit v1.2.3


From 672621773f7df8cda2ff5635edac4aa5339d097f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 29 Dec 2025 11:13:55 -0800
Subject: torture: Make kvm-series.sh give build numbers and totals

The kvm-series.sh script can easily be convinced to do on the order
of 1,000 builds, so some sort of progress indicator would be helpful.
This commit therefore updates the "Starting" output lines to read
as in the following example, adding the ("2 of 4"):

Starting TREE01/1.7e0ad1b49057 (2 of 4) at Sat Nov 8 10:08:21 PM PST 2025

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 tools/testing/selftests/rcutorture/bin/kvm-series.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh
index 6729687861f2..a00d2e96f6cc 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-series.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh
@@ -32,6 +32,7 @@ then
 	echo "$0: Repetition ('*') not allowed in config list."
 	exit 1
 fi
+config_list_len="`echo ${config_list} | wc -w | awk '{ print $1; }'`"
 
 commit_list="${2}"
 if test -z "${commit_list}"
@@ -47,6 +48,7 @@ then
 	exit 2
 fi
 sha1_list=`cat $T/commits`
+sha1_list_len="`echo ${sha1_list} | wc -w | awk '{ print $1; }'`"
 
 shift
 shift
@@ -80,6 +82,8 @@ echo " --- Results directory: " $ds | tee -a $T/log
 # turning preemption on and off.  Defer actual runs in order to run
 # lots of them concurrently on large systems.
 touch $T/torunlist
+n2build="$((config_list_len*sha1_list_len))"
+nbuilt=0
 for config in ${config_list}
 do
 	sha_n=0
@@ -87,7 +91,7 @@ do
 	do
 		sha1=${sha_n}.${sha} # Enable "sort -k1nr" to list commits in order.
 		echo
-		echo Starting ${config}/${sha1} at `date` | tee -a $T/log
+		echo Starting ${config}/${sha1} "($((nbuilt+1)) of ${n2build})" at `date` | tee -a $T/log
 		git checkout --detach "${sha}"
 		tools/testing/selftests/rcutorture/bin/kvm.sh --configs "$config" --datestamp "$ds/${config}/${sha1}" --duration 1 --build-only --trust-make "$@"
 		curret=$?
@@ -115,6 +119,7 @@ do
 			ret=${curret}
 		fi
 		sha_n=$((sha_n+1))
+		nbuilt=$((nbuilt+1))
 	done
 done
 
-- 
cgit v1.2.3


From 3d69b6beb8ba932911096138394b5cd3e21b3b92 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 29 Dec 2025 11:13:56 -0800
Subject: torture: Make kvm-series.sh give run numbers and totals

The kvm-series.sh script can easily be convinced to run on the order of
1,000 guest OSes, so some sort of progress indicator would be helpful.
This commit therefore updates the "Starting" output lines to read as in
the following example, adding the ("3 of 4"):

Starting TREE02/1.7e0ad1b49057 using 8 CPUs (4 of 4) Sat Nov 8 10:51:06 PM PST 2025

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 tools/testing/selftests/rcutorture/bin/kvm-series.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-series.sh b/tools/testing/selftests/rcutorture/bin/kvm-series.sh
index a00d2e96f6cc..c4ee5f910931 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-series.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-series.sh
@@ -132,6 +132,8 @@ fi
 cpusused=0
 touch $T/successlistfile
 touch $T/faillistfile
+n2run="`wc -l $T/torunlist | awk '{ print $1; }'`"
+nrun=0
 
 # do_run_one_qemu ds resultsdir qemu_curout
 #
@@ -193,9 +195,10 @@ run_one_qemu () {
 	then
 		cleanup_qemu_batch "${batchncpus}"
 	fi
-	echo Starting ${config_sha1} using ${batchncpus} CPUs `date`
+	echo Starting ${config_sha1} using ${batchncpus} CPUs "($((nrun+1)) of ${n2run})" `date`
 	qemu_curout="${DS}/${config_sha1}/qemu-series"
 	do_run_one_qemu "$ds" "${config_sha1}" ${qemu_curout} &
+	nrun="$((nrun+1))"
 }
 
 # Re-ordering the runs will mess up the affinity chosen at build time
-- 
cgit v1.2.3


From dcd6067322ba6750995fbc5486d7c9ada88489ff Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 29 Dec 2025 11:13:57 -0800
Subject: torture: Make config2csv.sh properly handle comments in .boot files

As in strip the "#" and everything after it and *then* tokenize.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 tools/testing/selftests/rcutorture/bin/config2csv.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/config2csv.sh b/tools/testing/selftests/rcutorture/bin/config2csv.sh
index 0cf55f1bf654..aeab4d6f11ad 100755
--- a/tools/testing/selftests/rcutorture/bin/config2csv.sh
+++ b/tools/testing/selftests/rcutorture/bin/config2csv.sh
@@ -42,7 +42,7 @@ do
 	grep -v '^#' < $i | grep -v '^ *$' > $T/p
 	if test -r $i.boot
 	then
-		tr -s ' ' '\012' < $i.boot | grep -v '^#' >> $T/p
+		sed -e 's/#.*$//' < $i.boot | tr -s ' ' '\012' >> $T/p
 	fi
 	sed -e 's/^[^=]*$/&=?/' < $T/p |
 	sed -e 's/^\([^=]*\)=\(.*\)$/\tp["\1:'"$i"'"] = "\2";\n\tc["\1"] = 1;/' >> $T/p.awk
-- 
cgit v1.2.3


From c89474b9b2ab8ab2c0d2cddadbed781c0f5e8f0c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 29 Dec 2025 11:13:58 -0800
Subject: torture: Include commit discription in testid.txt

Currently, the testid.txt file in the top-level directory of the
rcutorture results contains the output of "git rev-parse HEAD", which
just gives the full SHA-1 of the current commit.  This is followed by
the output of "git status", which is further followed by the output of
"git diff".  This works, but is less than helpful to human readers
scanning a list of commits.

This commit therefore instead uses "git show --oneline --no-patch HEAD",
which provides a short SHA-1, but also the names of any branches and
the commit's title.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 tools/testing/selftests/rcutorture/bin/mktestid.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/mktestid.sh b/tools/testing/selftests/rcutorture/bin/mktestid.sh
index 16f9907a4dae..24f6261dab6a 100755
--- a/tools/testing/selftests/rcutorture/bin/mktestid.sh
+++ b/tools/testing/selftests/rcutorture/bin/mktestid.sh
@@ -18,7 +18,7 @@ fi
 echo Build directory: `pwd` > ${resdir}/testid.txt
 if test -d .git
 then
-	echo Current commit: `git rev-parse HEAD` >> ${resdir}/testid.txt
+	echo Current commit: `git show --oneline --no-patch HEAD` >> ${resdir}/testid.txt
 	echo >> ${resdir}/testid.txt
 	echo ' ---' Output of "'"git status"'": >> ${resdir}/testid.txt
 	git status >> ${resdir}/testid.txt
-- 
cgit v1.2.3


From 7646c7afd9a95db0b0cb4ad066ed90f6024da67d Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Fri, 2 Jan 2026 10:00:28 -0800
Subject: bpf: Remove redundant KF_TRUSTED_ARGS flag from all kfuncs

Now that KF_TRUSTED_ARGS is the default for all kfuncs, remove the
explicit KF_TRUSTED_ARGS flag from all kfunc definitions and remove the
flag itself.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20260102180038.2708325-3-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 fs/bpf_fs_kfuncs.c                                   | 13 ++++++-------
 fs/verity/measure.c                                  |  2 +-
 include/linux/bpf.h                                  |  2 +-
 include/linux/btf.h                                  |  3 +--
 kernel/bpf/arena.c                                   |  6 +++---
 kernel/bpf/cpumask.c                                 |  2 +-
 kernel/bpf/helpers.c                                 | 20 ++++++++++----------
 kernel/bpf/map_iter.c                                |  2 +-
 kernel/bpf/verifier.c                                |  2 +-
 kernel/sched/ext.c                                   |  8 ++++----
 mm/bpf_memcontrol.c                                  | 10 +++++-----
 net/core/filter.c                                    | 10 +++++-----
 net/core/xdp.c                                       |  2 +-
 net/netfilter/nf_conntrack_bpf.c                     |  8 ++++----
 net/netfilter/nf_flow_table_bpf.c                    |  2 +-
 net/netfilter/nf_nat_bpf.c                           |  2 +-
 net/sched/bpf_qdisc.c                                | 12 ++++++------
 tools/testing/selftests/bpf/progs/cpumask_failure.c  |  2 +-
 tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 20 ++++++++++----------
 19 files changed, 63 insertions(+), 65 deletions(-)

(limited to 'tools')

diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c
index abd5eaa4892e..e4e51a1d0de2 100644
--- a/fs/bpf_fs_kfuncs.c
+++ b/fs/bpf_fs_kfuncs.c
@@ -356,14 +356,13 @@ __bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__s
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(bpf_fs_kfunc_set_ids)
-BTF_ID_FLAGS(func, bpf_get_task_exe_file,
-	     KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_get_task_exe_file, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_path_d_path)
+BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE)
 BTF_KFUNCS_END(bpf_fs_kfunc_set_ids)
 
 static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id)
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index 388734132f01..6a35623ebdf0 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -162,7 +162,7 @@ __bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr *di
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(fsverity_set_ids)
-BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_get_fsverity_digest)
 BTF_KFUNCS_END(fsverity_set_ids)
 
 static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4e7d72dfbcd4..9efb2ddf331c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -753,7 +753,7 @@ enum bpf_type_flag {
 	MEM_ALLOC		= BIT(11 + BPF_BASE_TYPE_BITS),
 
 	/* PTR was passed from the kernel in a trusted context, and may be
-	 * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions.
+	 * passed to kfuncs or BPF helper functions.
 	 * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above.
 	 * PTR_UNTRUSTED refers to a kptr that was read directly from a map
 	 * without invoking bpf_kptr_xchg(). What we really need to know is
diff --git a/include/linux/btf.h b/include/linux/btf.h
index f06976ffb63f..691f09784933 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -34,7 +34,7 @@
  *
  * And the following kfunc:
  *
- *	BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+ *	BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE)
  *
  * All invocations to the kfunc must pass the unmodified, unwalked task:
  *
@@ -66,7 +66,6 @@
  *	return 0;
  * }
  */
-#define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
 #define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 456ac989269d..2274319a95e6 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -890,9 +890,9 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(arena_kfuncs)
-BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_ARENA_RET | KF_ARENA_ARG2)
-BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2)
-BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2)
 BTF_KFUNCS_END(arena_kfuncs)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 9876c5fe6c2a..b8c805b4b06a 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -477,7 +477,7 @@ __bpf_kfunc_end_defs();
 BTF_KFUNCS_START(cpumask_kfunc_btf_ids)
 BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE)
 BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
 BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index db72b96f9c8c..2c15f77c74db 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -4427,7 +4427,7 @@ BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_throw)
 #ifdef CONFIG_BPF_EVENTS
-BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_send_signal_task)
 #endif
 #ifdef CONFIG_KEYS
 BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
@@ -4467,14 +4467,14 @@ BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU)
 BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY)
 #ifdef CONFIG_CGROUPS
-BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW)
 BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY)
 #endif
-BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, bpf_dynptr_adjust)
@@ -4510,8 +4510,8 @@ BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
 BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr)
 BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE)
 #endif
 #ifdef CONFIG_DMA_SHARED_BUFFER
 BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
@@ -4536,10 +4536,10 @@ BTF_ID_FLAGS(func, bpf_strncasestr);
 #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
 BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
 #endif
-BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_stream_vprintk_impl)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl)
+BTF_ID_FLAGS(func, bpf_dynptr_from_file)
 BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
 BTF_KFUNCS_END(common_btf_ids)
 
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 9575314f40a6..261a03ea73d3 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -214,7 +214,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map)
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(bpf_map_iter_kfunc_ids)
-BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_map_sum_elem_count)
 BTF_KFUNCS_END(bpf_map_iter_kfunc_ids)
 
 static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 359a962d69a1..c9da70dd3e72 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12619,7 +12619,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
 
 	/* Enforce strict type matching for calls to kfuncs that are acquiring
 	 * or releasing a reference, or are no-cast aliases. We do _not_
-	 * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default,
+	 * enforce strict matching for kfuncs by default,
 	 * as we want to enable BPF programs to pass types that are bitwise
 	 * equivalent without forcing them to explicitly cast with something
 	 * like bpf_cast_to_kern_ctx().
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 94164f2dec6d..fd5423428dde 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -7229,9 +7229,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_exit_bstr)
+BTF_ID_FLAGS(func, scx_bpf_error_bstr)
+BTF_ID_FLAGS(func, scx_bpf_dump_bstr)
 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
@@ -7250,7 +7250,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
 #endif
 BTF_ID_FLAGS(func, scx_bpf_now)
-BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_events)
 BTF_KFUNCS_END(scx_kfunc_ids_any)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_any = {
diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c
index e8fa7f5855f9..716df49d7647 100644
--- a/mm/bpf_memcontrol.c
+++ b/mm/bpf_memcontrol.c
@@ -166,11 +166,11 @@ BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
 BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE)
 
-BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_mem_cgroup_usage, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events)
+BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events)
+BTF_ID_FLAGS(func, bpf_mem_cgroup_usage)
+BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state)
+BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_SLEEPABLE)
 
 BTF_KFUNCS_END(bpf_memcontrol_kfuncs)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 616e0520a0bb..d43df98e1ded 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -12438,11 +12438,11 @@ int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
 }
 
 BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
-BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
 BTF_KFUNCS_END(bpf_kfunc_check_set_skb)
 
 BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta)
-BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta)
 BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta)
 
 BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
@@ -12455,11 +12455,11 @@ BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
 BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr)
 
 BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
-BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk)
 BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)
 
 BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
-BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp)
 BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)
 
 static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
@@ -12554,7 +12554,7 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids)
-BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_sock_destroy)
 BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids)
 
 static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 9100e160113a..fee6d080ee85 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -964,7 +964,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx,
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(xdp_metadata_kfunc_ids)
-#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
+#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name)
 XDP_METADATA_KFUNC_xxx
 #undef XDP_METADATA_KFUNC
 BTF_KFUNCS_END(xdp_metadata_kfunc_ids)
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 4a136fc3a9c0..a630139bd0c3 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -516,10 +516,10 @@ BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_set_timeout)
+BTF_ID_FLAGS(func, bpf_ct_change_timeout)
+BTF_ID_FLAGS(func, bpf_ct_set_status)
+BTF_ID_FLAGS(func, bpf_ct_change_status)
 BTF_KFUNCS_END(nf_ct_kfunc_set)
 
 static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
diff --git a/net/netfilter/nf_flow_table_bpf.c b/net/netfilter/nf_flow_table_bpf.c
index 4a5f5195f2d2..cbd5b97a6329 100644
--- a/net/netfilter/nf_flow_table_bpf.c
+++ b/net/netfilter/nf_flow_table_bpf.c
@@ -105,7 +105,7 @@ __diag_pop()
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(nf_ft_kfunc_set)
-BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_RET_NULL)
 BTF_KFUNCS_END(nf_ft_kfunc_set)
 
 static const struct btf_kfunc_id_set nf_flow_kfunc_set = {
diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c
index 481be15609b1..f9dd85ccea01 100644
--- a/net/netfilter/nf_nat_bpf.c
+++ b/net/netfilter/nf_nat_bpf.c
@@ -55,7 +55,7 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(nf_nat_kfunc_set)
-BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_set_nat_info)
 BTF_KFUNCS_END(nf_nat_kfunc_set)
 
 static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = {
diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c
index adcb618a2bfc..b9771788b9b3 100644
--- a/net/sched/bpf_qdisc.c
+++ b/net/sched/bpf_qdisc.c
@@ -271,14 +271,14 @@ __bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(qdisc_kfunc_ids)
-BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_skb_get_hash)
 BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
+BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule)
+BTF_ID_FLAGS(func, bpf_qdisc_init_prologue)
+BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue)
+BTF_ID_FLAGS(func, bpf_qdisc_bstats_update)
 BTF_KFUNCS_END(qdisc_kfunc_ids)
 
 BTF_SET_START(qdisc_common_kfunc_set)
diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c
index 8a2fd596c8a3..61c32e91e8c3 100644
--- a/tools/testing/selftests/bpf/progs/cpumask_failure.c
+++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c
@@ -110,7 +110,7 @@ SEC("tp_btf/task_newtask")
 __failure __msg("NULL pointer passed to trusted arg0")
 int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags)
 {
-  /* NULL passed to KF_TRUSTED_ARGS kfunc. */
+  /* NULL passed to kfunc. */
 	bpf_cpumask_empty(NULL);
 
 	return 0;
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index 90c4b1a51de6..1c41d03bd5a1 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -693,9 +693,9 @@ BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test)
 BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE)
 BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE)
 BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test)
+BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test)
+BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test)
 BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU)
 BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test, KF_RET_NULL | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test_nostruct, KF_RET_NULL | KF_RCU_PROTECTED)
@@ -1158,7 +1158,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass2)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail1)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail2)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_RCU)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset)
@@ -1172,12 +1172,12 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10)
+BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1)
+BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl)
 BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids)
 
 static int bpf_testmod_ops_init(struct btf *btf)
-- 
cgit v1.2.3


From df5004579bbdfe4c734f2cbbf3f44fe3fac440a3 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Fri, 2 Jan 2026 10:00:32 -0800
Subject: selftests: bpf: Update kfunc_param_nullable test for new error
 message

With trusted args now being the default, the NULL pointer check runs
before type-specific validation. Update test3 to expect the new error
message "Possibly NULL pointer passed to trusted arg0" instead of the
old dynptr-specific error message.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20260102180038.2708325-7-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c
index 0ad1bf1ede8d..967081bbcfe1 100644
--- a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c
+++ b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c
@@ -29,7 +29,7 @@ int kfunc_dynptr_nullable_test2(struct __sk_buff *skb)
 }
 
 SEC("tc")
-__failure __msg("expected pointer to stack or const struct bpf_dynptr")
+__failure __msg("Possibly NULL pointer passed to trusted arg0")
 int kfunc_dynptr_nullable_test3(struct __sk_buff *skb)
 {
 	struct bpf_dynptr data;
-- 
cgit v1.2.3


From 03cc77b10e009ce87f1a8e93454aadf2912a4c15 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Fri, 2 Jan 2026 10:00:33 -0800
Subject: selftests: bpf: Update failure message for rbtree_fail

The rbtree_api_use_unchecked_remove_retval() selftest passes a pointer
received from bpf_rbtree_remove() to bpf_rbtree_add() without checking
for NULL, this was earlier caught by __check_ptr_off_reg() in the
verifier. Now the verifier assumes every kfunc only takes trusted pointer
arguments, so it catches this NULL pointer earlier in the path and
provides a more accurate failure message.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20260102180038.2708325-8-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/rbtree_fail.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c
index 4acb6af2dfe3..70b7baf9304b 100644
--- a/tools/testing/selftests/bpf/progs/rbtree_fail.c
+++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c
@@ -153,7 +153,7 @@ long rbtree_api_add_to_multiple_trees(void *ctx)
 }
 
 SEC("?tc")
-__failure __msg("dereference of modified ptr_or_null_ ptr R2 off=16 disallowed")
+__failure __msg("Possibly NULL pointer passed to trusted arg1")
 long rbtree_api_use_unchecked_remove_retval(void *ctx)
 {
 	struct bpf_rb_node *res;
-- 
cgit v1.2.3


From 230b0118e416583a53fc0ad5d1fecb37f496fe34 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Fri, 2 Jan 2026 10:00:34 -0800
Subject: selftests: bpf: fix test_kfunc_dynptr_param

As verifier now assumes that all kfuncs only takes trusted pointer
arguments, passing 0 (NULL) to a kfunc that doesn't mark the argument as
__nullable or __opt will be rejected with a failure message of: Possibly
NULL pointer passed to trusted arg<n>

Pass a non-null value to the kfunc to test the expected failure mode.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20260102180038.2708325-9-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
index 061befb004c2..d249113ed657 100644
--- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
+++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
@@ -48,10 +48,9 @@ SEC("?lsm.s/bpf")
 __failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr")
 int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size, bool kernel)
 {
-	unsigned long val = 0;
+	static struct bpf_dynptr val;
 
-	return bpf_verify_pkcs7_signature((struct bpf_dynptr *)val,
-					  (struct bpf_dynptr *)val, NULL);
+	return bpf_verify_pkcs7_signature(&val, &val, NULL);
 }
 
 SEC("lsm.s/bpf")
-- 
cgit v1.2.3


From cf82580c86a91de2aa979260985cadcb39ed28d2 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Fri, 2 Jan 2026 10:00:35 -0800
Subject: selftests: bpf: fix cgroup_hierarchical_stats

The cgroup_hierarchical_stats selftests uses an fentry program attached
to cgroup_attach_task and then passes the received &dst_cgrp->self to
the css_rstat_updated() kfunc. The verifier now assumes that all kfuncs
only takes trusted pointer arguments, and pointers received by fentry
are not marked trustes by default.

Use a tp_btf program in place for fentry for this test, pointers
received by tp_btf programs are marked trusted by the verifier.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20260102180038.2708325-10-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c
index ff189a736ad8..8fc38592a87b 100644
--- a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c
+++ b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c
@@ -62,9 +62,9 @@ static int create_attach_counter(__u64 cg_id, __u64 state, __u64 pending)
 				   &init, BPF_NOEXIST);
 }
 
-SEC("fentry/cgroup_attach_task")
-int BPF_PROG(counter, struct cgroup *dst_cgrp, struct task_struct *leader,
-	     bool threadgroup)
+SEC("tp_btf/cgroup_attach_task")
+int BPF_PROG(counter, struct cgroup *dst_cgrp, const char *path,
+	     struct task_struct *task, bool threadgroup)
 {
 	__u64 cg_id = cgroup_id(dst_cgrp);
 	struct percpu_attach_counter *pcpu_counter = bpf_map_lookup_elem(
-- 
cgit v1.2.3


From cf503eb2c6c38bf449063f33790a96218a067718 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Fri, 2 Jan 2026 10:00:36 -0800
Subject: selftests: bpf: Fix test_bpf_nf for trusted args becoming default

With trusted args now being the default, passing NULL to kfunc
parameters that are pointers causes verifier rejection rather than a
runtime error. The test_bpf_nf test was failing because it attempted to
pass NULL to bpf_xdp_ct_lookup() to verify runtime error handling.

Since the NULL check now happens at verification time, remove the
runtime test case that passed NULL to the bpf_tuple parameter and
instead add verification-time tests to ensure the verifier correctly
rejects programs that pass NULL to trusted arguments.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20260102180038.2708325-11-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/bpf_nf.c    |  5 +-
 tools/testing/selftests/bpf/progs/test_bpf_nf.c    |  7 ---
 .../testing/selftests/bpf/progs/test_bpf_nf_fail.c | 57 ++++++++++++++++++++++
 3 files changed, 61 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
index dd6512fa652b..215878ea04de 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
@@ -19,6 +19,10 @@ struct {
 	{ "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout args#0 expected pointer to STRUCT nf_conn but" },
 	{ "change_status_after_alloc", "kernel function bpf_ct_change_status args#0 expected pointer to STRUCT nf_conn but" },
 	{ "write_not_allowlisted_field", "no write support to nf_conn at off" },
+	{ "lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" },
+	{ "lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" },
+	{ "xdp_lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" },
+	{ "xdp_lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" },
 };
 
 enum {
@@ -111,7 +115,6 @@ static void test_bpf_nf_ct(int mode)
 	if (!ASSERT_OK(err, "bpf_prog_test_run"))
 		goto end;
 
-	ASSERT_EQ(skel->bss->test_einval_bpf_tuple, -EINVAL, "Test EINVAL for NULL bpf_tuple");
 	ASSERT_EQ(skel->bss->test_einval_reserved, -EINVAL, "Test EINVAL for reserved not set to 0");
 	ASSERT_EQ(skel->bss->test_einval_reserved_new, -EINVAL, "Test EINVAL for reserved in new struct not set to 0");
 	ASSERT_EQ(skel->bss->test_einval_netns_id, -EINVAL, "Test EINVAL for netns_id < -1");
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
index f7b330ddd007..076fbf03a126 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
@@ -15,7 +15,6 @@
 
 extern unsigned long CONFIG_HZ __kconfig;
 
-int test_einval_bpf_tuple = 0;
 int test_einval_reserved = 0;
 int test_einval_reserved_new = 0;
 int test_einval_netns_id = 0;
@@ -99,12 +98,6 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
 
 	__builtin_memset(&bpf_tuple, 0, sizeof(bpf_tuple.ipv4));
 
-	ct = lookup_fn(ctx, NULL, 0, &opts_def, sizeof(opts_def));
-	if (ct)
-		bpf_ct_release(ct);
-	else
-		test_einval_bpf_tuple = opts_def.error;
-
 	opts_def.reserved[0] = 1;
 	ct = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
 		       sizeof(opts_def));
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c
index a586f087ffeb..2c156cd166af 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c
@@ -4,6 +4,7 @@
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_core_read.h>
+#include "bpf_misc.h"
 
 struct nf_conn;
 
@@ -18,6 +19,10 @@ struct nf_conn *bpf_skb_ct_alloc(struct __sk_buff *, struct bpf_sock_tuple *, u3
 				 struct bpf_ct_opts___local *, u32) __ksym;
 struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *, struct bpf_sock_tuple *, u32,
 				  struct bpf_ct_opts___local *, u32) __ksym;
+struct nf_conn *bpf_xdp_ct_alloc(struct xdp_md *, struct bpf_sock_tuple *, u32,
+				 struct bpf_ct_opts___local *, u32) __ksym;
+struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *, struct bpf_sock_tuple *, u32,
+				  struct bpf_ct_opts___local *, u32) __ksym;
 struct nf_conn *bpf_ct_insert_entry(struct nf_conn *) __ksym;
 void bpf_ct_release(struct nf_conn *) __ksym;
 void bpf_ct_set_timeout(struct nf_conn *, u32) __ksym;
@@ -146,4 +151,56 @@ int change_status_after_alloc(struct __sk_buff *ctx)
 	return 0;
 }
 
+SEC("?tc")
+__failure __msg("Possibly NULL pointer passed to trusted arg1")
+int lookup_null_bpf_tuple(struct __sk_buff *ctx)
+{
+	struct bpf_ct_opts___local opts = {};
+	struct nf_conn *ct;
+
+	ct = bpf_skb_ct_lookup(ctx, NULL, 0, &opts, sizeof(opts));
+	if (ct)
+		bpf_ct_release(ct);
+	return 0;
+}
+
+SEC("?tc")
+__failure __msg("Possibly NULL pointer passed to trusted arg3")
+int lookup_null_bpf_opts(struct __sk_buff *ctx)
+{
+	struct bpf_sock_tuple tup = {};
+	struct nf_conn *ct;
+
+	ct = bpf_skb_ct_lookup(ctx, &tup, sizeof(tup.ipv4), NULL, sizeof(struct bpf_ct_opts___local));
+	if (ct)
+		bpf_ct_release(ct);
+	return 0;
+}
+
+SEC("?xdp")
+__failure __msg("Possibly NULL pointer passed to trusted arg1")
+int xdp_lookup_null_bpf_tuple(struct xdp_md *ctx)
+{
+	struct bpf_ct_opts___local opts = {};
+	struct nf_conn *ct;
+
+	ct = bpf_xdp_ct_lookup(ctx, NULL, 0, &opts, sizeof(opts));
+	if (ct)
+		bpf_ct_release(ct);
+	return 0;
+}
+
+SEC("?xdp")
+__failure __msg("Possibly NULL pointer passed to trusted arg3")
+int xdp_lookup_null_bpf_opts(struct xdp_md *ctx)
+{
+	struct bpf_sock_tuple tup = {};
+	struct nf_conn *ct;
+
+	ct = bpf_xdp_ct_lookup(ctx, &tup, sizeof(tup.ipv4), NULL, sizeof(struct bpf_ct_opts___local));
+	if (ct)
+		bpf_ct_release(ct);
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From ec4bb8e8dfa060c699b548f62e4d56133aafbbec Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Wed, 24 Sep 2025 16:20:58 +0200
Subject: tools/nolibc: add ptrace support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add ptrace support, as it will be useful in UML.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
[Thomas: drop va_args usage and linux/uio.h inclusion]
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/Makefile                |  1 +
 tools/include/nolibc/nolibc.h                |  1 +
 tools/include/nolibc/sys/ptrace.h            | 33 ++++++++++++++++++++++++++++
 tools/testing/selftests/nolibc/nolibc-test.c |  2 ++
 4 files changed, 37 insertions(+)
 create mode 100644 tools/include/nolibc/sys/ptrace.h

(limited to 'tools')

diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile
index 8118e22844f1..8b883a6fe580 100644
--- a/tools/include/nolibc/Makefile
+++ b/tools/include/nolibc/Makefile
@@ -54,6 +54,7 @@ all_files := \
 		sys/mman.h \
 		sys/mount.h \
 		sys/prctl.h \
+		sys/ptrace.h \
 		sys/random.h \
 		sys/reboot.h \
 		sys/resource.h \
diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h
index 272dfc961158..9c7f43b9218b 100644
--- a/tools/include/nolibc/nolibc.h
+++ b/tools/include/nolibc/nolibc.h
@@ -101,6 +101,7 @@
 #include "sys/mman.h"
 #include "sys/mount.h"
 #include "sys/prctl.h"
+#include "sys/ptrace.h"
 #include "sys/random.h"
 #include "sys/reboot.h"
 #include "sys/resource.h"
diff --git a/tools/include/nolibc/sys/ptrace.h b/tools/include/nolibc/sys/ptrace.h
new file mode 100644
index 000000000000..72ca28541633
--- /dev/null
+++ b/tools/include/nolibc/sys/ptrace.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * ptrace for NOLIBC
+ * Copyright (C) 2017-2021 Willy Tarreau <w@1wt.eu>
+ * Copyright (C) 2025 Intel Corporation
+ */
+
+/* make sure to include all global symbols */
+#include "../nolibc.h"
+
+#ifndef _NOLIBC_SYS_PTRACE_H
+#define _NOLIBC_SYS_PTRACE_H
+
+#include "../sys.h"
+
+#include <linux/ptrace.h>
+
+/*
+ * long ptrace(int op, pid_t pid, void *addr, void *data);
+ */
+static __attribute__((unused))
+long sys_ptrace(int op, pid_t pid, void *addr, void *data)
+{
+	return my_syscall4(__NR_ptrace, op, pid, addr, data);
+}
+
+static __attribute__((unused))
+ssize_t ptrace(int op, pid_t pid, void *addr, void *data)
+{
+	return __sysret(sys_ptrace(op, pid, addr, data));
+}
+
+#endif /* _NOLIBC_SYS_PTRACE_H */
diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 3c5a226dad3a..6888b20af259 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -17,6 +17,7 @@
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/prctl.h>
+#include <sys/ptrace.h>
 #include <sys/random.h>
 #include <sys/reboot.h>
 #include <sys/resource.h>
@@ -1406,6 +1407,7 @@ int run_syscall(int min, int max)
 		CASE_TEST(readv_zero);        EXPECT_SYSZR(1, readv(1, NULL, 0)); break;
 		CASE_TEST(writev_badf);       EXPECT_SYSER(1, writev(-1, &iov_one, 1), -1, EBADF); break;
 		CASE_TEST(writev_zero);       EXPECT_SYSZR(1, writev(1, NULL, 0)); break;
+		CASE_TEST(ptrace);            EXPECT_SYSER(1, ptrace(PTRACE_CONT, getpid(), NULL, NULL), -1, ESRCH); break;
 		CASE_TEST(syscall_noargs);    EXPECT_SYSEQ(1, syscall(__NR_getpid), getpid()); break;
 		CASE_TEST(syscall_args);      EXPECT_SYSER(1, syscall(__NR_statx, 0, NULL, 0, 0, NULL), -1, EFAULT); break;
 		CASE_TEST(namespace);         EXPECT_SYSZR(euid0 && proc, test_namespace()); break;
-- 
cgit v1.2.3


From cc6809f6728456c03db6750fcc94ed8b581a2cf8 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 3 Dec 2025 20:08:00 +0100
Subject: tools/nolibc: always use 64-bit mode for s390 header checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

32-bit s390 support was recently removed from nolibc.
If the compiler defaults to 32-bit during the header checks, they fail.

Make sure to always use 64-bit mode for s390 heafer checks.

Fixes: 169ebcbb9082 ("tools: Remove s390 compat support")
Acked-by: Willy Tarreau <w@1wt.eu>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Link: https://patch.msgid.link/20251203-nolibc-headers-check-s390-v1-1-5d35e52a83ba@weissschuh.net
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile
index 8b883a6fe580..1958dda98895 100644
--- a/tools/include/nolibc/Makefile
+++ b/tools/include/nolibc/Makefile
@@ -104,9 +104,12 @@ headers_standalone: headers
 	$(Q)$(MAKE) -C $(srctree) headers
 	$(Q)$(MAKE) -C $(srctree) headers_install INSTALL_HDR_PATH=$(OUTPUT)sysroot
 
+CFLAGS_s390 := -m64
+CFLAGS := $(CFLAGS_$(ARCH))
+
 headers_check: headers_standalone
 	$(Q)for header in $(filter-out crt.h std.h,$(all_files)); do \
-		$(CC) $(CLANG_CROSS_FLAGS) -Wall -Werror -nostdinc -fsyntax-only -x c /dev/null \
+		$(CC) $(CFLAGS) $(CLANG_CROSS_FLAGS) -Wall -Werror -nostdinc -fsyntax-only -x c /dev/null \
 			-I$(or $(objtree),$(srctree))/usr/include -include $$header -include $$header || exit 1; \
 	done
 
-- 
cgit v1.2.3


From f675e35dd28f1ac326b1d6520fee3605019b381b Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:45 +0100
Subject: tools/nolibc/poll: use kernel types for system call invocations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The system calls expect 'struct __kernel_old_timespec'.
While currently 'struct __kernel_old_timespec' and 'struct timespec' are
compatible, this is confusing. Especially as future patches will change
the definition of 'struct timespec'.

Use the correct kernel type instead.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/lkml/fbca1d3e-12e4-4c4e-8091-87464035fe39@app.fastmail.com/
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-1-c662992f75d7@weissschuh.net
---
 tools/include/nolibc/poll.h       | 2 +-
 tools/include/nolibc/sys/select.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/poll.h b/tools/include/nolibc/poll.h
index 0d053f93ea99..df952bcf0905 100644
--- a/tools/include/nolibc/poll.h
+++ b/tools/include/nolibc/poll.h
@@ -24,7 +24,7 @@ static __attribute__((unused))
 int sys_poll(struct pollfd *fds, int nfds, int timeout)
 {
 #if defined(__NR_ppoll)
-	struct timespec t;
+	struct __kernel_old_timespec t;
 
 	if (timeout >= 0) {
 		t.tv_sec  = timeout / 1000;
diff --git a/tools/include/nolibc/sys/select.h b/tools/include/nolibc/sys/select.h
index 2a5619c01277..9a29e5b98a3c 100644
--- a/tools/include/nolibc/sys/select.h
+++ b/tools/include/nolibc/sys/select.h
@@ -75,7 +75,7 @@ int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeva
 #elif defined(__NR_select)
 	return my_syscall5(__NR_select, nfds, rfds, wfds, efds, timeout);
 #elif defined(__NR_pselect6)
-	struct timespec t;
+	struct __kernel_old_timespec t;
 
 	if (timeout) {
 		t.tv_sec  = timeout->tv_sec;
-- 
cgit v1.2.3


From 548d682649f02f4240e8ea4e99f1899978e1cfe4 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:46 +0100
Subject: tools/nolibc/poll: drop __NR_poll fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fallback is never used, remove it.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/lkml/fbca1d3e-12e4-4c4e-8091-87464035fe39@app.fastmail.com/
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-2-c662992f75d7@weissschuh.net
---
 tools/include/nolibc/poll.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/poll.h b/tools/include/nolibc/poll.h
index df952bcf0905..5b4fa339fbb5 100644
--- a/tools/include/nolibc/poll.h
+++ b/tools/include/nolibc/poll.h
@@ -31,7 +31,7 @@ int sys_poll(struct pollfd *fds, int nfds, int timeout)
 		t.tv_nsec = (timeout % 1000) * 1000000;
 	}
 	return my_syscall5(__NR_ppoll, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0);
-#elif defined(__NR_ppoll_time64)
+#else
 	struct __kernel_timespec t;
 
 	if (timeout >= 0) {
@@ -39,8 +39,6 @@ int sys_poll(struct pollfd *fds, int nfds, int timeout)
 		t.tv_nsec = (timeout % 1000) * 1000000;
 	}
 	return my_syscall5(__NR_ppoll_time64, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0);
-#else
-	return my_syscall3(__NR_poll, fds, nfds, timeout);
 #endif
 }
 
-- 
cgit v1.2.3


From 668e43737279284318064bdd4eab689a3aaed652 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:47 +0100
Subject: tools/nolibc/select: drop non-pselect based implementations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These implementations use the libc 'struct timeval' with system calls
which can lead to type mismatches. Currently this is fine, but will
break with upcoming changes to 'struct timeval'.

If the structure needs to be converted anyways, the implementations
based on pselect can be used for all architectures. This simplifies the
logic.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-3-c662992f75d7@weissschuh.net
---
 tools/include/nolibc/sys/select.h | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/sys/select.h b/tools/include/nolibc/sys/select.h
index 9a29e5b98a3c..50b77dace7ef 100644
--- a/tools/include/nolibc/sys/select.h
+++ b/tools/include/nolibc/sys/select.h
@@ -63,18 +63,7 @@ typedef struct {
 static __attribute__((unused))
 int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout)
 {
-#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect)
-	struct sel_arg_struct {
-		unsigned long n;
-		fd_set *r, *w, *e;
-		struct timeval *t;
-	} arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout };
-	return my_syscall1(__NR_select, &arg);
-#elif defined(__NR__newselect)
-	return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout);
-#elif defined(__NR_select)
-	return my_syscall5(__NR_select, nfds, rfds, wfds, efds, timeout);
-#elif defined(__NR_pselect6)
+#if defined(__NR_pselect6)
 	struct __kernel_old_timespec t;
 
 	if (timeout) {
-- 
cgit v1.2.3


From b8f4f5d1b99e2ae73fd448e9bbd16dc244e6586c Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:48 +0100
Subject: tools/nolibc/time: drop invocation of gettimeofday system call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This invocation uses libc types with a system call. While this works
now, upcoming changes to 'struct timeval' would require type
conversions. If types are converted anyways, the clock_gettime() based
fallback can be used everywhere, simplifying the code.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-4-c662992f75d7@weissschuh.net
---
 tools/include/nolibc/sys/time.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/sys/time.h b/tools/include/nolibc/sys/time.h
index 33782a19aae9..171187836e6d 100644
--- a/tools/include/nolibc/sys/time.h
+++ b/tools/include/nolibc/sys/time.h
@@ -22,9 +22,6 @@ static int sys_clock_gettime(clockid_t clockid, struct timespec *tp);
 static __attribute__((unused))
 int sys_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
-#ifdef __NR_gettimeofday
-	return my_syscall2(__NR_gettimeofday, tv, tz);
-#else
 	(void) tz; /* Non-NULL tz is undefined behaviour */
 
 	struct timespec tp;
@@ -37,7 +34,6 @@ int sys_gettimeofday(struct timeval *tv, struct timezone *tz)
 	}
 
 	return ret;
-#endif
 }
 
 static __attribute__((unused))
-- 
cgit v1.2.3


From ba7fd0384530e3dd20ea873aac21c473e3e461ae Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:49 +0100
Subject: tools/nolibc: prefer explicit 64-bit time-related system calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make sure to always use the 64-bit safe system calls
in preparation for 64-bit time_t on 32-bit architectures.

Also prevent issues on kernels which disable CONFIG_COMPAT_32BIT_TIME
and therefore don't provide the 32-bit system calls anymore.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-5-c662992f75d7@weissschuh.net
---
 tools/include/nolibc/poll.h        | 10 +++++-----
 tools/include/nolibc/sys/select.h  | 10 +++++-----
 tools/include/nolibc/sys/timerfd.h | 12 ++++++------
 tools/include/nolibc/time.h        | 36 ++++++++++++++++++------------------
 4 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/poll.h b/tools/include/nolibc/poll.h
index 5b4fa339fbb5..e854c94647b1 100644
--- a/tools/include/nolibc/poll.h
+++ b/tools/include/nolibc/poll.h
@@ -23,22 +23,22 @@
 static __attribute__((unused))
 int sys_poll(struct pollfd *fds, int nfds, int timeout)
 {
-#if defined(__NR_ppoll)
-	struct __kernel_old_timespec t;
+#if defined(__NR_ppoll_time64)
+	struct __kernel_timespec t;
 
 	if (timeout >= 0) {
 		t.tv_sec  = timeout / 1000;
 		t.tv_nsec = (timeout % 1000) * 1000000;
 	}
-	return my_syscall5(__NR_ppoll, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0);
+	return my_syscall5(__NR_ppoll_time64, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0);
 #else
-	struct __kernel_timespec t;
+	struct __kernel_old_timespec t;
 
 	if (timeout >= 0) {
 		t.tv_sec  = timeout / 1000;
 		t.tv_nsec = (timeout % 1000) * 1000000;
 	}
-	return my_syscall5(__NR_ppoll_time64, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0);
+	return my_syscall5(__NR_ppoll, fds, nfds, (timeout >= 0) ? &t : NULL, NULL, 0);
 #endif
 }
 
diff --git a/tools/include/nolibc/sys/select.h b/tools/include/nolibc/sys/select.h
index 50b77dace7ef..f8870ad49687 100644
--- a/tools/include/nolibc/sys/select.h
+++ b/tools/include/nolibc/sys/select.h
@@ -63,22 +63,22 @@ typedef struct {
 static __attribute__((unused))
 int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout)
 {
-#if defined(__NR_pselect6)
-	struct __kernel_old_timespec t;
+#if defined(__NR_pselect6_time64)
+	struct __kernel_timespec t;
 
 	if (timeout) {
 		t.tv_sec  = timeout->tv_sec;
 		t.tv_nsec = timeout->tv_usec * 1000;
 	}
-	return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL);
+	return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL);
 #else
-	struct __kernel_timespec t;
+	struct __kernel_old_timespec t;
 
 	if (timeout) {
 		t.tv_sec  = timeout->tv_sec;
 		t.tv_nsec = timeout->tv_usec * 1000;
 	}
-	return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL);
+	return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL);
 #endif
 }
 
diff --git a/tools/include/nolibc/sys/timerfd.h b/tools/include/nolibc/sys/timerfd.h
index 5dd61030c991..66f779553d31 100644
--- a/tools/include/nolibc/sys/timerfd.h
+++ b/tools/include/nolibc/sys/timerfd.h
@@ -32,9 +32,7 @@ int timerfd_create(int clockid, int flags)
 static __attribute__((unused))
 int sys_timerfd_gettime(int fd, struct itimerspec *curr_value)
 {
-#if defined(__NR_timerfd_gettime)
-	return my_syscall2(__NR_timerfd_gettime, fd, curr_value);
-#else
+#if defined(__NR_timerfd_gettime64)
 	struct __kernel_itimerspec kcurr_value;
 	int ret;
 
@@ -42,6 +40,8 @@ int sys_timerfd_gettime(int fd, struct itimerspec *curr_value)
 	__nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval);
 	__nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value);
 	return ret;
+#else
+	return my_syscall2(__NR_timerfd_gettime, fd, curr_value);
 #endif
 }
 
@@ -56,9 +56,7 @@ static __attribute__((unused))
 int sys_timerfd_settime(int fd, int flags,
 			const struct itimerspec *new_value, struct itimerspec *old_value)
 {
-#if defined(__NR_timerfd_settime)
-	return my_syscall4(__NR_timerfd_settime, fd, flags, new_value, old_value);
-#else
+#if defined(__NR_timerfd_settime64)
 	struct __kernel_itimerspec knew_value, kold_value;
 	int ret;
 
@@ -70,6 +68,8 @@ int sys_timerfd_settime(int fd, int flags,
 		__nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value);
 	}
 	return ret;
+#else
+	return my_syscall4(__NR_timerfd_settime, fd, flags, new_value, old_value);
 #endif
 }
 
diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h
index 48e78f8becf9..45df9b09d7b6 100644
--- a/tools/include/nolibc/time.h
+++ b/tools/include/nolibc/time.h
@@ -43,9 +43,7 @@ void __nolibc_timespec_kernel_to_user(const struct __kernel_timespec *kts, struc
 static __attribute__((unused))
 int sys_clock_getres(clockid_t clockid, struct timespec *res)
 {
-#if defined(__NR_clock_getres)
-	return my_syscall2(__NR_clock_getres, clockid, res);
-#else
+#if defined(__NR_clock_getres_time64)
 	struct __kernel_timespec kres;
 	int ret;
 
@@ -53,6 +51,8 @@ int sys_clock_getres(clockid_t clockid, struct timespec *res)
 	if (res)
 		__nolibc_timespec_kernel_to_user(&kres, res);
 	return ret;
+#else
+	return my_syscall2(__NR_clock_getres, clockid, res);
 #endif
 }
 
@@ -65,9 +65,7 @@ int clock_getres(clockid_t clockid, struct timespec *res)
 static __attribute__((unused))
 int sys_clock_gettime(clockid_t clockid, struct timespec *tp)
 {
-#if defined(__NR_clock_gettime)
-	return my_syscall2(__NR_clock_gettime, clockid, tp);
-#else
+#if defined(__NR_clock_gettime64)
 	struct __kernel_timespec ktp;
 	int ret;
 
@@ -75,6 +73,8 @@ int sys_clock_gettime(clockid_t clockid, struct timespec *tp)
 	if (tp)
 		__nolibc_timespec_kernel_to_user(&ktp, tp);
 	return ret;
+#else
+	return my_syscall2(__NR_clock_gettime, clockid, tp);
 #endif
 }
 
@@ -87,13 +87,13 @@ int clock_gettime(clockid_t clockid, struct timespec *tp)
 static __attribute__((unused))
 int sys_clock_settime(clockid_t clockid, struct timespec *tp)
 {
-#if defined(__NR_clock_settime)
-	return my_syscall2(__NR_clock_settime, clockid, tp);
-#else
+#if defined(__NR_clock_settime64)
 	struct __kernel_timespec ktp;
 
 	__nolibc_timespec_user_to_kernel(tp, &ktp);
 	return my_syscall2(__NR_clock_settime64, clockid, &ktp);
+#else
+	return my_syscall2(__NR_clock_settime, clockid, tp);
 #endif
 }
 
@@ -107,9 +107,7 @@ static __attribute__((unused))
 int sys_clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqtp,
 			struct timespec *rmtp)
 {
-#if defined(__NR_clock_nanosleep)
-	return my_syscall4(__NR_clock_nanosleep, clockid, flags, rqtp, rmtp);
-#else
+#if defined(__NR_clock_nanosleep_time64)
 	struct __kernel_timespec krqtp, krmtp;
 	int ret;
 
@@ -118,6 +116,8 @@ int sys_clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqt
 	if (rmtp)
 		__nolibc_timespec_kernel_to_user(&krmtp, rmtp);
 	return ret;
+#else
+	return my_syscall4(__NR_clock_nanosleep, clockid, flags, rqtp, rmtp);
 #endif
 }
 
@@ -189,9 +189,7 @@ int timer_delete(timer_t timerid)
 static __attribute__((unused))
 int sys_timer_gettime(timer_t timerid, struct itimerspec *curr_value)
 {
-#if defined(__NR_timer_gettime)
-	return my_syscall2(__NR_timer_gettime, timerid, curr_value);
-#else
+#if defined(__NR_timer_gettime64)
 	struct __kernel_itimerspec kcurr_value;
 	int ret;
 
@@ -199,6 +197,8 @@ int sys_timer_gettime(timer_t timerid, struct itimerspec *curr_value)
 	__nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval);
 	__nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value);
 	return ret;
+#else
+	return my_syscall2(__NR_timer_gettime, timerid, curr_value);
 #endif
 }
 
@@ -212,9 +212,7 @@ static __attribute__((unused))
 int sys_timer_settime(timer_t timerid, int flags,
 		      const struct itimerspec *new_value, struct itimerspec *old_value)
 {
-#if defined(__NR_timer_settime)
-	return my_syscall4(__NR_timer_settime, timerid, flags, new_value, old_value);
-#else
+#if defined(__NR_timer_settime64)
 	struct __kernel_itimerspec knew_value, kold_value;
 	int ret;
 
@@ -226,6 +224,8 @@ int sys_timer_settime(timer_t timerid, int flags,
 		__nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value);
 	}
 	return ret;
+#else
+	return my_syscall4(__NR_timer_settime, timerid, flags, new_value, old_value);
 #endif
 }
 
-- 
cgit v1.2.3


From 7efd15d22a9b7e62d0659471bde25c35ef50c9e5 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:50 +0100
Subject: tools/nolibc/gettimeofday: avoid libgcc 64-bit divisions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

timespec::tv_nsec is going to be 64-bit wide even on 32-bit
architectures. As not all architectures support 64-bit division
instructions, calls to libgcc (__divdi3()) may be emitted by the
compiler which are not provided by nolibc.

As tv_nsec is guaranteed to always fit into an uint32_t, perform a
32-bit division instead.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-6-c662992f75d7@weissschuh.net
---
 tools/include/nolibc/sys/time.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/sys/time.h b/tools/include/nolibc/sys/time.h
index 171187836e6d..afdb7e326df1 100644
--- a/tools/include/nolibc/sys/time.h
+++ b/tools/include/nolibc/sys/time.h
@@ -30,7 +30,7 @@ int sys_gettimeofday(struct timeval *tv, struct timezone *tz)
 	ret = sys_clock_gettime(CLOCK_REALTIME, &tp);
 	if (!ret && tv) {
 		tv->tv_sec = tp.tv_sec;
-		tv->tv_usec = tp.tv_nsec / 1000;
+		tv->tv_usec = (uint32_t)tp.tv_nsec / 1000;
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 47c17d97681d9c5d080acfdb273fa0856c930e74 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:51 +0100
Subject: tools/nolibc/select: avoid libgcc 64-bit multiplications
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

timeval::tv_usec is going to be 64-bit wide even on 32-bit
architectures. As not all architectures support 64-bit multiplications
instructions, calls to libgcc (__multi3()) may be emitted by the
compiler which are not provided by nolibc.

As tv_usec and tv_nsec are guaranteed to always fit into an uint32_t,
perform a 32-bit multiplication instead.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-7-c662992f75d7@weissschuh.net
---
 tools/include/nolibc/sys/select.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/sys/select.h b/tools/include/nolibc/sys/select.h
index f8870ad49687..80cb3755ba18 100644
--- a/tools/include/nolibc/sys/select.h
+++ b/tools/include/nolibc/sys/select.h
@@ -68,7 +68,7 @@ int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeva
 
 	if (timeout) {
 		t.tv_sec  = timeout->tv_sec;
-		t.tv_nsec = timeout->tv_usec * 1000;
+		t.tv_nsec = (uint32_t)timeout->tv_usec * 1000;
 	}
 	return my_syscall6(__NR_pselect6_time64, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL);
 #else
@@ -76,7 +76,7 @@ int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeva
 
 	if (timeout) {
 		t.tv_sec  = timeout->tv_sec;
-		t.tv_nsec = timeout->tv_usec * 1000;
+		t.tv_nsec = (uint32_t)timeout->tv_usec * 1000;
 	}
 	return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL);
 #endif
-- 
cgit v1.2.3


From f5aa863aea6c1ec20b85cc0b0a22e99597f0cb50 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:52 +0100
Subject: tools/nolibc: use custom structs timespec and timeval
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A custom 'struct timespec' and 'struct timeval' will be necessary for
64-bit time types on 32-bit architectures. <linux/time.h> will define
other time-related types in terms of the custom 'struct timespec'.

Add custom struct definitions which for now mirror exactly the ones from
the UAPI headers, but provide the foundation for further changes.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-8-c662992f75d7@weissschuh.net
---
 tools/include/nolibc/arch-s390.h |  3 +++
 tools/include/nolibc/types.h     | 16 +++++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h
index 74125a254ce3..5bee6ecbde0a 100644
--- a/tools/include/nolibc/arch-s390.h
+++ b/tools/include/nolibc/arch-s390.h
@@ -5,6 +5,9 @@
 
 #ifndef _NOLIBC_ARCH_S390_H
 #define _NOLIBC_ARCH_S390_H
+
+#include "types.h"
+
 #include <linux/signal.h>
 #include <linux/unistd.h>
 
diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h
index 470a5f77bc0f..c8ed4d9cae8a 100644
--- a/tools/include/nolibc/types.h
+++ b/tools/include/nolibc/types.h
@@ -13,9 +13,23 @@
 #include "std.h"
 #include <linux/mman.h>
 #include <linux/stat.h>
-#include <linux/time.h>
+#include <linux/time_types.h>
 #include <linux/wait.h>
 
+struct timespec {
+	__kernel_time_t	tv_sec;
+	long		tv_nsec;
+};
+#define _STRUCT_TIMESPEC
+
+struct timeval {
+	__kernel_time_t		tv_sec;
+	__kernel_suseconds_t	tv_usec;
+};
+
+#define timeval __nolibc_kernel_timeval
+#include <linux/time.h>
+#undef timeval
 
 /* Only the generic macros and types may be defined here. The arch-specific
  * ones such as the O_RDONLY and related macros used by fcntl() and open()
-- 
cgit v1.2.3


From bdcfc417f26ffd1a7e214d1cce78500dc4dbc2d5 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:53 +0100
Subject: tools/nolibc: always use 64-bit time types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

32-bit time types will stop working in 2038.

Switch to 64-bit time types everywhere.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/lkml/cec27d94-c99d-4c57-9a12-275ea663dda8@app.fastmail.com/
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-9-c662992f75d7@weissschuh.net
---
 tools/include/nolibc/std.h   | 2 +-
 tools/include/nolibc/types.h | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/std.h b/tools/include/nolibc/std.h
index 392f4dd94158..b9a116123902 100644
--- a/tools/include/nolibc/std.h
+++ b/tools/include/nolibc/std.h
@@ -29,6 +29,6 @@ typedef unsigned long       nlink_t;
 typedef  int64_t              off_t;
 typedef   signed long     blksize_t;
 typedef   signed long      blkcnt_t;
-typedef __kernel_time_t      time_t;
+typedef __kernel_time64_t    time_t;
 
 #endif /* _NOLIBC_STD_H */
diff --git a/tools/include/nolibc/types.h b/tools/include/nolibc/types.h
index c8ed4d9cae8a..8f3cb18df7f1 100644
--- a/tools/include/nolibc/types.h
+++ b/tools/include/nolibc/types.h
@@ -17,14 +17,15 @@
 #include <linux/wait.h>
 
 struct timespec {
-	__kernel_time_t	tv_sec;
-	long		tv_nsec;
+	time_t	tv_sec;
+	int64_t	tv_nsec;
 };
 #define _STRUCT_TIMESPEC
 
+/* Never use with system calls */
 struct timeval {
-	__kernel_time_t		tv_sec;
-	__kernel_suseconds_t	tv_usec;
+	time_t	tv_sec;
+	int64_t	tv_usec;
 };
 
 #define timeval __nolibc_kernel_timeval
-- 
cgit v1.2.3


From a590a79d19046d3e0f2089f83046f3b87f880359 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelagnelf@nvidia.com>
Date: Thu, 1 Jan 2026 11:34:16 -0500
Subject: rcutorture: Prevent concurrent kvm.sh runs on same source tree

Add flock-based locking to kvm.sh to prevent multiple instances from
running concurrently on the same source tree. This prevents build
failures caused by one instance's "make clean" deleting generated files
while another instance is building causing build failures.

The lock file is placed in the rcutorture directory and added to
.gitignore.

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 tools/testing/selftests/rcutorture/.gitignore |  1 +
 tools/testing/selftests/rcutorture/bin/kvm.sh | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/.gitignore b/tools/testing/selftests/rcutorture/.gitignore
index f6cbce77460b..b8fd42547a6e 100644
--- a/tools/testing/selftests/rcutorture/.gitignore
+++ b/tools/testing/selftests/rcutorture/.gitignore
@@ -3,3 +3,4 @@ initrd
 b[0-9]*
 res
 *.swp
+.kvm.sh.lock
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index fff15821c44c..d1fbd092e22a 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -275,6 +275,23 @@ do
 	shift
 done
 
+# Prevent concurrent kvm.sh runs on the same source tree.  The flock
+# is automatically released when the script exits, even if killed.
+TORTURE_LOCK="$RCUTORTURE/.kvm.sh.lock"
+if test -z "$dryrun"
+then
+	# Create a file descriptor and flock it, so that when kvm.sh (and its
+	# children) exit, the flock is released by the kernel automatically.
+	exec 9>"$TORTURE_LOCK"
+	if ! flock -n 9
+	then
+		echo "ERROR: Another kvm.sh instance is already running on this tree."
+		echo "       Lock file: $TORTURE_LOCK"
+		echo "       To run kvm.sh, kill all existing kvm.sh runs first."
+		exit 1
+	fi
+fi
+
 if test -n "$dryrun" || test -z "$TORTURE_INITRD" || tools/testing/selftests/rcutorture/bin/mkinitrd.sh
 then
 	:
-- 
cgit v1.2.3


From cf587c6ff2d09866eb53a4620bc1aa561fb0c000 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelagnelf@nvidia.com>
Date: Thu, 1 Jan 2026 11:34:17 -0500
Subject: rcutorture: Add --kill-previous option to terminate previous kvm.sh
 runs

When kvm.sh is killed, its child processes (make, gcc, qemu, etc.) may
continue running. This prevents new kvm.sh instances from starting even
though the parent is gone.

Add a --kill-previous option that uses fuser(1) to terminate all
processes holding the flock file before attempting to acquire it. This
provides a clean way to recover from stale/zombie kvm.sh runs which
sometimes may have lots of qemu and compiler processes still disturbing.

Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 tools/testing/selftests/rcutorture/bin/kvm.sh | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index d1fbd092e22a..65b04b832733 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -80,6 +80,7 @@ usage () {
 	echo "       --kasan"
 	echo "       --kconfig Kconfig-options"
 	echo "       --kcsan"
+	echo "       --kill-previous"
 	echo "       --kmake-arg kernel-make-arguments"
 	echo "       --mac nn:nn:nn:nn:nn:nn"
 	echo "       --memory megabytes|nnnG"
@@ -206,6 +207,9 @@ do
 	--kcsan)
 		TORTURE_KCONFIG_KCSAN_ARG="$debuginfo CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG
 		;;
+	--kill-previous)
+		TORTURE_KILL_PREVIOUS=1
+		;;
 	--kmake-arg|--kmake-args)
 		checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$'
 		TORTURE_KMAKE_ARG="`echo "$TORTURE_KMAKE_ARG $2" | sed -e 's/^ *//' -e 's/ *$//'`"
@@ -278,6 +282,25 @@ done
 # Prevent concurrent kvm.sh runs on the same source tree.  The flock
 # is automatically released when the script exits, even if killed.
 TORTURE_LOCK="$RCUTORTURE/.kvm.sh.lock"
+
+# Terminate any processes holding the lock file, if requested.
+if test -n "$TORTURE_KILL_PREVIOUS"
+then
+	if test -e "$TORTURE_LOCK"
+	then
+		echo "Killing processes holding $TORTURE_LOCK..."
+		if fuser -k "$TORTURE_LOCK" >/dev/null 2>&1
+		then
+			sleep 2
+			echo "Previous kvm.sh processes killed."
+		else
+			echo "No processes were holding the lock."
+		fi
+	else
+		echo "No lock file exists, nothing to kill."
+	fi
+fi
+
 if test -z "$dryrun"
 then
 	# Create a file descriptor and flock it, so that when kvm.sh (and its
@@ -287,7 +310,7 @@ then
 	then
 		echo "ERROR: Another kvm.sh instance is already running on this tree."
 		echo "       Lock file: $TORTURE_LOCK"
-		echo "       To run kvm.sh, kill all existing kvm.sh runs first."
+		echo "       To run kvm.sh, kill all existing kvm.sh runs first (--kill-previous)."
 		exit 1
 	fi
 fi
-- 
cgit v1.2.3


From c18f35e4904920db4c51620ba634e4d175b24741 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@gmail.com>
Date: Tue, 23 Dec 2025 20:35:38 +0900
Subject: objtool/rust: add one more `noreturn` Rust function

Fix the following warning:

rust/kernel.o: warning: objtool: _RNvXNtNtCs1ewLyjEZ7Le_6kernel3str9parse_intaNtNtB2_7private12FromStrRadix14from_str_radix()
falls through to next function _RNvXNtNtCs1ewLyjEZ7Le_6kernel3str9parse_intaNtNtB2_7private12FromStrRadix16from_u64_negated()

The commit 51d9ee90ea90 ("rust: str: add radix prefixed integer
parsing functions") introduces u64::from_str_radix(), whose
implementation contains a panic path for out-of-range radix values.
The panic helper is core::num::from_ascii_radix_panic().

Note that radix is derived from strip_radix() here and is always
within the valid range, so kernel never panics.

Fixes: 51d9ee90ea90 ("rust: str: add radix prefixed integer parsing functions")
Signed-off-by: FUJITA Tomonori <fujita.tomonori@gmail.com>
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Tested-by: Alice Ryhl <aliceryhl@google.com>
Link: https://patch.msgid.link/20251223113538.1016078-1-fujita.tomonori@gmail.com
[ Reworded typo. - Miguel ]
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 tools/objtool/check.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 3f7999317f4d..719ec727efd4 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -197,7 +197,8 @@ static bool is_rust_noreturn(const struct symbol *func)
 	 * as well as changes to the source code itself between versions (since
 	 * these come from the Rust standard library).
 	 */
-	return str_ends_with(func->name, "_4core5sliceSp15copy_from_slice17len_mismatch_fail")		||
+	return str_ends_with(func->name, "_4core3num22from_ascii_radix_panic")				||
+	       str_ends_with(func->name, "_4core5sliceSp15copy_from_slice17len_mismatch_fail")		||
 	       str_ends_with(func->name, "_4core6option13expect_failed")				||
 	       str_ends_with(func->name, "_4core6option13unwrap_failed")				||
 	       str_ends_with(func->name, "_4core6result13unwrap_failed")				||
-- 
cgit v1.2.3


From be05f571464404432a0f8fe1c81a86a0862da283 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Sun, 28 Dec 2025 20:39:42 +0200
Subject: memblock test: include <linux/sizes.h> from tools mm.h stub
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

memblock test compilation fails:

memblock.c: In function ‘memblock_validate_numa_coverage’:
memblock.c:784:58: error: ‘SZ_1M’ undeclared (first use in this function)
  784 |                 mem_size_mb = memblock_phys_mem_size() / SZ_1M;
      |                                                          ^~~~~

The SZ_1M is defined in sizes.h, but it is not included by stub version of
mm.h in tools/include/linux.

Add include of sizes.h to tools/include/linux/mm.h to fix the compilation
of memblock tests.

Link: https://patch.msgid.link/20251228183942.3628918-1-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 tools/include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h
index 677c37e4a18c..028f3faf46e7 100644
--- a/tools/include/linux/mm.h
+++ b/tools/include/linux/mm.h
@@ -4,6 +4,7 @@
 
 #include <linux/align.h>
 #include <linux/mmzone.h>
+#include <linux/sizes.h>
 
 #define PAGE_SHIFT		12
 #define PAGE_SIZE		(_AC(1, UL) << PAGE_SHIFT)
-- 
cgit v1.2.3


From e4588c25c9d122b5847b88e18b184404b6959160 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Fri, 19 Dec 2025 16:40:13 +0100
Subject: compiler-context-analysis: Remove __cond_lock() function-like helper

As discussed in [1], removing __cond_lock() will improve the readability
of trylock code. Now that Sparse context tracking support has been
removed, we can also remove __cond_lock().

Change existing APIs to either drop __cond_lock() completely, or make
use of the __cond_acquires() function attribute instead.

In particular, spinlock and rwlock implementations required switching
over to inline helpers rather than statement-expressions for their
trylock_* variants.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20250207082832.GU7145@noisy.programming.kicks-ass.net/ [1]
Link: https://patch.msgid.link/20251219154418.3592607-25-elver@google.com
---
 Documentation/dev-tools/context-analysis.rst       |  2 -
 Documentation/mm/process_addrs.rst                 |  6 +--
 drivers/net/wireless/intel/iwlwifi/iwl-trans.c     |  4 +-
 drivers/net/wireless/intel/iwlwifi/iwl-trans.h     |  6 +--
 .../wireless/intel/iwlwifi/pcie/gen1_2/internal.h  |  5 +-
 .../net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c |  4 +-
 include/linux/compiler-context-analysis.h          | 31 -----------
 include/linux/lockref.h                            |  4 +-
 include/linux/mm.h                                 | 33 ++----------
 include/linux/rwlock.h                             | 11 ++--
 include/linux/rwlock_api_smp.h                     | 14 ++++-
 include/linux/rwlock_rt.h                          | 21 ++++----
 include/linux/sched/signal.h                       | 14 +----
 include/linux/spinlock.h                           | 45 ++++++----------
 include/linux/spinlock_api_smp.h                   | 20 +++++++
 include/linux/spinlock_api_up.h                    | 61 ++++++++++++++++++----
 include/linux/spinlock_rt.h                        | 26 +++++----
 kernel/signal.c                                    |  4 +-
 kernel/time/posix-timers.c                         | 13 ++---
 lib/dec_and_lock.c                                 |  8 +--
 lib/lockref.c                                      |  1 -
 mm/memory.c                                        |  4 +-
 mm/pgtable-generic.c                               | 19 ++++---
 tools/include/linux/compiler_types.h               |  2 -
 24 files changed, 163 insertions(+), 195 deletions(-)

(limited to 'tools')

diff --git a/Documentation/dev-tools/context-analysis.rst b/Documentation/dev-tools/context-analysis.rst
index 8dd6c0d695aa..e69896e597b6 100644
--- a/Documentation/dev-tools/context-analysis.rst
+++ b/Documentation/dev-tools/context-analysis.rst
@@ -112,10 +112,8 @@ Keywords
                  __releases_shared
                  __acquire
                  __release
-                 __cond_lock
                  __acquire_shared
                  __release_shared
-                 __cond_lock_shared
                  __acquire_ret
                  __acquire_shared_ret
                  context_unsafe
diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst
index 7f2f3e87071d..851680ead45f 100644
--- a/Documentation/mm/process_addrs.rst
+++ b/Documentation/mm/process_addrs.rst
@@ -583,7 +583,7 @@ To access PTE-level page tables, a helper like :c:func:`!pte_offset_map_lock` or
 :c:func:`!pte_offset_map` can be used depending on stability requirements.
 These map the page table into kernel memory if required, take the RCU lock, and
 depending on variant, may also look up or acquire the PTE lock.
-See the comment on :c:func:`!__pte_offset_map_lock`.
+See the comment on :c:func:`!pte_offset_map_lock`.
 
 Atomicity
 ^^^^^^^^^
@@ -667,7 +667,7 @@ must be released via :c:func:`!pte_unmap_unlock`.
 .. note:: There are some variants on this, such as
    :c:func:`!pte_offset_map_rw_nolock` when we know we hold the PTE stable but
    for brevity we do not explore this.  See the comment for
-   :c:func:`!__pte_offset_map_lock` for more details.
+   :c:func:`!pte_offset_map_lock` for more details.
 
 When modifying data in ranges we typically only wish to allocate higher page
 tables as necessary, using these locks to avoid races or overwriting anything,
@@ -686,7 +686,7 @@ At the leaf page table, that is the PTE, we can't entirely rely on this pattern
 as we have separate PMD and PTE locks and a THP collapse for instance might have
 eliminated the PMD entry as well as the PTE from under us.
 
-This is why :c:func:`!__pte_offset_map_lock` locklessly retrieves the PMD entry
+This is why :c:func:`!pte_offset_map_lock` locklessly retrieves the PMD entry
 for the PTE, carefully checking it is as expected, before acquiring the
 PTE-specific lock, and then *again* checking that the PMD entry is as expected.
 
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c
index cc8a84018f70..fa1442246662 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c
@@ -548,11 +548,11 @@ int iwl_trans_read_config32(struct iwl_trans *trans, u32 ofs,
 	return iwl_trans_pcie_read_config32(trans, ofs, val);
 }
 
-bool _iwl_trans_grab_nic_access(struct iwl_trans *trans)
+bool iwl_trans_grab_nic_access(struct iwl_trans *trans)
 {
 	return iwl_trans_pcie_grab_nic_access(trans);
 }
-IWL_EXPORT_SYMBOL(_iwl_trans_grab_nic_access);
+IWL_EXPORT_SYMBOL(iwl_trans_grab_nic_access);
 
 void __releases(nic_access)
 iwl_trans_release_nic_access(struct iwl_trans *trans)
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h
index a552669db6e2..688f9fee2821 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h
@@ -1063,11 +1063,7 @@ int iwl_trans_sw_reset(struct iwl_trans *trans);
 void iwl_trans_set_bits_mask(struct iwl_trans *trans, u32 reg,
 			     u32 mask, u32 value);
 
-bool _iwl_trans_grab_nic_access(struct iwl_trans *trans);
-
-#define iwl_trans_grab_nic_access(trans)		\
-	__cond_lock(nic_access,				\
-		    likely(_iwl_trans_grab_nic_access(trans)))
+bool iwl_trans_grab_nic_access(struct iwl_trans *trans);
 
 void __releases(nic_access)
 iwl_trans_release_nic_access(struct iwl_trans *trans);
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/internal.h
index 207c56e338dd..7b7b35e442f9 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/internal.h
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/internal.h
@@ -553,10 +553,7 @@ void iwl_trans_pcie_free(struct iwl_trans *trans);
 void iwl_trans_pcie_free_pnvm_dram_regions(struct iwl_dram_regions *dram_regions,
 					   struct device *dev);
 
-bool __iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans, bool silent);
-#define _iwl_trans_pcie_grab_nic_access(trans, silent)		\
-	__cond_lock(nic_access_nobh,				\
-		    likely(__iwl_trans_pcie_grab_nic_access(trans, silent)))
+bool _iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans, bool silent);
 
 void iwl_trans_pcie_check_product_reset_status(struct pci_dev *pdev);
 void iwl_trans_pcie_check_product_reset_mode(struct pci_dev *pdev);
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c
index 164d060ec617..415a19ea9f06 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c
@@ -2327,7 +2327,7 @@ EXPORT_SYMBOL(iwl_trans_pcie_reset);
  * This version doesn't disable BHs but rather assumes they're
  * already disabled.
  */
-bool __iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans, bool silent)
+bool _iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans, bool silent)
 {
 	int ret;
 	struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
@@ -2415,7 +2415,7 @@ bool iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans)
 	bool ret;
 
 	local_bh_disable();
-	ret = __iwl_trans_pcie_grab_nic_access(trans, false);
+	ret = _iwl_trans_pcie_grab_nic_access(trans, false);
 	if (ret) {
 		/* keep BHs disabled until iwl_trans_pcie_release_nic_access */
 		return ret;
diff --git a/include/linux/compiler-context-analysis.h b/include/linux/compiler-context-analysis.h
index cb728822343f..4f7559d7ae91 100644
--- a/include/linux/compiler-context-analysis.h
+++ b/include/linux/compiler-context-analysis.h
@@ -341,24 +341,6 @@ static inline void _context_unsafe_alias(void **p) { }
  */
 #define __release(x)		__release_ctx_lock(x)
 
-/**
- * __cond_lock() - function that conditionally acquires a context lock
- *                 exclusively
- * @x: context lock instance pinter
- * @c: boolean expression
- *
- * Return: result of @c
- *
- * No-op function that conditionally acquires context lock instance @x
- * exclusively, if the boolean expression @c is true. The result of @c is the
- * return value; for example:
- *
- * .. code-block:: c
- *
- *	#define spin_trylock(l) __cond_lock(&lock, _spin_trylock(&lock))
- */
-#define __cond_lock(x, c)	__try_acquire_ctx_lock(x, c)
-
 /**
  * __must_hold_shared() - function attribute, caller must hold shared context lock
  *
@@ -417,19 +399,6 @@ static inline void _context_unsafe_alias(void **p) { }
  */
 #define __release_shared(x)	__release_shared_ctx_lock(x)
 
-/**
- * __cond_lock_shared() - function that conditionally acquires a context lock shared
- * @x: context lock instance pinter
- * @c: boolean expression
- *
- * Return: result of @c
- *
- * No-op function that conditionally acquires context lock instance @x with
- * shared access, if the boolean expression @c is true. The result of @c is the
- * return value.
- */
-#define __cond_lock_shared(x, c) __try_acquire_shared_ctx_lock(x, c)
-
 /**
  * __acquire_ret() - helper to acquire context lock of return value
  * @call: call expression
diff --git a/include/linux/lockref.h b/include/linux/lockref.h
index 815d871fadfc..6ded24cdb4a8 100644
--- a/include/linux/lockref.h
+++ b/include/linux/lockref.h
@@ -49,9 +49,7 @@ static inline void lockref_init(struct lockref *lockref)
 void lockref_get(struct lockref *lockref);
 int lockref_put_return(struct lockref *lockref);
 bool lockref_get_not_zero(struct lockref *lockref);
-bool lockref_put_or_lock(struct lockref *lockref);
-#define lockref_put_or_lock(_lockref) \
-	(!__cond_lock((_lockref)->lock, !lockref_put_or_lock(_lockref)))
+bool lockref_put_or_lock(struct lockref *lockref) __cond_acquires(false, &lockref->lock);
 
 void lockref_mark_dead(struct lockref *lockref);
 bool lockref_get_not_dead(struct lockref *lockref);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 15076261d0c2..f369cb633516 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2975,15 +2975,8 @@ static inline pud_t pud_mkspecial(pud_t pud)
 }
 #endif	/* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
 
-extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
-			       spinlock_t **ptl);
-static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
-				    spinlock_t **ptl)
-{
-	pte_t *ptep;
-	__cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl));
-	return ptep;
-}
+extern pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+			     spinlock_t **ptl);
 
 #ifdef __PAGETABLE_P4D_FOLDED
 static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
@@ -3337,31 +3330,15 @@ static inline bool pagetable_pte_ctor(struct mm_struct *mm,
 	return true;
 }
 
-pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
-static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr,
-			pmd_t *pmdvalp)
-{
-	pte_t *pte;
+pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
 
-	__cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp));
-	return pte;
-}
 static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
 {
 	return __pte_offset_map(pmd, addr, NULL);
 }
 
-pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
-			unsigned long addr, spinlock_t **ptlp);
-static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
-			unsigned long addr, spinlock_t **ptlp)
-{
-	pte_t *pte;
-
-	__cond_lock(RCU, __cond_lock(*ptlp,
-			pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)));
-	return pte;
-}
+pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
+			   unsigned long addr, spinlock_t **ptlp);
 
 pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
 				unsigned long addr, spinlock_t **ptlp);
diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index 151f9d5f3288..65a5b55e1bcd 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -50,8 +50,8 @@ do {								\
  * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
  * methods are defined as nops in the case they are not required.
  */
-#define read_trylock(lock)	__cond_lock_shared(lock, _raw_read_trylock(lock))
-#define write_trylock(lock)	__cond_lock(lock, _raw_write_trylock(lock))
+#define read_trylock(lock)	_raw_read_trylock(lock)
+#define write_trylock(lock)	_raw_write_trylock(lock)
 
 #define write_lock(lock)	_raw_write_lock(lock)
 #define read_lock(lock)		_raw_read_lock(lock)
@@ -113,12 +113,7 @@ do {								\
 	} while (0)
 #define write_unlock_bh(lock)		_raw_write_unlock_bh(lock)
 
-#define write_trylock_irqsave(lock, flags)		\
-	__cond_lock(lock, ({				\
-		local_irq_save(flags);			\
-		_raw_write_trylock(lock) ?		\
-		1 : ({ local_irq_restore(flags); 0; });	\
-	}))
+#define write_trylock_irqsave(lock, flags) _raw_write_trylock_irqsave(lock, &(flags))
 
 #ifdef arch_rwlock_is_contended
 #define rwlock_is_contended(lock) \
diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h
index 6d5cc0b7be1f..d903b17c46ca 100644
--- a/include/linux/rwlock_api_smp.h
+++ b/include/linux/rwlock_api_smp.h
@@ -26,8 +26,8 @@ unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
 							__acquires(lock);
 unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
 							__acquires(lock);
-int __lockfunc _raw_read_trylock(rwlock_t *lock);
-int __lockfunc _raw_write_trylock(rwlock_t *lock);
+int __lockfunc _raw_read_trylock(rwlock_t *lock)	__cond_acquires_shared(true, lock);
+int __lockfunc _raw_write_trylock(rwlock_t *lock)	__cond_acquires(true, lock);
 void __lockfunc _raw_read_unlock(rwlock_t *lock)	__releases_shared(lock);
 void __lockfunc _raw_write_unlock(rwlock_t *lock)	__releases(lock);
 void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)	__releases_shared(lock);
@@ -41,6 +41,16 @@ void __lockfunc
 _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 							__releases(lock);
 
+static inline bool _raw_write_trylock_irqsave(rwlock_t *lock, unsigned long *flags)
+	__cond_acquires(true, lock)
+{
+	local_irq_save(*flags);
+	if (_raw_write_trylock(lock))
+		return true;
+	local_irq_restore(*flags);
+	return false;
+}
+
 #ifdef CONFIG_INLINE_READ_LOCK
 #define _raw_read_lock(lock) __raw_read_lock(lock)
 #endif
diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
index f64d6d319a47..37b387dcab21 100644
--- a/include/linux/rwlock_rt.h
+++ b/include/linux/rwlock_rt.h
@@ -26,11 +26,11 @@ do {							\
 } while (0)
 
 extern void rt_read_lock(rwlock_t *rwlock)	__acquires_shared(rwlock);
-extern int rt_read_trylock(rwlock_t *rwlock);
+extern int rt_read_trylock(rwlock_t *rwlock)	__cond_acquires_shared(true, rwlock);
 extern void rt_read_unlock(rwlock_t *rwlock)	__releases_shared(rwlock);
 extern void rt_write_lock(rwlock_t *rwlock)	__acquires(rwlock);
 extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass)	__acquires(rwlock);
-extern int rt_write_trylock(rwlock_t *rwlock);
+extern int rt_write_trylock(rwlock_t *rwlock)	__cond_acquires(true, rwlock);
 extern void rt_write_unlock(rwlock_t *rwlock)	__releases(rwlock);
 
 static __always_inline void read_lock(rwlock_t *rwlock)
@@ -59,7 +59,7 @@ static __always_inline void read_lock_irq(rwlock_t *rwlock)
 		flags = 0;				\
 	} while (0)
 
-#define read_trylock(lock)	__cond_lock_shared(lock, rt_read_trylock(lock))
+#define read_trylock(lock)	rt_read_trylock(lock)
 
 static __always_inline void read_unlock(rwlock_t *rwlock)
 	__releases_shared(rwlock)
@@ -123,14 +123,15 @@ static __always_inline void write_lock_irq(rwlock_t *rwlock)
 		flags = 0;				\
 	} while (0)
 
-#define write_trylock(lock)	__cond_lock(lock, rt_write_trylock(lock))
+#define write_trylock(lock)	rt_write_trylock(lock)
 
-#define write_trylock_irqsave(lock, flags)		\
-	__cond_lock(lock, ({				\
-		typecheck(unsigned long, flags);	\
-		flags = 0;				\
-		rt_write_trylock(lock);			\
-	}))
+static __always_inline bool _write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
+	__cond_acquires(true, rwlock)
+{
+	*flags = 0;
+	return rt_write_trylock(rwlock);
+}
+#define write_trylock_irqsave(lock, flags) _write_trylock_irqsave(lock, &(flags))
 
 static __always_inline void write_unlock(rwlock_t *rwlock)
 	__releases(rwlock)
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 7d6449982822..a63f65aa5bdd 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -737,18 +737,8 @@ static inline int thread_group_empty(struct task_struct *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
-extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
-							unsigned long *flags);
-
-static inline struct sighand_struct *lock_task_sighand(struct task_struct *task,
-						       unsigned long *flags)
-{
-	struct sighand_struct *ret;
-
-	ret = __lock_task_sighand(task, flags);
-	(void)__cond_lock(&task->sighand->siglock, ret);
-	return ret;
-}
+extern struct sighand_struct *lock_task_sighand(struct task_struct *task,
+						unsigned long *flags);
 
 static inline void unlock_task_sighand(struct task_struct *task,
 						unsigned long *flags)
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 7e560c7a7b23..396b8c5d6c1b 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -213,7 +213,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
  * various methods are defined as nops in the case they are not
  * required.
  */
-#define raw_spin_trylock(lock)	__cond_lock(lock, _raw_spin_trylock(lock))
+#define raw_spin_trylock(lock)	_raw_spin_trylock(lock)
 
 #define raw_spin_lock(lock)	_raw_spin_lock(lock)
 
@@ -284,22 +284,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 	} while (0)
 #define raw_spin_unlock_bh(lock)	_raw_spin_unlock_bh(lock)
 
-#define raw_spin_trylock_bh(lock) \
-	__cond_lock(lock, _raw_spin_trylock_bh(lock))
+#define raw_spin_trylock_bh(lock)	_raw_spin_trylock_bh(lock)
 
-#define raw_spin_trylock_irq(lock)			\
-	__cond_lock(lock, ({				\
-		local_irq_disable();			\
-		_raw_spin_trylock(lock) ?		\
-		1 : ({ local_irq_enable(); 0;  });	\
-	}))
+#define raw_spin_trylock_irq(lock)	_raw_spin_trylock_irq(lock)
 
-#define raw_spin_trylock_irqsave(lock, flags)		\
-	__cond_lock(lock, ({				\
-		local_irq_save(flags);			\
-		_raw_spin_trylock(lock) ?		\
-		1 : ({ local_irq_restore(flags); 0; }); \
-	}))
+#define raw_spin_trylock_irqsave(lock, flags) _raw_spin_trylock_irqsave(lock, &(flags))
 
 #ifndef CONFIG_PREEMPT_RT
 /* Include rwlock functions for !RT */
@@ -433,8 +422,12 @@ static __always_inline int spin_trylock_irq(spinlock_t *lock)
 	return raw_spin_trylock_irq(&lock->rlock);
 }
 
-#define spin_trylock_irqsave(lock, flags)			\
-	__cond_lock(lock, raw_spin_trylock_irqsave(spinlock_check(lock), flags))
+static __always_inline bool _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+	__cond_acquires(true, lock) __no_context_analysis
+{
+	return raw_spin_trylock_irqsave(spinlock_check(lock), *flags);
+}
+#define spin_trylock_irqsave(lock, flags) _spin_trylock_irqsave(lock, &(flags))
 
 /**
  * spin_is_locked() - Check whether a spinlock is locked.
@@ -512,23 +505,17 @@ static inline int rwlock_needbreak(rwlock_t *lock)
  * Decrements @atomic by 1.  If the result is 0, returns true and locks
  * @lock.  Returns false for all other cases.
  */
-extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
-#define atomic_dec_and_lock(atomic, lock) \
-		__cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
+extern int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) __cond_acquires(true, lock);
 
 extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
-					unsigned long *flags);
-#define atomic_dec_and_lock_irqsave(atomic, lock, flags) \
-		__cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags)))
+					unsigned long *flags) __cond_acquires(true, lock);
+#define atomic_dec_and_lock_irqsave(atomic, lock, flags) _atomic_dec_and_lock_irqsave(atomic, lock, &(flags))
 
-extern int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock);
-#define atomic_dec_and_raw_lock(atomic, lock) \
-		__cond_lock(lock, _atomic_dec_and_raw_lock(atomic, lock))
+extern int atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock) __cond_acquires(true, lock);
 
 extern int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock,
-					unsigned long *flags);
-#define atomic_dec_and_raw_lock_irqsave(atomic, lock, flags) \
-		__cond_lock(lock, _atomic_dec_and_raw_lock_irqsave(atomic, lock, &(flags)))
+					    unsigned long *flags) __cond_acquires(true, lock);
+#define atomic_dec_and_raw_lock_irqsave(atomic, lock, flags) _atomic_dec_and_raw_lock_irqsave(atomic, lock, &(flags))
 
 int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask,
 			     size_t max_size, unsigned int cpu_mult,
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 7e7d7d373213..bda5e7a390cd 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -95,6 +95,26 @@ static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 	return 0;
 }
 
+static __always_inline bool _raw_spin_trylock_irq(raw_spinlock_t *lock)
+	__cond_acquires(true, lock)
+{
+	local_irq_disable();
+	if (_raw_spin_trylock(lock))
+		return true;
+	local_irq_enable();
+	return false;
+}
+
+static __always_inline bool _raw_spin_trylock_irqsave(raw_spinlock_t *lock, unsigned long *flags)
+	__cond_acquires(true, lock)
+{
+	local_irq_save(*flags);
+	if (_raw_spin_trylock(lock))
+		return true;
+	local_irq_restore(*flags);
+	return false;
+}
+
 /*
  * If lockdep is enabled then we use the non-preemption spin-ops
  * even on CONFIG_PREEMPTION, because lockdep assumes that interrupts are
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
index 018f5aabc1be..a9d5c7c66e03 100644
--- a/include/linux/spinlock_api_up.h
+++ b/include/linux/spinlock_api_up.h
@@ -24,14 +24,11 @@
  * flags straight, to suppress compiler warnings of unused lock
  * variables, and to add the proper checker annotations:
  */
-#define ___LOCK_void(lock) \
-  do { (void)(lock); } while (0)
-
 #define ___LOCK_(lock) \
-  do { __acquire(lock); ___LOCK_void(lock); } while (0)
+  do { __acquire(lock); (void)(lock); } while (0)
 
 #define ___LOCK_shared(lock) \
-  do { __acquire_shared(lock); ___LOCK_void(lock); } while (0)
+  do { __acquire_shared(lock); (void)(lock); } while (0)
 
 #define __LOCK(lock, ...) \
   do { preempt_disable(); ___LOCK_##__VA_ARGS__(lock); } while (0)
@@ -78,10 +75,56 @@
 #define _raw_spin_lock_irqsave(lock, flags)	__LOCK_IRQSAVE(lock, flags)
 #define _raw_read_lock_irqsave(lock, flags)	__LOCK_IRQSAVE(lock, flags, shared)
 #define _raw_write_lock_irqsave(lock, flags)	__LOCK_IRQSAVE(lock, flags)
-#define _raw_spin_trylock(lock)			({ __LOCK(lock, void); 1; })
-#define _raw_read_trylock(lock)			({ __LOCK(lock, void); 1; })
-#define _raw_write_trylock(lock)			({ __LOCK(lock, void); 1; })
-#define _raw_spin_trylock_bh(lock)		({ __LOCK_BH(lock, void); 1; })
+
+static __always_inline int _raw_spin_trylock(raw_spinlock_t *lock)
+	__cond_acquires(true, lock)
+{
+	__LOCK(lock);
+	return 1;
+}
+
+static __always_inline int _raw_spin_trylock_bh(raw_spinlock_t *lock)
+	__cond_acquires(true, lock)
+{
+	__LOCK_BH(lock);
+	return 1;
+}
+
+static __always_inline int _raw_spin_trylock_irq(raw_spinlock_t *lock)
+	__cond_acquires(true, lock)
+{
+	__LOCK_IRQ(lock);
+	return 1;
+}
+
+static __always_inline int _raw_spin_trylock_irqsave(raw_spinlock_t *lock, unsigned long *flags)
+	__cond_acquires(true, lock)
+{
+	__LOCK_IRQSAVE(lock, *(flags));
+	return 1;
+}
+
+static __always_inline int _raw_read_trylock(rwlock_t *lock)
+	__cond_acquires_shared(true, lock)
+{
+	__LOCK(lock, shared);
+	return 1;
+}
+
+static __always_inline int _raw_write_trylock(rwlock_t *lock)
+	__cond_acquires(true, lock)
+{
+	__LOCK(lock);
+	return 1;
+}
+
+static __always_inline int _raw_write_trylock_irqsave(rwlock_t *lock, unsigned long *flags)
+	__cond_acquires(true, lock)
+{
+	__LOCK_IRQSAVE(lock, *(flags));
+	return 1;
+}
+
 #define _raw_spin_unlock(lock)			__UNLOCK(lock)
 #define _raw_read_unlock(lock)			__UNLOCK(lock, shared)
 #define _raw_write_unlock(lock)			__UNLOCK(lock)
diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index 6bab73ee1384..0a585768358f 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -37,8 +37,8 @@ extern void rt_spin_lock_nested(spinlock_t *lock, int subclass)	__acquires(lock)
 extern void rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock) __acquires(lock);
 extern void rt_spin_unlock(spinlock_t *lock)	__releases(lock);
 extern void rt_spin_lock_unlock(spinlock_t *lock);
-extern int rt_spin_trylock_bh(spinlock_t *lock);
-extern int rt_spin_trylock(spinlock_t *lock);
+extern int rt_spin_trylock_bh(spinlock_t *lock) __cond_acquires(true, lock);
+extern int rt_spin_trylock(spinlock_t *lock) __cond_acquires(true, lock);
 
 static __always_inline void spin_lock(spinlock_t *lock)
 	__acquires(lock)
@@ -130,21 +130,19 @@ static __always_inline void spin_unlock_irqrestore(spinlock_t *lock,
 	rt_spin_unlock(lock);
 }
 
-#define spin_trylock(lock)				\
-	__cond_lock(lock, rt_spin_trylock(lock))
+#define spin_trylock(lock)	rt_spin_trylock(lock)
 
-#define spin_trylock_bh(lock)				\
-	__cond_lock(lock, rt_spin_trylock_bh(lock))
+#define spin_trylock_bh(lock)	rt_spin_trylock_bh(lock)
 
-#define spin_trylock_irq(lock)				\
-	__cond_lock(lock, rt_spin_trylock(lock))
+#define spin_trylock_irq(lock)	rt_spin_trylock(lock)
 
-#define spin_trylock_irqsave(lock, flags)		\
-	__cond_lock(lock, ({				\
-		typecheck(unsigned long, flags);	\
-		flags = 0;				\
-		rt_spin_trylock(lock);			\
-	}))
+static __always_inline bool _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+	__cond_acquires(true, lock)
+{
+	*flags = 0;
+	return rt_spin_trylock(lock);
+}
+#define spin_trylock_irqsave(lock, flags) _spin_trylock_irqsave(lock, &(flags))
 
 #define spin_is_contended(lock)		(((void)(lock), 0))
 
diff --git a/kernel/signal.c b/kernel/signal.c
index e42b8bd6922f..d65d0fe24bfb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1355,8 +1355,8 @@ int zap_other_threads(struct task_struct *p)
 	return count;
 }
 
-struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
-					   unsigned long *flags)
+struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
+					 unsigned long *flags)
 {
 	struct sighand_struct *sighand;
 
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 80a8a09a21a0..413e2389f0a5 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -66,14 +66,7 @@ static const struct k_clock clock_realtime, clock_monotonic;
 #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
 #endif
 
-static struct k_itimer *__lock_timer(timer_t timer_id);
-
-#define lock_timer(tid)							\
-({	struct k_itimer *__timr;					\
-	__cond_lock(&__timr->it_lock, __timr = __lock_timer(tid));	\
-	__timr;								\
-})
-
+static struct k_itimer *lock_timer(timer_t timer_id);
 static inline void unlock_timer(struct k_itimer *timr)
 {
 	if (likely((timr)))
@@ -85,7 +78,7 @@ static inline void unlock_timer(struct k_itimer *timr)
 
 #define scoped_timer				(scope)
 
-DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id);
+DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), lock_timer(id), timer_t id);
 DEFINE_CLASS_IS_COND_GUARD(lock_timer);
 
 static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr)
@@ -600,7 +593,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
 }
 #endif
 
-static struct k_itimer *__lock_timer(timer_t timer_id)
+static struct k_itimer *lock_timer(timer_t timer_id)
 {
 	struct k_itimer *timr;
 
diff --git a/lib/dec_and_lock.c b/lib/dec_and_lock.c
index 1dcca8f2e194..8c7c398fd770 100644
--- a/lib/dec_and_lock.c
+++ b/lib/dec_and_lock.c
@@ -18,7 +18,7 @@
  * because the spin-lock and the decrement must be
  * "atomic".
  */
-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 {
 	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
 	if (atomic_add_unless(atomic, -1, 1))
@@ -32,7 +32,7 @@ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
 	return 0;
 }
 
-EXPORT_SYMBOL(_atomic_dec_and_lock);
+EXPORT_SYMBOL(atomic_dec_and_lock);
 
 int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
 				 unsigned long *flags)
@@ -50,7 +50,7 @@ int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
 }
 EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave);
 
-int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock)
+int atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock)
 {
 	/* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
 	if (atomic_add_unless(atomic, -1, 1))
@@ -63,7 +63,7 @@ int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock)
 	raw_spin_unlock(lock);
 	return 0;
 }
-EXPORT_SYMBOL(_atomic_dec_and_raw_lock);
+EXPORT_SYMBOL(atomic_dec_and_raw_lock);
 
 int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock,
 				     unsigned long *flags)
diff --git a/lib/lockref.c b/lib/lockref.c
index 9210fc6ae714..5d8e3ef3860e 100644
--- a/lib/lockref.c
+++ b/lib/lockref.c
@@ -105,7 +105,6 @@ EXPORT_SYMBOL(lockref_put_return);
  * @lockref: pointer to lockref structure
  * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
  */
-#undef lockref_put_or_lock
 bool lockref_put_or_lock(struct lockref *lockref)
 {
 	CMPXCHG_LOOP(
diff --git a/mm/memory.c b/mm/memory.c
index 2a55edc48a65..b751e1f85abc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2210,8 +2210,8 @@ static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
 	return pmd;
 }
 
-pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
-			spinlock_t **ptl)
+pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+		      spinlock_t **ptl)
 {
 	pmd_t *pmd = walk_to_pmd(mm, addr);
 
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index d3aec7a9926a..af7966169d69 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -280,7 +280,7 @@ static unsigned long pmdp_get_lockless_start(void) { return 0; }
 static void pmdp_get_lockless_end(unsigned long irqflags) { }
 #endif
 
-pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
+pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
 {
 	unsigned long irqflags;
 	pmd_t pmdval;
@@ -332,13 +332,12 @@ pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
 }
 
 /*
- * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation
- * __pte_offset_map_lock() below, is usually called with the pmd pointer for
- * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while
- * holding mmap_lock or vma lock for read or for write; or in truncate or rmap
- * context, while holding file's i_mmap_lock or anon_vma lock for read (or for
- * write). In a few cases, it may be used with pmd pointing to a pmd_t already
- * copied to or constructed on the stack.
+ * pte_offset_map_lock(mm, pmd, addr, ptlp) is usually called with the pmd
+ * pointer for addr, reached by walking down the mm's pgd, p4d, pud for addr:
+ * either while holding mmap_lock or vma lock for read or for write; or in
+ * truncate or rmap context, while holding file's i_mmap_lock or anon_vma lock
+ * for read (or for write). In a few cases, it may be used with pmd pointing to
+ * a pmd_t already copied to or constructed on the stack.
  *
  * When successful, it returns the pte pointer for addr, with its page table
  * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent
@@ -389,8 +388,8 @@ pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
  * table, and may not use RCU at all: "outsiders" like khugepaged should avoid
  * pte_offset_map() and co once the vma is detached from mm or mm_users is zero.
  */
-pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
-			     unsigned long addr, spinlock_t **ptlp)
+pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
+			   unsigned long addr, spinlock_t **ptlp)
 {
 	spinlock_t *ptl;
 	pmd_t pmdval;
diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h
index d09f9dc172a4..067a5b4e0f7b 100644
--- a/tools/include/linux/compiler_types.h
+++ b/tools/include/linux/compiler_types.h
@@ -20,7 +20,6 @@
 # define __releases(x)	__attribute__((context(x,1,0)))
 # define __acquire(x)	__context__(x,1)
 # define __release(x)	__context__(x,-1)
-# define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
 #else /* __CHECKER__ */
 /* context/locking */
 # define __must_hold(x)
@@ -28,7 +27,6 @@
 # define __releases(x)
 # define __acquire(x)	(void)0
 # define __release(x)	(void)0
-# define __cond_lock(x,c) (c)
 #endif /* __CHECKER__ */
 
 /* Compiler specific macros. */
-- 
cgit v1.2.3


From 623ba6ea45979fb1d06c5c8f03417ecc3565a851 Mon Sep 17 00:00:00 2001
From: Gary Guo <gary@garyguo.net>
Date: Mon, 5 Jan 2026 15:00:57 +0000
Subject: perf symbol: Remove Rust symbol workarounds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Due to an off-by-one error introduced in commit 73bbb94466fd3f8b
("kallsyms: support "big" kernel symbols"), long symbols (which are
currently only produced by Rust) can have their symbol type being
wrongly parsed by kernel/kallsyms.c.

This has been fixed in commit f3f9f42232dee596 ("kallsyms: Fix wrong
"big" kernel symbol type read from procfs"), and these symbols are now
reported correctly.

Drop the workaround in perf symbol that filter out these symbol types.

Specifically, '1' and 'l' can never be generated by nm -- 'u' does
indicate GNU unique, however such symbols are only generated by G++ for
C++ templates, and are never generated by LLVM (LLVM generates weak
symbols in such cases instead).

'N' can appear if symbols exist inside debug sections, and 'n' may
appear for symbols inside note sections, however these sections do not
typically have symbol (and they're explicitly filtered out by kallsyms).

Therefore, the previous occurrence of these symbols types must be due to
the off-by-one error and can be safely removed.

Signed-off-by: Gary Guo <gary@garyguo.net>
Acked-by: Miguel Ojeda <ojeda@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andreas Hindborg <a.hindborg@kernel.org>
Cc: Benno Lossin <lossin@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: Björn Roy Baron <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Trevor Gross <tmgross@umich.edu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/symbol.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 814f960fa8f8..8662001e1e25 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -104,21 +104,10 @@ static enum dso_binary_type binary_type_symtab[] = {
 
 #define DSO_BINARY_TYPE__SYMTAB_CNT ARRAY_SIZE(binary_type_symtab)
 
-static bool symbol_type__filter(char __symbol_type)
-{
-	// Since 'U' == undefined and 'u' == unique global symbol, we can't use toupper there
-	// 'N' is for debugging symbols, 'n' is a non-data, non-code, non-debug read-only section.
-	// According to 'man nm'.
-	// 'N' first seen in:
-	// ffffffff9b35d130 N __pfx__RNCINvNtNtNtCsbDUBuN8AbD4_4core4iter8adapters3map12map_try_foldjNtCs6vVzKs5jPr6_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_
-	// a seemingly Rust mangled name
-	// Ditto for '1':
-	// root@x1:~# grep ' 1 ' /proc/kallsyms
-	// ffffffffb098bc00 1 __pfx__RNCINvNtNtNtCsfwaGRd4cjqE_4core4iter8adapters3map12map_try_foldjNtCskFudTml27HW_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_
-	// ffffffffb098bc10 1 _RNCINvNtNtNtCsfwaGRd4cjqE_4core4iter8adapters3map12map_try_foldjNtCskFudTml27HW_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_
-	char symbol_type = toupper(__symbol_type);
-	return symbol_type == 'T' || symbol_type == 'W' || symbol_type == 'D' || symbol_type == 'B' ||
-	       __symbol_type == 'u' || __symbol_type == 'l' || __symbol_type == 'N' || __symbol_type == '1';
+static bool symbol_type__filter(char symbol_type)
+{
+	symbol_type = toupper(symbol_type);
+	return symbol_type == 'T' || symbol_type == 'W' || symbol_type == 'D' || symbol_type == 'B';
 }
 
 static int prefix_underscores_count(const char *str)
-- 
cgit v1.2.3


From f2546eba53bbe38c4bb950f78625ccf4b1a2cbc8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 15 Dec 2025 16:56:15 -0800
Subject: cxl/mem: Drop @host argument to devm_cxl_add_memdev()

In all cases the device that created the 'struct cxl_dev_state' instance is
also the device to host the devm cleanup of devm_cxl_add_memdev(). This
simplifies the function prototype, and limits a degree of freedom of the
API.

Cc: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ben Cheatham <benjamin.cheatham@amd.com>
Tested-by: Alejandro Lucero <alucerop@amd.com>
Link: https://patch.msgid.link/20251216005616.3090129-6-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/memdev.c    | 3 +--
 drivers/cxl/cxlmem.h         | 6 ++----
 drivers/cxl/mem.c            | 9 +++++----
 drivers/cxl/pci.c            | 2 +-
 tools/testing/cxl/test/mem.c | 2 +-
 5 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index 92aea95859fb..935a163f1527 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -1093,8 +1093,7 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd)
  * Core helper for devm_cxl_add_memdev() that wants to both create a device and
  * assert to the caller that upon return cxl_mem::probe() has been invoked.
  */
-struct cxl_memdev *__devm_cxl_add_memdev(struct device *host,
-					 struct cxl_dev_state *cxlds)
+struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds)
 {
 	struct device *dev;
 	int rc;
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 012e68acad34..9db31c7993c4 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -95,10 +95,8 @@ static inline bool is_cxl_endpoint(struct cxl_port *port)
 	return is_cxl_memdev(port->uport_dev);
 }
 
-struct cxl_memdev *__devm_cxl_add_memdev(struct device *host,
-					 struct cxl_dev_state *cxlds);
-struct cxl_memdev *devm_cxl_add_memdev(struct device *host,
-				       struct cxl_dev_state *cxlds);
+struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds);
+struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds);
 int devm_cxl_sanitize_setup_notifier(struct device *host,
 				     struct cxl_memdev *cxlmd);
 struct cxl_memdev_state;
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index d62931526fd4..677996c65272 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -165,17 +165,18 @@ static int cxl_mem_probe(struct device *dev)
 
 /**
  * devm_cxl_add_memdev - Add a CXL memory device
- * @host: devres alloc/release context and parent for the memdev
  * @cxlds: CXL device state to associate with the memdev
  *
  * Upon return the device will have had a chance to attach to the
  * cxl_mem driver, but may fail if the CXL topology is not ready
  * (hardware CXL link down, or software platform CXL root not attached)
+ *
+ * The parent of the resulting device and the devm context for allocations is
+ * @cxlds->dev.
  */
-struct cxl_memdev *devm_cxl_add_memdev(struct device *host,
-				       struct cxl_dev_state *cxlds)
+struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds)
 {
-	return __devm_cxl_add_memdev(host, cxlds);
+	return __devm_cxl_add_memdev(cxlds);
 }
 EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL");
 
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 0be4e508affe..1c6fc5334806 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1006,7 +1006,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		dev_dbg(&pdev->dev, "No CXL Features discovered\n");
 
-	cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds);
+	cxlmd = devm_cxl_add_memdev(cxlds);
 	if (IS_ERR(cxlmd))
 		return PTR_ERR(cxlmd);
 
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 176dcde570cd..8a22b7601627 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -1767,7 +1767,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
 
 	cxl_mock_add_event_logs(&mdata->mes);
 
-	cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds);
+	cxlmd = devm_cxl_add_memdev(cxlds);
 	if (IS_ERR(cxlmd))
 		return PTR_ERR(cxlmd);
 
-- 
cgit v1.2.3


From 29317f8dc6ed601ec54575689c2cd55cc470bcce Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 15 Dec 2025 16:56:16 -0800
Subject: cxl/mem: Introduce cxl_memdev_attach for CXL-dependent operation

Unlike the cxl_pci class driver that opportunistically enables memory
expansion with no other dependent functionality, CXL accelerator drivers
have distinct PCIe-only and CXL-enhanced operation states. If CXL is
available some additional coherent memory/cache operations can be enabled,
otherwise traditional DMA+MMIO over PCIe/CXL.io is a fallback.

This constitutes a new mode of operation where the caller of
devm_cxl_add_memdev() wants to make a "go/no-go" decision about running
in CXL accelerated mode or falling back to PCIe-only operation. Part of
that decision making process likely also includes additional
CXL-acceleration-specific resource setup. Encapsulate both of those
requirements into 'struct cxl_memdev_attach' that provides a ->probe()
callback. The probe callback runs in cxl_mem_probe() context, after the
port topology is successfully attached for the given memdev. It supports
a contract where, upon successful return from devm_cxl_add_memdev(),
everything needed for CXL accelerated operation has been enabled.

Additionally the presence of @cxlmd->attach indicates that the accelerator
driver be detached when CXL operation ends. This conceptually makes a CXL
link loss event mirror a PCIe link loss event which results in triggering
the ->remove() callback of affected devices+drivers. A driver can re-attach
to recover back to PCIe-only operation. Live recovery, i.e. without a
->remove()/->probe() cycle, is left as a future consideration.

[ dj: Repalce with updated commit log from Dan ]

Cc: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Ben Cheatham <benjamin.cheatham@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Alejandro Lucero <alucerop@amd.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251216005616.3090129-7-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/memdev.c    | 33 +++++++++++++++++++++++++++++----
 drivers/cxl/cxlmem.h         | 12 ++++++++++--
 drivers/cxl/mem.c            | 20 ++++++++++++++++----
 drivers/cxl/pci.c            |  2 +-
 tools/testing/cxl/test/mem.c |  2 +-
 5 files changed, 57 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index 935a163f1527..af3d0cc65138 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -641,14 +641,24 @@ static void detach_memdev(struct work_struct *work)
 	struct cxl_memdev *cxlmd;
 
 	cxlmd = container_of(work, typeof(*cxlmd), detach_work);
-	device_release_driver(&cxlmd->dev);
+
+	/*
+	 * When the creator of @cxlmd sets ->attach it indicates CXL operation
+	 * is required. In that case, @cxlmd detach escalates to parent device
+	 * detach.
+	 */
+	if (cxlmd->attach)
+		device_release_driver(cxlmd->dev.parent);
+	else
+		device_release_driver(&cxlmd->dev);
 	put_device(&cxlmd->dev);
 }
 
 static struct lock_class_key cxl_memdev_key;
 
 static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds,
-					   const struct file_operations *fops)
+					   const struct file_operations *fops,
+					   const struct cxl_memdev_attach *attach)
 {
 	struct cxl_memdev *cxlmd;
 	struct device *dev;
@@ -664,6 +674,8 @@ static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds,
 		goto err;
 	cxlmd->id = rc;
 	cxlmd->depth = -1;
+	cxlmd->attach = attach;
+	cxlmd->endpoint = ERR_PTR(-ENXIO);
 
 	dev = &cxlmd->dev;
 	device_initialize(dev);
@@ -1081,6 +1093,18 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd)
 {
 	int rc;
 
+	/*
+	 * If @attach is provided fail if the driver is not attached upon
+	 * return. Note that failure here could be the result of a race to
+	 * teardown the CXL port topology. I.e. cxl_mem_probe() could have
+	 * succeeded and then cxl_mem unbound before the lock is acquired.
+	 */
+	guard(device)(&cxlmd->dev);
+	if (cxlmd->attach && !cxlmd->dev.driver) {
+		cxl_memdev_unregister(cxlmd);
+		return ERR_PTR(-ENXIO);
+	}
+
 	rc = devm_add_action_or_reset(cxlmd->cxlds->dev, cxl_memdev_unregister,
 				      cxlmd);
 	if (rc)
@@ -1093,13 +1117,14 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd)
  * Core helper for devm_cxl_add_memdev() that wants to both create a device and
  * assert to the caller that upon return cxl_mem::probe() has been invoked.
  */
-struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds)
+struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds,
+					 const struct cxl_memdev_attach *attach)
 {
 	struct device *dev;
 	int rc;
 
 	struct cxl_memdev *cxlmd __free(put_cxlmd) =
-		cxl_memdev_alloc(cxlds, &cxl_memdev_fops);
+		cxl_memdev_alloc(cxlds, &cxl_memdev_fops, attach);
 	if (IS_ERR(cxlmd))
 		return cxlmd;
 
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 9db31c7993c4..ef202b34e5ea 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -34,6 +34,10 @@
 	(FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) !=                       \
 	 CXLMDEV_RESET_NEEDED_NOT)
 
+struct cxl_memdev_attach {
+	int (*probe)(struct cxl_memdev *cxlmd);
+};
+
 /**
  * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device
  * @dev: driver core device object
@@ -43,6 +47,7 @@
  * @cxl_nvb: coordinate removal of @cxl_nvd if present
  * @cxl_nvd: optional bridge to an nvdimm if the device supports pmem
  * @endpoint: connection to the CXL port topology for this memory device
+ * @attach: creator of this memdev depends on CXL link attach to operate
  * @id: id number of this memdev instance.
  * @depth: endpoint port depth
  * @scrub_cycle: current scrub cycle set for this device
@@ -59,6 +64,7 @@ struct cxl_memdev {
 	struct cxl_nvdimm_bridge *cxl_nvb;
 	struct cxl_nvdimm *cxl_nvd;
 	struct cxl_port *endpoint;
+	const struct cxl_memdev_attach *attach;
 	int id;
 	int depth;
 	u8 scrub_cycle;
@@ -95,8 +101,10 @@ static inline bool is_cxl_endpoint(struct cxl_port *port)
 	return is_cxl_memdev(port->uport_dev);
 }
 
-struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds);
-struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds);
+struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds,
+					 const struct cxl_memdev_attach *attach);
+struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds,
+				       const struct cxl_memdev_attach *attach);
 int devm_cxl_sanitize_setup_notifier(struct device *host,
 				     struct cxl_memdev *cxlmd);
 struct cxl_memdev_state;
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index 677996c65272..333c366b69e7 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -142,6 +142,12 @@ static int cxl_mem_probe(struct device *dev)
 			return rc;
 	}
 
+	if (cxlmd->attach) {
+		rc = cxlmd->attach->probe(cxlmd);
+		if (rc)
+			return rc;
+	}
+
 	rc = devm_cxl_memdev_edac_register(cxlmd);
 	if (rc)
 		dev_dbg(dev, "CXL memdev EDAC registration failed rc=%d\n", rc);
@@ -166,17 +172,23 @@ static int cxl_mem_probe(struct device *dev)
 /**
  * devm_cxl_add_memdev - Add a CXL memory device
  * @cxlds: CXL device state to associate with the memdev
+ * @attach: Caller depends on CXL topology attachment
  *
  * Upon return the device will have had a chance to attach to the
- * cxl_mem driver, but may fail if the CXL topology is not ready
- * (hardware CXL link down, or software platform CXL root not attached)
+ * cxl_mem driver, but may fail to attach if the CXL topology is not ready
+ * (hardware CXL link down, or software platform CXL root not attached).
+ *
+ * When @attach is NULL it indicates the caller wants the memdev to remain
+ * registered even if it does not immediately attach to the CXL hierarchy. When
+ * @attach is provided a cxl_mem_probe() failure leads to failure of this routine.
  *
  * The parent of the resulting device and the devm context for allocations is
  * @cxlds->dev.
  */
-struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds)
+struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds,
+				       const struct cxl_memdev_attach *attach)
 {
-	return __devm_cxl_add_memdev(cxlds);
+	return __devm_cxl_add_memdev(cxlds, attach);
 }
 EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL");
 
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 1c6fc5334806..549368a9c868 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1006,7 +1006,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		dev_dbg(&pdev->dev, "No CXL Features discovered\n");
 
-	cxlmd = devm_cxl_add_memdev(cxlds);
+	cxlmd = devm_cxl_add_memdev(cxlds, NULL);
 	if (IS_ERR(cxlmd))
 		return PTR_ERR(cxlmd);
 
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 8a22b7601627..cb87e8c0e63c 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -1767,7 +1767,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
 
 	cxl_mock_add_event_logs(&mdata->mes);
 
-	cxlmd = devm_cxl_add_memdev(cxlds);
+	cxlmd = devm_cxl_add_memdev(cxlds, NULL);
 	if (IS_ERR(cxlmd))
 		return PTR_ERR(cxlmd);
 
-- 
cgit v1.2.3


From fb36d71308a770268c771d6697f22615e5ddbd6e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 19 Dec 2025 15:29:42 +0000
Subject: kselftest/arm64: Support FORCE_TARGETS

The top level kselftest Makefile supports an option FORCE_TARGETS which
causes any failures during the build to be propagated to the exit status
of the top level make, useful during build testing. Currently the recursion
done by the arm64 selftests ignores this option, meaning arm64 failures are
not reported via this mechanism. Add the logic to implement FORCE_TARGETS
so that it works for arm64.

Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile
index c4c72ee2ef55..e456f3b62fa1 100644
--- a/tools/testing/selftests/arm64/Makefile
+++ b/tools/testing/selftests/arm64/Makefile
@@ -30,13 +30,15 @@ all:
 	@for DIR in $(ARM64_SUBTARGETS); do				\
 		BUILD_TARGET=$(OUTPUT)/$$DIR;			\
 		mkdir -p $$BUILD_TARGET;			\
-		make OUTPUT=$$BUILD_TARGET -C $$DIR $@;		\
+		make OUTPUT=$$BUILD_TARGET -C $$DIR $@		\
+			$(if $(FORCE_TARGETS),|| exit); \
 	done
 
 install: all
 	@for DIR in $(ARM64_SUBTARGETS); do				\
 		BUILD_TARGET=$(OUTPUT)/$$DIR;			\
-		make OUTPUT=$$BUILD_TARGET -C $$DIR $@;		\
+		make OUTPUT=$$BUILD_TARGET -C $$DIR $@		\
+			$(if $(FORCE_TARGETS),|| exit); \
 	done
 
 run_tests: all
-- 
cgit v1.2.3


From 5c7a4741431b0a938dcbd22b90a4dc9a2903fc00 Mon Sep 17 00:00:00 2001
From: Ryota Sakamoto <sakamo.ryota@gmail.com>
Date: Tue, 6 Jan 2026 01:41:01 +0900
Subject: kunit: respect KBUILD_OUTPUT env variable by default

Currently, kunit.py ignores the KBUILD_OUTPUT env variable and always
defaults to .kunit in the working directory. This behavior is inconsistent
with standard Kbuild behavior, where KBUILD_OUTPUT defines the build
artifact location.

This patch modifies kunit.py to respect KBUILD_OUTPUT if set.  A .kunit
subdirectory is created inside KBUILD_OUTPUT to avoid polluting the build
directory.

Link: https://lore.kernel.org/r/20260106-kunit-kbuild_output-v2-1-582281797343@gmail.com
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Ryota Sakamoto <sakamo.ryota@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py           |  7 ++++++-
 tools/testing/kunit/kunit_tool_test.py | 19 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index cd99c1956331..e3d82a038f93 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -323,11 +323,16 @@ def get_default_jobs() -> int:
 		return ncpu
 	raise RuntimeError("os.cpu_count() returned None")
 
+def get_default_build_dir() -> str:
+	if 'KBUILD_OUTPUT' in os.environ:
+		return os.path.join(os.environ['KBUILD_OUTPUT'], '.kunit')
+	return '.kunit'
+
 def add_common_opts(parser: argparse.ArgumentParser) -> None:
 	parser.add_argument('--build_dir',
 			    help='As in the make command, it specifies the build '
 			    'directory.',
-			    type=str, default='.kunit', metavar='DIR')
+			    type=str, default=get_default_build_dir(), metavar='DIR')
 	parser.add_argument('--make_options',
 			    help='X=Y make option, can be repeated.',
 			    action='append', metavar='X=Y')
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index bbba921e0eac..a55b5085310d 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -601,6 +601,7 @@ class KUnitMainTest(unittest.TestCase):
 			all_passed_log = file.readlines()
 
 		self.print_mock = mock.patch('kunit_printer.Printer.print').start()
+		mock.patch.dict(os.environ, clear=True).start()
 		self.addCleanup(mock.patch.stopall)
 
 		self.mock_linux_init = mock.patch.object(kunit_kernel, 'LinuxSourceTree').start()
@@ -723,6 +724,24 @@ class KUnitMainTest(unittest.TestCase):
 			args=None, build_dir=build_dir, filter_glob='', filter='', filter_action=None, timeout=300)
 		self.print_mock.assert_any_call(StrContains('Testing complete.'))
 
+	@mock.patch.dict(os.environ, {'KBUILD_OUTPUT': '/tmp'})
+	def test_run_builddir_from_env(self):
+		build_dir = '/tmp/.kunit'
+		kunit.main(['run'])
+		self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1)
+		self.linux_source_mock.run_kernel.assert_called_once_with(
+			args=None, build_dir=build_dir, filter_glob='', filter='', filter_action=None, timeout=300)
+		self.print_mock.assert_any_call(StrContains('Testing complete.'))
+
+	@mock.patch.dict(os.environ, {'KBUILD_OUTPUT': '/tmp'})
+	def test_run_builddir_override(self):
+		build_dir = '.kunit'
+		kunit.main(['run', '--build_dir=.kunit'])
+		self.assertEqual(self.linux_source_mock.build_reconfig.call_count, 1)
+		self.linux_source_mock.run_kernel.assert_called_once_with(
+			args=None, build_dir=build_dir, filter_glob='', filter='', filter_action=None, timeout=300)
+		self.print_mock.assert_any_call(StrContains('Testing complete.'))
+
 	def test_config_builddir(self):
 		build_dir = '.kunit'
 		kunit.main(['config', '--build_dir', build_dir])
-- 
cgit v1.2.3


From 0c5b86c67fb6898d02c8f92de884186297fd302f Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 30 Dec 2025 13:26:35 +0100
Subject: kunit: tool: Add test for nested test result reporting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently there is a lack of tests validating the result reporting from
nested tests. Add one, it will also be used to validate upcoming changes
to the nested test parsing.

Link: https://lore.kernel.org/r/20251230-kunit-nested-failure-v1-1-98cfbeb87823@linutronix.de
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Reviewed-by: Rae Moar <rmoar@google.com>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_tool_test.py                         | 10 ++++++++++
 .../kunit/test_data/test_is_test_passed-failure-nested.log     |  7 +++++++
 2 files changed, 17 insertions(+)
 create mode 100644 tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index a55b5085310d..81a0996edef4 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -165,6 +165,16 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status)
 		self.assertEqual(result.counts.errors, 0)
 
+	def test_parse_failed_nested_tests_log(self):
+		nested_log = test_data_path('test_is_test_passed-failure-nested.log')
+		with open(nested_log) as file:
+			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
+		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status)
+		self.assertEqual(result.counts.failed, 2)
+		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[0].status)
+		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].status)
+		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].subtests[0].status)
+
 	def test_no_header(self):
 		empty_log = test_data_path('test_is_test_passed-no_tests_run_no_header.log')
 		with open(empty_log) as file:
diff --git a/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log
new file mode 100644
index 000000000000..2e528da39ab5
--- /dev/null
+++ b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log
@@ -0,0 +1,7 @@
+KTAP version 1
+1..2
+not ok 1 subtest 1
+    KTAP version 1
+    1..1
+        not ok 1 subsubtest 1
+not ok 2 subtest 2
-- 
cgit v1.2.3


From 85aff81b0dba7c42d226d9f7c11c4d30a7906878 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 30 Dec 2025 13:26:36 +0100
Subject: kunit: tool: Don't overwrite test status based on subtest counts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a subtest itself reports success, but the outer testcase fails,
the whole testcase should be reported as a failure. However the status
is recalculated based on the test counts, overwriting the outer test
result. Synthesize a failed test in this case to make sure the failure
is not swallowed.

Link: https://lore.kernel.org/r/20251230-kunit-nested-failure-v1-2-98cfbeb87823@linutronix.de
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_parser.py                                  | 3 +++
 tools/testing/kunit/kunit_tool_test.py                               | 1 +
 tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log | 3 +++
 3 files changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py
index 333cd3a4a56b..5338489dcbe4 100644
--- a/tools/testing/kunit/kunit_parser.py
+++ b/tools/testing/kunit/kunit_parser.py
@@ -689,6 +689,9 @@ def bubble_up_test_results(test: Test) -> None:
 	elif test.counts.get_status() == TestStatus.TEST_CRASHED:
 		test.status = TestStatus.TEST_CRASHED
 
+	if status == TestStatus.FAILURE and test.counts.get_status() == TestStatus.SUCCESS:
+		counts.add_status(status)
+
 def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest: bool, printer: Printer) -> Test:
 	"""
 	Finds next test to parse in LineStream, creates new Test object,
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 81a0996edef4..bdc51b5c7b10 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -172,6 +172,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status)
 		self.assertEqual(result.counts.failed, 2)
 		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[0].status)
+		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.subtests[0].subtests[0].status)
 		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].status)
 		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].subtests[0].status)
 
diff --git a/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log
index 2e528da39ab5..5498dfd0b0db 100644
--- a/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log
+++ b/tools/testing/kunit/test_data/test_is_test_passed-failure-nested.log
@@ -1,5 +1,8 @@
 KTAP version 1
 1..2
+    KTAP version 1
+    1..1
+        ok 1 test 1
 not ok 1 subtest 1
     KTAP version 1
     1..1
-- 
cgit v1.2.3


From ab150c2bbafe9425759eca1e45e08d4ad1456818 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Fri, 2 Jan 2026 08:20:39 +0100
Subject: kunit: qemu_configs: Add 32-bit big endian ARM configuration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a basic config to run kunit tests on 32-bit big endian ARM.

Link: https://lore.kernel.org/r/20260102-kunit-armeb-v1-1-e8e5475d735c@linutronix.de
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/qemu_configs/armeb.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 tools/testing/kunit/qemu_configs/armeb.py

(limited to 'tools')

diff --git a/tools/testing/kunit/qemu_configs/armeb.py b/tools/testing/kunit/qemu_configs/armeb.py
new file mode 100644
index 000000000000..86d326651490
--- /dev/null
+++ b/tools/testing/kunit/qemu_configs/armeb.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+
+from ..qemu_config import QemuArchParams
+
+QEMU_ARCH = QemuArchParams(linux_arch='arm',
+			   kconfig='''
+CONFIG_CPU_BIG_ENDIAN=y
+CONFIG_ARCH_VIRT=y
+CONFIG_SERIAL_AMBA_PL010=y
+CONFIG_SERIAL_AMBA_PL010_CONSOLE=y
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y''',
+			   qemu_arch='arm',
+			   kernel_path='arch/arm/boot/zImage',
+			   kernel_command_line='console=ttyAMA0',
+			   extra_qemu_params=['-machine', 'virt'])
-- 
cgit v1.2.3


From 11aa4a18094f04a8ba7e403c272a9a5d85c9c9fc Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Sun, 12 Oct 2025 10:11:30 +0300
Subject: tools/rtla: Remove unused function declarations

Historically four function declarations remain orphaned or duplicated.

Remove them to keep the source clean.

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Link: https://lore.kernel.org/r/20251012071133.290225-1-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/osnoise.h | 3 ---
 tools/tracing/rtla/src/utils.h   | 1 -
 2 files changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h
index 895687030c0b..75de0d5c706a 100644
--- a/tools/tracing/rtla/src/osnoise.h
+++ b/tools/tracing/rtla/src/osnoise.h
@@ -58,8 +58,6 @@ int osnoise_set_irq_disable(struct osnoise_context *context, bool onoff);
 void osnoise_report_missed_events(struct osnoise_tool *tool);
 int osnoise_apply_config(struct osnoise_tool *tool, struct osnoise_params *params);
 
-int osnoise_hist_main(int argc, char *argv[]);
-int osnoise_top_main(int argc, char **argv);
 int osnoise_enable(struct osnoise_tool *tool);
 int osnoise_main(int argc, char **argv);
 int hwnoise_main(int argc, char **argv);
@@ -68,4 +66,3 @@ extern struct tool_ops timerlat_top_ops, timerlat_hist_ops;
 extern struct tool_ops osnoise_top_ops, osnoise_hist_ops;
 
 int run_tool(struct tool_ops *ops, int argc, char *argv[]);
-int hist_main_loop(struct osnoise_tool *tool);
diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h
index 091df4ba4587..ed7618842e82 100644
--- a/tools/tracing/rtla/src/utils.h
+++ b/tools/tracing/rtla/src/utils.h
@@ -24,7 +24,6 @@ void fatal(const char *fmt, ...);
 long parse_seconds_duration(char *val);
 void get_duration(time_t start_time, char *output, int output_size);
 
-int parse_cpu_list(char *cpu_list, char **monitored_cpus);
 char *parse_optional_arg(int argc, char **argv);
 long long get_llong_from_str(char *start);
 
-- 
cgit v1.2.3


From ca7206b6ad029d2c35e64f1ea81dba385496e630 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:54 +0100
Subject: selftests/nolibc: test compatibility of nolibc and kernel time types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Keeping 'struct timespec' and 'struct __kernel_timespec' compatible
allows the source code to stay simple.

Validate that the types stay compatible.

The test is specific to nolibc and does not compile on other libcs, so
skip it there.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-10-c662992f75d7@weissschuh.net
---
 tools/testing/selftests/nolibc/nolibc-test.c | 29 ++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 6888b20af259..3986d55a6ff6 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -1430,6 +1430,34 @@ int test_difftime(void)
 	return 0;
 }
 
+int test_time_types(void)
+{
+#ifdef NOLIBC
+	struct __kernel_timespec kts;
+	struct timespec ts;
+
+	if (!__builtin_types_compatible_p(time_t, __kernel_time64_t))
+		return 1;
+
+	if (sizeof(ts) != sizeof(kts))
+		return 1;
+
+	if (!__builtin_types_compatible_p(__typeof__(ts.tv_sec), __typeof__(kts.tv_sec)))
+		return 1;
+
+	if (!__builtin_types_compatible_p(__typeof__(ts.tv_nsec), __typeof__(kts.tv_nsec)))
+		return 1;
+
+	if (offsetof(__typeof__(ts), tv_sec) != offsetof(__typeof__(kts), tv_sec))
+		return 1;
+
+	if (offsetof(__typeof__(ts), tv_nsec) != offsetof(__typeof__(kts), tv_nsec))
+		return 1;
+#endif /* NOLIBC */
+
+	return 0;
+}
+
 int run_stdlib(int min, int max)
 {
 	int test;
@@ -1555,6 +1583,7 @@ int run_stdlib(int min, int max)
 		CASE_TEST(difftime);                EXPECT_ZR(1, test_difftime()); break;
 		CASE_TEST(memchr_foobar6_o);        EXPECT_STREQ(1, memchr("foobar", 'o', 6), "oobar"); break;
 		CASE_TEST(memchr_foobar3_b);        EXPECT_STRZR(1, memchr("foobar", 'b', 3)); break;
+		CASE_TEST(time_types);              EXPECT_ZR(is_nolibc, test_time_types()); break;
 
 		case __LINE__:
 			return ret; /* must be last */
-- 
cgit v1.2.3


From 6c9be90527207f9beca78e698dd45969813f4c0e Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:55 +0100
Subject: tools/nolibc: remove time conversions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that 'struct timespec' and 'struct __kernel_timespec' are
compatible, the conversions are not necessary anymore.
The same holds true for 'struct itimerspec' and 'struct
__kernel_itimerspec'.

Remove the conversions.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-11-c662992f75d7@weissschuh.net
---
 tools/include/nolibc/sys/timerfd.h | 20 ++----------
 tools/include/nolibc/time.h        | 64 ++++----------------------------------
 2 files changed, 8 insertions(+), 76 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/sys/timerfd.h b/tools/include/nolibc/sys/timerfd.h
index 66f779553d31..616fcfb416a9 100644
--- a/tools/include/nolibc/sys/timerfd.h
+++ b/tools/include/nolibc/sys/timerfd.h
@@ -33,13 +33,7 @@ static __attribute__((unused))
 int sys_timerfd_gettime(int fd, struct itimerspec *curr_value)
 {
 #if defined(__NR_timerfd_gettime64)
-	struct __kernel_itimerspec kcurr_value;
-	int ret;
-
-	ret = my_syscall2(__NR_timerfd_gettime64, fd, &kcurr_value);
-	__nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval);
-	__nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value);
-	return ret;
+	return my_syscall2(__NR_timerfd_gettime64, fd, curr_value);
 #else
 	return my_syscall2(__NR_timerfd_gettime, fd, curr_value);
 #endif
@@ -57,17 +51,7 @@ int sys_timerfd_settime(int fd, int flags,
 			const struct itimerspec *new_value, struct itimerspec *old_value)
 {
 #if defined(__NR_timerfd_settime64)
-	struct __kernel_itimerspec knew_value, kold_value;
-	int ret;
-
-	__nolibc_timespec_user_to_kernel(&new_value->it_value, &knew_value.it_value);
-	__nolibc_timespec_user_to_kernel(&new_value->it_interval, &knew_value.it_interval);
-	ret = my_syscall4(__NR_timerfd_settime64, fd, flags, &knew_value, &kold_value);
-	if (old_value) {
-		__nolibc_timespec_kernel_to_user(&kold_value.it_interval, &old_value->it_interval);
-		__nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value);
-	}
-	return ret;
+	return my_syscall4(__NR_timerfd_settime64, fd, flags, new_value, old_value);
 #else
 	return my_syscall4(__NR_timerfd_settime, fd, flags, new_value, old_value);
 #endif
diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h
index 45df9b09d7b6..ab67f209c99f 100644
--- a/tools/include/nolibc/time.h
+++ b/tools/include/nolibc/time.h
@@ -18,20 +18,6 @@
 #include <linux/signal.h>
 #include <linux/time.h>
 
-static __inline__
-void __nolibc_timespec_user_to_kernel(const struct timespec *ts, struct __kernel_timespec *kts)
-{
-	kts->tv_sec = ts->tv_sec;
-	kts->tv_nsec = ts->tv_nsec;
-}
-
-static __inline__
-void __nolibc_timespec_kernel_to_user(const struct __kernel_timespec *kts, struct timespec *ts)
-{
-	ts->tv_sec = kts->tv_sec;
-	ts->tv_nsec = kts->tv_nsec;
-}
-
 /*
  * int clock_getres(clockid_t clockid, struct timespec *res);
  * int clock_gettime(clockid_t clockid, struct timespec *tp);
@@ -44,13 +30,7 @@ static __attribute__((unused))
 int sys_clock_getres(clockid_t clockid, struct timespec *res)
 {
 #if defined(__NR_clock_getres_time64)
-	struct __kernel_timespec kres;
-	int ret;
-
-	ret = my_syscall2(__NR_clock_getres_time64, clockid, &kres);
-	if (res)
-		__nolibc_timespec_kernel_to_user(&kres, res);
-	return ret;
+	return my_syscall2(__NR_clock_getres_time64, clockid, res);
 #else
 	return my_syscall2(__NR_clock_getres, clockid, res);
 #endif
@@ -66,13 +46,7 @@ static __attribute__((unused))
 int sys_clock_gettime(clockid_t clockid, struct timespec *tp)
 {
 #if defined(__NR_clock_gettime64)
-	struct __kernel_timespec ktp;
-	int ret;
-
-	ret = my_syscall2(__NR_clock_gettime64, clockid, &ktp);
-	if (tp)
-		__nolibc_timespec_kernel_to_user(&ktp, tp);
-	return ret;
+	return my_syscall2(__NR_clock_gettime64, clockid, tp);
 #else
 	return my_syscall2(__NR_clock_gettime, clockid, tp);
 #endif
@@ -88,10 +62,7 @@ static __attribute__((unused))
 int sys_clock_settime(clockid_t clockid, struct timespec *tp)
 {
 #if defined(__NR_clock_settime64)
-	struct __kernel_timespec ktp;
-
-	__nolibc_timespec_user_to_kernel(tp, &ktp);
-	return my_syscall2(__NR_clock_settime64, clockid, &ktp);
+	return my_syscall2(__NR_clock_settime64, clockid, tp);
 #else
 	return my_syscall2(__NR_clock_settime, clockid, tp);
 #endif
@@ -108,14 +79,7 @@ int sys_clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqt
 			struct timespec *rmtp)
 {
 #if defined(__NR_clock_nanosleep_time64)
-	struct __kernel_timespec krqtp, krmtp;
-	int ret;
-
-	__nolibc_timespec_user_to_kernel(rqtp, &krqtp);
-	ret = my_syscall4(__NR_clock_nanosleep_time64, clockid, flags, &krqtp, &krmtp);
-	if (rmtp)
-		__nolibc_timespec_kernel_to_user(&krmtp, rmtp);
-	return ret;
+	return my_syscall4(__NR_clock_nanosleep_time64, clockid, flags, rqtp, rmtp);
 #else
 	return my_syscall4(__NR_clock_nanosleep, clockid, flags, rqtp, rmtp);
 #endif
@@ -190,13 +154,7 @@ static __attribute__((unused))
 int sys_timer_gettime(timer_t timerid, struct itimerspec *curr_value)
 {
 #if defined(__NR_timer_gettime64)
-	struct __kernel_itimerspec kcurr_value;
-	int ret;
-
-	ret = my_syscall2(__NR_timer_gettime64, timerid, &kcurr_value);
-	__nolibc_timespec_kernel_to_user(&kcurr_value.it_interval, &curr_value->it_interval);
-	__nolibc_timespec_kernel_to_user(&kcurr_value.it_value, &curr_value->it_value);
-	return ret;
+	return my_syscall2(__NR_timer_gettime64, timerid, curr_value);
 #else
 	return my_syscall2(__NR_timer_gettime, timerid, curr_value);
 #endif
@@ -213,17 +171,7 @@ int sys_timer_settime(timer_t timerid, int flags,
 		      const struct itimerspec *new_value, struct itimerspec *old_value)
 {
 #if defined(__NR_timer_settime64)
-	struct __kernel_itimerspec knew_value, kold_value;
-	int ret;
-
-	__nolibc_timespec_user_to_kernel(&new_value->it_value, &knew_value.it_value);
-	__nolibc_timespec_user_to_kernel(&new_value->it_interval, &knew_value.it_interval);
-	ret = my_syscall4(__NR_timer_settime64, timerid, flags, &knew_value, &kold_value);
-	if (old_value) {
-		__nolibc_timespec_kernel_to_user(&kold_value.it_interval, &old_value->it_interval);
-		__nolibc_timespec_kernel_to_user(&kold_value.it_value, &old_value->it_value);
-	}
-	return ret;
+	return my_syscall4(__NR_timer_settime64, timerid, flags, new_value, old_value);
 #else
 	return my_syscall4(__NR_timer_settime, timerid, flags, new_value, old_value);
 #endif
-- 
cgit v1.2.3


From dd6659efe0529e7177e9270a0fc044a0b17deb8a Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:56 +0100
Subject: tools/nolibc: add compiler version detection macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some upcoming logic needs to depend on the version of GCC or clang.

Add some helper macros to keep the conditionals readable.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-12-c662992f75d7@weissschuh.net
---
 tools/include/nolibc/compiler.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h
index 87090bbc53e0..c9ffd0496dae 100644
--- a/tools/include/nolibc/compiler.h
+++ b/tools/include/nolibc/compiler.h
@@ -47,4 +47,20 @@
 #  define __nolibc_fallthrough do { } while (0)
 #endif /* __nolibc_has_attribute(fallthrough) */
 
+#define __nolibc_version(_major, _minor, _patch) ((_major) * 10000 + (_minor) * 100 + (_patch))
+
+#ifdef __GNUC__
+#  define __nolibc_gnuc_version \
+		__nolibc_version(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
+#else
+#  define __nolibc_gnuc_version 0
+#endif /* __GNUC__ */
+
+#ifdef __clang__
+#  define __nolibc_clang_version \
+		__nolibc_version(__clang_major__, __clang_minor__, __clang_patchlevel__)
+#else
+#  define __nolibc_clang_version 0
+#endif /* __clang__ */
+
 #endif /* _NOLIBC_COMPILER_H */
-- 
cgit v1.2.3


From 37219aa5b12326cd60f4586779c687f3394e80f5 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:57 +0100
Subject: tools/nolibc: add __nolibc_static_assert()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a wrapper for _Static_assert() to use within nolibc.
While _Static_assert() itself was only standardized in C11,
in GCC and clang dialects it is also available in older standards.

Link: https://lore.kernel.org/lkml/20251203192330.GA12995@1wt.eu/
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-13-c662992f75d7@weissschuh.net
---
 tools/include/nolibc/compiler.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h
index c9ffd0496dae..a8c7619dcdde 100644
--- a/tools/include/nolibc/compiler.h
+++ b/tools/include/nolibc/compiler.h
@@ -63,4 +63,12 @@
 #  define __nolibc_clang_version 0
 #endif /* __clang__ */
 
+#if __STDC_VERSION__ >= 201112L || \
+	__nolibc_gnuc_version >= __nolibc_version(4, 6, 0) || \
+	__nolibc_clang_version >= __nolibc_version(3, 0, 0)
+#  define __nolibc_static_assert(_t) _Static_assert(_t, "")
+#else
+#  define __nolibc_static_assert(_t)
+#endif
+
 #endif /* _NOLIBC_COMPILER_H */
-- 
cgit v1.2.3


From f3ed932644a671038b31f7f536a066eeef6803b0 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 20 Dec 2025 14:55:58 +0100
Subject: selftests/nolibc: add static assertions around time types handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The nolibc system call wrappers expect the libc types to be compatible
to the kernel types.

Make sure these expectations hold at compile-time.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20251220-nolibc-uapi-types-v3-14-c662992f75d7@weissschuh.net
---
 tools/include/nolibc/sys/timerfd.h |  4 ++++
 tools/include/nolibc/time.h        | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/sys/timerfd.h b/tools/include/nolibc/sys/timerfd.h
index 616fcfb416a9..29fd92bd47d2 100644
--- a/tools/include/nolibc/sys/timerfd.h
+++ b/tools/include/nolibc/sys/timerfd.h
@@ -33,8 +33,10 @@ static __attribute__((unused))
 int sys_timerfd_gettime(int fd, struct itimerspec *curr_value)
 {
 #if defined(__NR_timerfd_gettime64)
+	__nolibc_assert_time64_type(curr_value->it_value.tv_sec);
 	return my_syscall2(__NR_timerfd_gettime64, fd, curr_value);
 #else
+	__nolibc_assert_native_time64();
 	return my_syscall2(__NR_timerfd_gettime, fd, curr_value);
 #endif
 }
@@ -51,8 +53,10 @@ int sys_timerfd_settime(int fd, int flags,
 			const struct itimerspec *new_value, struct itimerspec *old_value)
 {
 #if defined(__NR_timerfd_settime64)
+	__nolibc_assert_time64_type(new_value->it_value.tv_sec);
 	return my_syscall4(__NR_timerfd_settime64, fd, flags, new_value, old_value);
 #else
+	__nolibc_assert_native_time64();
 	return my_syscall4(__NR_timerfd_settime, fd, flags, new_value, old_value);
 #endif
 }
diff --git a/tools/include/nolibc/time.h b/tools/include/nolibc/time.h
index ab67f209c99f..f9257d6a7878 100644
--- a/tools/include/nolibc/time.h
+++ b/tools/include/nolibc/time.h
@@ -18,6 +18,12 @@
 #include <linux/signal.h>
 #include <linux/time.h>
 
+#define __nolibc_assert_time64_type(t) \
+	__nolibc_static_assert(sizeof(t) == 8)
+
+#define __nolibc_assert_native_time64() \
+	__nolibc_assert_time64_type(__kernel_old_time_t)
+
 /*
  * int clock_getres(clockid_t clockid, struct timespec *res);
  * int clock_gettime(clockid_t clockid, struct timespec *tp);
@@ -30,8 +36,10 @@ static __attribute__((unused))
 int sys_clock_getres(clockid_t clockid, struct timespec *res)
 {
 #if defined(__NR_clock_getres_time64)
+	__nolibc_assert_time64_type(res->tv_sec);
 	return my_syscall2(__NR_clock_getres_time64, clockid, res);
 #else
+	__nolibc_assert_native_time64();
 	return my_syscall2(__NR_clock_getres, clockid, res);
 #endif
 }
@@ -46,8 +54,10 @@ static __attribute__((unused))
 int sys_clock_gettime(clockid_t clockid, struct timespec *tp)
 {
 #if defined(__NR_clock_gettime64)
+	__nolibc_assert_time64_type(tp->tv_sec);
 	return my_syscall2(__NR_clock_gettime64, clockid, tp);
 #else
+	__nolibc_assert_native_time64();
 	return my_syscall2(__NR_clock_gettime, clockid, tp);
 #endif
 }
@@ -62,8 +72,10 @@ static __attribute__((unused))
 int sys_clock_settime(clockid_t clockid, struct timespec *tp)
 {
 #if defined(__NR_clock_settime64)
+	__nolibc_assert_time64_type(tp->tv_sec);
 	return my_syscall2(__NR_clock_settime64, clockid, tp);
 #else
+	__nolibc_assert_native_time64();
 	return my_syscall2(__NR_clock_settime, clockid, tp);
 #endif
 }
@@ -79,8 +91,10 @@ int sys_clock_nanosleep(clockid_t clockid, int flags, const struct timespec *rqt
 			struct timespec *rmtp)
 {
 #if defined(__NR_clock_nanosleep_time64)
+	__nolibc_assert_time64_type(rqtp->tv_sec);
 	return my_syscall4(__NR_clock_nanosleep_time64, clockid, flags, rqtp, rmtp);
 #else
+	__nolibc_assert_native_time64();
 	return my_syscall4(__NR_clock_nanosleep, clockid, flags, rqtp, rmtp);
 #endif
 }
@@ -154,8 +168,10 @@ static __attribute__((unused))
 int sys_timer_gettime(timer_t timerid, struct itimerspec *curr_value)
 {
 #if defined(__NR_timer_gettime64)
+	__nolibc_assert_time64_type(curr_value->it_value.tv_sec);
 	return my_syscall2(__NR_timer_gettime64, timerid, curr_value);
 #else
+	__nolibc_assert_native_time64();
 	return my_syscall2(__NR_timer_gettime, timerid, curr_value);
 #endif
 }
@@ -171,8 +187,10 @@ int sys_timer_settime(timer_t timerid, int flags,
 		      const struct itimerspec *new_value, struct itimerspec *old_value)
 {
 #if defined(__NR_timer_settime64)
+	__nolibc_assert_time64_type(new_value->it_value.tv_sec);
 	return my_syscall4(__NR_timer_settime64, timerid, flags, new_value, old_value);
 #else
+	__nolibc_assert_native_time64();
 	return my_syscall4(__NR_timer_settime, timerid, flags, new_value, old_value);
 #endif
 }
-- 
cgit v1.2.3


From 03139924859f7930cd1667a278a560b5d4c6a672 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 4 Jan 2026 15:57:51 +0100
Subject: selftests/nolibc: drop NOLIBC_SYSROOT=0 logic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This logic was added in commit 850fad7de827 ("selftests/nolibc: allow
test -include /path/to/nolibc.h") to allow the testing of -include
/path/to/nolibc.h. As it requires as special variable to activate, this
code is nearly never used. Furthermore it complicates the logic a bit.

Since commit a6a054c8ad32 ("tools/nolibc: add target to check header
usability") and commit 443c6467fcd6 ("selftests/nolibc: always run
nolibc header check") the usability of -include /path/to/nolibc.h is
always checked anyways, making NOLIBC_SYSROOT=0 pointless.

Drop the special logic.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://patch.msgid.link/20260104-nolibc-nolibc_sysroot-v1-1-98025ad99add@weissschuh.net
---
 tools/testing/selftests/nolibc/Makefile.nolibc | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc
index f9d43cbdc894..b17ba2f8fb46 100644
--- a/tools/testing/selftests/nolibc/Makefile.nolibc
+++ b/tools/testing/selftests/nolibc/Makefile.nolibc
@@ -302,15 +302,9 @@ sysroot/$(ARCH)/include:
 	$(Q)$(MAKE) -C $(srctree)/tools/include/nolibc ARCH=$(ARCH) OUTPUT=$(CURDIR)/sysroot/ headers_standalone headers_check
 	$(Q)mv sysroot/sysroot sysroot/$(ARCH)
 
-ifneq ($(NOLIBC_SYSROOT),0)
 nolibc-test: nolibc-test.c nolibc-test-linkage.c sysroot/$(ARCH)/include
 	$(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \
 	  -nostdlib -nostdinc -static -Isysroot/$(ARCH)/include nolibc-test.c nolibc-test-linkage.c $(LIBGCC)
-else
-nolibc-test: nolibc-test.c nolibc-test-linkage.c
-	$(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \
-	  -nostdlib -static -include $(srctree)/tools/include/nolibc/nolibc.h nolibc-test.c nolibc-test-linkage.c $(LIBGCC)
-endif
 
 libc-test: nolibc-test.c nolibc-test-linkage.c
 	$(QUIET_CC)$(HOSTCC) -o $@ nolibc-test.c nolibc-test-linkage.c
-- 
cgit v1.2.3


From 57624b38ce99b906cbb191a1d536bb871ad2d8c2 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 4 Jan 2026 23:43:13 +0100
Subject: tools/nolibc: align sys_vfork() with sys_fork()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the generic variants of sys_fork() and sys_vfork() differ in
both they precedence of used system calls and the usage of sys_clone()
vs sys_clone3(). While the interface of clone3() in sys_vfork() is more
consistent over different architectures, qemu-user does not support it,
making testing harder. We already handle the different clone()
interfaces for sys_fork() in the architecture-specific headers, and can
do so also for sys_vfork(). In fact SPARC already has such handling and
only s390 is currently missing.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://patch.msgid.link/20260104-nolibc-vfork-v1-1-a63464b9e4e6@weissschuh.net
---
 tools/include/nolibc/arch-s390.h |  8 ++++++++
 tools/include/nolibc/sys.h       | 18 +++++-------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h
index 5bee6ecbde0a..904281e95f99 100644
--- a/tools/include/nolibc/arch-s390.h
+++ b/tools/include/nolibc/arch-s390.h
@@ -8,6 +8,7 @@
 
 #include "types.h"
 
+#include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/unistd.h>
 
@@ -189,4 +190,11 @@ pid_t sys_fork(void)
 }
 #define sys_fork sys_fork
 
+static __attribute__((unused))
+pid_t sys_vfork(void)
+{
+	return my_syscall5(__NR_clone, 0, CLONE_VM | CLONE_VFORK | SIGCHLD, 0, 0, 0);
+}
+#define sys_vfork sys_vfork
+
 #endif /* _NOLIBC_ARCH_S390_H */
diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h
index 847af1ccbdc9..403ee9ce8389 100644
--- a/tools/include/nolibc/sys.h
+++ b/tools/include/nolibc/sys.h
@@ -22,7 +22,7 @@
 #include <linux/time.h>
 #include <linux/auxvec.h>
 #include <linux/fcntl.h> /* for O_* and AT_* */
-#include <linux/sched.h> /* for clone_args */
+#include <linux/sched.h> /* for CLONE_* */
 #include <linux/stat.h>  /* for statx() */
 
 #include "errno.h"
@@ -363,19 +363,11 @@ pid_t fork(void)
 static __attribute__((unused))
 pid_t sys_vfork(void)
 {
-#if defined(__NR_vfork)
+#if defined(__NR_clone)
+	/* See the note in sys_fork(). */
+	return my_syscall5(__NR_clone, CLONE_VM | CLONE_VFORK | SIGCHLD, 0, 0, 0, 0);
+#elif defined(__NR_vfork)
 	return my_syscall0(__NR_vfork);
-#else
-	/*
-	 * clone() could be used but has different argument orders per
-	 * architecture.
-	 */
-	struct clone_args args = {
-		.flags		= CLONE_VM | CLONE_VFORK,
-		.exit_signal	= SIGCHLD,
-	};
-
-	return my_syscall2(__NR_clone3, &args, sizeof(args));
 #endif
 }
 #endif
-- 
cgit v1.2.3


From 6b6dbf3e4ecfd7d1086bd7cd8b31ca8e45d4dc1f Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Tue, 6 Jan 2026 11:39:55 +0100
Subject: selftests/nolibc: always build sparc32 tests with -mcpu=v8
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since LLVM commit 39e30508a7f6 ("[Driver][Sparc] Default to -mcpu=v9 for
32-bit Linux/sparc64 (#109278)"), clang defaults to -mcpu=v9 for 32-bit
SPARC builds. -mcpu=v9 generates instructions which are not recognized
by qemu-sparc and qemu-system-sparc.

Explicitly enforce -mcpu=v8 to generate compatible code.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://patch.msgid.link/20260106-nolibc-sparc32-fix-v2-1-7c5cd6b175c2@weissschuh.net
---
 tools/testing/selftests/nolibc/Makefile.nolibc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile.nolibc b/tools/testing/selftests/nolibc/Makefile.nolibc
index b17ba2f8fb46..f5704193038f 100644
--- a/tools/testing/selftests/nolibc/Makefile.nolibc
+++ b/tools/testing/selftests/nolibc/Makefile.nolibc
@@ -226,7 +226,7 @@ CFLAGS_mipsn32be = -EB -mabi=n32 -march=mips64r6
 CFLAGS_mips64le = -EL -mabi=64 -march=mips64r6
 CFLAGS_mips64be = -EB -mabi=64 -march=mips64r2
 CFLAGS_loongarch = $(if $(LLVM),-fuse-ld=lld)
-CFLAGS_sparc32 = $(call cc-option,-m32)
+CFLAGS_sparc32 = $(call cc-option,-m32) -mcpu=v8
 CFLAGS_sh4 = -ml -m4
 ifeq ($(origin XARCH),command line)
 CFLAGS_XARCH = $(CFLAGS_$(XARCH))
-- 
cgit v1.2.3


From 2246c24426fbc1069cb2a47e0624ccffe5f2627b Mon Sep 17 00:00:00 2001
From: Zide Chen <zide.chen@intel.com>
Date: Wed, 31 Dec 2025 14:42:28 -0800
Subject: perf pmu: Relax uncore wildcard matching to allow numeric suffix

Diamond Rapids introduces two types of PCIe related uncore PMUs:
"uncore_pcie4_*" and "uncore_pcie6_*".

To ensure that generic PCIe events (e.g., UNC_PCIE_CLOCKTICKS) can match
and collect events from both PMU types, slightly relax the wildcard
matching logic in perf_pmu__match_wildcard().

This change allows a wildcard such as "pcie" to match PMU names that
include a numeric suffix, such as "pcie4_*" and "pcie6_*".

Co-developed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Zide Chen <zide.chen@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Link: https://patch.msgid.link/20251231224233.113839-12-zide.chen@intel.com
---
 tools/perf/util/pmu.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 956ea273c2c7..01a21b6aa031 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -939,6 +939,7 @@ static bool perf_pmu__match_wildcard(const char *pmu_name, const char *tok)
 {
 	const char *p, *suffix;
 	bool has_hex = false;
+	bool has_underscore = false;
 	size_t tok_len = strlen(tok);
 
 	/* Check start of pmu_name for equality. */
@@ -949,13 +950,14 @@ static bool perf_pmu__match_wildcard(const char *pmu_name, const char *tok)
 	if (*p == 0)
 		return true;
 
-	if (*p == '_') {
-		++p;
-		++suffix;
-	}
-
-	/* Ensure we end in a number */
+	/* Ensure we end in a number or a mix of number and "_". */
 	while (1) {
+		if (!has_underscore && (*p == '_')) {
+			has_underscore = true;
+			++p;
+			++suffix;
+		}
+
 		if (!isxdigit(*p))
 			return false;
 		if (!has_hex)
-- 
cgit v1.2.3


From ab86d0bf01f6d0e37fd67761bb62918321b64efc Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Mon, 5 Jan 2026 12:47:46 +0100
Subject: selftests/bpf: Update xdp_context_test_run test to check maximum
 metadata size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update the selftest to check that the metadata size check takes the
xdp_frame size into account in bpf_prog_test_run. The original
check (for meta size 256) was broken because the data frame supplied was
smaller than this, triggering a different EINVAL return. So supply a
larger data frame for this test to make sure we actually exercise the
check we think we are.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Amery Hung <ameryhung@gmail.com>
Link: https://lore.kernel.org/r/20260105114747.1358750-2-toke@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/xdp_context_test_run.c        | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c
index ee94c281888a..26159e0499c7 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c
@@ -47,6 +47,7 @@ void test_xdp_context_test_run(void)
 	struct test_xdp_context_test_run *skel = NULL;
 	char data[sizeof(pkt_v4) + sizeof(__u32)];
 	char bad_ctx[sizeof(struct xdp_md) + 1];
+	char large_data[256];
 	struct xdp_md ctx_in, ctx_out;
 	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
 			    .data_in = &data,
@@ -94,9 +95,6 @@ void test_xdp_context_test_run(void)
 	test_xdp_context_error(prog_fd, opts, 4, sizeof(__u32), sizeof(data),
 			       0, 0, 0);
 
-	/* Meta data must be 255 bytes or smaller */
-	test_xdp_context_error(prog_fd, opts, 0, 256, sizeof(data), 0, 0, 0);
-
 	/* Total size of data must be data_end - data_meta or larger */
 	test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32),
 			       sizeof(data) + 1, 0, 0, 0);
@@ -116,6 +114,16 @@ void test_xdp_context_test_run(void)
 	test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data),
 			       0, 0, 1);
 
+	/* Meta data must be 216 bytes or smaller (256 - sizeof(struct
+	 * xdp_frame)). Test both nearest invalid size and nearest invalid
+	 * 4-byte-aligned size, and make sure data_in is large enough that we
+	 * actually hit the check on metadata length
+	 */
+	opts.data_in = large_data;
+	opts.data_size_in = sizeof(large_data);
+	test_xdp_context_error(prog_fd, opts, 0, 217, sizeof(large_data), 0, 0, 0);
+	test_xdp_context_error(prog_fd, opts, 0, 220, sizeof(large_data), 0, 0, 0);
+
 	test_xdp_context_test_run__destroy(skel);
 }
 
-- 
cgit v1.2.3


From e970637707f4f8e5bd098b09090b755f2f57898b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 2 Jan 2026 12:06:57 -0800
Subject: docs: find-unused-docs.sh: fixup directory usage

The recent move of this script from scripts/ to tools/docs/
did not account for the 'cd' directory usage.
Update "cd .." to "cd ../.." to make the script self-correcting.

This also eliminates a shell warning:
./tools/docs/find-unused-docs.sh: line 33: cd: Documentation/: No such file or directory

Fixes: 184414c6a6ca ("docs: move find-unused-docs.sh to tools/docs")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Fixes: 184414c6a6ca (docs: move find-unused-docs.sh to tools/docs)
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <20260102200657.1040234-1-rdunlap@infradead.org>
---
 tools/docs/find-unused-docs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/docs/find-unused-docs.sh b/tools/docs/find-unused-docs.sh
index 05552dbda5bc..ca4e607ec3f7 100755
--- a/tools/docs/find-unused-docs.sh
+++ b/tools/docs/find-unused-docs.sh
@@ -28,7 +28,7 @@ if ! [ -d "$1" ]; then
 fi
 
 cd "$( dirname "${BASH_SOURCE[0]}" )"
-cd ..
+cd ../..
 
 cd Documentation/
 
-- 
cgit v1.2.3


From b04d2b9199129f4f0c992a518c0fb78c2efc1064 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Wed, 10 Dec 2025 08:17:52 +0100
Subject: perf test: Fix test case perf evlist tests for s390x

Perf test case 78: perf evlist tests fails on s390.

The failure is causes by grouping events cycles and instructions because
sampling does only support event cycles.  Change the group to software
events to fix this.

Output before:
  # ./perf test 78
  78: perf evlist tests              : FAILED!
  #

Output after:
  # ./perf test 78
  78: perf evlist tests              : Ok
  #

Fixes: db452961de939225 ("perf tests evlist: Add basic evlist test")
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Tested-by: Ian Rogers <irogers@google.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Jan Polensky <japo@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/evlist.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/evlist.sh b/tools/perf/tests/shell/evlist.sh
index 140f099e75c1..5632be391710 100755
--- a/tools/perf/tests/shell/evlist.sh
+++ b/tools/perf/tests/shell/evlist.sh
@@ -38,13 +38,14 @@ test_evlist_simple() {
 
 test_evlist_group() {
 	echo "Group evlist test"
-	if ! perf record -e "{cycles,instructions}" -o "${perfdata}" true 2> /dev/null
+	if ! perf record -e "{cpu-clock,task-clock}" -o "${perfdata}" \
+		-- perf test -w noploop 2> /dev/null
 	then
 		echo "Group evlist [Skipped event group recording failed]"
 		return
 	fi
 
-	if ! perf evlist -i "${perfdata}" -g | grep -q "{.*cycles.*,.*instructions.*}"
+	if ! perf evlist -i "${perfdata}" -g | grep -q "{.*cpu-clock.*,.*task-clock.*}"
 	then
 		echo "Group evlist [Failed to list event group]"
 		err=1
-- 
cgit v1.2.3


From 1ec205e3669c12dfb0adbd2d7099922c195b46ff Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 5 Dec 2025 11:01:35 -0800
Subject: perf test java symbol: Additional libperf-jvmti.so path check

If perf is built into an output directory then so is
libperf-jvmti.so.

If `perf test` is run from that directory then PWD needn't also be that
directory meaning libperf-jvmti.so won't be found and the test skipped.

Add an additional check for libperf-jvmti.so in the same directory as
the perf binary for this case, this avoids the test skipping.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/test_java_symbol.sh | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/test_java_symbol.sh b/tools/perf/tests/shell/test_java_symbol.sh
index 499539d1c479..63a2cc9bf13f 100755
--- a/tools/perf/tests/shell/test_java_symbol.sh
+++ b/tools/perf/tests/shell/test_java_symbol.sh
@@ -22,10 +22,13 @@ cleanup_files()
 
 trap cleanup_files exit term int
 
+PERF_DIR=$(dirname "$(which perf)")
 if [ -e "$PWD/tools/perf/libperf-jvmti.so" ]; then
 	LIBJVMTI=$PWD/tools/perf/libperf-jvmti.so
 elif [ -e "$PWD/libperf-jvmti.so" ]; then
 	LIBJVMTI=$PWD/libperf-jvmti.so
+elif [ -e "$PERF_DIR/libperf-jvmti.so" ]; then
+	LIBJVMTI=$PERF_DIR/libperf-jvmti.so
 elif [ -e "$PREFIX/lib64/libperf-jvmti.so" ]; then
 	LIBJVMTI=$PREFIX/lib64/libperf-jvmti.so
 elif [ -e "$PREFIX/lib/libperf-jvmti.so" ]; then
@@ -34,6 +37,7 @@ elif [ -e "/usr/lib/linux-tools-$(uname -a | awk '{ print $3 }' | sed -r 's/-gen
 	LIBJVMTI=/usr/lib/linux-tools-$(uname -a | awk '{ print $3 }' | sed -r 's/-generic//')/libperf-jvmti.so
 else
 	echo "Fail to find libperf-jvmti.so"
+
 	# JVMTI is a build option, skip the test if fail to find lib
 	exit 2
 fi
-- 
cgit v1.2.3


From 7fc37b588aaaf72145764b4c3b6184431471b3e0 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 23 Dec 2025 17:00:24 +0000
Subject: perf build: Remove
 FEATURE_CHECK_LDFLAGS-disassembler-{four-args,init-styled} setting

As the building mechanism is now able to retry detection with different
combinations of linking flags, setting FEATURE_CHECK_LDFLAGS-disassembler-four-args
and FEATURE_CHECK_LDFLAGS-disassembler-init-styled is not necessary anymore,
so remove it.

James Clark notes:

Use the same technique to find the set of bfd-related libraries to link as in:

  3308ffc5016e6136 ("tools, build: Retry detection of bfd-related features")

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andres Freund <andres@anarazel.de>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Justin Stitt <justinstitt@google.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Monnet <qmo@kernel.org>
Cc: Song Liu <song@kernel.org>
Cc: Stanislav Fomichev <sdf@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index d8d25f62aaad..bbfebbe26f4d 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -379,8 +379,8 @@ ifneq ($(TCMALLOC),)
 endif
 
 ifeq ($(FEATURES_DUMP),)
-# We will display at the end of this Makefile.config, using $(call feature_display_entries)
-# As we may retry some feature detection here, see the disassembler-four-args case, for instance
+# We will display at the end of this Makefile.config, using $(call feature_display_entries),
+# as we may retry some feature detection here.
   FEATURE_DISPLAY_DEFERRED := 1
 include $(srctree)/tools/build/Makefile.feature
 else
@@ -935,8 +935,6 @@ ifdef BUILD_NONDISTRO
 
   ifeq ($(feature-libbfd), 1)
     EXTLIBS += -lbfd -lopcodes
-    FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl
-    FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl
   else
     # we are on a system that requires -liberty and (maybe) -lz
     # to link against -lbfd; test each case individually here
@@ -948,13 +946,9 @@ ifdef BUILD_NONDISTRO
 
     ifeq ($(feature-libbfd-liberty), 1)
       EXTLIBS += -lbfd -lopcodes -liberty
-      FEATURE_CHECK_LDFLAGS-disassembler-four-args += -liberty -ldl
-      FEATURE_CHECK_LDFLAGS-disassembler-init-styled += -liberty -ldl
     else
       ifeq ($(feature-libbfd-liberty-z), 1)
         EXTLIBS += -lbfd -lopcodes -liberty -lz
-        FEATURE_CHECK_LDFLAGS-disassembler-four-args += -liberty -lz -ldl
-        FEATURE_CHECK_LDFLAGS-disassembler-init-styled += -liberty -lz -ldl
       endif
     endif
     $(call feature_check,disassembler-four-args)
@@ -1332,6 +1326,6 @@ endif
 
 # re-generate FEATURE-DUMP as we may have called feature_check, found out
 # extra libraries to add to LDFLAGS of some other test and then redo those
-# tests, see the block about libbfd, disassembler-four-args, for instance.
+# tests.
 $(shell rm -f $(FEATURE_DUMP_FILENAME))
 $(foreach feat,$(FEATURE_TESTS),$(shell echo "$(call feature_assign,$(feat))" >> $(FEATURE_DUMP_FILENAME)))
-- 
cgit v1.2.3


From ae323bc241d25f5ebc56f0b2a6d580b7233647c0 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Tue, 23 Dec 2025 17:00:25 +0000
Subject: perf build: Do all non-distro feature checks in one go

None of the if statements or variable assignments in the non-distro
block actually affect the feature checks. Just do them all in one place
so the flow isn't obscured.

Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index bbfebbe26f4d..85075de2aedd 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -931,36 +931,28 @@ ifneq ($(NO_JEVENTS),1)
 endif
 
 ifdef BUILD_NONDISTRO
+  # call all detections now so we get correct status in VF output
   $(call feature_check,libbfd)
+  $(call feature_check,disassembler-four-args)
+  $(call feature_check,disassembler-init-styled)
+  $(call feature_check,libbfd-buildid)
+  $(call feature_check,libbfd-liberty)
+  $(call feature_check,libbfd-liberty-z)
 
+  # we may be on a system that requires -liberty and (maybe) -lz
+  # to link against -lbfd; test each case individually here
   ifeq ($(feature-libbfd), 1)
     EXTLIBS += -lbfd -lopcodes
-  else
-    # we are on a system that requires -liberty and (maybe) -lz
-    # to link against -lbfd; test each case individually here
-
-    # call all detections now so we get correct
-    # status in VF output
-    $(call feature_check,libbfd-liberty)
-    $(call feature_check,libbfd-liberty-z)
-
-    ifeq ($(feature-libbfd-liberty), 1)
-      EXTLIBS += -lbfd -lopcodes -liberty
-    else
-      ifeq ($(feature-libbfd-liberty-z), 1)
-        EXTLIBS += -lbfd -lopcodes -liberty -lz
-      endif
-    endif
-    $(call feature_check,disassembler-four-args)
-    $(call feature_check,disassembler-init-styled)
+  else ifeq ($(feature-libbfd-liberty), 1)
+    EXTLIBS += -lbfd -lopcodes -liberty
+  else ifeq ($(feature-libbfd-liberty-z), 1)
+    EXTLIBS += -lbfd -lopcodes -liberty -lz
   endif
 
   CFLAGS += -DHAVE_LIBBFD_SUPPORT
   CXXFLAGS += -DHAVE_LIBBFD_SUPPORT
   $(call detected,CONFIG_LIBBFD)
 
-  $(call feature_check,libbfd-buildid)
-
   ifeq ($(feature-libbfd-buildid), 1)
     CFLAGS += -DHAVE_LIBBFD_BUILDID_SUPPORT
   else
-- 
cgit v1.2.3


From c0cb97a275ffa00d91a0715dce8105ae3f627727 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Tue, 23 Dec 2025 17:00:26 +0000
Subject: perf build: Remove unused libbfd-buildid feature test

HAVE_LIBBFD_BUILDID_SUPPORT isn't used in the codebase so remove the
feature test that sets it.

Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature              | 1 -
 tools/build/feature/Makefile              | 4 ----
 tools/build/feature/test-libbfd-buildid.c | 8 --------
 tools/perf/Makefile.config                | 7 -------
 4 files changed, 20 deletions(-)
 delete mode 100644 tools/build/feature/test-libbfd-buildid.c

(limited to 'tools')

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 362cf8f4a0a0..bbaa88bb9b30 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -71,7 +71,6 @@ FEATURE_TESTS_BASIC :=                  \
         gettid				\
         glibc                           \
         libbfd                          \
-        libbfd-buildid			\
         libelf                          \
         libelf-getphdrnum               \
         libelf-gelf_getnote             \
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 0d5a15654b17..d84db7df7988 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -13,7 +13,6 @@ FILES=                                          \
          test-gtk2-infobar.bin                  \
          test-hello.bin                         \
          test-libbfd.bin                        \
-         test-libbfd-buildid.bin		\
          test-disassembler-four-args.bin        \
          test-disassembler-init-styled.bin	\
          test-reallocarray.bin			\
@@ -268,9 +267,6 @@ $(OUTPUT)test-libpython.bin:
 $(OUTPUT)test-libbfd.bin:
 	$(BUILD_BFD)
 
-$(OUTPUT)test-libbfd-buildid.bin:
-	$(BUILD_BFD) || $(BUILD_BFD) -liberty || $(BUILD_BFD) -liberty -lz
-
 $(OUTPUT)test-disassembler-four-args.bin:
 	$(BUILD_BFD) -lopcodes || $(BUILD_BFD) -lopcodes -liberty || \
 	$(BUILD_BFD) -lopcodes -liberty -lz
diff --git a/tools/build/feature/test-libbfd-buildid.c b/tools/build/feature/test-libbfd-buildid.c
deleted file mode 100644
index 157644b04c05..000000000000
--- a/tools/build/feature/test-libbfd-buildid.c
+++ /dev/null
@@ -1,8 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <bfd.h>
-
-int main(void)
-{
-	bfd *abfd = bfd_openr("Pedro", 0);
-	return abfd && (!abfd->build_id || abfd->build_id->size > 0x506564726f);
-}
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 85075de2aedd..fb1cf2bf5d83 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -935,7 +935,6 @@ ifdef BUILD_NONDISTRO
   $(call feature_check,libbfd)
   $(call feature_check,disassembler-four-args)
   $(call feature_check,disassembler-init-styled)
-  $(call feature_check,libbfd-buildid)
   $(call feature_check,libbfd-liberty)
   $(call feature_check,libbfd-liberty-z)
 
@@ -953,12 +952,6 @@ ifdef BUILD_NONDISTRO
   CXXFLAGS += -DHAVE_LIBBFD_SUPPORT
   $(call detected,CONFIG_LIBBFD)
 
-  ifeq ($(feature-libbfd-buildid), 1)
-    CFLAGS += -DHAVE_LIBBFD_BUILDID_SUPPORT
-  else
-    $(warning Old version of libbfd/binutils things like PE executable profiling will not be available)
-  endif
-
   ifeq ($(feature-disassembler-four-args), 1)
     CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE
   endif
-- 
cgit v1.2.3


From cff602f65988da48cc1b84f6c3588a25a320fa81 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Tue, 23 Dec 2025 17:00:27 +0000
Subject: perf build: Feature test for libbfd thread safety API

The non-distro build requires libbfd 2.42 since commit b72b8132d8fd
("perf libbfd: Ensure libbfd is initialized prior to use"). Add a
feature test so that it's obvious why the build fails if this criteria
isn't met.

Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature                 |  1 +
 tools/build/feature/Makefile                 |  4 ++++
 tools/build/feature/test-libbfd-threadsafe.c | 18 ++++++++++++++++++
 tools/perf/Makefile.config                   |  5 +++++
 4 files changed, 28 insertions(+)
 create mode 100644 tools/build/feature/test-libbfd-threadsafe.c

(limited to 'tools')

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index bbaa88bb9b30..7f119eafc7c4 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -71,6 +71,7 @@ FEATURE_TESTS_BASIC :=                  \
         gettid				\
         glibc                           \
         libbfd                          \
+	libbfd-threadsafe		\
         libelf                          \
         libelf-getphdrnum               \
         libelf-gelf_getnote             \
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index d84db7df7988..5c15572d505e 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -13,6 +13,7 @@ FILES=                                          \
          test-gtk2-infobar.bin                  \
          test-hello.bin                         \
          test-libbfd.bin                        \
+	 test-libbfd-threadsafe.bin      	\
          test-disassembler-four-args.bin        \
          test-disassembler-init-styled.bin	\
          test-reallocarray.bin			\
@@ -267,6 +268,9 @@ $(OUTPUT)test-libpython.bin:
 $(OUTPUT)test-libbfd.bin:
 	$(BUILD_BFD)
 
+$(OUTPUT)test-libbfd-threadsafe.bin:
+	$(BUILD_BFD) || $(BUILD_BFD) -liberty || $(BUILD_BFD) -liberty -lz
+
 $(OUTPUT)test-disassembler-four-args.bin:
 	$(BUILD_BFD) -lopcodes || $(BUILD_BFD) -lopcodes -liberty || \
 	$(BUILD_BFD) -lopcodes -liberty -lz
diff --git a/tools/build/feature/test-libbfd-threadsafe.c b/tools/build/feature/test-libbfd-threadsafe.c
new file mode 100644
index 000000000000..fe97f95f6f06
--- /dev/null
+++ b/tools/build/feature/test-libbfd-threadsafe.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <bfd.h>
+
+static bool lock(void *unused)
+{
+	return true;
+}
+
+static bool unlock(void *unused)
+{
+	return true;
+}
+
+int main(void)
+{
+       /* Check for presence of new thread safety API (version 2.42) */
+       return !bfd_thread_init(lock, unlock, NULL);
+}
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index fb1cf2bf5d83..6f2c7bd36e74 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -935,9 +935,14 @@ ifdef BUILD_NONDISTRO
   $(call feature_check,libbfd)
   $(call feature_check,disassembler-four-args)
   $(call feature_check,disassembler-init-styled)
+  $(call feature_check,libbfd-threadsafe)
   $(call feature_check,libbfd-liberty)
   $(call feature_check,libbfd-liberty-z)
 
+  ifneq ($(feature-libbfd-threadsafe), 1)
+    $(error binutils 2.42 or later is required for non-distro builds)
+  endif
+
   # we may be on a system that requires -liberty and (maybe) -lz
   # to link against -lbfd; test each case individually here
   ifeq ($(feature-libbfd), 1)
-- 
cgit v1.2.3


From 523471c5163659c61132274123c5470286e407ce Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Tue, 23 Dec 2025 17:00:28 +0000
Subject: perf build: Skip nondistro build test if libbfd is old

Non distro builds now require a new version of libbfd, so skip the test
if the library is too old.

The grep test isn't a strong as the feature test in
test-libbfd-threadsafe.c, but there seems to be precedent for feature
testing this way here and it's good enough for the build-test rule. If
the function exists but returns an error it will be picked up by the
feature test when attempting the build.

Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/make | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 6641701e4828..36411b4b6d2b 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -124,6 +124,9 @@ make_minimal        += NO_LIBDW_DWARF_UNWIND=1 NO_LIBBPF=1
 make_minimal        += NO_SDT=1 NO_JVMTI=1 NO_LIBZSTD=1
 make_minimal        += NO_LIBCAP=1 NO_CAPSTONE=1
 
+# binutils 2_42 and newer have bfd_thread_init()
+new_libbfd := $(shell echo '#include <bfd.h>' | $(CC) -E -x c - | grep bfd_thread_init)
+
 # $(run) contains all available tests
 run := make_pure
 # Targets 'clean all' can be run together only through top level
@@ -137,7 +140,9 @@ MAKE_F := $(MAKE) -f $(MK)
 endif
 run += make_python_perf_so
 run += make_debug
+ifneq ($(new_libbfd),)
 run += make_nondistro
+endif
 run += make_extra_tests
 run += make_jevents_all
 run += make_no_bpf_skel
-- 
cgit v1.2.3


From 8e746e95c3e4eb56ae261feb9ae261bce1f96947 Mon Sep 17 00:00:00 2001
From: Derek Foreman <derek.foreman@collabora.com>
Date: Fri, 28 Nov 2025 15:50:17 -0600
Subject: perf data: Allow filtering conversion by time range

This adds a feature to allow restricting the range of converted samples
with a range string like perf-script and perf-report --time.

Committer testing:

Put a probe on the ICMP receive path handling broadcast packets:

  # perf probe icmp_rcv:64
  Added new event:
    probe:icmp_rcv_L64   (on icmp_rcv:64)

  You can now use it in all perf tools, such as:

  	perf record -e probe:icmp_rcv_L64 -aR sleep 1

  # perf record -e probe:icmp_rcv_L64 ping -c 10 -b 127.255.255.255
  WARNING: pinging broadcast address
  PING 127.255.255.255 (127.255.255.255) 56(84) bytes of data.
  ^C
  --- 127.255.255.255 ping statistics ---
  10 packets transmitted, 0 received, 100% packet loss, time 9217ms

  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.034 MB perf.data (10 samples) ]

  # perf script
              ping   52785 [009]  5847.300394: probe:icmp_rcv_L64: (ffffffffaadb337e)
              ping   52785 [009]  5848.325018: probe:icmp_rcv_L64: (ffffffffaadb337e)
              ping   52785 [009]  5849.349007: probe:icmp_rcv_L64: (ffffffffaadb337e)
              ping   52785 [009]  5850.372979: probe:icmp_rcv_L64: (ffffffffaadb337e)
              ping   52785 [009]  5851.396988: probe:icmp_rcv_L64: (ffffffffaadb337e)
              ping   52785 [009]  5852.420954: probe:icmp_rcv_L64: (ffffffffaadb337e)
              ping   52785 [009]  5853.444934: probe:icmp_rcv_L64: (ffffffffaadb337e)
              ping   52785 [009]  5854.468926: probe:icmp_rcv_L64: (ffffffffaadb337e)
              ping   52785 [009]  5855.492914: probe:icmp_rcv_L64: (ffffffffaadb337e)
              ping   52785 [009]  5856.516883: probe:icmp_rcv_L64: (ffffffffaadb337e)
  #

Now get some slices using perf script:

  # perf script --time 40%
            ping   52785 [009]  5847.300394: probe:icmp_rcv_L64: (ffffffffaadb337e)
            ping   52785 [009]  5848.325018: probe:icmp_rcv_L64: (ffffffffaadb337e)
            ping   52785 [009]  5849.349007: probe:icmp_rcv_L64: (ffffffffaadb337e)
            ping   52785 [009]  5850.372979: probe:icmp_rcv_L64: (ffffffffaadb337e)
  # perf script --time 40%-60%
            ping   52785 [009]  5851.396988: probe:icmp_rcv_L64: (ffffffffaadb337e)
            ping   52785 [009]  5852.420954: probe:icmp_rcv_L64: (ffffffffaadb337e)
  #

And finally use this new feature:

  # perf data convert --to-json out.json --time 0%-10%
  [ perf data convert: Converted 'perf.data' into JSON data 'out.json' ]
  [ perf data convert: Converted and wrote 0.001 MB (1 samples) ]
  [ perf data convert: Skipped 9 samples ]
  # cat out.json
  {
  	"linux-perf-json-version": 1,
  	"headers": {
  		"header-version": 1,
  		"captured-on": "2026-01-06T22:26:40Z",
  		"data-offset": 520,
  		"data-size": 34648,
  		"feat-offset": 35168,
  		"hostname": "number",
  		"os-release": "6.17.12-300.fc43.x86_64",
  		"arch": "x86_64",
  		"cpu-desc": "AMD Ryzen 9 9950X3D 16-Core Processor",
  		"cpuid": "AuthenticAMD,26,68,0",
  		"nrcpus-online": 32,
  		"nrcpus-avail": 32,
  		"perf-version": "6.19.rc4.gf4c270685d3d",
  		"cmdline": [
  			"/home/acme/bin/perf"
  		]
  	},
  	"samples": [
  		{
  			"timestamp": 5847300394661,
  			"pid": 52785,
  			"tid": 52785,
  			"cpu": 9,
  			"comm": "ping",
  			"callchain": [
  				{
  					"ip": "0xffffffffaadb337f",
  					"symbol": "icmp_rcv",
  					"dso": "[kernel.kallsyms]"
  				}
  			],
  			"__probe_ip": "ffffffffaadb337e"
  		}
  	]
  }
  #

Signed-off-by: Derek Foreman <derek.foreman@collabora.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-data.txt | 28 ++++++++++++++++++++++++++++
 tools/perf/builtin-data.c              |  3 +++
 tools/perf/util/data-convert-bt.c      | 31 +++++++++++++++++++++++++++++++
 tools/perf/util/data-convert-json.c    | 33 +++++++++++++++++++++++++++++++++
 tools/perf/util/data-convert.h         |  1 +
 5 files changed, 96 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-data.txt b/tools/perf/Documentation/perf-data.txt
index 417bf17e265c..20f178d61ed7 100644
--- a/tools/perf/Documentation/perf-data.txt
+++ b/tools/perf/Documentation/perf-data.txt
@@ -40,6 +40,34 @@ OPTIONS for 'convert'
 --force::
 	Don't complain, do it.
 
+--time::
+	Only convert samples within given time window: <start>,<stop>. Times
+	have the format seconds.nanoseconds. If start is not given (i.e. time
+	string is ',x.y') then analysis starts at the beginning of the file. If
+	stop time is not given (i.e. time string is 'x.y,') then analysis goes
+	to end of file. Multiple ranges can be separated by spaces, which
+	requires the argument to be quoted e.g. --time "1234.567,1234.789 1235,"
+
+	Also support time percent with multiple time ranges. Time string is
+	'a%/n,b%/m,...' or 'a%-b%,c%-%d,...'.
+
+	For example:
+	Select the second 10% time slice:
+
+	  perf data convert --to-json out.json --time 10%/2
+
+	Select from 0% to 10% time slice:
+
+	  perf data convert --to-json out.json --time 0%-10%
+
+	Select the first and second 10% time slices:
+
+	  perf data convert --to-json out.json --time 10%/1,10%/2
+
+	Select from 0% to 10% and 30% to 40% slices:
+
+	  perf data convert --to-json out.json --time 0%-10%,30%-40%
+
 -v::
 --verbose::
         Be more verbose (show counter open errors, etc).
diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c
index ce51cbf6dc97..85f59886b5cf 100644
--- a/tools/perf/builtin-data.c
+++ b/tools/perf/builtin-data.c
@@ -33,6 +33,7 @@ const char *to_ctf;
 struct perf_data_convert_opts opts = {
 	.force = false,
 	.all = false,
+	.time_str = NULL,
 };
 
 const struct option data_options[] = {
@@ -45,6 +46,8 @@ const struct option data_options[] = {
 #endif
 		OPT_BOOLEAN('f', "force", &opts.force, "don't complain, do it"),
 		OPT_BOOLEAN(0, "all", &opts.all, "Convert all events"),
+		OPT_STRING(0, "time", &opts.time_str, "str",
+			   "Time span of interest (start,stop)"),
 		OPT_END()
 	};
 
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 3d2e437e1354..0bcbc0e309e0 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -34,6 +34,7 @@
 #include "util.h"
 #include "clockid.h"
 #include "util/sample.h"
+#include "util/time-utils.h"
 
 #ifdef HAVE_LIBTRACEEVENT
 #include <event-parse.h>
@@ -91,9 +92,14 @@ struct convert {
 	struct perf_tool	tool;
 	struct ctf_writer	writer;
 
+	struct perf_time_interval *ptime_range;
+	int range_size;
+	int range_num;
+
 	u64			events_size;
 	u64			events_count;
 	u64			non_sample_count;
+	u64			skipped;
 
 	/* Ordered events configured queue size. */
 	u64			queue_size;
@@ -811,6 +817,11 @@ static int process_sample_event(const struct perf_tool *tool,
 	if (WARN_ONCE(!priv, "Failed to setup all events.\n"))
 		return 0;
 
+	if (perf_time__ranges_skip_sample(c->ptime_range, c->range_num, sample->time)) {
+		++c->skipped;
+		return 0;
+	}
+
 	event_class = priv->event_class;
 
 	/* update stats */
@@ -1644,6 +1655,15 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 	if (IS_ERR(session))
 		return PTR_ERR(session);
 
+	if (opts->time_str) {
+		err = perf_time__parse_for_ranges(opts->time_str, session,
+						  &c.ptime_range,
+						  &c.range_size,
+						  &c.range_num);
+		if (err < 0)
+			goto free_session;
+	}
+
 	/* CTF writer */
 	if (ctf_writer__init(cw, path, session, opts->tod))
 		goto free_session;
@@ -1687,6 +1707,14 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 	else
 		fprintf(stderr, ", %" PRIu64 " non-samples) ]\n", c.non_sample_count);
 
+	if (c.skipped) {
+		fprintf(stderr,	"[ perf data convert: Skipped %" PRIu64 " samples ]\n",
+			c.skipped);
+	}
+
+	if (c.ptime_range)
+		zfree(&c.ptime_range);
+
 	cleanup_events(session);
 	perf_session__delete(session);
 	ctf_writer__cleanup(cw);
@@ -1696,6 +1724,9 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 free_writer:
 	ctf_writer__cleanup(cw);
 free_session:
+	if (c.ptime_range)
+		zfree(&c.ptime_range);
+
 	perf_session__delete(session);
 	pr_err("Error during conversion setup.\n");
 	return err;
diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c
index 9dc1e184cf3c..787039967916 100644
--- a/tools/perf/util/data-convert-json.c
+++ b/tools/perf/util/data-convert-json.c
@@ -25,6 +25,7 @@
 #include "util/session.h"
 #include "util/symbol.h"
 #include "util/thread.h"
+#include "util/time-utils.h"
 #include "util/tool.h"
 
 #ifdef HAVE_LIBTRACEEVENT
@@ -35,7 +36,12 @@ struct convert_json {
 	struct perf_tool tool;
 	FILE *out;
 	bool first;
+	struct perf_time_interval *ptime_range;
+	int range_size;
+	int range_num;
+
 	u64 events_count;
+	u64 skipped;
 };
 
 // Outputs a JSON-encoded string surrounded by quotes with characters escaped.
@@ -165,6 +171,11 @@ static int process_sample_event(const struct perf_tool *tool,
 		return -1;
 	}
 
+	if (perf_time__ranges_skip_sample(c->ptime_range, c->range_num, sample->time)) {
+		++c->skipped;
+		return 0;
+	}
+
 	++c->events_count;
 
 	if (c->first)
@@ -320,6 +331,10 @@ int bt_convert__perf2json(const char *input_name, const char *output_name,
 	struct convert_json c = {
 		.first = true,
 		.events_count = 0,
+		.ptime_range = NULL,
+		.range_size = 0,
+		.range_num = 0,
+		.skipped = 0,
 	};
 	struct perf_data data = {
 		.mode = PERF_DATA_MODE_READ,
@@ -382,6 +397,15 @@ int bt_convert__perf2json(const char *input_name, const char *output_name,
 		goto err_session_delete;
 	}
 
+	if (opts->time_str) {
+		ret = perf_time__parse_for_ranges(opts->time_str, session,
+						  &c.ptime_range,
+						  &c.range_size,
+						  &c.range_num);
+		if (ret < 0)
+			goto err_session_delete;
+	}
+
 	// The opening brace is printed manually because it isn't delimited from a
 	// previous value (i.e. we don't want a leading newline)
 	fputc('{', c.out);
@@ -411,7 +435,16 @@ int bt_convert__perf2json(const char *input_name, const char *output_name,
 			"[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples) ]\n",
 			(ftell(c.out)) / 1024.0 / 1024.0, c.events_count);
 
+	if (c.skipped) {
+		fprintf(stderr,	"[ perf data convert: Skipped %" PRIu64 " samples ]\n",
+			c.skipped);
+	}
+
 	ret = 0;
+
+	if (c.ptime_range)
+		zfree(&c.ptime_range);
+
 err_session_delete:
 	perf_session__delete(session);
 err_fclose:
diff --git a/tools/perf/util/data-convert.h b/tools/perf/util/data-convert.h
index 1b4c5f598415..ee651fa680a1 100644
--- a/tools/perf/util/data-convert.h
+++ b/tools/perf/util/data-convert.h
@@ -8,6 +8,7 @@ struct perf_data_convert_opts {
 	bool force;
 	bool all;
 	bool tod;
+	const char *time_str;
 };
 
 #ifdef HAVE_LIBBABELTRACE_SUPPORT
-- 
cgit v1.2.3


From 75326c67aa8c43000819a2ac29f22eb27846d545 Mon Sep 17 00:00:00 2001
From: Derek Foreman <derek.foreman@collabora.com>
Date: Fri, 28 Nov 2025 15:50:18 -0600
Subject: perf data: Fix coding style

Adjust some oddly indented fprintf() calls.

Signed-off-by: Derek Foreman <derek.foreman@collabora.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/data-convert-bt.c   | 6 ++----
 tools/perf/util/data-convert-json.c | 9 ++++-----
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 0bcbc0e309e0..a22e9049ff30 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -1693,12 +1693,10 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 	else
 		pr_err("Error during conversion.\n");
 
-	fprintf(stderr,
-		"[ perf data convert: Converted '%s' into CTF data '%s' ]\n",
+	fprintf(stderr,	"[ perf data convert: Converted '%s' into CTF data '%s' ]\n",
 		data.path, path);
 
-	fprintf(stderr,
-		"[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples",
+	fprintf(stderr,	"[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples",
 		(double) c.events_size / 1024.0 / 1024.0,
 		c.events_count);
 
diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c
index 787039967916..eefa3a94c813 100644
--- a/tools/perf/util/data-convert-json.c
+++ b/tools/perf/util/data-convert-json.c
@@ -427,13 +427,12 @@ int bt_convert__perf2json(const char *input_name, const char *output_name,
 	output_json_format(c.out, false, 0, "}");
 	fputc('\n', c.out);
 
-	fprintf(stderr,
-			"[ perf data convert: Converted '%s' into JSON data '%s' ]\n",
-			data.path, output_name);
+	fprintf(stderr,	"[ perf data convert: Converted '%s' into JSON data '%s' ]\n",
+		data.path, output_name);
 
 	fprintf(stderr,
-			"[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples) ]\n",
-			(ftell(c.out)) / 1024.0 / 1024.0, c.events_count);
+		"[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples) ]\n",
+		(ftell(c.out)) / 1024.0 / 1024.0, c.events_count);
 
 	if (c.skipped) {
 		fprintf(stderr,	"[ perf data convert: Skipped %" PRIu64 " samples ]\n",
-- 
cgit v1.2.3


From 1eb217ab2e737609f8a861b517649e82e7236d05 Mon Sep 17 00:00:00 2001
From: Faisal Bukhari <faisalbukhari523@gmail.com>
Date: Mon, 22 Sep 2025 23:38:34 +0530
Subject: perf parse-events: Fix evsel allocation failure

If evsel__new_idx() returns NULL, the function currently jumps to label
'out_err'.  Here, references to `cpus` and `pmu_cpus` are dropped.
Also, resources held by evsel->name and evsel->metric_id are freed.

But if evsel__new_idx() returns NULL, it can lead to NULL pointer
dereference.

Fixes: cd63c22168257a0b ("perf parse-events: Minor __add_event refactoring")
Signed-off-by: Faisal Bukhari <faisalbukhari523@gmail.com>
Reviewed-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/parse-events.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 17c1c36a7bf9..000c89a1e50d 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -251,8 +251,11 @@ __add_event(struct list_head *list, int *idx,
 		event_attr_init(attr);
 
 	evsel = evsel__new_idx(attr, *idx);
-	if (!evsel)
-		goto out_err;
+	if (!evsel) {
+		perf_cpu_map__put(cpus);
+		perf_cpu_map__put(pmu_cpus);
+		return NULL;
+	}
 
 	if (name) {
 		evsel->name = strdup(name);
-- 
cgit v1.2.3


From cb3de96eea66f5e4a580086c6a1be46e765f97f4 Mon Sep 17 00:00:00 2001
From: Yumei Huang <yuhuang@redhat.com>
Date: Sun, 4 Jan 2026 11:23:57 +0800
Subject: ipv6: preserve insertion order for same-scope addresses

IPv6 addresses with the same scope are returned in reverse insertion
order, unlike IPv4. For example, when adding a -> b -> c, the list is
reported as c -> b -> a, while IPv4 preserves the original order.

This behavior causes:

a. When using `ip -6 a save` and `ip -6 a restore`, addresses are restored
   in the opposite order from which they were saved. See example below
   showing addresses added as 1::1, 1::2, 1::3 but displayed and saved
   in reverse order.

   # ip -6 a a 1::1 dev x
   # ip -6 a a 1::2 dev x
   # ip -6 a a 1::3 dev x
   # ip -6 a s dev x
   2: x: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default qlen 1000
       inet6 1::3/128 scope global tentative
       valid_lft forever preferred_lft forever
       inet6 1::2/128 scope global tentative
       valid_lft forever preferred_lft forever
       inet6 1::1/128 scope global tentative
       valid_lft forever preferred_lft forever
   # ip -6 a save > dump
   # ip -6 a d 1::1 dev x
   # ip -6 a d 1::2 dev x
   # ip -6 a d 1::3 dev x
   # ip a d ::1 dev lo
   # ip a restore < dump
   # ip -6 a s dev x
   2: x: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default qlen 1000
       inet6 1::1/128 scope global tentative
       valid_lft forever preferred_lft forever
       inet6 1::2/128 scope global tentative
       valid_lft forever preferred_lft forever
       inet6 1::3/128 scope global tentative
       valid_lft forever preferred_lft forever
   # ip a showdump < dump
    if1:
        inet6 ::1/128 scope host proto kernel_lo
        valid_lft forever preferred_lft forever
    if2:
        inet6 1::3/128 scope global tentative
        valid_lft forever preferred_lft forever
    if2:
        inet6 1::2/128 scope global tentative
        valid_lft forever preferred_lft forever
    if2:
        inet6 1::1/128 scope global tentative
        valid_lft forever preferred_lft forever

b. Addresses in pasta to appear in reversed order compared to host
   addresses.

The ipv6 addresses were added in reverse order by commit e55ffac60117
("[IPV6]: order addresses by scope"), then it was changed by commit
502a2ffd7376 ("ipv6: convert idev_list to list macros"), and restored by
commit b54c9b98bbfb ("ipv6: Preserve pervious behavior in
ipv6_link_dev_addr()."). However, this reverse ordering within the same
scope causes inconsistency with IPv4 and the issues described above.

This patch aligns IPv6 address ordering with IPv4 for consistency
by changing the comparison from >= to > when inserting addresses
into the address list. Also updates the ioam6 selftest to reflect
the new address ordering behavior. Combine these two changes into
one patch for bisectability.

Link: https://bugs.passt.top/show_bug.cgi?id=175
Suggested-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Yumei Huang <yuhuang@redhat.com>
Acked-by: Justin Iurman <justin.iurman@gmail.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20260104032357.38555-1-yuhuang@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv6/addrconf.c                  | 2 +-
 tools/testing/selftests/net/ioam6.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b66217d1b2f8..fe64988a23ab 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1013,7 +1013,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
 	list_for_each(p, &idev->addr_list) {
 		struct inet6_ifaddr *ifa
 			= list_entry(p, struct inet6_ifaddr, if_list);
-		if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr))
+		if (ifp_scope > ipv6_addr_src_scope(&ifa->addr))
 			break;
 	}
 
diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh
index 845c26dd01a9..b2b99889942f 100755
--- a/tools/testing/selftests/net/ioam6.sh
+++ b/tools/testing/selftests/net/ioam6.sh
@@ -273,8 +273,8 @@ setup()
   ip -netns $ioam_node_beta link set ioam-veth-betaR name veth1 &>/dev/null
   ip -netns $ioam_node_gamma link set ioam-veth-gamma name veth0 &>/dev/null
 
-  ip -netns $ioam_node_alpha addr add 2001:db8:1::50/64 dev veth0 &>/dev/null
   ip -netns $ioam_node_alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null
+  ip -netns $ioam_node_alpha addr add 2001:db8:1::50/64 dev veth0 &>/dev/null
   ip -netns $ioam_node_alpha link set veth0 up &>/dev/null
   ip -netns $ioam_node_alpha link set lo up &>/dev/null
   ip -netns $ioam_node_alpha route add 2001:db8:2::/64 \
-- 
cgit v1.2.3


From c4df070a57def243dcf5773428dd1b51bc8337ef Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sun, 4 Jan 2026 10:46:00 -0800
Subject: selftests: hw-net: rss-input-xfrm: try to enable the xfrm at the
 start

The test currently SKIPs if the symmetric RSS xfrm is not enabled
by default. This leads to spurious SKIPs in the Intel CI reporting
results to NIPA.

Testing on CX7:

 # ./drivers/net/hw/rss_input_xfrm.py
  TAP version 13
  1..2
  ok 1 rss_input_xfrm.test_rss_input_xfrm_ipv4 # SKIP Test requires IPv4 connectivity
  # Sym input xfrm already enabled: {'sym-or-xor'}
  ok 2 rss_input_xfrm.test_rss_input_xfrm_ipv6
  # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:1 error:0

 # ethtool -X eth0 xfrm none

 # ./drivers/net/hw/rss_input_xfrm.py
  TAP version 13
  1..2
  ok 1 rss_input_xfrm.test_rss_input_xfrm_ipv4 # SKIP Test requires IPv4 connectivity
  # Sym input xfrm configured: {'sym-or-xor'}
  ok 2 rss_input_xfrm.test_rss_input_xfrm_ipv6
  # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:1 error:0

Link: https://patch.msgid.link/20260104184600.795280-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/drivers/net/hw/rss_input_xfrm.py     | 44 +++++++++++++++++++---
 1 file changed, 38 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py
index 72880e388478..503f1a2a2872 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py
@@ -5,9 +5,9 @@ import multiprocessing
 import socket
 from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge, cmd, fd_read_timeout
 from lib.py import NetDrvEpEnv
-from lib.py import EthtoolFamily, NetdevFamily
+from lib.py import EthtoolFamily, NetdevFamily, NlError
 from lib.py import KsftSkipEx, KsftFailEx
-from lib.py import rand_port
+from lib.py import defer, ksft_pr, rand_port
 
 
 def traffic(cfg, local_port, remote_port, ipver):
@@ -21,6 +21,40 @@ def traffic(cfg, local_port, remote_port, ipver):
     return sock.getsockopt(socket.SOL_SOCKET, socket.SO_INCOMING_CPU)
 
 
+def _rss_input_xfrm_try_enable(cfg):
+    """
+    Check if symmetric input-xfrm is already enabled, if not try to enable it
+    and register a cleanup.
+    """
+    rss = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}})
+    orig_xfrm = rss.get('input-xfrm', set())
+    sym_xfrm = set(filter(lambda x: 'sym' in x, orig_xfrm))
+
+    if sym_xfrm:
+        ksft_pr("Sym input xfrm already enabled:", sym_xfrm)
+        return sym_xfrm
+
+    for xfrm in cfg.ethnl.consts["input-xfrm"].entries:
+        # Skip non-symmetric transforms
+        if "sym" not in xfrm:
+            continue
+
+        try_xfrm = {xfrm} | orig_xfrm
+        try:
+            cfg.ethnl.rss_set({"header": {"dev-index": cfg.ifindex},
+                               "input-xfrm": try_xfrm})
+        except NlError:
+            continue
+
+        ksft_pr("Sym input xfrm configured:", try_xfrm)
+        defer(cfg.ethnl.rss_set,
+              {"header": {"dev-index": cfg.ifindex},
+               "input-xfrm": orig_xfrm})
+        return {xfrm}
+
+    return set()
+
+
 def test_rss_input_xfrm(cfg, ipver):
     """
     Test symmetric input_xfrm.
@@ -37,12 +71,10 @@ def test_rss_input_xfrm(cfg, ipver):
     if not hasattr(socket, "SO_INCOMING_CPU"):
         raise KsftSkipEx("socket.SO_INCOMING_CPU was added in Python 3.11")
 
-    rss = cfg.ethnl.rss_get({'header': {'dev-name': cfg.ifname}})
-    input_xfrm = set(filter(lambda x: 'sym' in x, rss.get('input-xfrm', {})))
-
     # Check for symmetric xor/or-xor
+    input_xfrm = _rss_input_xfrm_try_enable(cfg)
     if not input_xfrm:
-        raise KsftSkipEx("Symmetric RSS hash not requested")
+        raise KsftSkipEx("Symmetric RSS hash not supported by device")
 
     cpus = set()
     successful = 0
-- 
cgit v1.2.3


From 3b7a108c4197f9fd0b593c6b4b0de457d9ed4c87 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 5 Jan 2026 12:25:02 -0500
Subject: selftests/net: packetdrill: add minimal client and server tests

Introduce minimal tests. These can serve as simple illustrative
examples, and as templates when writing new tests.

When adding new cases, it can be easier to extend an existing base
test rather than start from scratch. The existing tests all focus on
real, often non-trivial, features. It is not obvious which to take as
starting point, and arguably none really qualify.

Add two tests
- the client test performs the active open and initial close
- the server test implements the passive open and final close

Signed-off-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20260105172529.3514786-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/packetdrill/tcp_basic_client.pkt | 24 +++++++++++++++
 .../selftests/net/packetdrill/tcp_basic_server.pkt | 35 ++++++++++++++++++++++
 2 files changed, 59 insertions(+)
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt b/tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt
new file mode 100644
index 000000000000..319f81dd717d
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_basic_client.pkt
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Minimal active open.
+// First to close connection.
+
+`./defaults.sh`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
+
+   // Connect to server: active open: three-way handshake
+   +0...0 connect(4, ..., ...) = 0
+   +0 > S 0:0(0) <mss 1460,sackOK,TS val 0 ecr 0,nop,wscale 8>
+   +0 < S. 0:0(0) ack 1 win 65535 <mss 1460,sackOK,nop,nop,nop,wscale 7>
+   +0 > . 1:1(0) ack 1
+
+   // Send data
+   +0 send(4, ..., 1000, 0) = 1000
+   +0 > P. 1:1001(1000) ack 1
+   +0 < . 1:1(0) ack 1001 win 257
+
+   +0 close(4) = 0
+   +0 > F. 1001:1001(0) ack 1
+   +0 < F. 1:1(0) ack 1002 win 257
+   +0 > . 1002:1002(0) ack 2
diff --git a/tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt b/tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt
new file mode 100644
index 000000000000..e72a291b666e
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_basic_server.pkt
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Minimal passive open.
+// Peer is first to close.
+
+`./defaults.sh`
+
+   // Open listener socket
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   // Incoming connection: passive open: three-way handshake
+   +0 < S 0:0(0) win 65535 <mss 1000,sackOK,nop,nop,nop,wscale 8>
+   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+   +0 < . 1:1(0) ack 1 win 257
+
+   // Open connection socket and close listener socket
+   +0 accept(3, ..., ...) = 4
+   +0 close(3) = 0
+
+   // Peer sends data: acknowledge and receive
+   +0 < P. 1:1001(1000) ack 1 win 257
+   +0 > . 1:1(0) ack 1001
+   +0 recv(4, ..., 1000, 0) = 1000
+
+   // Peer initiates connection close
+   +0 < F. 1001:1001(0) ack 1 win 257
+ +.04 > . 1:1(0) ack 1002
+
+   // Local socket also closes its side
+   +0 close(4) = 0
+   +0 > F. 1:1(0) ack 1002
+   +0 < . 1002:1002(0) ack 2 win 257
-- 
cgit v1.2.3


From b81d5e9d965e0af2c1f21fc392a23e598171f9d6 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Tue, 6 Jan 2026 18:36:45 -0500
Subject: selftests/bpf: add tests for arena kfuncs under lock

Add selftests to ensure the verifier permits calling the arena
kfunc API while holding a lock.

Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Link: https://lore.kernel.org/r/20260106-arena-under-lock-v2-3-378e9eab3066@etsalapatis.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_arena.c | 38 ++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c
index 4a9d96344813..c4b8daac4388 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena.c
@@ -10,6 +10,8 @@
 #include "bpf_experimental.h"
 #include "bpf_arena_common.h"
 
+#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8)))
+
 struct {
 	__uint(type, BPF_MAP_TYPE_ARENA);
 	__uint(map_flags, BPF_F_MMAPABLE);
@@ -439,4 +441,40 @@ int iter_maps3(struct bpf_iter__bpf_map *ctx)
 	return 0;
 }
 
+private(ARENA_TESTS) struct bpf_spin_lock arena_bpf_test_lock;
+
+/* Use the arena kfunc API while under a BPF lock. */
+SEC("syscall")
+__success __retval(0)
+int arena_kfuncs_under_bpf_lock(void *ctx)
+{
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+	char __arena *page;
+	int ret;
+
+	bpf_spin_lock(&arena_bpf_test_lock);
+
+	/* Get a separate region of the arena. */
+	page = arena_base(&arena);
+	ret = bpf_arena_reserve_pages(&arena, page, 1);
+	if (ret) {
+		bpf_spin_unlock(&arena_bpf_test_lock);
+		return 1;
+	}
+
+	bpf_arena_free_pages(&arena, page, 1);
+
+	page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0);
+	if (!page) {
+		bpf_spin_unlock(&arena_bpf_test_lock);
+		return 2;
+	}
+
+	bpf_arena_free_pages(&arena, page, 1);
+
+	bpf_spin_unlock(&arena_bpf_test_lock);
+#endif
+
+	return 0;
+}
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 2b421662c7887a0649fe409155a1f101562d0fa9 Mon Sep 17 00:00:00 2001
From: Leon Hwang <leon.hwang@linux.dev>
Date: Wed, 7 Jan 2026 10:20:16 +0800
Subject: bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags

Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags and check them for
following APIs:

* 'map_lookup_elem()'
* 'map_update_elem()'
* 'generic_map_lookup_batch()'
* 'generic_map_update_batch()'

And, get the correct value size for these APIs.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260107022022.12843-2-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            | 23 ++++++++++++++++++++++-
 include/uapi/linux/bpf.h       |  2 ++
 kernel/bpf/syscall.c           | 31 +++++++++++++++++--------------
 tools/include/uapi/linux/bpf.h |  2 ++
 4 files changed, 43 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a63e47d2109c..108bab1bda9d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3915,14 +3915,35 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image)
 }
 #endif
 
+static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type)
+{
+	return false;
+}
+
 static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags)
 {
-	if (flags & ~allowed_flags)
+	u32 cpu;
+
+	if ((u32)flags & ~allowed_flags)
 		return -EINVAL;
 
 	if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))
 		return -EINVAL;
 
+	if (!(flags & BPF_F_CPU) && flags >> 32)
+		return -EINVAL;
+
+	if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) {
+		if (!bpf_map_supports_cpu_flags(map->map_type))
+			return -EINVAL;
+		if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS))
+			return -EINVAL;
+
+		cpu = flags >> 32;
+		if ((flags & BPF_F_CPU) && cpu >= num_possible_cpus())
+			return -ERANGE;
+	}
+
 	return 0;
 }
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 84ced3ed2d21..2a2ade4be60f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1384,6 +1384,8 @@ enum {
 	BPF_NOEXIST	= 1, /* create new element if it didn't exist */
 	BPF_EXIST	= 2, /* update existing element */
 	BPF_F_LOCK	= 4, /* spin_lock-ed map_lookup/map_update */
+	BPF_F_CPU	= 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */
+	BPF_F_ALL_CPUS	= 16, /* update value across all CPUs for percpu maps */
 };
 
 /* flags for BPF_MAP_CREATE command */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6dd2ad2f9e81..e8cfe9d67e64 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -133,12 +133,14 @@ bool bpf_map_write_active(const struct bpf_map *map)
 	return atomic64_read(&map->writecnt) != 0;
 }
 
-static u32 bpf_map_value_size(const struct bpf_map *map)
-{
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags)
+{
+	if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS))
+		return map->value_size;
+	else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+		 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+		 map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+		 map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
 		return round_up(map->value_size, 8) * num_possible_cpus();
 	else if (IS_FD_MAP(map))
 		return sizeof(u32);
@@ -1729,7 +1731,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
 		return -EPERM;
 
-	err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK);
+	err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU);
 	if (err)
 		return err;
 
@@ -1737,7 +1739,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (IS_ERR(key))
 		return PTR_ERR(key);
 
-	value_size = bpf_map_value_size(map);
+	value_size = bpf_map_value_size(map, attr->flags);
 
 	err = -ENOMEM;
 	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
@@ -1804,7 +1806,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 		goto err_put;
 	}
 
-	value_size = bpf_map_value_size(map);
+	value_size = bpf_map_value_size(map, attr->flags);
 	value = kvmemdup_bpfptr(uvalue, value_size);
 	if (IS_ERR(value)) {
 		err = PTR_ERR(value);
@@ -2000,11 +2002,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
 	void *key, *value;
 	int err = 0;
 
-	err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+	err = bpf_map_check_op_flags(map, attr->batch.elem_flags,
+				     BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS);
 	if (err)
 		return err;
 
-	value_size = bpf_map_value_size(map);
+	value_size = bpf_map_value_size(map, attr->batch.elem_flags);
 
 	max_count = attr->batch.count;
 	if (!max_count)
@@ -2059,11 +2062,11 @@ int generic_map_lookup_batch(struct bpf_map *map,
 	u32 value_size, cp, max_count;
 	int err;
 
-	err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+	err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU);
 	if (err)
 		return err;
 
-	value_size = bpf_map_value_size(map);
+	value_size = bpf_map_value_size(map, attr->batch.elem_flags);
 
 	max_count = attr->batch.count;
 	if (!max_count)
@@ -2185,7 +2188,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	value_size = bpf_map_value_size(map);
+	value_size = bpf_map_value_size(map, 0);
 
 	err = -ENOMEM;
 	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 6b92b0847ec2..b816bc53d2e1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1384,6 +1384,8 @@ enum {
 	BPF_NOEXIST	= 1, /* create new element if it didn't exist */
 	BPF_EXIST	= 2, /* update existing element */
 	BPF_F_LOCK	= 4, /* spin_lock-ed map_lookup/map_update */
+	BPF_F_CPU	= 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */
+	BPF_F_ALL_CPUS	= 16, /* update value across all CPUs for percpu maps */
 };
 
 /* flags for BPF_MAP_CREATE command */
-- 
cgit v1.2.3


From 2546863b4a723c96f55af7127827d62632cfbc9c Mon Sep 17 00:00:00 2001
From: Leon Hwang <leon.hwang@linux.dev>
Date: Wed, 7 Jan 2026 10:20:21 +0800
Subject: libbpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu
 maps

Add libbpf support for the BPF_F_CPU flag for percpu maps by embedding the
cpu info into the high 32 bits of:

1. **flags**: bpf_map_lookup_elem_flags(), bpf_map__lookup_elem(),
   bpf_map_update_elem() and bpf_map__update_elem()
2. **opts->elem_flags**: bpf_map_lookup_batch() and
   bpf_map_update_batch()

And the flag can be BPF_F_ALL_CPUS, but cannot be
'BPF_F_CPU | BPF_F_ALL_CPUS'.

Behavior:

* If the flag is BPF_F_ALL_CPUS, the update is applied across all CPUs.
* If the flag is BPF_F_CPU, it updates value only to the specified CPU.
* If the flag is BPF_F_CPU, lookup value only from the specified CPU.
* lookup does not support BPF_F_ALL_CPUS.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260107022022.12843-7-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/bpf.h    |  8 ++++++++
 tools/lib/bpf/libbpf.c | 26 ++++++++++++++++++++------
 tools/lib/bpf/libbpf.h | 21 ++++++++-------------
 3 files changed, 36 insertions(+), 19 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 1f9c28d27795..2c8e88ddb674 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -289,6 +289,14 @@ LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, void *in_batch,
  *    Update spin_lock-ed map elements. This must be
  *    specified if the map value contains a spinlock.
  *
+ * **BPF_F_CPU**
+ *    As for percpu maps, update value on the specified CPU. And the cpu
+ *    info is embedded into the high 32 bits of **opts->elem_flags**.
+ *
+ * **BPF_F_ALL_CPUS**
+ *    As for percpu maps, update value across all CPUs. This flag cannot
+ *    be used with BPF_F_CPU at the same time.
+ *
  * @param fd BPF map file descriptor
  * @param keys pointer to an array of *count* keys
  * @param values pointer to an array of *count* values
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1a52d818a76c..6ea81701e274 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10919,7 +10919,7 @@ bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name)
 }
 
 static int validate_map_op(const struct bpf_map *map, size_t key_sz,
-			   size_t value_sz, bool check_value_sz)
+			   size_t value_sz, bool check_value_sz, __u64 flags)
 {
 	if (!map_is_created(map)) /* map is not yet created */
 		return -ENOENT;
@@ -10946,6 +10946,20 @@ static int validate_map_op(const struct bpf_map *map, size_t key_sz,
 		int num_cpu = libbpf_num_possible_cpus();
 		size_t elem_sz = roundup(map->def.value_size, 8);
 
+		if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) {
+			if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS)) {
+				pr_warn("map '%s': BPF_F_CPU and BPF_F_ALL_CPUS are mutually exclusive\n",
+					map->name);
+				return -EINVAL;
+			}
+			if (map->def.value_size != value_sz) {
+				pr_warn("map '%s': unexpected value size %zu provided for either BPF_F_CPU or BPF_F_ALL_CPUS, expected %u\n",
+					map->name, value_sz, map->def.value_size);
+				return -EINVAL;
+			}
+			break;
+		}
+
 		if (value_sz != num_cpu * elem_sz) {
 			pr_warn("map '%s': unexpected value size %zu provided for per-CPU map, expected %d * %zu = %zd\n",
 				map->name, value_sz, num_cpu, elem_sz, num_cpu * elem_sz);
@@ -10970,7 +10984,7 @@ int bpf_map__lookup_elem(const struct bpf_map *map,
 {
 	int err;
 
-	err = validate_map_op(map, key_sz, value_sz, true);
+	err = validate_map_op(map, key_sz, value_sz, true, flags);
 	if (err)
 		return libbpf_err(err);
 
@@ -10983,7 +10997,7 @@ int bpf_map__update_elem(const struct bpf_map *map,
 {
 	int err;
 
-	err = validate_map_op(map, key_sz, value_sz, true);
+	err = validate_map_op(map, key_sz, value_sz, true, flags);
 	if (err)
 		return libbpf_err(err);
 
@@ -10995,7 +11009,7 @@ int bpf_map__delete_elem(const struct bpf_map *map,
 {
 	int err;
 
-	err = validate_map_op(map, key_sz, 0, false /* check_value_sz */);
+	err = validate_map_op(map, key_sz, 0, false /* check_value_sz */, flags);
 	if (err)
 		return libbpf_err(err);
 
@@ -11008,7 +11022,7 @@ int bpf_map__lookup_and_delete_elem(const struct bpf_map *map,
 {
 	int err;
 
-	err = validate_map_op(map, key_sz, value_sz, true);
+	err = validate_map_op(map, key_sz, value_sz, true, flags);
 	if (err)
 		return libbpf_err(err);
 
@@ -11020,7 +11034,7 @@ int bpf_map__get_next_key(const struct bpf_map *map,
 {
 	int err;
 
-	err = validate_map_op(map, key_sz, 0, false /* check_value_sz */);
+	err = validate_map_op(map, key_sz, 0, false /* check_value_sz */, 0);
 	if (err)
 		return libbpf_err(err);
 
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index e14d9e349f9c..dfc37a615578 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1216,12 +1216,13 @@ LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map);
  * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size**
  * @param value pointer to memory in which looked up value will be stored
  * @param value_sz size in byte of value data memory; it has to match BPF map
- * definition's **value_size**. For per-CPU BPF maps value size has to be
- * a product of BPF map value size and number of possible CPUs in the system
- * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for
- * per-CPU values value size has to be aligned up to closest 8 bytes for
- * alignment reasons, so expected size is: `round_up(value_size, 8)
- * * libbpf_num_possible_cpus()`.
+ * definition's **value_size**. For per-CPU BPF maps, value size can be
+ * `value_size` if either **BPF_F_CPU** or **BPF_F_ALL_CPUS** is specified
+ * in **flags**, otherwise a product of BPF map value size and number of
+ * possible CPUs in the system (could be fetched with
+ * **libbpf_num_possible_cpus()**). Note also that for per-CPU values value
+ * size has to be aligned up to closest 8 bytes, so expected size is:
+ * `round_up(value_size, 8) * libbpf_num_possible_cpus()`.
  * @param flags extra flags passed to kernel for this operation
  * @return 0, on success; negative error, otherwise
  *
@@ -1239,13 +1240,7 @@ LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map,
  * @param key pointer to memory containing bytes of the key
  * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size**
  * @param value pointer to memory containing bytes of the value
- * @param value_sz size in byte of value data memory; it has to match BPF map
- * definition's **value_size**. For per-CPU BPF maps value size has to be
- * a product of BPF map value size and number of possible CPUs in the system
- * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for
- * per-CPU values value size has to be aligned up to closest 8 bytes for
- * alignment reasons, so expected size is: `round_up(value_size, 8)
- * * libbpf_num_possible_cpus()`.
+ * @param value_sz refer to **bpf_map__lookup_elem**'s description.'
  * @param flags extra flags passed to kernel for this operation
  * @return 0, on success; negative error, otherwise
  *
-- 
cgit v1.2.3


From 07bf7aa58e5e7fb27b8addcc33052400a7d9ce32 Mon Sep 17 00:00:00 2001
From: Leon Hwang <leon.hwang@linux.dev>
Date: Wed, 7 Jan 2026 10:20:22 +0800
Subject: selftests/bpf: Add cases to test BPF_F_CPU and BPF_F_ALL_CPUS flags

Add test coverage for the new BPF_F_CPU and BPF_F_ALL_CPUS flags support
in percpu maps. The following APIs are exercised:

* bpf_map_update_batch()
* bpf_map_lookup_batch()
* bpf_map_update_elem()
* bpf_map__update_elem()
* bpf_map_lookup_elem_flags()
* bpf_map__lookup_elem()

For lru_percpu_hash map, set max_entries to
'libbpf_num_possible_cpus() + 1' and only use the first
'libbpf_num_possible_cpus()' entries. This ensures a spare entry is always
available in the LRU free list, avoiding eviction.

When updating an existing key in lru_percpu_hash map:

1. l_new = prealloc_lru_pop();  /* Borrow from free list */
2. l_old = lookup_elem_raw();   /* Found, key exists */
3. pcpu_copy_value();           /* In-place update */
4. bpf_lru_push_free();         /* Return l_new to free list */

Also add negative tests to verify that non-percpu array and hash maps
reject the BPF_F_CPU and BPF_F_ALL_CPUS flags.

Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260107022022.12843-8-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/percpu_alloc.c        | 328 +++++++++++++++++++++
 .../selftests/bpf/progs/percpu_alloc_array.c       |  32 ++
 2 files changed, 360 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c
index 343da65864d6..c1d0949f093f 100644
--- a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <test_progs.h>
+#include "cgroup_helpers.h"
 #include "percpu_alloc_array.skel.h"
 #include "percpu_alloc_cgrp_local_storage.skel.h"
 #include "percpu_alloc_fail.skel.h"
@@ -115,6 +116,321 @@ static void test_failure(void) {
 	RUN_TESTS(percpu_alloc_fail);
 }
 
+static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t key_sz, u32 entries,
+					int nr_cpus, bool test_batch)
+{
+	size_t value_sz = sizeof(u32), value_sz_cpus, value_sz_total;
+	u32 *values = NULL, *values_percpu = NULL;
+	const u32 value = 0xDEADC0DE;
+	int i, j, cpu, map_fd, err;
+	u64 batch = 0, flags;
+	void *values_row;
+	u32 count, v;
+	LIBBPF_OPTS(bpf_map_batch_opts, batch_opts);
+
+	value_sz_cpus = value_sz * nr_cpus;
+	values = calloc(entries, value_sz_cpus);
+	if (!ASSERT_OK_PTR(values, "calloc values"))
+		return;
+
+	values_percpu = calloc(entries, roundup(value_sz, 8) * nr_cpus);
+	if (!ASSERT_OK_PTR(values_percpu, "calloc values_percpu")) {
+		free(values);
+		return;
+	}
+
+	value_sz_total = value_sz_cpus * entries;
+	memset(values, 0, value_sz_total);
+
+	map_fd = bpf_map__fd(map);
+	flags = BPF_F_CPU | BPF_F_ALL_CPUS;
+	err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags);
+	if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags cpu|all_cpus"))
+		goto out;
+
+	err = bpf_map_update_elem(map_fd, keys, values, flags);
+	if (!ASSERT_ERR(err, "bpf_map_update_elem cpu|all_cpus"))
+		goto out;
+
+	flags = BPF_F_ALL_CPUS;
+	err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags);
+	if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags all_cpus"))
+		goto out;
+
+	flags = BPF_F_LOCK | BPF_F_CPU;
+	err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags);
+	if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags BPF_F_LOCK"))
+		goto out;
+
+	flags = BPF_F_LOCK | BPF_F_ALL_CPUS;
+	err = bpf_map_update_elem(map_fd, keys, values, flags);
+	if (!ASSERT_ERR(err, "bpf_map_update_elem BPF_F_LOCK"))
+		goto out;
+
+	flags = (u64)nr_cpus << 32 | BPF_F_CPU;
+	err = bpf_map_update_elem(map_fd, keys, values, flags);
+	if (!ASSERT_EQ(err, -ERANGE, "bpf_map_update_elem -ERANGE"))
+		goto out;
+
+	err = bpf_map__update_elem(map, keys, key_sz, values, value_sz, flags);
+	if (!ASSERT_EQ(err, -ERANGE, "bpf_map__update_elem -ERANGE"))
+		goto out;
+
+	err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags);
+	if (!ASSERT_EQ(err, -ERANGE, "bpf_map_lookup_elem_flags -ERANGE"))
+		goto out;
+
+	err = bpf_map__lookup_elem(map, keys, key_sz, values, value_sz, flags);
+	if (!ASSERT_EQ(err, -ERANGE, "bpf_map__lookup_elem -ERANGE"))
+		goto out;
+
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		/* clear value on all cpus */
+		values[0] = 0;
+		flags = BPF_F_ALL_CPUS;
+		for (i = 0; i < entries; i++) {
+			err = bpf_map__update_elem(map, keys + i * key_sz, key_sz, values,
+						   value_sz, flags);
+			if (!ASSERT_OK(err, "bpf_map__update_elem all_cpus"))
+				goto out;
+		}
+
+		/* update value on specified cpu */
+		for (i = 0; i < entries; i++) {
+			values[0] = value;
+			flags = (u64)cpu << 32 | BPF_F_CPU;
+			err = bpf_map__update_elem(map, keys + i * key_sz, key_sz, values,
+						   value_sz, flags);
+			if (!ASSERT_OK(err, "bpf_map__update_elem specified cpu"))
+				goto out;
+
+			/* lookup then check value on CPUs */
+			for (j = 0; j < nr_cpus; j++) {
+				flags = (u64)j << 32 | BPF_F_CPU;
+				err = bpf_map__lookup_elem(map, keys + i * key_sz, key_sz, values,
+							   value_sz, flags);
+				if (!ASSERT_OK(err, "bpf_map__lookup_elem specified cpu"))
+					goto out;
+				if (!ASSERT_EQ(values[0], j != cpu ? 0 : value,
+					       "bpf_map__lookup_elem value on specified cpu"))
+					goto out;
+			}
+		}
+	}
+
+	if (!test_batch)
+		goto out;
+
+	count = entries;
+	batch_opts.elem_flags = (u64)nr_cpus << 32 | BPF_F_CPU;
+	err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts);
+	if (!ASSERT_EQ(err, -ERANGE, "bpf_map_update_batch -ERANGE"))
+		goto out;
+
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		memset(values, 0, value_sz_total);
+
+		/* clear values across all CPUs */
+		count = entries;
+		batch_opts.elem_flags = BPF_F_ALL_CPUS;
+		err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts);
+		if (!ASSERT_OK(err, "bpf_map_update_batch all_cpus"))
+			goto out;
+
+		/* update values on specified CPU */
+		for (i = 0; i < entries; i++)
+			values[i] = value;
+
+		count = entries;
+		batch_opts.elem_flags = (u64)cpu << 32 | BPF_F_CPU;
+		err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts);
+		if (!ASSERT_OK(err, "bpf_map_update_batch specified cpu"))
+			goto out;
+
+		/* lookup values on specified CPU */
+		batch = 0;
+		count = entries;
+		memset(values, 0, entries * value_sz);
+		err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values, &count, &batch_opts);
+		if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch specified cpu"))
+			goto out;
+
+		for (i = 0; i < entries; i++)
+			if (!ASSERT_EQ(values[i], value,
+				       "bpf_map_lookup_batch value on specified cpu"))
+				goto out;
+
+		/* lookup values from all CPUs */
+		batch = 0;
+		count = entries;
+		batch_opts.elem_flags = 0;
+		memset(values_percpu, 0, roundup(value_sz, 8) * nr_cpus * entries);
+		err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values_percpu, &count,
+					   &batch_opts);
+		if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch all_cpus"))
+			goto out;
+
+		for (i = 0; i < entries; i++) {
+			values_row = (void *) values_percpu +
+				     roundup(value_sz, 8) * i * nr_cpus;
+			for (j = 0; j < nr_cpus; j++) {
+				v = *(u32 *) (values_row + roundup(value_sz, 8) * j);
+				if (!ASSERT_EQ(v, j != cpu ? 0 : value,
+					       "bpf_map_lookup_batch value all_cpus"))
+					goto out;
+			}
+		}
+	}
+
+out:
+	free(values_percpu);
+	free(values);
+}
+
+
+static void test_percpu_map_cpu_flag(enum bpf_map_type map_type)
+{
+	struct percpu_alloc_array *skel;
+	size_t key_sz = sizeof(int);
+	int *keys, nr_cpus, i, err;
+	struct bpf_map *map;
+	u32 max_entries;
+
+	nr_cpus = libbpf_num_possible_cpus();
+	if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus"))
+		return;
+
+	max_entries = nr_cpus + 1;
+	keys = calloc(max_entries, key_sz);
+	if (!ASSERT_OK_PTR(keys, "calloc keys"))
+		return;
+
+	for (i = 0; i < max_entries; i++)
+		keys[i] = i;
+
+	skel = percpu_alloc_array__open();
+	if (!ASSERT_OK_PTR(skel, "percpu_alloc_array__open")) {
+		free(keys);
+		return;
+	}
+
+	map = skel->maps.percpu;
+	bpf_map__set_type(map, map_type);
+	bpf_map__set_max_entries(map, max_entries);
+
+	err = percpu_alloc_array__load(skel);
+	if (!ASSERT_OK(err, "test_percpu_alloc__load"))
+		goto out;
+
+	test_percpu_map_op_cpu_flag(map, keys, key_sz, max_entries - 1, nr_cpus, true);
+out:
+	percpu_alloc_array__destroy(skel);
+	free(keys);
+}
+
+static void test_percpu_array_cpu_flag(void)
+{
+	test_percpu_map_cpu_flag(BPF_MAP_TYPE_PERCPU_ARRAY);
+}
+
+static void test_percpu_hash_cpu_flag(void)
+{
+	test_percpu_map_cpu_flag(BPF_MAP_TYPE_PERCPU_HASH);
+}
+
+static void test_lru_percpu_hash_cpu_flag(void)
+{
+	test_percpu_map_cpu_flag(BPF_MAP_TYPE_LRU_PERCPU_HASH);
+}
+
+static void test_percpu_cgroup_storage_cpu_flag(void)
+{
+	struct percpu_alloc_array *skel = NULL;
+	struct bpf_cgroup_storage_key key;
+	int cgroup, prog_fd, nr_cpus, err;
+	struct bpf_map *map;
+
+	nr_cpus = libbpf_num_possible_cpus();
+	if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus"))
+		return;
+
+	err = setup_cgroup_environment();
+	if (!ASSERT_OK(err, "setup_cgroup_environment"))
+		return;
+
+	cgroup = create_and_get_cgroup("/cg_percpu");
+	if (!ASSERT_GE(cgroup, 0, "create_and_get_cgroup")) {
+		cleanup_cgroup_environment();
+		return;
+	}
+
+	err = join_cgroup("/cg_percpu");
+	if (!ASSERT_OK(err, "join_cgroup"))
+		goto out;
+
+	skel = percpu_alloc_array__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "percpu_alloc_array__open_and_load"))
+		goto out;
+
+	prog_fd = bpf_program__fd(skel->progs.cgroup_egress);
+	err = bpf_prog_attach(prog_fd, cgroup, BPF_CGROUP_INET_EGRESS, 0);
+	if (!ASSERT_OK(err, "bpf_prog_attach"))
+		goto out;
+
+	map = skel->maps.percpu_cgroup_storage;
+	err = bpf_map_get_next_key(bpf_map__fd(map), NULL, &key);
+	if (!ASSERT_OK(err, "bpf_map_get_next_key"))
+		goto out;
+
+	test_percpu_map_op_cpu_flag(map, &key, sizeof(key), 1, nr_cpus, false);
+out:
+	bpf_prog_detach2(-1, cgroup, BPF_CGROUP_INET_EGRESS);
+	close(cgroup);
+	cleanup_cgroup_environment();
+	percpu_alloc_array__destroy(skel);
+}
+
+static void test_map_op_cpu_flag(enum bpf_map_type map_type)
+{
+	u32 max_entries = 1, count = max_entries;
+	u64 flags, batch = 0, val = 0;
+	int err, map_fd, key = 0;
+	LIBBPF_OPTS(bpf_map_batch_opts, batch_opts);
+
+	map_fd = bpf_map_create(map_type, "test_cpu_flag", sizeof(int), sizeof(u64), max_entries,
+				NULL);
+	if (!ASSERT_GE(map_fd, 0, "bpf_map_create"))
+		return;
+
+	flags = BPF_F_ALL_CPUS;
+	err = bpf_map_update_elem(map_fd, &key, &val, flags);
+	ASSERT_ERR(err, "bpf_map_update_elem all_cpus");
+
+	batch_opts.elem_flags = BPF_F_ALL_CPUS;
+	err = bpf_map_update_batch(map_fd, &key, &val, &count, &batch_opts);
+	ASSERT_ERR(err, "bpf_map_update_batch all_cpus");
+
+	flags = BPF_F_CPU;
+	err = bpf_map_lookup_elem_flags(map_fd, &key, &val, flags);
+	ASSERT_ERR(err, "bpf_map_lookup_elem_flags cpu");
+
+	batch_opts.elem_flags = BPF_F_CPU;
+	err = bpf_map_lookup_batch(map_fd, NULL, &batch, &key, &val, &count, &batch_opts);
+	ASSERT_ERR(err, "bpf_map_lookup_batch cpu");
+
+	close(map_fd);
+}
+
+static void test_array_cpu_flag(void)
+{
+	test_map_op_cpu_flag(BPF_MAP_TYPE_ARRAY);
+}
+
+static void test_hash_cpu_flag(void)
+{
+	test_map_op_cpu_flag(BPF_MAP_TYPE_HASH);
+}
+
 void test_percpu_alloc(void)
 {
 	if (test__start_subtest("array"))
@@ -125,4 +441,16 @@ void test_percpu_alloc(void)
 		test_cgrp_local_storage();
 	if (test__start_subtest("failure_tests"))
 		test_failure();
+	if (test__start_subtest("cpu_flag_percpu_array"))
+		test_percpu_array_cpu_flag();
+	if (test__start_subtest("cpu_flag_percpu_hash"))
+		test_percpu_hash_cpu_flag();
+	if (test__start_subtest("cpu_flag_lru_percpu_hash"))
+		test_lru_percpu_hash_cpu_flag();
+	if (test__start_subtest("cpu_flag_percpu_cgroup_storage"))
+		test_percpu_cgroup_storage_cpu_flag();
+	if (test__start_subtest("cpu_flag_array"))
+		test_array_cpu_flag();
+	if (test__start_subtest("cpu_flag_hash"))
+		test_hash_cpu_flag();
 }
diff --git a/tools/testing/selftests/bpf/progs/percpu_alloc_array.c b/tools/testing/selftests/bpf/progs/percpu_alloc_array.c
index 37c2d2608ec0..ed6a2a93d5a5 100644
--- a/tools/testing/selftests/bpf/progs/percpu_alloc_array.c
+++ b/tools/testing/selftests/bpf/progs/percpu_alloc_array.c
@@ -187,4 +187,36 @@ out:
 	return 0;
 }
 
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, 2);
+	__type(key, int);
+	__type(value, u32);
+} percpu SEC(".maps");
+
+SEC("?fentry/bpf_fentry_test1")
+int BPF_PROG(test_percpu_array, int x)
+{
+	u64 value = 0xDEADC0DE;
+	int key = 0;
+
+	bpf_map_update_elem(&percpu, &key, &value, BPF_ANY);
+	return 0;
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+	__type(key, struct bpf_cgroup_storage_key);
+	__type(value, u32);
+} percpu_cgroup_storage SEC(".maps");
+
+SEC("cgroup_skb/egress")
+int cgroup_egress(struct __sk_buff *skb)
+{
+	u32 *val = bpf_get_local_storage(&percpu_cgroup_storage, 0);
+
+	*val = 1;
+	return 1;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 97fb54d86d2194ea8a4cbe6cf074e6ba47b054ea Mon Sep 17 00:00:00 2001
From: "Jose E. Marchesi" <jose.marchesi@oracle.com>
Date: Tue, 6 Jan 2026 18:36:49 +0100
Subject: bpf: adapt selftests to GCC 16 -Wunused-but-set-variable

GCC 16 has changed the semantics of -Wunused-but-set-variable, as well
as introducing new options -Wunused-but-set-variable={0,1,2,3} to
adjust the level of support.

One of the changes is that GCC now treats 'sum += 1' and 'sum++' as
non-usage, whereas clang (and GCC < 16) considers the first as usage
and the second as non-usage, which is sort of inconsistent.

The GCC 16 -Wunused-but-set-variable=2 option implements the previous
semantics of -Wunused-but-set-variable, but since it is a new option,
it cannot be used unconditionally for forward-compatibility, just for
backwards-compatibility.

So this patch adds pragmas to the two self-tests impacted by this,
progs/free_timer.c and progs/rcu_read_lock.c, to make gcc to ignore
-Wunused-but-set-variable warnings when compiling them with GCC > 15.

See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=44677#c25 for details
on why this regression got introduced in GCC upstream.

Signed-off-by: Jose E. Marchesi <jose.marchesi@oracle.com>
Cc: david.faust@oracle.com
Cc: cupertino.miranda@oracle.com
Cc: Eduard Zingerman <eddyz87@gmail.com>
Cc: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20260106173650.18191-2-jose.marchesi@oracle.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/free_timer.c    | 10 ++++++++++
 tools/testing/selftests/bpf/progs/rcu_read_lock.c | 10 ++++++++++
 2 files changed, 20 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/free_timer.c b/tools/testing/selftests/bpf/progs/free_timer.c
index 4501ae8fc414..eccb2d47db43 100644
--- a/tools/testing/selftests/bpf/progs/free_timer.c
+++ b/tools/testing/selftests/bpf/progs/free_timer.c
@@ -7,6 +7,16 @@
 
 #define MAX_ENTRIES 8
 
+/* clang considers 'sum += 1' as usage but 'sum++' as non-usage.  GCC
+ * is more consistent and considers both 'sum += 1' and 'sum++' as
+ * non-usage.  This triggers warnings in the functions below.
+ *
+ * Starting with GCC 16 -Wunused-but-set-variable=2 can be used to
+ * mimic clang's behavior.  */
+#if !defined(__clang__) && __GNUC__ > 15
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+
 struct map_value {
 	struct bpf_timer timer;
 };
diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c
index d70c28824bbe..b4e073168fb1 100644
--- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c
+++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c
@@ -7,6 +7,16 @@
 #include "bpf_tracing_net.h"
 #include "bpf_misc.h"
 
+/* clang considers 'sum += 1' as usage but 'sum++' as non-usage.  GCC
+ * is more consistent and considers both 'sum += 1' and 'sum++' as
+ * non-usage.  This triggers warnings in the functions below.
+ *
+ * Starting with GCC 16 -Wunused-but-set-variable=2 can be used to
+ * mimic clang's behavior.  */
+#if !defined(__clang__) && __GNUC__ > 15
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+
 char _license[] SEC("license") = "GPL";
 
 struct {
-- 
cgit v1.2.3


From 681600647c59050546939da5e490c736e567fe91 Mon Sep 17 00:00:00 2001
From: "Jose E. Marchesi" <jose.marchesi@oracle.com>
Date: Tue, 6 Jan 2026 18:36:50 +0100
Subject: bpf: GCC requires function attributes before the declarator

GCC insists in placing attributes before the declarators in function
declarations.  Now that GCC supports btf_decl_tag and therefore __tag1
and __tag2 expand to actual attributes, the compiler is complaining
about it for

  static __noinline int foo(int x __tag1 __tag2) __tag1 __tag2

progs/test_btf_decl_tag.c:36:1: error: attributes should be specified \
before the declarator in a function definition

This patch simply places the tags before the declarator.

Signed-off-by: Jose E. Marchesi <jose.marchesi@oracle.com>
Cc: david.faust@oracle.com
Cc: cupertino.miranda@oracle.com
Cc: Eduard Zingerman <eddyz87@gmail.com>
Cc: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20260106173650.18191-3-jose.marchesi@oracle.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/test_btf_decl_tag.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c b/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c
index c88ccc53529a..0c3df19626cb 100644
--- a/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c
+++ b/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c
@@ -33,7 +33,7 @@ struct {
 } hashmap1 SEC(".maps");
 
 
-static __noinline int foo(int x __tag1 __tag2) __tag1 __tag2
+static __noinline __tag1 __tag2 int foo(int x __tag1 __tag2)
 {
 	struct key_t key;
 	value_t val = {};
-- 
cgit v1.2.3


From c219d4ee1d63b772d5fa8ed453b9cec18a9e2f6a Mon Sep 17 00:00:00 2001
From: Crystal Wood <crwood@redhat.com>
Date: Wed, 12 Nov 2025 09:25:29 -0600
Subject: rtla: Set stop threshold after all instances are enabled

This avoids startup races where one of the instances hit a threshold
before all instances were enabled, and thus tracing stops without
the relevant event.  In particular, this is not uncommon with the
tests that set a very tight threshold and then complain if there's
no analysis.

This also ensures that we don't stop tracing during a warmup.

The downside is a small chance of having an event over the threshold
early in the output, without stopping on it, which could cause user
confusion.  This should be less likely if the warmup feature is used, but
that doesn't eliminate the race window, just the odds of an unusual spike
right at that moment.

Signed-off-by: Crystal Wood <crwood@redhat.com>
Link: https://lore.kernel.org/r/20251112152529.956778-6-crwood@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/common.c   | 20 ++++++++++++++++++++
 tools/tracing/rtla/src/common.h   |  4 ++++
 tools/tracing/rtla/src/osnoise.c  | 17 ++++-------------
 tools/tracing/rtla/src/osnoise.h  |  5 -----
 tools/tracing/rtla/src/timerlat.c | 29 ++++++++++-------------------
 5 files changed, 38 insertions(+), 37 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index b197037fc58b..46e0263d6ae8 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -348,3 +348,23 @@ int hist_main_loop(struct osnoise_tool *tool)
 
 	return retval;
 }
+
+int osn_set_stop(struct osnoise_tool *tool)
+{
+	struct common_params *params = tool->params;
+	int retval;
+
+	retval = osnoise_set_stop_us(tool->context, params->stop_us);
+	if (retval) {
+		err_msg("Failed to set stop us\n");
+		return retval;
+	}
+
+	retval = osnoise_set_stop_total_us(tool->context, params->stop_total_us);
+	if (retval) {
+		err_msg("Failed to set stop total us\n");
+		return retval;
+	}
+
+	return 0;
+}
diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h
index 9ec2b7632c37..c5e73d4600a0 100644
--- a/tools/tracing/rtla/src/common.h
+++ b/tools/tracing/rtla/src/common.h
@@ -152,7 +152,11 @@ void osnoise_destroy_tool(struct osnoise_tool *top);
 struct osnoise_tool *osnoise_init_tool(char *tool_name);
 struct osnoise_tool *osnoise_init_trace_tool(const char *tracer);
 bool osnoise_trace_is_off(struct osnoise_tool *tool, struct osnoise_tool *record);
+int osnoise_set_stop_us(struct osnoise_context *context, long long stop_us);
+int osnoise_set_stop_total_us(struct osnoise_context *context,
+			      long long stop_total_us);
 
 int common_apply_config(struct osnoise_tool *tool, struct common_params *params);
 int top_main_loop(struct osnoise_tool *tool);
 int hist_main_loop(struct osnoise_tool *tool);
+int osn_set_stop(struct osnoise_tool *tool);
diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c
index 312c511fa004..945eb61efc46 100644
--- a/tools/tracing/rtla/src/osnoise.c
+++ b/tools/tracing/rtla/src/osnoise.c
@@ -1128,18 +1128,6 @@ osnoise_apply_config(struct osnoise_tool *tool, struct osnoise_params *params)
 		goto out_err;
 	}
 
-	retval = osnoise_set_stop_us(tool->context, params->common.stop_us);
-	if (retval) {
-		err_msg("Failed to set stop us\n");
-		goto out_err;
-	}
-
-	retval = osnoise_set_stop_total_us(tool->context, params->common.stop_total_us);
-	if (retval) {
-		err_msg("Failed to set stop total us\n");
-		goto out_err;
-	}
-
 	retval = osnoise_set_tracing_thresh(tool->context, params->threshold);
 	if (retval) {
 		err_msg("Failed to set tracing_thresh\n");
@@ -1184,9 +1172,12 @@ int osnoise_enable(struct osnoise_tool *tool)
 			debug_msg("Error cleaning up the buffer");
 			return retval;
 		}
-
 	}
 
+	retval = osn_set_stop(tool);
+	if (retval)
+		return retval;
+
 	return 0;
 }
 
diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h
index 75de0d5c706a..168669aa7e0d 100644
--- a/tools/tracing/rtla/src/osnoise.h
+++ b/tools/tracing/rtla/src/osnoise.h
@@ -34,12 +34,7 @@ int osnoise_set_runtime_period(struct osnoise_context *context,
 			       unsigned long long period);
 void osnoise_restore_runtime_period(struct osnoise_context *context);
 
-int osnoise_set_stop_us(struct osnoise_context *context,
-			long long stop_us);
 void osnoise_restore_stop_us(struct osnoise_context *context);
-
-int osnoise_set_stop_total_us(struct osnoise_context *context,
-			      long long stop_total_us);
 void osnoise_restore_stop_total_us(struct osnoise_context *context);
 
 int osnoise_set_timerlat_period_us(struct osnoise_context *context,
diff --git a/tools/tracing/rtla/src/timerlat.c b/tools/tracing/rtla/src/timerlat.c
index df4f9bfe3433..ee15e344cf37 100644
--- a/tools/tracing/rtla/src/timerlat.c
+++ b/tools/tracing/rtla/src/timerlat.c
@@ -48,25 +48,6 @@ timerlat_apply_config(struct osnoise_tool *tool, struct timerlat_params *params)
 		}
 	}
 
-	if (params->mode != TRACING_MODE_BPF) {
-		/*
-		 * In tracefs and mixed mode, timerlat tracer handles stopping
-		 * on threshold
-		 */
-		retval = osnoise_set_stop_us(tool->context, params->common.stop_us);
-		if (retval) {
-			err_msg("Failed to set stop us\n");
-			goto out_err;
-		}
-
-		retval = osnoise_set_stop_total_us(tool->context, params->common.stop_total_us);
-		if (retval) {
-			err_msg("Failed to set stop total us\n");
-			goto out_err;
-		}
-	}
-
-
 	retval = osnoise_set_timerlat_period_us(tool->context,
 						params->timerlat_period_us ?
 						params->timerlat_period_us :
@@ -184,6 +165,16 @@ int timerlat_enable(struct osnoise_tool *tool)
 		}
 	}
 
+	/*
+	 * In tracefs and mixed mode, timerlat tracer handles stopping
+	 * on threshold
+	 */
+	if (params->mode != TRACING_MODE_BPF) {
+		retval = osn_set_stop(tool);
+		if (retval)
+			return retval;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From a08e012e814d346c191726a877b18901c3bc204f Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Mon, 24 Nov 2025 08:31:46 +0200
Subject: tools/rtla: Add common_usage()

The rtla tools have significant code quadruplication in their usage
functions. Each tool implements its own version of the same help text
formatting and option descriptions, leading to maintenance overhead and
inconsistencies.  Documentation/tools/rtla/common_options.rst lists 14
common options.

Add common_usage() infrastructure to consolidate help formatting.
Subsequent patches will extend this to handle other common options.

The refactored output is almost identical to the original, with the
following changes:
- add square brackets to specify optionality: `usage: [rtla] ...`
- remove `-q` from timerlat hist because hist tools don't support it
- minor spacing

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Link: https://lore.kernel.org/r/20251124063204.845425-1-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/common.c        | 39 ++++++++++++++++++++++++++++++++++
 tools/tracing/rtla/src/common.h        |  3 +++
 tools/tracing/rtla/src/osnoise_hist.c  | 22 +++++++------------
 tools/tracing/rtla/src/osnoise_top.c   | 37 +++++++++++++-------------------
 tools/tracing/rtla/src/timerlat_hist.c | 22 +++++++------------
 tools/tracing/rtla/src/timerlat_top.c  | 22 +++++++------------
 6 files changed, 81 insertions(+), 64 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index 46e0263d6ae8..009a4bce9737 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -4,6 +4,7 @@
 #include <pthread.h>
 #include <signal.h>
 #include <stdlib.h>
+#include <string.h>
 #include <unistd.h>
 #include "common.h"
 
@@ -368,3 +369,41 @@ int osn_set_stop(struct osnoise_tool *tool)
 
 	return 0;
 }
+
+static void print_msg_array(const char * const *msgs)
+{
+	if (!msgs)
+		return;
+
+	for (int i = 0; msgs[i]; i++)
+		fprintf(stderr, "%s\n", msgs[i]);
+}
+
+/*
+ * common_usage - print complete usage information
+ */
+void common_usage(const char *tool, const char *mode,
+		  const char *desc, const char * const *start_msgs, const char * const *opt_msgs)
+{
+	static const char * const common_options[] = {
+		"	  -h/--help: print this menu",
+		NULL
+	};
+	fprintf(stderr, "rtla %s", tool);
+	if (strcmp(mode, ""))
+		fprintf(stderr, " %s", mode);
+	fprintf(stderr, ": %s (version %s)\n\n", desc, VERSION);
+	fprintf(stderr, "  usage: [rtla] %s ", tool);
+
+	if (strcmp(mode, "top") == 0)
+		fprintf(stderr, "[top] [-h] ");
+	else
+		fprintf(stderr, "%s [-h] ", mode);
+
+	print_msg_array(start_msgs);
+	fprintf(stderr, "\n");
+	print_msg_array(common_options);
+	print_msg_array(opt_msgs);
+
+	exit(EXIT_SUCCESS);
+}
diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h
index c5e73d4600a0..c48c9bfd20e3 100644
--- a/tools/tracing/rtla/src/common.h
+++ b/tools/tracing/rtla/src/common.h
@@ -160,3 +160,6 @@ int common_apply_config(struct osnoise_tool *tool, struct common_params *params)
 int top_main_loop(struct osnoise_tool *tool);
 int hist_main_loop(struct osnoise_tool *tool);
 int osn_set_stop(struct osnoise_tool *tool);
+
+void common_usage(const char *tool, const char *mode,
+		  const char *desc, const char * const *start_msgs, const char * const *opt_msgs);
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index ff8c231e47c4..372128db9e4a 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -409,16 +409,15 @@ osnoise_print_stats(struct osnoise_tool *tool)
  */
 static void osnoise_hist_usage(void)
 {
-	int i;
-
-	static const char * const msg[] = {
-		"",
-		"  usage: rtla osnoise hist [-h] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\",
+	static const char * const msg_start[] = {
+		"[-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\",
 		"	  [-T us] [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\",
 		"	  [-c cpu-list] [-H cpu-list] [-P priority] [-b N] [-E N] [--no-header] [--no-summary] \\",
 		"	  [--no-index] [--with-zeros] [-C [cgroup_name]] [--warm-up]",
-		"",
-		"	  -h/--help: print this menu",
+		NULL,
+	};
+
+	static const char * const msg_opts[] = {
 		"	  -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit",
 		"	  -p/--period us: osnoise period in us",
 		"	  -r/--runtime us: osnoise runtime in us",
@@ -453,13 +452,8 @@ static void osnoise_hist_usage(void)
 		NULL,
 	};
 
-	fprintf(stderr, "rtla osnoise hist: a per-cpu histogram of the OS noise (version %s)\n",
-			VERSION);
-
-	for (i = 0; msg[i]; i++)
-		fprintf(stderr, "%s\n", msg[i]);
-
-	exit(EXIT_SUCCESS);
+	common_usage("osnoise", "hist", "a per-cpu histogram of the OS noise",
+		     msg_start, msg_opts);
 }
 
 /*
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 04c699bdd736..1db1d946b600 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -257,14 +257,16 @@ osnoise_print_stats(struct osnoise_tool *top)
  */
 static void osnoise_top_usage(struct osnoise_params *params)
 {
-	int i;
+	const char *tool, *mode, *desc;
 
-	static const char * const msg[] = {
-		" [-h] [-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\",
+	static const char * const msg_start[] = {
+		"[-q] [-D] [-d s] [-a us] [-p us] [-r us] [-s us] [-S us] \\",
 		"	  [-T us] [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] \\",
 		"	  [-c cpu-list] [-H cpu-list] [-P priority] [-C [cgroup_name]] [--warm-up s]",
-		"",
-		"	  -h/--help: print this menu",
+		NULL,
+	};
+
+	static const char * const msg_opts[] = {
 		"	  -a/--auto: set automatic trace mode, stopping the session if argument in us sample is hit",
 		"	  -p/--period us: osnoise period in us",
 		"	  -r/--runtime us: osnoise runtime in us",
@@ -295,25 +297,16 @@ static void osnoise_top_usage(struct osnoise_params *params)
 	};
 
 	if (params->mode == MODE_OSNOISE) {
-		fprintf(stderr,
-			"rtla osnoise top: a per-cpu summary of the OS noise (version %s)\n",
-			VERSION);
-
-		fprintf(stderr, "  usage: rtla osnoise [top]");
+		tool = "osnoise";
+		mode = "top";
+		desc = "a per-cpu summary of the OS noise";
+	} else {
+		tool = "hwnoise";
+		mode = "";
+		desc = "a summary of hardware-related noise";
 	}
 
-	if (params->mode == MODE_HWNOISE) {
-		fprintf(stderr,
-			"rtla hwnoise: a summary of hardware-related noise (version %s)\n",
-			VERSION);
-
-		fprintf(stderr, "  usage: rtla hwnoise");
-	}
-
-	for (i = 0; msg[i]; i++)
-		fprintf(stderr, "%s\n", msg[i]);
-
-	exit(EXIT_SUCCESS);
+	common_usage(tool, mode, desc, msg_start, msg_opts);
 }
 
 /*
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 1fb471a787b7..2a5c543217ba 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -696,17 +696,16 @@ timerlat_print_stats(struct osnoise_tool *tool)
  */
 static void timerlat_hist_usage(void)
 {
-	int i;
-
-	char *msg[] = {
-		"",
-		"  usage: [rtla] timerlat hist [-h] [-q] [-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\",
+	static const char * const msg_start[] = {
+		"[-d s] [-D] [-n] [-a us] [-p us] [-i us] [-T us] [-s us] \\",
 		"         [-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\",
 		"	  [-P priority] [-E N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\",
 		"	  [--no-index] [--with-zeros] [--dma-latency us] [-C [cgroup_name]] [--no-aa] [--dump-task] [-u|-k]",
 		"	  [--warm-up s] [--deepest-idle-state n]",
-		"",
-		"	  -h/--help: print this menu",
+		NULL,
+	};
+
+	static const char * const msg_opts[] = {
 		"	  -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit",
 		"	  -p/--period us: timerlat period in us",
 		"	  -i/--irq us: stop trace if the irq latency is higher than the argument in us",
@@ -750,13 +749,8 @@ static void timerlat_hist_usage(void)
 		NULL,
 	};
 
-	fprintf(stderr, "rtla timerlat hist: a per-cpu histogram of the timer latency (version %s)\n",
-			VERSION);
-
-	for (i = 0; msg[i]; i++)
-		fprintf(stderr, "%s\n", msg[i]);
-
-	exit(EXIT_SUCCESS);
+	common_usage("timerlat", "hist", "a per-cpu histogram of the timer latency",
+		     msg_start, msg_opts);
 }
 
 /*
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 29c2c1f717ed..9ed8b931552f 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -476,15 +476,14 @@ timerlat_print_stats(struct osnoise_tool *top)
  */
 static void timerlat_top_usage(void)
 {
-	int i;
-
-	static const char *const msg[] = {
-		"",
-		"  usage: rtla timerlat [top] [-h] [-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\",
+	static const char *const msg_start[] = {
+		"[-q] [-a us] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] \\",
 		"	  [[-t [file]] [-e sys[:event]] [--filter <filter>] [--trigger <trigger>] [-c cpu-list] [-H cpu-list]\\",
 		"	  [-P priority] [--dma-latency us] [--aa-only us] [-C [cgroup_name]] [-u|-k] [--warm-up s] [--deepest-idle-state n]",
-		"",
-		"	  -h/--help: print this menu",
+		NULL,
+	};
+
+	static const char *const msg_opts[] = {
 		"	  -a/--auto: set automatic trace mode, stopping the session if argument in us latency is hit",
 		"	     --aa-only us: stop if <us> latency is hit, only printing the auto analysis (reduces CPU usage)",
 		"	  -p/--period us: timerlat period in us",
@@ -522,13 +521,8 @@ static void timerlat_top_usage(void)
 		NULL,
 	};
 
-	fprintf(stderr, "rtla timerlat top: a per-cpu summary of the timer latency (version %s)\n",
-			VERSION);
-
-	for (i = 0; msg[i]; i++)
-		fprintf(stderr, "%s\n", msg[i]);
-
-	exit(EXIT_SUCCESS);
+	common_usage("timerlat", "top", "a per-cpu summary of the timer latency",
+		     msg_start, msg_opts);
 }
 
 /*
-- 
cgit v1.2.3


From 8cd0f08ac72e25e2a048c72d76730676ab0106f3 Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Wed, 26 Nov 2025 15:41:59 +0100
Subject: rtla/timerlat: Support tail call from BPF program

Add a map to the rtla-timerlat BPF program that holds a file descriptor
of another BPF program, to be executed on threshold overflow.

timerlat_bpf_set_action() is added as an interface to set the program.

Link: https://lore.kernel.org/r/20251126144205.331954-2-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/timerlat.bpf.c | 25 +++++++++++++++++++++----
 tools/tracing/rtla/src/timerlat_bpf.c | 13 +++++++++++++
 tools/tracing/rtla/src/timerlat_bpf.h |  1 +
 3 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/timerlat.bpf.c b/tools/tracing/rtla/src/timerlat.bpf.c
index e2265b5d6491..549d2d2191d2 100644
--- a/tools/tracing/rtla/src/timerlat.bpf.c
+++ b/tools/tracing/rtla/src/timerlat.bpf.c
@@ -40,6 +40,17 @@ struct {
 	__uint(max_entries, 1);
 } signal_stop_tracing SEC(".maps");
 
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(key_size, sizeof(unsigned int));
+	__uint(max_entries, 1);
+	__array(values, unsigned int (void *));
+} bpf_action SEC(".maps") = {
+	.values = {
+		[0] = 0
+	},
+};
+
 /* Params to be set by rtla */
 const volatile int bucket_size = 1;
 const volatile int output_divisor = 1000;
@@ -109,7 +120,7 @@ nosubprog void update_summary(void *map,
 	map_set(map, SUMMARY_SUM, map_get(map, SUMMARY_SUM) + latency);
 }
 
-nosubprog void set_stop_tracing(void)
+nosubprog void set_stop_tracing(struct trace_event_raw_timerlat_sample *tp_args)
 {
 	int value = 0;
 
@@ -118,6 +129,12 @@ nosubprog void set_stop_tracing(void)
 
 	/* Signal to userspace */
 	bpf_ringbuf_output(&signal_stop_tracing, &value, sizeof(value), 0);
+
+	/*
+	 * Call into BPF action program, if attached.
+	 * Otherwise, just silently fail.
+	 */
+	bpf_tail_call(tp_args, &bpf_action, 0);
 }
 
 SEC("tp/osnoise/timerlat_sample")
@@ -138,19 +155,19 @@ int handle_timerlat_sample(struct trace_event_raw_timerlat_sample *tp_args)
 		update_summary(&summary_irq, latency, bucket);
 
 		if (irq_threshold != 0 && latency_us >= irq_threshold)
-			set_stop_tracing();
+			set_stop_tracing(tp_args);
 	} else if (tp_args->context == 1) {
 		update_main_hist(&hist_thread, bucket);
 		update_summary(&summary_thread, latency, bucket);
 
 		if (thread_threshold != 0 && latency_us >= thread_threshold)
-			set_stop_tracing();
+			set_stop_tracing(tp_args);
 	} else {
 		update_main_hist(&hist_user, bucket);
 		update_summary(&summary_user, latency, bucket);
 
 		if (thread_threshold != 0 && latency_us >= thread_threshold)
-			set_stop_tracing();
+			set_stop_tracing(tp_args);
 	}
 
 	return 0;
diff --git a/tools/tracing/rtla/src/timerlat_bpf.c b/tools/tracing/rtla/src/timerlat_bpf.c
index e97d16646bcd..1d619e502c65 100644
--- a/tools/tracing/rtla/src/timerlat_bpf.c
+++ b/tools/tracing/rtla/src/timerlat_bpf.c
@@ -59,6 +59,19 @@ int timerlat_bpf_init(struct timerlat_params *params)
 	return 0;
 }
 
+/*
+ * timerlat_bpf_set_action - set action on threshold executed on BPF side
+ */
+static int timerlat_bpf_set_action(struct bpf_program *prog)
+{
+	unsigned int key = 0, value = bpf_program__fd(prog);
+
+	return bpf_map__update_elem(bpf->maps.bpf_action,
+				    &key, sizeof(key),
+				    &value, sizeof(value),
+				    BPF_ANY);
+}
+
 /*
  * timerlat_bpf_attach - attach BPF program to collect timerlat data
  */
diff --git a/tools/tracing/rtla/src/timerlat_bpf.h b/tools/tracing/rtla/src/timerlat_bpf.h
index 118487436d30..b5009092c7a3 100644
--- a/tools/tracing/rtla/src/timerlat_bpf.h
+++ b/tools/tracing/rtla/src/timerlat_bpf.h
@@ -12,6 +12,7 @@ enum summary_field {
 };
 
 #ifndef __bpf__
+#include <bpf/libbpf.h>
 #ifdef HAVE_BPF_SKEL
 int timerlat_bpf_init(struct timerlat_params *params);
 int timerlat_bpf_attach(void);
-- 
cgit v1.2.3


From f967d1eca7d0bde7c896014577ea876096831c6e Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Wed, 26 Nov 2025 15:42:00 +0100
Subject: rtla/timerlat: Add --bpf-action option

Add option --bpf-action that allows the user to attach an external BPF
program that will be executed via BPF tail call on latency threshold
overflow.

Executing additional BPF code on latency threshold overflow allows doing
low-latency and in-kernel troubleshooting of the cause of the overflow.

The option takes an argument, which is a path to a BPF ELF file
expected to contain a function named "action_handler" in a section named
"tp/timerlat_action" (the section is necessary for libbpf to assign the
correct BPF program type to it).

Link: https://lore.kernel.org/r/20251126144205.331954-3-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/timerlat.c      | 11 +++++++
 tools/tracing/rtla/src/timerlat.h      |  2 +-
 tools/tracing/rtla/src/timerlat_bpf.c  | 53 ++++++++++++++++++++++++++++++++++
 tools/tracing/rtla/src/timerlat_bpf.h  |  6 +++-
 tools/tracing/rtla/src/timerlat_hist.c |  5 ++++
 tools/tracing/rtla/src/timerlat_top.c  |  5 ++++
 6 files changed, 80 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/timerlat.c b/tools/tracing/rtla/src/timerlat.c
index ee15e344cf37..8f6cf55f4a94 100644
--- a/tools/tracing/rtla/src/timerlat.c
+++ b/tools/tracing/rtla/src/timerlat.c
@@ -48,6 +48,17 @@ timerlat_apply_config(struct osnoise_tool *tool, struct timerlat_params *params)
 		}
 	}
 
+	/* Check if BPF action program is requested but BPF is not available */
+	if (params->bpf_action_program) {
+		if (params->mode == TRACING_MODE_TRACEFS) {
+			err_msg("BPF actions are not supported in tracefs-only mode\n");
+			goto out_err;
+		}
+
+		if (timerlat_load_bpf_action_program(params->bpf_action_program))
+			goto out_err;
+	}
+
 	retval = osnoise_set_timerlat_period_us(tool->context,
 						params->timerlat_period_us ?
 						params->timerlat_period_us :
diff --git a/tools/tracing/rtla/src/timerlat.h b/tools/tracing/rtla/src/timerlat.h
index fd6065f48bb7..8dd5d134ce08 100644
--- a/tools/tracing/rtla/src/timerlat.h
+++ b/tools/tracing/rtla/src/timerlat.h
@@ -27,6 +27,7 @@ struct timerlat_params {
 	int			dump_tasks;
 	int			deepest_idle_state;
 	enum timerlat_tracing_mode mode;
+	const char		*bpf_action_program;
 };
 
 #define to_timerlat_params(ptr) container_of(ptr, struct timerlat_params, common)
@@ -36,4 +37,3 @@ int timerlat_main(int argc, char *argv[]);
 int timerlat_enable(struct osnoise_tool *tool);
 void timerlat_analyze(struct osnoise_tool *tool, bool stopped);
 void timerlat_free(struct osnoise_tool *tool);
-
diff --git a/tools/tracing/rtla/src/timerlat_bpf.c b/tools/tracing/rtla/src/timerlat_bpf.c
index 1d619e502c65..05adf18303df 100644
--- a/tools/tracing/rtla/src/timerlat_bpf.c
+++ b/tools/tracing/rtla/src/timerlat_bpf.c
@@ -7,6 +7,10 @@
 
 static struct timerlat_bpf *bpf;
 
+/* BPF object and program for action program */
+static struct bpf_object *obj;
+static struct bpf_program *prog;
+
 /*
  * timerlat_bpf_init - load and initialize BPF program to collect timerlat data
  */
@@ -96,6 +100,11 @@ void timerlat_bpf_detach(void)
 void timerlat_bpf_destroy(void)
 {
 	timerlat_bpf__destroy(bpf);
+	bpf = NULL;
+	if (obj)
+		bpf_object__close(obj);
+	obj = NULL;
+	prog = NULL;
 }
 
 static int handle_rb_event(void *ctx, void *data, size_t data_sz)
@@ -190,4 +199,48 @@ int timerlat_bpf_get_summary_value(enum summary_field key,
 			 bpf->maps.summary_user,
 			 key, value_irq, value_thread, value_user, cpus);
 }
+
+/*
+ * timerlat_load_bpf_action_program - load and register a BPF action program
+ */
+int timerlat_load_bpf_action_program(const char *program_path)
+{
+	int err;
+
+	obj = bpf_object__open_file(program_path, NULL);
+	if (!obj) {
+		err_msg("Failed to open BPF action program: %s\n", program_path);
+		goto out_err;
+	}
+
+	err = bpf_object__load(obj);
+	if (err) {
+		err_msg("Failed to load BPF action program: %s\n", program_path);
+		goto out_obj_err;
+	}
+
+	prog = bpf_object__find_program_by_name(obj, "action_handler");
+	if (!prog) {
+		err_msg("BPF action program must have 'action_handler' function: %s\n",
+			program_path);
+		goto out_obj_err;
+	}
+
+	err = timerlat_bpf_set_action(prog);
+	if (err) {
+		err_msg("Failed to register BPF action program: %s\n", program_path);
+		goto out_prog_err;
+	}
+
+	return 0;
+
+out_prog_err:
+	prog = NULL;
+out_obj_err:
+	bpf_object__close(obj);
+	obj = NULL;
+out_err:
+	return 1;
+}
+
 #endif /* HAVE_BPF_SKEL */
diff --git a/tools/tracing/rtla/src/timerlat_bpf.h b/tools/tracing/rtla/src/timerlat_bpf.h
index b5009092c7a3..169abeaf4363 100644
--- a/tools/tracing/rtla/src/timerlat_bpf.h
+++ b/tools/tracing/rtla/src/timerlat_bpf.h
@@ -30,7 +30,7 @@ int timerlat_bpf_get_summary_value(enum summary_field key,
 				   long long *value_thread,
 				   long long *value_user,
 				   int cpus);
-
+int timerlat_load_bpf_action_program(const char *program_path);
 static inline int have_libbpf_support(void) { return 1; }
 #else
 static inline int timerlat_bpf_init(struct timerlat_params *params)
@@ -58,6 +58,10 @@ static inline int timerlat_bpf_get_summary_value(enum summary_field key,
 {
 	return -1;
 }
+static inline int timerlat_load_bpf_action_program(const char *program_path)
+{
+	return -1;
+}
 static inline int have_libbpf_support(void) { return 0; }
 #endif /* HAVE_BPF_SKEL */
 #endif /* __bpf__ */
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 2a5c543217ba..ec43e6fda743 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -746,6 +746,7 @@ static void timerlat_hist_usage(void)
 		"	     --deepest-idle-state n: only go down to idle state n on cpus used by timerlat to reduce exit from idle latency",
 		"	     --on-threshold <action>: define action to be executed at latency threshold, multiple are allowed",
 		"	     --on-end <action>: define action to be executed at measurement end, multiple are allowed",
+		"	     --bpf-action <program>: load and execute BPF program when latency threshold is exceeded",
 		NULL,
 	};
 
@@ -825,6 +826,7 @@ static struct common_params
 			{"deepest-idle-state",	required_argument,	0, '\4'},
 			{"on-threshold",	required_argument,	0, '\5'},
 			{"on-end",		required_argument,	0, '\6'},
+			{"bpf-action",		required_argument,	0, '\7'},
 			{0, 0, 0, 0}
 		};
 
@@ -1006,6 +1008,9 @@ static struct common_params
 			if (retval)
 				fatal("Invalid action %s", optarg);
 			break;
+		case '\7':
+			params->bpf_action_program = optarg;
+			break;
 		default:
 			fatal("Invalid option");
 		}
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 9ed8b931552f..af20b3eee472 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -518,6 +518,7 @@ static void timerlat_top_usage(void)
 		"	     --deepest-idle-state n: only go down to idle state n on cpus used by timerlat to reduce exit from idle latency",
 		"	     --on-threshold <action>: define action to be executed at latency threshold, multiple are allowed",
 		"	     --on-end: define action to be executed at measurement end, multiple are allowed",
+		"	     --bpf-action <program>: load and execute BPF program when latency threshold is exceeded",
 		NULL,
 	};
 
@@ -589,6 +590,7 @@ static struct common_params
 			{"deepest-idle-state",	required_argument,	0, '8'},
 			{"on-threshold",	required_argument,	0, '9'},
 			{"on-end",		required_argument,	0, '\1'},
+			{"bpf-action",		required_argument,	0, '\2'},
 			{0, 0, 0, 0}
 		};
 
@@ -756,6 +758,9 @@ static struct common_params
 			if (retval)
 				fatal("Invalid action %s", optarg);
 			break;
+		case '\2':
+			params->bpf_action_program = optarg;
+			break;
 		default:
 			fatal("Invalid option");
 		}
-- 
cgit v1.2.3


From 0304a3b7ec9a207637ab6f360a41af5fb25e1f44 Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Wed, 26 Nov 2025 15:42:01 +0100
Subject: rtla/timerlat: Add example for BPF action program

Add an example BPF action program that prints the measured latency to
the tracefs buffer via bpf_printk().

A new Makefile target, "examples", is added to build the example. In
addition, "sample/" subfolder is renamed to "example".

If BPF skeleton support is unavailable or disabled, a warning will be
displayed when building the BPF action program example.

Link: https://lore.kernel.org/r/20251126144205.331954-4-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/Makefile                      |  9 ++-
 tools/tracing/rtla/example/timerlat_bpf_action.c | 16 +++++
 tools/tracing/rtla/example/timerlat_load.py      | 78 ++++++++++++++++++++++++
 tools/tracing/rtla/sample/timerlat_load.py       | 78 ------------------------
 4 files changed, 102 insertions(+), 79 deletions(-)
 create mode 100644 tools/tracing/rtla/example/timerlat_bpf_action.c
 create mode 100644 tools/tracing/rtla/example/timerlat_load.py
 delete mode 100644 tools/tracing/rtla/sample/timerlat_load.py

(limited to 'tools')

diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index 746ccf2f5808..5f1529ce3693 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -73,9 +73,15 @@ src/timerlat.bpf.o: src/timerlat.bpf.c
 
 src/timerlat.skel.h: src/timerlat.bpf.o
 	$(QUIET_GENSKEL)$(SYSTEM_BPFTOOL) gen skeleton $< > $@
+
+example/timerlat_bpf_action.o: example/timerlat_bpf_action.c
+	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@
 else
 src/timerlat.skel.h:
 	$(Q)echo '/* BPF skeleton is disabled */' > src/timerlat.skel.h
+
+example/timerlat_bpf_action.o: example/timerlat_bpf_action.c
+	$(Q)echo "BPF skeleton support is disabled, skipping example/timerlat_bpf_action.o"
 endif
 
 $(RTLA): $(RTLA_IN)
@@ -96,7 +102,8 @@ clean: doc_clean fixdep-clean
 	$(Q)find . -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
 	$(Q)rm -f rtla rtla-static fixdep FEATURE-DUMP rtla-*
 	$(Q)rm -rf feature
-	$(Q)rm -f src/timerlat.bpf.o src/timerlat.skel.h
+	$(Q)rm -f src/timerlat.bpf.o src/timerlat.skel.h example/timerlat_bpf_action.o
 check: $(RTLA)
 	RTLA=$(RTLA) prove -o -f tests/
+examples: example/timerlat_bpf_action.o
 .PHONY: FORCE clean check
diff --git a/tools/tracing/rtla/example/timerlat_bpf_action.c b/tools/tracing/rtla/example/timerlat_bpf_action.c
new file mode 100644
index 000000000000..ac1be049a848
--- /dev/null
+++ b/tools/tracing/rtla/example/timerlat_bpf_action.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_tracing.h>
+
+char LICENSE[] SEC("license") = "GPL";
+
+struct trace_event_raw_timerlat_sample {
+	unsigned long long timer_latency;
+} __attribute__((preserve_access_index));
+
+SEC("tp/timerlat_action")
+int action_handler(struct trace_event_raw_timerlat_sample *tp_args)
+{
+	bpf_printk("Latency: %lld\n", tp_args->timer_latency);
+	return 0;
+}
diff --git a/tools/tracing/rtla/example/timerlat_load.py b/tools/tracing/rtla/example/timerlat_load.py
new file mode 100644
index 000000000000..a819c3588073
--- /dev/null
+++ b/tools/tracing/rtla/example/timerlat_load.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright (C) 2024 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
+#
+# This is a sample code about how to use timerlat's timer by any workload
+# so rtla can measure and provide auto-analysis for the overall latency (IOW
+# the response time) for a task.
+#
+# Before running it, you need to dispatch timerlat with -U option in a terminal.
+# Then # run this script pinned to a CPU on another terminal. For example:
+#
+# timerlat_load.py 1 -p 95
+#
+# The "Timerlat IRQ" is the IRQ latency, The thread latency is the latency
+# for the python process to get the CPU. The Ret from user Timer Latency is
+# the overall latency. In other words, it is the response time for that
+# activation.
+#
+# This is just an example, the load is reading 20MB of data from /dev/full
+# It is in python because it is easy to read :-)
+
+import argparse
+import sys
+import os
+
+parser = argparse.ArgumentParser(description='user-space timerlat thread in Python')
+parser.add_argument("cpu", type=int, help='CPU to run timerlat thread')
+parser.add_argument("-p", "--prio", type=int, help='FIFO priority')
+args = parser.parse_args()
+
+try:
+    affinity_mask = {args.cpu}
+    os.sched_setaffinity(0, affinity_mask)
+except Exception as e:
+    print(f"Error setting affinity: {e}")
+    sys.exit(1)
+
+if args.prio:
+    try:
+        param = os.sched_param(args.prio)
+        os.sched_setscheduler(0, os.SCHED_FIFO, param)
+    except Exception as e:
+        print(f"Error setting priority: {e}")
+        sys.exit(1)
+
+try:
+    timerlat_path = f"/sys/kernel/tracing/osnoise/per_cpu/cpu{args.cpu}/timerlat_fd"
+    timerlat_fd = open(timerlat_path, 'r')
+except PermissionError:
+    print("Permission denied. Please check your access rights.")
+    sys.exit(1)
+except OSError:
+    print("Error opening timerlat fd, did you run timerlat -U?")
+    sys.exit(1)
+
+try:
+    data_fd = open("/dev/full", 'r')
+except Exception as e:
+    print(f"Error opening data fd: {e}")
+    sys.exit(1)
+
+while True:
+    try:
+        timerlat_fd.read(1)
+        data_fd.read(20 * 1024 * 1024)
+    except KeyboardInterrupt:
+        print("Leaving")
+        break
+    except IOError as e:
+        print(f"I/O error occurred: {e}")
+        break
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        break
+
+timerlat_fd.close()
+data_fd.close()
diff --git a/tools/tracing/rtla/sample/timerlat_load.py b/tools/tracing/rtla/sample/timerlat_load.py
deleted file mode 100644
index a819c3588073..000000000000
--- a/tools/tracing/rtla/sample/timerlat_load.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Copyright (C) 2024 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org>
-#
-# This is a sample code about how to use timerlat's timer by any workload
-# so rtla can measure and provide auto-analysis for the overall latency (IOW
-# the response time) for a task.
-#
-# Before running it, you need to dispatch timerlat with -U option in a terminal.
-# Then # run this script pinned to a CPU on another terminal. For example:
-#
-# timerlat_load.py 1 -p 95
-#
-# The "Timerlat IRQ" is the IRQ latency, The thread latency is the latency
-# for the python process to get the CPU. The Ret from user Timer Latency is
-# the overall latency. In other words, it is the response time for that
-# activation.
-#
-# This is just an example, the load is reading 20MB of data from /dev/full
-# It is in python because it is easy to read :-)
-
-import argparse
-import sys
-import os
-
-parser = argparse.ArgumentParser(description='user-space timerlat thread in Python')
-parser.add_argument("cpu", type=int, help='CPU to run timerlat thread')
-parser.add_argument("-p", "--prio", type=int, help='FIFO priority')
-args = parser.parse_args()
-
-try:
-    affinity_mask = {args.cpu}
-    os.sched_setaffinity(0, affinity_mask)
-except Exception as e:
-    print(f"Error setting affinity: {e}")
-    sys.exit(1)
-
-if args.prio:
-    try:
-        param = os.sched_param(args.prio)
-        os.sched_setscheduler(0, os.SCHED_FIFO, param)
-    except Exception as e:
-        print(f"Error setting priority: {e}")
-        sys.exit(1)
-
-try:
-    timerlat_path = f"/sys/kernel/tracing/osnoise/per_cpu/cpu{args.cpu}/timerlat_fd"
-    timerlat_fd = open(timerlat_path, 'r')
-except PermissionError:
-    print("Permission denied. Please check your access rights.")
-    sys.exit(1)
-except OSError:
-    print("Error opening timerlat fd, did you run timerlat -U?")
-    sys.exit(1)
-
-try:
-    data_fd = open("/dev/full", 'r')
-except Exception as e:
-    print(f"Error opening data fd: {e}")
-    sys.exit(1)
-
-while True:
-    try:
-        timerlat_fd.read(1)
-        data_fd.read(20 * 1024 * 1024)
-    except KeyboardInterrupt:
-        print("Leaving")
-        break
-    except IOError as e:
-        print(f"I/O error occurred: {e}")
-        break
-    except Exception as e:
-        print(f"Unexpected error: {e}")
-        break
-
-timerlat_fd.close()
-data_fd.close()
-- 
cgit v1.2.3


From 5525aebd4e0c6f7d92ec1cb074218bbcf3d46f13 Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Wed, 26 Nov 2025 15:42:02 +0100
Subject: rtla/tests: Test BPF action program

Add a test that implements a BPF program writing to a test map, which
is attached to RTLA via --bpf-action to be executed on theshold
overflow.

A combination of --on-threshold shell with bpftool (which is always
present if BPF support is enabled) is used to check whether the BPF
program has executed successfully.

Suggested-by: Crystal Wood <crwood@redhat.com>
Link: https://lore.kernel.org/r/20251126144205.331954-5-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/Makefile                   | 10 ++++++++--
 tools/tracing/rtla/tests/bpf/bpf_action_map.c | 25 +++++++++++++++++++++++++
 tools/tracing/rtla/tests/timerlat.t           | 15 +++++++++++++++
 3 files changed, 48 insertions(+), 2 deletions(-)
 create mode 100644 tools/tracing/rtla/tests/bpf/bpf_action_map.c

(limited to 'tools')

diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index 5f1529ce3693..aef814b639b7 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -76,12 +76,18 @@ src/timerlat.skel.h: src/timerlat.bpf.o
 
 example/timerlat_bpf_action.o: example/timerlat_bpf_action.c
 	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@
+
+tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c
+	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -c $(filter %.c,$^) -o $@
 else
 src/timerlat.skel.h:
 	$(Q)echo '/* BPF skeleton is disabled */' > src/timerlat.skel.h
 
 example/timerlat_bpf_action.o: example/timerlat_bpf_action.c
 	$(Q)echo "BPF skeleton support is disabled, skipping example/timerlat_bpf_action.o"
+
+tests/bpf/bpf_action_map.o: tests/bpf/bpf_action_map.c
+	$(Q)echo "BPF skeleton support is disabled, skipping tests/bpf/bpf_action_map.o"
 endif
 
 $(RTLA): $(RTLA_IN)
@@ -103,7 +109,7 @@ clean: doc_clean fixdep-clean
 	$(Q)rm -f rtla rtla-static fixdep FEATURE-DUMP rtla-*
 	$(Q)rm -rf feature
 	$(Q)rm -f src/timerlat.bpf.o src/timerlat.skel.h example/timerlat_bpf_action.o
-check: $(RTLA)
-	RTLA=$(RTLA) prove -o -f tests/
+check: $(RTLA) tests/bpf/bpf_action_map.o
+	RTLA=$(RTLA) BPFTOOL=$(SYSTEM_BPFTOOL) prove -o -f tests/
 examples: example/timerlat_bpf_action.o
 .PHONY: FORCE clean check
diff --git a/tools/tracing/rtla/tests/bpf/bpf_action_map.c b/tools/tracing/rtla/tests/bpf/bpf_action_map.c
new file mode 100644
index 000000000000..1686e0b858e6
--- /dev/null
+++ b/tools/tracing/rtla/tests/bpf/bpf_action_map.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_tracing.h>
+
+char LICENSE[] SEC("license") = "GPL";
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, unsigned int);
+	__type(value, unsigned long long);
+} rtla_test_map SEC(".maps");
+
+struct trace_event_raw_timerlat_sample;
+
+SEC("tp/timerlat_action")
+int action_handler(struct trace_event_raw_timerlat_sample *tp_args)
+{
+	unsigned int key = 0;
+	unsigned long long value = 42;
+
+	bpf_map_update_elem(&rtla_test_map, &key, &value, BPF_ANY);
+
+	return 0;
+}
diff --git a/tools/tracing/rtla/tests/timerlat.t b/tools/tracing/rtla/tests/timerlat.t
index bbaa1897d8a8..fd4935fd7b49 100644
--- a/tools/tracing/rtla/tests/timerlat.t
+++ b/tools/tracing/rtla/tests/timerlat.t
@@ -67,6 +67,21 @@ check "hist with trace output at end" \
 	"timerlat hist -d 1s --on-end trace" 0 "^  Saving trace to timerlat_trace.txt$"
 check "top with trace output at end" \
 	"timerlat top -d 1s --on-end trace" 0 "^  Saving trace to timerlat_trace.txt$"
+
+# BPF action program tests
+if [ "$option" -eq 0 ]
+then
+	# Test BPF action program properly in BPF mode
+	[ -z "$BPFTOOL" ] && BPFTOOL=bpftool
+	check "hist with BPF action program (BPF mode)" \
+		"timerlat hist -T 2 --bpf-action tests/bpf/bpf_action_map.o --on-threshold shell,command='$BPFTOOL map dump name rtla_test_map'" \
+		2 '"value": 42'
+else
+	# Test BPF action program failure in non-BPF mode
+	check "hist with BPF action program (non-BPF mode)" \
+		"timerlat hist -T 2 --bpf-action tests/bpf/bpf_action_map.o" \
+		1 "BPF actions are not supported in tracefs-only mode"
+fi
 done
 
 test_end
-- 
cgit v1.2.3


From fbb8ed6682f84e6e27c798a3117b0bcd4d0623c4 Mon Sep 17 00:00:00 2001
From: Tomas Glozar <tglozar@redhat.com>
Date: Wed, 26 Nov 2025 15:42:03 +0100
Subject: rtla/tests: Run Test::Harness in verbose mode

Add -v flag to prove command to also print the names of tests that
succeeded, not only those that failed, to allow easier debugging of the
test suite.

Also, drop printing the option and value to stdout in
check_with_osnoise_options, which was a debugging print that was
accidentally left in the final commit, and which would be otherwise now
visible in make check output, as stdout is no longer suppressed.

Suggested-by: Crystal Wood <crwood@redhat.com>
Reviewed-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20251126144205.331954-6-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/Makefile        | 2 +-
 tools/tracing/rtla/tests/engine.sh | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile
index aef814b639b7..2701256abaf3 100644
--- a/tools/tracing/rtla/Makefile
+++ b/tools/tracing/rtla/Makefile
@@ -110,6 +110,6 @@ clean: doc_clean fixdep-clean
 	$(Q)rm -rf feature
 	$(Q)rm -f src/timerlat.bpf.o src/timerlat.skel.h example/timerlat_bpf_action.o
 check: $(RTLA) tests/bpf/bpf_action_map.o
-	RTLA=$(RTLA) BPFTOOL=$(SYSTEM_BPFTOOL) prove -o -f tests/
+	RTLA=$(RTLA) BPFTOOL=$(SYSTEM_BPFTOOL) prove -o -f -v tests/
 examples: example/timerlat_bpf_action.o
 .PHONY: FORCE clean check
diff --git a/tools/tracing/rtla/tests/engine.sh b/tools/tracing/rtla/tests/engine.sh
index c7de3d6ed6a8..ed261e07c6d9 100644
--- a/tools/tracing/rtla/tests/engine.sh
+++ b/tools/tracing/rtla/tests/engine.sh
@@ -105,7 +105,6 @@ check_with_osnoise_options() {
 			[ "$1" == "" ] && continue
 			option=$(echo $1 | cut -d '=' -f 1)
 			value=$(echo $1 | cut -d '=' -f 2)
-			echo "option: $option, value: $value"
 			echo "$value" > "/sys/kernel/tracing/osnoise/$option" || return 1
 		done
 	fi
-- 
cgit v1.2.3


From 850cd24cb6d648262b994b99e189409b21a2c09b Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Tue, 9 Dec 2025 12:00:40 +0200
Subject: tools/rtla: Add common_parse_options()

Each rtla tool duplicates parsing of many common options. This creates
maintenance overhead and risks inconsistencies when updating these
options.

Add common_parse_options() to centralize parsing of options used across
all tools.

Common options to be migrated in future patches.

Changes since v1:
- restore opterr

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Link: https://lore.kernel.org/r/20251209100047.2692515-1-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/common.c        | 35 ++++++++++++++++++++++++++++++++++
 tools/tracing/rtla/src/common.h        |  1 +
 tools/tracing/rtla/src/osnoise_hist.c  |  3 +++
 tools/tracing/rtla/src/osnoise_top.c   |  3 +++
 tools/tracing/rtla/src/timerlat_hist.c |  3 +++
 tools/tracing/rtla/src/timerlat_top.c  |  3 +++
 6 files changed, 48 insertions(+)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index 009a4bce9737..c01de7972bea 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -6,6 +6,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <getopt.h>
 #include "common.h"
 
 struct trace_instance *trace_inst;
@@ -38,6 +39,40 @@ static void set_signals(struct common_params *params)
 	}
 }
 
+/*
+ * common_parse_options - parse common command line options
+ *
+ * @argc: argument count
+ * @argv: argument vector
+ * @common: common parameters structure
+ *
+ * Parse command line options that are common to all rtla tools.
+ *
+ * Returns: non zero if a common option was parsed, or 0
+ * if the option should be handled by tool-specific parsing.
+ */
+int common_parse_options(int argc, char **argv, struct common_params *common)
+{
+	int saved_state = optind;
+	int c;
+
+	static struct option long_options[] = {
+		{0, 0, 0, 0}
+	};
+
+	opterr = 0;
+	c = getopt_long(argc, argv, "", long_options, NULL);
+	opterr = 1;
+
+	switch (c) {
+	default:
+		optind = saved_state;
+		return 0;
+	}
+
+	return c;
+}
+
 /*
  * common_apply_config - apply common configs to the initialized tool
  */
diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h
index c48c9bfd20e3..ef17ea5be540 100644
--- a/tools/tracing/rtla/src/common.h
+++ b/tools/tracing/rtla/src/common.h
@@ -156,6 +156,7 @@ int osnoise_set_stop_us(struct osnoise_context *context, long long stop_us);
 int osnoise_set_stop_total_us(struct osnoise_context *context,
 			      long long stop_total_us);
 
+int common_parse_options(int argc, char **argv, struct common_params *common);
 int common_apply_config(struct osnoise_tool *tool, struct common_params *params);
 int top_main_loop(struct osnoise_tool *tool);
 int hist_main_loop(struct osnoise_tool *tool);
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index 372128db9e4a..d5c78e07bf60 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -512,6 +512,9 @@ static struct common_params
 			{0, 0, 0, 0}
 		};
 
+		if (common_parse_options(argc, argv, &params->common))
+			continue;
+
 		c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:",
 				 long_options, NULL);
 
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 1db1d946b600..2bb154da1139 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -363,6 +363,9 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 			{0, 0, 0, 0}
 		};
 
+		if (common_parse_options(argc, argv, &params->common))
+			continue;
+
 		c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:",
 				 long_options, NULL);
 
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index ec43e6fda743..a17111b6aa6d 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -830,6 +830,9 @@ static struct common_params
 			{0, 0, 0, 0}
 		};
 
+		if (common_parse_options(argc, argv, &params->common))
+			continue;
+
 		c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:",
 				 long_options, NULL);
 
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index af20b3eee472..b14a785361b1 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -594,6 +594,9 @@ static struct common_params
 			{0, 0, 0, 0}
 		};
 
+		if (common_parse_options(argc, argv, &params->common))
+			continue;
+
 		c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:",
 				 long_options, NULL);
 
-- 
cgit v1.2.3


From 28dc445919bf4019ffdaf65d94bf26ace25d4a5e Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Tue, 9 Dec 2025 12:00:41 +0200
Subject: tools/rtla: Consolidate -c/--cpus option parsing

Each rtla tool duplicates parsing of -c/--cpus.

Migrate the option parsing from individual tools to the
common_parse_options().

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Link: https://lore.kernel.org/r/20251209100047.2692515-2-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/common.c        | 8 +++++++-
 tools/tracing/rtla/src/osnoise_hist.c  | 9 +--------
 tools/tracing/rtla/src/osnoise_top.c   | 9 +--------
 tools/tracing/rtla/src/timerlat_hist.c | 9 +--------
 tools/tracing/rtla/src/timerlat_top.c  | 9 +--------
 5 files changed, 11 insertions(+), 33 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index c01de7972bea..1b9e0108b0ba 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -57,14 +57,20 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 	int c;
 
 	static struct option long_options[] = {
+		{"cpus",                required_argument,      0, 'c'},
 		{0, 0, 0, 0}
 	};
 
 	opterr = 0;
-	c = getopt_long(argc, argv, "", long_options, NULL);
+	c = getopt_long(argc, argv, "c:", long_options, NULL);
 	opterr = 1;
 
 	switch (c) {
+	case 'c':
+		if (parse_cpu_set(optarg, &common->monitored_cpus))
+			fatal("Invalid -c cpu list");
+		common->cpus = optarg;
+		break;
 	default:
 		optind = saved_state;
 		return 0;
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index d5c78e07bf60..443cb7f0e3a2 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -485,7 +485,6 @@ static struct common_params
 			{"auto",		required_argument,	0, 'a'},
 			{"bucket-size",		required_argument,	0, 'b'},
 			{"entries",		required_argument,	0, 'E'},
-			{"cpus",		required_argument,	0, 'c'},
 			{"cgroup",		optional_argument,	0, 'C'},
 			{"debug",		no_argument,		0, 'D'},
 			{"duration",		required_argument,	0, 'd'},
@@ -515,7 +514,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:",
+		c = getopt_long(argc, argv, "a:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -541,12 +540,6 @@ static struct common_params
 			    params->common.hist.bucket_size >= 1000000)
 				fatal("Bucket size needs to be > 0 and <= 1000000");
 			break;
-		case 'c':
-			retval = parse_cpu_set(optarg, &params->common.monitored_cpus);
-			if (retval)
-				fatal("Invalid -c cpu list");
-			params->common.cpus = optarg;
-			break;
 		case 'C':
 			params->common.cgroup = 1;
 			params->common.cgroup_name = parse_optional_arg(argc, argv);
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 2bb154da1139..8ba0757225b3 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -339,7 +339,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
-			{"cpus",		required_argument,	0, 'c'},
 			{"cgroup",		optional_argument,	0, 'C'},
 			{"debug",		no_argument,		0, 'D'},
 			{"duration",		required_argument,	0, 'd'},
@@ -366,7 +365,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:c:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:",
+		c = getopt_long(argc, argv, "a:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:",
 				 long_options, NULL);
 
 		/* Detect the end of the options. */
@@ -385,12 +384,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 			if (!trace_output)
 				trace_output = "osnoise_trace.txt";
 
-			break;
-		case 'c':
-			retval = parse_cpu_set(optarg, &params->common.monitored_cpus);
-			if (retval)
-				fatal("Invalid -c cpu list");
-			params->common.cpus = optarg;
 			break;
 		case 'C':
 			params->common.cgroup = 1;
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index a17111b6aa6d..3571aa8233c1 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -791,7 +791,6 @@ static struct common_params
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
-			{"cpus",		required_argument,	0, 'c'},
 			{"cgroup",		optional_argument,	0, 'C'},
 			{"bucket-size",		required_argument,	0, 'b'},
 			{"debug",		no_argument,		0, 'D'},
@@ -833,7 +832,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:c:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:",
+		c = getopt_long(argc, argv, "a:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -855,12 +854,6 @@ static struct common_params
 			if (!trace_output)
 				trace_output = "timerlat_trace.txt";
 
-			break;
-		case 'c':
-			retval = parse_cpu_set(optarg, &params->common.monitored_cpus);
-			if (retval)
-				fatal("Invalid -c cpu list");
-			params->common.cpus = optarg;
 			break;
 		case 'C':
 			params->common.cgroup = 1;
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index b14a785361b1..b0a048cded29 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -561,7 +561,6 @@ static struct common_params
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
-			{"cpus",		required_argument,	0, 'c'},
 			{"cgroup",		optional_argument,	0, 'C'},
 			{"debug",		no_argument,		0, 'D'},
 			{"duration",		required_argument,	0, 'd'},
@@ -597,7 +596,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:c:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:",
+		c = getopt_long(argc, argv, "a:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -634,12 +633,6 @@ static struct common_params
 			/* set aa_only to avoid parsing the trace */
 			params->common.aa_only = 1;
 			break;
-		case 'c':
-			retval = parse_cpu_set(optarg, &params->common.monitored_cpus);
-			if (retval)
-				fatal("Invalid -c cpu list");
-			params->common.cpus = optarg;
-			break;
 		case 'C':
 			params->common.cgroup = 1;
 			params->common.cgroup_name = optarg;
-- 
cgit v1.2.3


From edb23c8372222395fd4e4297240cbe2191425dbf Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Tue, 9 Dec 2025 12:00:42 +0200
Subject: tools/rtla: Consolidate -C/--cgroup option parsing

Each rtla tool duplicates parsing of -C/--cgroup.

Migrate the option parsing from individual tools to the
common_parse_options().

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Link: https://lore.kernel.org/r/20251209100047.2692515-3-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/common.c        | 7 ++++++-
 tools/tracing/rtla/src/osnoise_hist.c  | 7 +------
 tools/tracing/rtla/src/osnoise_top.c   | 7 +------
 tools/tracing/rtla/src/timerlat_hist.c | 7 +------
 tools/tracing/rtla/src/timerlat_top.c  | 7 +------
 5 files changed, 10 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index 1b9e0108b0ba..3400836f66ef 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -58,11 +58,12 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 
 	static struct option long_options[] = {
 		{"cpus",                required_argument,      0, 'c'},
+		{"cgroup",              optional_argument,      0, 'C'},
 		{0, 0, 0, 0}
 	};
 
 	opterr = 0;
-	c = getopt_long(argc, argv, "c:", long_options, NULL);
+	c = getopt_long(argc, argv, "c:C::", long_options, NULL);
 	opterr = 1;
 
 	switch (c) {
@@ -71,6 +72,10 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 			fatal("Invalid -c cpu list");
 		common->cpus = optarg;
 		break;
+	case 'C':
+		common->cgroup = 1;
+		common->cgroup_name = parse_optional_arg(argc, argv);
+		break;
 	default:
 		optind = saved_state;
 		return 0;
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index 443cb7f0e3a2..bcd4b4c96354 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -485,7 +485,6 @@ static struct common_params
 			{"auto",		required_argument,	0, 'a'},
 			{"bucket-size",		required_argument,	0, 'b'},
 			{"entries",		required_argument,	0, 'E'},
-			{"cgroup",		optional_argument,	0, 'C'},
 			{"debug",		no_argument,		0, 'D'},
 			{"duration",		required_argument,	0, 'd'},
 			{"house-keeping",	required_argument,		0, 'H'},
@@ -514,7 +513,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:C::b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:",
+		c = getopt_long(argc, argv, "a:b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -540,10 +539,6 @@ static struct common_params
 			    params->common.hist.bucket_size >= 1000000)
 				fatal("Bucket size needs to be > 0 and <= 1000000");
 			break;
-		case 'C':
-			params->common.cgroup = 1;
-			params->common.cgroup_name = parse_optional_arg(argc, argv);
-			break;
 		case 'D':
 			config_debug = 1;
 			break;
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 8ba0757225b3..2799dd75a4e2 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -339,7 +339,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
-			{"cgroup",		optional_argument,	0, 'C'},
 			{"debug",		no_argument,		0, 'D'},
 			{"duration",		required_argument,	0, 'd'},
 			{"event",		required_argument,	0, 'e'},
@@ -365,7 +364,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:C::d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:",
+		c = getopt_long(argc, argv, "a:d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:",
 				 long_options, NULL);
 
 		/* Detect the end of the options. */
@@ -384,10 +383,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 			if (!trace_output)
 				trace_output = "osnoise_trace.txt";
 
-			break;
-		case 'C':
-			params->common.cgroup = 1;
-			params->common.cgroup_name = parse_optional_arg(argc, argv);
 			break;
 		case 'D':
 			config_debug = 1;
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 3571aa8233c1..64c1dbb1fccc 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -791,7 +791,6 @@ static struct common_params
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
-			{"cgroup",		optional_argument,	0, 'C'},
 			{"bucket-size",		required_argument,	0, 'b'},
 			{"debug",		no_argument,		0, 'D'},
 			{"entries",		required_argument,	0, 'E'},
@@ -832,7 +831,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:C::b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:",
+		c = getopt_long(argc, argv, "a:b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -854,10 +853,6 @@ static struct common_params
 			if (!trace_output)
 				trace_output = "timerlat_trace.txt";
 
-			break;
-		case 'C':
-			params->common.cgroup = 1;
-			params->common.cgroup_name = parse_optional_arg(argc, argv);
 			break;
 		case 'b':
 			params->common.hist.bucket_size = get_llong_from_str(optarg);
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index b0a048cded29..bdc8bf265836 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -561,7 +561,6 @@ static struct common_params
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
-			{"cgroup",		optional_argument,	0, 'C'},
 			{"debug",		no_argument,		0, 'D'},
 			{"duration",		required_argument,	0, 'd'},
 			{"event",		required_argument,	0, 'e'},
@@ -596,7 +595,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:C::d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:",
+		c = getopt_long(argc, argv, "a:d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -633,10 +632,6 @@ static struct common_params
 			/* set aa_only to avoid parsing the trace */
 			params->common.aa_only = 1;
 			break;
-		case 'C':
-			params->common.cgroup = 1;
-			params->common.cgroup_name = optarg;
-			break;
 		case 'D':
 			config_debug = 1;
 			break;
-- 
cgit v1.2.3


From fd788c49a90328f5b2edaa87aa5af18648ade718 Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Tue, 9 Dec 2025 12:00:43 +0200
Subject: tools/rtla: Consolidate -D/--debug option parsing

Each rtla tool duplicates parsing of -D/--debug.

Migrate the option parsing from individual tools to the
common_parse_options().

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Link: https://lore.kernel.org/r/20251209100047.2692515-4-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/common.c        | 6 +++++-
 tools/tracing/rtla/src/osnoise_hist.c  | 6 +-----
 tools/tracing/rtla/src/osnoise_top.c   | 6 +-----
 tools/tracing/rtla/src/timerlat_hist.c | 6 +-----
 tools/tracing/rtla/src/timerlat_top.c  | 6 +-----
 5 files changed, 9 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index 3400836f66ef..f71cf7c7f4e3 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -59,11 +59,12 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 	static struct option long_options[] = {
 		{"cpus",                required_argument,      0, 'c'},
 		{"cgroup",              optional_argument,      0, 'C'},
+		{"debug",               no_argument,            0, 'D'},
 		{0, 0, 0, 0}
 	};
 
 	opterr = 0;
-	c = getopt_long(argc, argv, "c:C::", long_options, NULL);
+	c = getopt_long(argc, argv, "c:C::D", long_options, NULL);
 	opterr = 1;
 
 	switch (c) {
@@ -76,6 +77,9 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 		common->cgroup = 1;
 		common->cgroup_name = parse_optional_arg(argc, argv);
 		break;
+	case 'D':
+		config_debug = 1;
+		break;
 	default:
 		optind = saved_state;
 		return 0;
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index bcd4b4c96354..c9422b596622 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -485,7 +485,6 @@ static struct common_params
 			{"auto",		required_argument,	0, 'a'},
 			{"bucket-size",		required_argument,	0, 'b'},
 			{"entries",		required_argument,	0, 'E'},
-			{"debug",		no_argument,		0, 'D'},
 			{"duration",		required_argument,	0, 'd'},
 			{"house-keeping",	required_argument,		0, 'H'},
 			{"help",		no_argument,		0, 'h'},
@@ -513,7 +512,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:b:d:e:E:DhH:p:P:r:s:S:t::T:01234:5:6:7:",
+		c = getopt_long(argc, argv, "a:b:d:e:E:hH:p:P:r:s:S:t::T:01234:5:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -539,9 +538,6 @@ static struct common_params
 			    params->common.hist.bucket_size >= 1000000)
 				fatal("Bucket size needs to be > 0 and <= 1000000");
 			break;
-		case 'D':
-			config_debug = 1;
-			break;
 		case 'd':
 			params->common.duration = parse_seconds_duration(optarg);
 			if (!params->common.duration)
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 2799dd75a4e2..8d49042d10f0 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -339,7 +339,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
-			{"debug",		no_argument,		0, 'D'},
 			{"duration",		required_argument,	0, 'd'},
 			{"event",		required_argument,	0, 'e'},
 			{"house-keeping",	required_argument,	0, 'H'},
@@ -364,7 +363,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:d:De:hH:p:P:qr:s:S:t::T:0:1:2:3:",
+		c = getopt_long(argc, argv, "a:d:e:hH:p:P:qr:s:S:t::T:0:1:2:3:",
 				 long_options, NULL);
 
 		/* Detect the end of the options. */
@@ -383,9 +382,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 			if (!trace_output)
 				trace_output = "osnoise_trace.txt";
 
-			break;
-		case 'D':
-			config_debug = 1;
 			break;
 		case 'd':
 			params->common.duration = parse_seconds_duration(optarg);
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 64c1dbb1fccc..c08f628047c1 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -792,7 +792,6 @@ static struct common_params
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
 			{"bucket-size",		required_argument,	0, 'b'},
-			{"debug",		no_argument,		0, 'D'},
 			{"entries",		required_argument,	0, 'E'},
 			{"duration",		required_argument,	0, 'd'},
 			{"house-keeping",	required_argument,	0, 'H'},
@@ -831,7 +830,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:b:d:e:E:DhH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:",
+		c = getopt_long(argc, argv, "a:b:d:e:E:hH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -860,9 +859,6 @@ static struct common_params
 			    params->common.hist.bucket_size >= 1000000)
 				fatal("Bucket size needs to be > 0 and <= 1000000");
 			break;
-		case 'D':
-			config_debug = 1;
-			break;
 		case 'd':
 			params->common.duration = parse_seconds_duration(optarg);
 			if (!params->common.duration)
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index bdc8bf265836..7c0a3f582273 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -561,7 +561,6 @@ static struct common_params
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
-			{"debug",		no_argument,		0, 'D'},
 			{"duration",		required_argument,	0, 'd'},
 			{"event",		required_argument,	0, 'e'},
 			{"help",		no_argument,		0, 'h'},
@@ -595,7 +594,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:d:De:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:",
+		c = getopt_long(argc, argv, "a:d:e:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -632,9 +631,6 @@ static struct common_params
 			/* set aa_only to avoid parsing the trace */
 			params->common.aa_only = 1;
 			break;
-		case 'D':
-			config_debug = 1;
-			break;
 		case 'd':
 			params->common.duration = parse_seconds_duration(optarg);
 			if (!params->common.duration)
-- 
cgit v1.2.3


From 76975581fb0eba03820fe312094981c995c225f9 Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Tue, 9 Dec 2025 12:00:44 +0200
Subject: tools/rtla: Consolidate -d/--duration option parsing

Each rtla tool duplicates parsing of -d/--duration.

Migrate the option parsing from individual tools to the
common_parse_options().

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Link: https://lore.kernel.org/r/20251209100047.2692515-5-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/common.c        | 8 +++++++-
 tools/tracing/rtla/src/osnoise_hist.c  | 8 +-------
 tools/tracing/rtla/src/osnoise_top.c   | 8 +-------
 tools/tracing/rtla/src/timerlat_hist.c | 8 +-------
 tools/tracing/rtla/src/timerlat_top.c  | 8 +-------
 5 files changed, 11 insertions(+), 29 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index f71cf7c7f4e3..0776f1568d23 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -60,11 +60,12 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 		{"cpus",                required_argument,      0, 'c'},
 		{"cgroup",              optional_argument,      0, 'C'},
 		{"debug",               no_argument,            0, 'D'},
+		{"duration",            required_argument,      0, 'd'},
 		{0, 0, 0, 0}
 	};
 
 	opterr = 0;
-	c = getopt_long(argc, argv, "c:C::D", long_options, NULL);
+	c = getopt_long(argc, argv, "c:C::Dd:", long_options, NULL);
 	opterr = 1;
 
 	switch (c) {
@@ -80,6 +81,11 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 	case 'D':
 		config_debug = 1;
 		break;
+	case 'd':
+		common->duration = parse_seconds_duration(optarg);
+		if (!common->duration)
+			fatal("Invalid -d duration");
+		break;
 	default:
 		optind = saved_state;
 		return 0;
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index c9422b596622..f34c88fd57e2 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -485,7 +485,6 @@ static struct common_params
 			{"auto",		required_argument,	0, 'a'},
 			{"bucket-size",		required_argument,	0, 'b'},
 			{"entries",		required_argument,	0, 'E'},
-			{"duration",		required_argument,	0, 'd'},
 			{"house-keeping",	required_argument,		0, 'H'},
 			{"help",		no_argument,		0, 'h'},
 			{"period",		required_argument,	0, 'p'},
@@ -512,7 +511,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:b:d:e:E:hH:p:P:r:s:S:t::T:01234:5:6:7:",
+		c = getopt_long(argc, argv, "a:b:e:E:hH:p:P:r:s:S:t::T:01234:5:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -538,11 +537,6 @@ static struct common_params
 			    params->common.hist.bucket_size >= 1000000)
 				fatal("Bucket size needs to be > 0 and <= 1000000");
 			break;
-		case 'd':
-			params->common.duration = parse_seconds_duration(optarg);
-			if (!params->common.duration)
-				fatal("Invalid -D duration");
-			break;
 		case 'e':
 			tevent = trace_event_alloc(optarg);
 			if (!tevent)
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 8d49042d10f0..695c6ecf0098 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -339,7 +339,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
-			{"duration",		required_argument,	0, 'd'},
 			{"event",		required_argument,	0, 'e'},
 			{"house-keeping",	required_argument,	0, 'H'},
 			{"help",		no_argument,		0, 'h'},
@@ -363,7 +362,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:d:e:hH:p:P:qr:s:S:t::T:0:1:2:3:",
+		c = getopt_long(argc, argv, "a:e:hH:p:P:qr:s:S:t::T:0:1:2:3:",
 				 long_options, NULL);
 
 		/* Detect the end of the options. */
@@ -383,11 +382,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 				trace_output = "osnoise_trace.txt";
 
 			break;
-		case 'd':
-			params->common.duration = parse_seconds_duration(optarg);
-			if (!params->common.duration)
-				fatal("Invalid -d duration");
-			break;
 		case 'e':
 			tevent = trace_event_alloc(optarg);
 			if (!tevent)
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index c08f628047c1..d625dbe44676 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -793,7 +793,6 @@ static struct common_params
 			{"auto",		required_argument,	0, 'a'},
 			{"bucket-size",		required_argument,	0, 'b'},
 			{"entries",		required_argument,	0, 'E'},
-			{"duration",		required_argument,	0, 'd'},
 			{"house-keeping",	required_argument,	0, 'H'},
 			{"help",		no_argument,		0, 'h'},
 			{"irq",			required_argument,	0, 'i'},
@@ -830,7 +829,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:b:d:e:E:hH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:",
+		c = getopt_long(argc, argv, "a:b:e:E:hH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -859,11 +858,6 @@ static struct common_params
 			    params->common.hist.bucket_size >= 1000000)
 				fatal("Bucket size needs to be > 0 and <= 1000000");
 			break;
-		case 'd':
-			params->common.duration = parse_seconds_duration(optarg);
-			if (!params->common.duration)
-				fatal("Invalid -D duration");
-			break;
 		case 'e':
 			tevent = trace_event_alloc(optarg);
 			if (!tevent)
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 7c0a3f582273..95e949f49cbd 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -561,7 +561,6 @@ static struct common_params
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
-			{"duration",		required_argument,	0, 'd'},
 			{"event",		required_argument,	0, 'e'},
 			{"help",		no_argument,		0, 'h'},
 			{"house-keeping",	required_argument,	0, 'H'},
@@ -594,7 +593,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:d:e:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:",
+		c = getopt_long(argc, argv, "a:e:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -631,11 +630,6 @@ static struct common_params
 			/* set aa_only to avoid parsing the trace */
 			params->common.aa_only = 1;
 			break;
-		case 'd':
-			params->common.duration = parse_seconds_duration(optarg);
-			if (!params->common.duration)
-				fatal("Invalid -d duration");
-			break;
 		case 'e':
 			tevent = trace_event_alloc(optarg);
 			if (!tevent)
-- 
cgit v1.2.3


From c93c25fca5ab3c27b42f1f941871209573c0b41b Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Tue, 9 Dec 2025 12:00:45 +0200
Subject: tools/rtla: Consolidate -e/--event option parsing

Each rtla tool duplicates parsing of -e/--event.

Migrate the option parsing from individual tools to the
common_parse_options().

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Link: https://lore.kernel.org/r/20251209100047.2692515-6-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/common.c        | 13 ++++++++++++-
 tools/tracing/rtla/src/osnoise_hist.c  | 14 +-------------
 tools/tracing/rtla/src/osnoise_top.c   | 14 +-------------
 tools/tracing/rtla/src/timerlat_hist.c | 14 +-------------
 tools/tracing/rtla/src/timerlat_top.c  | 13 +------------
 5 files changed, 16 insertions(+), 52 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index 0776f1568d23..fbd38d80f1ac 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -53,6 +53,7 @@ static void set_signals(struct common_params *params)
  */
 int common_parse_options(int argc, char **argv, struct common_params *common)
 {
+	struct trace_events *tevent;
 	int saved_state = optind;
 	int c;
 
@@ -61,11 +62,12 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 		{"cgroup",              optional_argument,      0, 'C'},
 		{"debug",               no_argument,            0, 'D'},
 		{"duration",            required_argument,      0, 'd'},
+		{"event",               required_argument,      0, 'e'},
 		{0, 0, 0, 0}
 	};
 
 	opterr = 0;
-	c = getopt_long(argc, argv, "c:C::Dd:", long_options, NULL);
+	c = getopt_long(argc, argv, "c:C::Dd:e:", long_options, NULL);
 	opterr = 1;
 
 	switch (c) {
@@ -86,6 +88,15 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 		if (!common->duration)
 			fatal("Invalid -d duration");
 		break;
+	case 'e':
+		tevent = trace_event_alloc(optarg);
+		if (!tevent)
+			fatal("Error alloc trace event");
+
+		if (common->events)
+			tevent->next = common->events;
+		common->events = tevent;
+		break;
 	default:
 		optind = saved_state;
 		return 0;
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index f34c88fd57e2..8b3eab6092bb 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -463,7 +463,6 @@ static struct common_params
 *osnoise_hist_parse_args(int argc, char *argv[])
 {
 	struct osnoise_params *params;
-	struct trace_events *tevent;
 	int retval;
 	int c;
 	char *trace_output = NULL;
@@ -493,7 +492,6 @@ static struct common_params
 			{"stop",		required_argument,	0, 's'},
 			{"stop-total",		required_argument,	0, 'S'},
 			{"trace",		optional_argument,	0, 't'},
-			{"event",		required_argument,	0, 'e'},
 			{"threshold",		required_argument,	0, 'T'},
 			{"no-header",		no_argument,		0, '0'},
 			{"no-summary",		no_argument,		0, '1'},
@@ -511,7 +509,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:b:e:E:hH:p:P:r:s:S:t::T:01234:5:6:7:",
+		c = getopt_long(argc, argv, "a:b:E:hH:p:P:r:s:S:t::T:01234:5:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -537,16 +535,6 @@ static struct common_params
 			    params->common.hist.bucket_size >= 1000000)
 				fatal("Bucket size needs to be > 0 and <= 1000000");
 			break;
-		case 'e':
-			tevent = trace_event_alloc(optarg);
-			if (!tevent)
-				fatal("Error alloc trace event");
-
-			if (params->common.events)
-				tevent->next = params->common.events;
-
-			params->common.events = tevent;
-			break;
 		case 'E':
 			params->common.hist.entries = get_llong_from_str(optarg);
 			if (params->common.hist.entries < 10 ||
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 695c6ecf0098..47aac00e2848 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -315,7 +315,6 @@ static void osnoise_top_usage(struct osnoise_params *params)
 struct common_params *osnoise_top_parse_args(int argc, char **argv)
 {
 	struct osnoise_params *params;
-	struct trace_events *tevent;
 	int retval;
 	int c;
 	char *trace_output = NULL;
@@ -339,7 +338,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
-			{"event",		required_argument,	0, 'e'},
 			{"house-keeping",	required_argument,	0, 'H'},
 			{"help",		no_argument,		0, 'h'},
 			{"period",		required_argument,	0, 'p'},
@@ -362,7 +360,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:e:hH:p:P:qr:s:S:t::T:0:1:2:3:",
+		c = getopt_long(argc, argv, "a:hH:p:P:qr:s:S:t::T:0:1:2:3:",
 				 long_options, NULL);
 
 		/* Detect the end of the options. */
@@ -381,16 +379,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 			if (!trace_output)
 				trace_output = "osnoise_trace.txt";
 
-			break;
-		case 'e':
-			tevent = trace_event_alloc(optarg);
-			if (!tevent)
-				fatal("Error alloc trace event");
-
-			if (params->common.events)
-				tevent->next = params->common.events;
-			params->common.events = tevent;
-
 			break;
 		case 'h':
 		case '?':
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index d625dbe44676..32424e2bd34a 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -761,7 +761,6 @@ static struct common_params
 *timerlat_hist_parse_args(int argc, char *argv[])
 {
 	struct timerlat_params *params;
-	struct trace_events *tevent;
 	int auto_thresh;
 	int retval;
 	int c;
@@ -805,7 +804,6 @@ static struct common_params
 			{"user-threads",	no_argument,		0, 'u'},
 			{"kernel-threads",	no_argument,		0, 'k'},
 			{"user-load",		no_argument,		0, 'U'},
-			{"event",		required_argument,	0, 'e'},
 			{"no-irq",		no_argument,		0, '0'},
 			{"no-thread",		no_argument,		0, '1'},
 			{"no-header",		no_argument,		0, '2'},
@@ -829,7 +827,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:b:e:E:hH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:",
+		c = getopt_long(argc, argv, "a:b:E:hH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -858,16 +856,6 @@ static struct common_params
 			    params->common.hist.bucket_size >= 1000000)
 				fatal("Bucket size needs to be > 0 and <= 1000000");
 			break;
-		case 'e':
-			tevent = trace_event_alloc(optarg);
-			if (!tevent)
-				fatal("Error alloc trace event");
-
-			if (params->common.events)
-				tevent->next = params->common.events;
-
-			params->common.events = tevent;
-			break;
 		case 'E':
 			params->common.hist.entries = get_llong_from_str(optarg);
 			if (params->common.hist.entries < 10 ||
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 95e949f49cbd..928d887e0c2e 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -533,7 +533,6 @@ static struct common_params
 *timerlat_top_parse_args(int argc, char **argv)
 {
 	struct timerlat_params *params;
-	struct trace_events *tevent;
 	long long auto_thresh;
 	int retval;
 	int c;
@@ -561,7 +560,6 @@ static struct common_params
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
-			{"event",		required_argument,	0, 'e'},
 			{"help",		no_argument,		0, 'h'},
 			{"house-keeping",	required_argument,	0, 'H'},
 			{"irq",			required_argument,	0, 'i'},
@@ -593,7 +591,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:e:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:",
+		c = getopt_long(argc, argv, "a:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -630,15 +628,6 @@ static struct common_params
 			/* set aa_only to avoid parsing the trace */
 			params->common.aa_only = 1;
 			break;
-		case 'e':
-			tevent = trace_event_alloc(optarg);
-			if (!tevent)
-				fatal("Error alloc trace event");
-
-			if (params->common.events)
-				tevent->next = params->common.events;
-			params->common.events = tevent;
-			break;
 		case 'h':
 		case '?':
 			timerlat_top_usage();
-- 
cgit v1.2.3


From 5cc90b14ee54591b890ad026ad5e01b2960c3a31 Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Tue, 9 Dec 2025 12:00:46 +0200
Subject: tools/rtla: Consolidate -P/--priority option parsing

Each rtla tool duplicates parsing of -P/--priority.

Migrate the option parsing from individual tools to the
common_parse_options().

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Link: https://lore.kernel.org/r/20251209100047.2692515-7-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/common.c        | 8 +++++++-
 tools/tracing/rtla/src/osnoise_hist.c  | 9 +--------
 tools/tracing/rtla/src/osnoise_top.c   | 9 +--------
 tools/tracing/rtla/src/timerlat_hist.c | 9 +--------
 tools/tracing/rtla/src/timerlat_top.c  | 9 +--------
 5 files changed, 11 insertions(+), 33 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index fbd38d80f1ac..90f1bbb7e189 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -63,11 +63,12 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 		{"debug",               no_argument,            0, 'D'},
 		{"duration",            required_argument,      0, 'd'},
 		{"event",               required_argument,      0, 'e'},
+		{"priority",            required_argument,      0, 'P'},
 		{0, 0, 0, 0}
 	};
 
 	opterr = 0;
-	c = getopt_long(argc, argv, "c:C::Dd:e:", long_options, NULL);
+	c = getopt_long(argc, argv, "c:C::Dd:e:P:", long_options, NULL);
 	opterr = 1;
 
 	switch (c) {
@@ -97,6 +98,11 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 			tevent->next = common->events;
 		common->events = tevent;
 		break;
+	case 'P':
+		if (parse_prio(optarg, &common->sched_param) == -1)
+			fatal("Invalid -P priority");
+		common->set_sched = 1;
+		break;
 	default:
 		optind = saved_state;
 		return 0;
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index 8b3eab6092bb..6e66726766a1 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -487,7 +487,6 @@ static struct common_params
 			{"house-keeping",	required_argument,		0, 'H'},
 			{"help",		no_argument,		0, 'h'},
 			{"period",		required_argument,	0, 'p'},
-			{"priority",		required_argument,	0, 'P'},
 			{"runtime",		required_argument,	0, 'r'},
 			{"stop",		required_argument,	0, 's'},
 			{"stop-total",		required_argument,	0, 'S'},
@@ -509,7 +508,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:b:E:hH:p:P:r:s:S:t::T:01234:5:6:7:",
+		c = getopt_long(argc, argv, "a:b:E:hH:p:r:s:S:t::T:01234:5:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -556,12 +555,6 @@ static struct common_params
 			if (params->period > 10000000)
 				fatal("Period longer than 10 s");
 			break;
-		case 'P':
-			retval = parse_prio(optarg, &params->common.sched_param);
-			if (retval == -1)
-				fatal("Invalid -P priority");
-			params->common.set_sched = 1;
-			break;
 		case 'r':
 			params->runtime = get_llong_from_str(optarg);
 			if (params->runtime < 100)
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 47aac00e2848..7ac992ec7439 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -341,7 +341,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 			{"house-keeping",	required_argument,	0, 'H'},
 			{"help",		no_argument,		0, 'h'},
 			{"period",		required_argument,	0, 'p'},
-			{"priority",		required_argument,	0, 'P'},
 			{"quiet",		no_argument,		0, 'q'},
 			{"runtime",		required_argument,	0, 'r'},
 			{"stop",		required_argument,	0, 's'},
@@ -360,7 +359,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:hH:p:P:qr:s:S:t::T:0:1:2:3:",
+		c = getopt_long(argc, argv, "a:hH:p:qr:s:S:t::T:0:1:2:3:",
 				 long_options, NULL);
 
 		/* Detect the end of the options. */
@@ -395,12 +394,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 			if (params->period > 10000000)
 				fatal("Period longer than 10 s");
 			break;
-		case 'P':
-			retval = parse_prio(optarg, &params->common.sched_param);
-			if (retval == -1)
-				fatal("Invalid -P priority");
-			params->common.set_sched = 1;
-			break;
 		case 'q':
 			params->common.quiet = 1;
 			break;
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 32424e2bd34a..99b416ccfc5b 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -797,7 +797,6 @@ static struct common_params
 			{"irq",			required_argument,	0, 'i'},
 			{"nano",		no_argument,		0, 'n'},
 			{"period",		required_argument,	0, 'p'},
-			{"priority",		required_argument,	0, 'P'},
 			{"stack",		required_argument,	0, 's'},
 			{"thread",		required_argument,	0, 'T'},
 			{"trace",		optional_argument,	0, 't'},
@@ -827,7 +826,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:b:E:hH:i:knp:P:s:t::T:uU0123456:7:8:9\1\2:\3:",
+		c = getopt_long(argc, argv, "a:b:E:hH:i:knp:s:t::T:uU0123456:7:8:9\1\2:\3:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -886,12 +885,6 @@ static struct common_params
 			if (params->timerlat_period_us > 1000000)
 				fatal("Period longer than 1 s");
 			break;
-		case 'P':
-			retval = parse_prio(optarg, &params->common.sched_param);
-			if (retval == -1)
-				fatal("Invalid -P priority");
-			params->common.set_sched = 1;
-			break;
 		case 's':
 			params->print_stack = get_llong_from_str(optarg);
 			break;
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 928d887e0c2e..027aad1b639f 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -565,7 +565,6 @@ static struct common_params
 			{"irq",			required_argument,	0, 'i'},
 			{"nano",		no_argument,		0, 'n'},
 			{"period",		required_argument,	0, 'p'},
-			{"priority",		required_argument,	0, 'P'},
 			{"quiet",		no_argument,		0, 'q'},
 			{"stack",		required_argument,	0, 's'},
 			{"thread",		required_argument,	0, 'T'},
@@ -591,7 +590,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:hH:i:knp:P:qs:t::T:uU0:1:2:345:6:7:",
+		c = getopt_long(argc, argv, "a:hH:i:knp:qs:t::T:uU0:1:2:345:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -652,12 +651,6 @@ static struct common_params
 			if (params->timerlat_period_us > 1000000)
 				fatal("Period longer than 1 s");
 			break;
-		case 'P':
-			retval = parse_prio(optarg, &params->common.sched_param);
-			if (retval == -1)
-				fatal("Invalid -P priority");
-			params->common.set_sched = 1;
-			break;
 		case 'q':
 			params->common.quiet = 1;
 			break;
-- 
cgit v1.2.3


From 0576be469ef18a9f3460f6f207183033ae8b90c5 Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Tue, 9 Dec 2025 12:00:47 +0200
Subject: tools/rtla: Consolidate -H/--house-keeping option parsing

Each rtla tool duplicates parsing of -H/--house-keeping.

Migrate the option parsing from individual tools to the
common_parse_options().

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Link: https://lore.kernel.org/r/20251209100047.2692515-8-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/common.c        | 8 +++++++-
 tools/tracing/rtla/src/osnoise_hist.c  | 9 +--------
 tools/tracing/rtla/src/osnoise_top.c   | 9 +--------
 tools/tracing/rtla/src/timerlat_hist.c | 9 +--------
 tools/tracing/rtla/src/timerlat_top.c  | 9 +--------
 5 files changed, 11 insertions(+), 33 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index 90f1bbb7e189..6f64c1fc1b62 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -63,12 +63,13 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 		{"debug",               no_argument,            0, 'D'},
 		{"duration",            required_argument,      0, 'd'},
 		{"event",               required_argument,      0, 'e'},
+		{"house-keeping",       required_argument,      0, 'H'},
 		{"priority",            required_argument,      0, 'P'},
 		{0, 0, 0, 0}
 	};
 
 	opterr = 0;
-	c = getopt_long(argc, argv, "c:C::Dd:e:P:", long_options, NULL);
+	c = getopt_long(argc, argv, "c:C::Dd:e:H:P:", long_options, NULL);
 	opterr = 1;
 
 	switch (c) {
@@ -98,6 +99,11 @@ int common_parse_options(int argc, char **argv, struct common_params *common)
 			tevent->next = common->events;
 		common->events = tevent;
 		break;
+	case 'H':
+		common->hk_cpus = 1;
+		if (parse_cpu_set(optarg, &common->hk_cpu_set))
+			fatal("Error parsing house keeping CPUs");
+		break;
 	case 'P':
 		if (parse_prio(optarg, &common->sched_param) == -1)
 			fatal("Invalid -P priority");
diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index 6e66726766a1..705c73d55102 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -484,7 +484,6 @@ static struct common_params
 			{"auto",		required_argument,	0, 'a'},
 			{"bucket-size",		required_argument,	0, 'b'},
 			{"entries",		required_argument,	0, 'E'},
-			{"house-keeping",	required_argument,		0, 'H'},
 			{"help",		no_argument,		0, 'h'},
 			{"period",		required_argument,	0, 'p'},
 			{"runtime",		required_argument,	0, 'r'},
@@ -508,7 +507,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:b:E:hH:p:r:s:S:t::T:01234:5:6:7:",
+		c = getopt_long(argc, argv, "a:b:E:hp:r:s:S:t::T:01234:5:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -544,12 +543,6 @@ static struct common_params
 		case '?':
 			osnoise_hist_usage();
 			break;
-		case 'H':
-			params->common.hk_cpus = 1;
-			retval = parse_cpu_set(optarg, &params->common.hk_cpu_set);
-			if (retval)
-				fatal("Error parsing house keeping CPUs");
-			break;
 		case 'p':
 			params->period = get_llong_from_str(optarg);
 			if (params->period > 10000000)
diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c
index 7ac992ec7439..d54d47947fb4 100644
--- a/tools/tracing/rtla/src/osnoise_top.c
+++ b/tools/tracing/rtla/src/osnoise_top.c
@@ -338,7 +338,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 	while (1) {
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
-			{"house-keeping",	required_argument,	0, 'H'},
 			{"help",		no_argument,		0, 'h'},
 			{"period",		required_argument,	0, 'p'},
 			{"quiet",		no_argument,		0, 'q'},
@@ -359,7 +358,7 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:hH:p:qr:s:S:t::T:0:1:2:3:",
+		c = getopt_long(argc, argv, "a:hp:qr:s:S:t::T:0:1:2:3:",
 				 long_options, NULL);
 
 		/* Detect the end of the options. */
@@ -383,12 +382,6 @@ struct common_params *osnoise_top_parse_args(int argc, char **argv)
 		case '?':
 			osnoise_top_usage(params);
 			break;
-		case 'H':
-			params->common.hk_cpus = 1;
-			retval = parse_cpu_set(optarg, &params->common.hk_cpu_set);
-			if (retval)
-				fatal("Error parsing house keeping CPUs");
-			break;
 		case 'p':
 			params->period = get_llong_from_str(optarg);
 			if (params->period > 10000000)
diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c
index 99b416ccfc5b..4e8c38a61197 100644
--- a/tools/tracing/rtla/src/timerlat_hist.c
+++ b/tools/tracing/rtla/src/timerlat_hist.c
@@ -792,7 +792,6 @@ static struct common_params
 			{"auto",		required_argument,	0, 'a'},
 			{"bucket-size",		required_argument,	0, 'b'},
 			{"entries",		required_argument,	0, 'E'},
-			{"house-keeping",	required_argument,	0, 'H'},
 			{"help",		no_argument,		0, 'h'},
 			{"irq",			required_argument,	0, 'i'},
 			{"nano",		no_argument,		0, 'n'},
@@ -826,7 +825,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:b:E:hH:i:knp:s:t::T:uU0123456:7:8:9\1\2:\3:",
+		c = getopt_long(argc, argv, "a:b:E:hi:knp:s:t::T:uU0123456:7:8:9\1\2:\3:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -865,12 +864,6 @@ static struct common_params
 		case '?':
 			timerlat_hist_usage();
 			break;
-		case 'H':
-			params->common.hk_cpus = 1;
-			retval = parse_cpu_set(optarg, &params->common.hk_cpu_set);
-			if (retval)
-				fatal("Error parsing house keeping CPUs");
-			break;
 		case 'i':
 			params->common.stop_us = get_llong_from_str(optarg);
 			break;
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index 027aad1b639f..f5a809344913 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -561,7 +561,6 @@ static struct common_params
 		static struct option long_options[] = {
 			{"auto",		required_argument,	0, 'a'},
 			{"help",		no_argument,		0, 'h'},
-			{"house-keeping",	required_argument,	0, 'H'},
 			{"irq",			required_argument,	0, 'i'},
 			{"nano",		no_argument,		0, 'n'},
 			{"period",		required_argument,	0, 'p'},
@@ -590,7 +589,7 @@ static struct common_params
 		if (common_parse_options(argc, argv, &params->common))
 			continue;
 
-		c = getopt_long(argc, argv, "a:hH:i:knp:qs:t::T:uU0:1:2:345:6:7:",
+		c = getopt_long(argc, argv, "a:hi:knp:qs:t::T:uU0:1:2:345:6:7:",
 				 long_options, NULL);
 
 		/* detect the end of the options. */
@@ -631,12 +630,6 @@ static struct common_params
 		case '?':
 			timerlat_top_usage();
 			break;
-		case 'H':
-			params->common.hk_cpus = 1;
-			retval = parse_cpu_set(optarg, &params->common.hk_cpu_set);
-			if (retval)
-				fatal("Error parsing house keeping CPUs");
-			break;
 		case 'i':
 			params->common.stop_us = get_llong_from_str(optarg);
 			break;
-- 
cgit v1.2.3


From 2a3a25336b1ba632a0a98249a7d4bbee454065aa Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Wed, 24 Dec 2025 14:50:56 +0200
Subject: tools/rtla: Deduplicate cgroup path opening code

Both set_pid_cgroup() and set_comm_cgroup() functions contain
identical code for opening the cgroup.procs file.

Extract this common code into a new helper function open_cgroup_procs()
to reduce code duplication and improve maintainability.

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Link: https://lore.kernel.org/r/20251224125058.1771519-1-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/utils.c | 65 +++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 33 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c
index 9cf5a0098e9a..0b84e02b13df 100644
--- a/tools/tracing/rtla/src/utils.c
+++ b/tools/tracing/rtla/src/utils.c
@@ -784,27 +784,27 @@ static int get_self_cgroup(char *self_cg, int sizeof_self_cg)
 }
 
 /*
- * set_comm_cgroup - Set cgroup to pid_t pid
+ * open_cgroup_procs - Open the cgroup.procs file for the given cgroup
  *
- * If cgroup argument is not NULL, the threads will move to the given cgroup.
- * Otherwise, the cgroup of the calling, i.e., rtla, thread will be used.
+ * If cgroup argument is not NULL, the cgroup.procs file for that cgroup
+ * will be opened. Otherwise, the cgroup of the calling, i.e., rtla, thread
+ * will be used.
  *
  * Supports cgroup v2.
  *
- * Returns 1 on success, 0 otherwise.
+ * Returns the file descriptor on success, -1 otherwise.
  */
-int set_pid_cgroup(pid_t pid, const char *cgroup)
+static int open_cgroup_procs(const char *cgroup)
 {
 	char cgroup_path[MAX_PATH - strlen("/cgroup.procs")];
 	char cgroup_procs[MAX_PATH];
-	char pid_str[24];
 	int retval;
 	int cg_fd;
 
 	retval = find_mount("cgroup2", cgroup_path, sizeof(cgroup_path));
 	if (!retval) {
 		err_msg("Did not find cgroupv2 mount point\n");
-		return 0;
+		return -1;
 	}
 
 	if (!cgroup) {
@@ -812,7 +812,7 @@ int set_pid_cgroup(pid_t pid, const char *cgroup)
 				sizeof(cgroup_path) - strlen(cgroup_path));
 		if (!retval) {
 			err_msg("Did not find self cgroup\n");
-			return 0;
+			return -1;
 		}
 	} else {
 		snprintf(&cgroup_path[strlen(cgroup_path)],
@@ -824,6 +824,29 @@ int set_pid_cgroup(pid_t pid, const char *cgroup)
 	debug_msg("Using cgroup path at: %s\n", cgroup_procs);
 
 	cg_fd = open(cgroup_procs, O_RDWR);
+	if (cg_fd < 0)
+		return -1;
+
+	return cg_fd;
+}
+
+/*
+ * set_pid_cgroup - Set cgroup to pid_t pid
+ *
+ * If cgroup argument is not NULL, the threads will move to the given cgroup.
+ * Otherwise, the cgroup of the calling, i.e., rtla, thread will be used.
+ *
+ * Supports cgroup v2.
+ *
+ * Returns 1 on success, 0 otherwise.
+ */
+int set_pid_cgroup(pid_t pid, const char *cgroup)
+{
+	char pid_str[24];
+	int retval;
+	int cg_fd;
+
+	cg_fd = open_cgroup_procs(cgroup);
 	if (cg_fd < 0)
 		return 0;
 
@@ -853,8 +876,6 @@ int set_pid_cgroup(pid_t pid, const char *cgroup)
  */
 int set_comm_cgroup(const char *comm_prefix, const char *cgroup)
 {
-	char cgroup_path[MAX_PATH - strlen("/cgroup.procs")];
-	char cgroup_procs[MAX_PATH];
 	struct dirent *proc_entry;
 	DIR *procfs;
 	int retval;
@@ -866,29 +887,7 @@ int set_comm_cgroup(const char *comm_prefix, const char *cgroup)
 		return 0;
 	}
 
-	retval = find_mount("cgroup2", cgroup_path, sizeof(cgroup_path));
-	if (!retval) {
-		err_msg("Did not find cgroupv2 mount point\n");
-		return 0;
-	}
-
-	if (!cgroup) {
-		retval = get_self_cgroup(&cgroup_path[strlen(cgroup_path)],
-				sizeof(cgroup_path) - strlen(cgroup_path));
-		if (!retval) {
-			err_msg("Did not find self cgroup\n");
-			return 0;
-		}
-	} else {
-		snprintf(&cgroup_path[strlen(cgroup_path)],
-				sizeof(cgroup_path) - strlen(cgroup_path), "%s/", cgroup);
-	}
-
-	snprintf(cgroup_procs, MAX_PATH, "%s/cgroup.procs", cgroup_path);
-
-	debug_msg("Using cgroup path at: %s\n", cgroup_procs);
-
-	cg_fd = open(cgroup_procs, O_RDWR);
+	cg_fd = open_cgroup_procs(cgroup);
 	if (cg_fd < 0)
 		return 0;
 
-- 
cgit v1.2.3


From 648634d17c813b35da775982662e56ea8ce750de Mon Sep 17 00:00:00 2001
From: Wander Lairson Costa <wander@redhat.com>
Date: Tue, 6 Jan 2026 08:49:39 -0300
Subject: rtla: Introduce for_each_action() helper

The for loop to iterate over the list of actions is used in
more than one place. To avoid code duplication and improve
readability, introduce a for_each_action() helper macro.

Replace the open-coded for loops with the new helper.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20260106133655.249887-4-wander@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/actions.c | 6 ++++--
 tools/tracing/rtla/src/actions.h | 5 +++++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c
index 8945aee58d51..31bc98db9228 100644
--- a/tools/tracing/rtla/src/actions.c
+++ b/tools/tracing/rtla/src/actions.c
@@ -32,7 +32,9 @@ void
 actions_destroy(struct actions *self)
 {
 	/* Free any action-specific data */
-	for (struct action *action = self->list; action < self->list + self->len; action++) {
+	struct action *action;
+
+	for_each_action(self, action) {
 		if (action->type == ACTION_SHELL)
 			free(action->command);
 		if (action->type == ACTION_TRACE_OUTPUT)
@@ -223,7 +225,7 @@ actions_perform(struct actions *self)
 	int pid, retval;
 	const struct action *action;
 
-	for (action = self->list; action < self->list + self->len; action++) {
+	for_each_action(self, action) {
 		switch (action->type) {
 		case ACTION_TRACE_OUTPUT:
 			retval = save_trace_to_file(self->trace_output_inst, action->trace_output);
diff --git a/tools/tracing/rtla/src/actions.h b/tools/tracing/rtla/src/actions.h
index a4f9b570775b..fb77069c972b 100644
--- a/tools/tracing/rtla/src/actions.h
+++ b/tools/tracing/rtla/src/actions.h
@@ -42,6 +42,11 @@ struct actions {
 	struct tracefs_instance *trace_output_inst;
 };
 
+#define for_each_action(actions, action)			\
+	for ((action) = (actions)->list;			\
+	     (action) < (actions)->list + (actions)->len;	\
+	     (action)++)
+
 void actions_init(struct actions *self);
 void actions_destroy(struct actions *self);
 int actions_add_trace_output(struct actions *self, const char *trace_output);
-- 
cgit v1.2.3


From 7e9dfccf8f11c26208211457c4597a466135b56a Mon Sep 17 00:00:00 2001
From: Wander Lairson Costa <wander@redhat.com>
Date: Tue, 6 Jan 2026 08:49:40 -0300
Subject: rtla: Replace atoi() with a robust strtoi()

The atoi() function does not perform error checking, which can lead to
undefined behavior when parsing invalid or out-of-range strings. This
can cause issues when parsing user-provided numerical inputs, such as
signal numbers, PIDs, or CPU lists.

To address this, introduce a new strtoi() helper function that safely
converts a string to an integer. This function validates the input and
checks for overflows, returning a negative value on  failure.

Replace all calls to atoi() with the new strtoi() function and add
proper error handling to make the parsing more robust and prevent
potential issues.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20260106133655.249887-5-wander@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/actions.c |  7 ++++---
 tools/tracing/rtla/src/utils.c   | 40 +++++++++++++++++++++++++++++++++++-----
 tools/tracing/rtla/src/utils.h   |  2 ++
 3 files changed, 41 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c
index 31bc98db9228..ace9965ebd55 100644
--- a/tools/tracing/rtla/src/actions.c
+++ b/tools/tracing/rtla/src/actions.c
@@ -181,12 +181,13 @@ actions_parse(struct actions *self, const char *trigger, const char *tracefn)
 		/* Takes two arguments, num (signal) and pid */
 		while (token != NULL) {
 			if (strlen(token) > 4 && strncmp(token, "num=", 4) == 0) {
-				signal = atoi(token + 4);
+				if (strtoi(token + 4, &signal))
+					return -1;
 			} else if (strlen(token) > 4 && strncmp(token, "pid=", 4) == 0) {
 				if (strncmp(token + 4, "parent", 7) == 0)
 					pid = -1;
-				else
-					pid = atoi(token + 4);
+				else if (strtoi(token + 4, &pid))
+					return -1;
 			} else {
 				/* Invalid argument */
 				return -1;
diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c
index 0b84e02b13df..748b86e6c2cc 100644
--- a/tools/tracing/rtla/src/utils.c
+++ b/tools/tracing/rtla/src/utils.c
@@ -17,6 +17,7 @@
 #include <fcntl.h>
 #include <sched.h>
 #include <stdio.h>
+#include <limits.h>
 
 #include "utils.h"
 
@@ -127,16 +128,18 @@ int parse_cpu_set(char *cpu_list, cpu_set_t *set)
 	nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
 
 	for (p = cpu_list; *p; ) {
-		cpu = atoi(p);
-		if (cpu < 0 || (!cpu && *p != '0') || cpu >= nr_cpus)
+		if (strtoi(p, &cpu))
+			goto err;
+		if (cpu < 0 || cpu >= nr_cpus)
 			goto err;
 
 		while (isdigit(*p))
 			p++;
 		if (*p == '-') {
 			p++;
-			end_cpu = atoi(p);
-			if (end_cpu < cpu || (!end_cpu && *p != '0') || end_cpu >= nr_cpus)
+			if (strtoi(p, &end_cpu))
+				goto err;
+			if (end_cpu < cpu || end_cpu >= nr_cpus)
 				goto err;
 			while (isdigit(*p))
 				p++;
@@ -337,6 +340,7 @@ int set_comm_sched_attr(const char *comm_prefix, struct sched_attr *attr)
 	struct dirent *proc_entry;
 	DIR *procfs;
 	int retval;
+	int pid;
 
 	if (strlen(comm_prefix) >= MAX_PATH) {
 		err_msg("Command prefix is too long: %d < strlen(%s)\n",
@@ -356,8 +360,12 @@ int set_comm_sched_attr(const char *comm_prefix, struct sched_attr *attr)
 		if (!retval)
 			continue;
 
+		if (strtoi(proc_entry->d_name, &pid)) {
+			err_msg("'%s' is not a valid pid", proc_entry->d_name);
+			goto out_err;
+		}
 		/* procfs_is_workload_pid confirmed it is a pid */
-		retval = __set_sched_attr(atoi(proc_entry->d_name), attr);
+		retval = __set_sched_attr(pid, attr);
 		if (retval) {
 			err_msg("Error setting sched attributes for pid:%s\n", proc_entry->d_name);
 			goto out_err;
@@ -999,3 +1007,25 @@ char *parse_optional_arg(int argc, char **argv)
 		return NULL;
 	}
 }
+
+/*
+ * strtoi - convert string to integer with error checking
+ *
+ * Returns 0 on success, -1 if conversion fails or result is out of int range.
+ */
+int strtoi(const char *s, int *res)
+{
+	char *end_ptr;
+	long lres;
+
+	if (!*s)
+		return -1;
+
+	errno = 0;
+	lres = strtol(s, &end_ptr, 0);
+	if (errno || *end_ptr || lres > INT_MAX || lres < INT_MIN)
+		return -1;
+
+	*res = (int) lres;
+	return 0;
+}
diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h
index ed7618842e82..974f7e0188c0 100644
--- a/tools/tracing/rtla/src/utils.h
+++ b/tools/tracing/rtla/src/utils.h
@@ -3,6 +3,7 @@
 #include <stdint.h>
 #include <time.h>
 #include <sched.h>
+#include <stdbool.h>
 
 /*
  * '18446744073709551615\0'
@@ -81,6 +82,7 @@ static inline int set_deepest_cpu_idle_state(unsigned int cpu, unsigned int stat
 static inline int have_libcpupower_support(void) { return 0; }
 #endif /* HAVE_LIBCPUPOWER_SUPPORT */
 int auto_house_keeping(cpu_set_t *monitored_cpus);
+__attribute__((__warn_unused_result__)) int strtoi(const char *s, int *res);
 
 #define ns_to_usf(x) (((double)x/1000))
 #define ns_to_per(total, part) ((part * 100) / (double)total)
-- 
cgit v1.2.3


From 9bf942f3c370c9b3af639df04cb5f34daf512dab Mon Sep 17 00:00:00 2001
From: Wander Lairson Costa <wander@redhat.com>
Date: Tue, 6 Jan 2026 08:49:44 -0300
Subject: rtla: Use standard exit codes for result enum

The result enum defines custom values for PASSED, ERROR, and FAILED.
These values correspond to standard exit codes EXIT_SUCCESS and
EXIT_FAILURE.

Update the enum to use the standard macros EXIT_SUCCESS and
EXIT_FAILURE to improve readability and adherence to standard C
practices.

The FAILED value is implicitly assigned EXIT_FAILURE + 1, so there
is no need to assign an explicit value.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20260106133655.249887-9-wander@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/utils.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h
index 974f7e0188c0..f7c2a52a0ab5 100644
--- a/tools/tracing/rtla/src/utils.h
+++ b/tools/tracing/rtla/src/utils.h
@@ -4,6 +4,7 @@
 #include <time.h>
 #include <sched.h>
 #include <stdbool.h>
+#include <stdlib.h>
 
 /*
  * '18446744073709551615\0'
@@ -88,7 +89,7 @@ __attribute__((__warn_unused_result__)) int strtoi(const char *s, int *res);
 #define ns_to_per(total, part) ((part * 100) / (double)total)
 
 enum result {
-	PASSED = 0, /* same as EXIT_SUCCESS */
-	ERROR = 1,  /* same as EXIT_FAILURE, an error in arguments */
-	FAILED = 2, /* test hit the stop tracing condition */
+	PASSED	= EXIT_SUCCESS,
+	ERROR	= EXIT_FAILURE,
+	FAILED, /* test hit the stop tracing condition */
 };
-- 
cgit v1.2.3


From d849f3af1cc7a53e3b150a9bbade8f9629445b36 Mon Sep 17 00:00:00 2001
From: Wander Lairson Costa <wander@redhat.com>
Date: Tue, 6 Jan 2026 08:49:45 -0300
Subject: rtla: Remove redundant memset after calloc

The actions struct is allocated using calloc, which already returns
zeroed memory. The subsequent memset call to zero the 'present' member
is therefore redundant.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20260106133655.249887-10-wander@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/actions.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c
index ace9965ebd55..d9c1db5d97d4 100644
--- a/tools/tracing/rtla/src/actions.c
+++ b/tools/tracing/rtla/src/actions.c
@@ -19,8 +19,6 @@ actions_init(struct actions *self)
 	self->len = 0;
 	self->continue_flag = false;
 
-	memset(&self->present, 0, sizeof(self->present));
-
 	/* This has to be set by the user */
 	self->trace_output_inst = NULL;
 }
-- 
cgit v1.2.3


From f3cc3e4b5116929ebff27c3b0a565b34ae4969b3 Mon Sep 17 00:00:00 2001
From: Wander Lairson Costa <wander@redhat.com>
Date: Tue, 6 Jan 2026 08:49:47 -0300
Subject: rtla: Remove unused headers

Remove unused includes for <errno.h> and <signal.h> to clean up the
code and reduce unnecessary dependencies.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20260106133655.249887-12-wander@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/osnoise_hist.c | 1 -
 tools/tracing/rtla/src/timerlat.c     | 1 -
 tools/tracing/rtla/src/timerlat_top.c | 1 -
 tools/tracing/rtla/src/trace.c        | 1 -
 4 files changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c
index 705c73d55102..9d70ea34807f 100644
--- a/tools/tracing/rtla/src/osnoise_hist.c
+++ b/tools/tracing/rtla/src/osnoise_hist.c
@@ -9,7 +9,6 @@
 #include <string.h>
 #include <signal.h>
 #include <unistd.h>
-#include <errno.h>
 #include <stdio.h>
 #include <time.h>
 
diff --git a/tools/tracing/rtla/src/timerlat.c b/tools/tracing/rtla/src/timerlat.c
index 8f6cf55f4a94..8f8811f7a13b 100644
--- a/tools/tracing/rtla/src/timerlat.c
+++ b/tools/tracing/rtla/src/timerlat.c
@@ -9,7 +9,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
-#include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <sched.h>
diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c
index f5a809344913..284b74773c2b 100644
--- a/tools/tracing/rtla/src/timerlat_top.c
+++ b/tools/tracing/rtla/src/timerlat_top.c
@@ -11,7 +11,6 @@
 #include <unistd.h>
 #include <stdio.h>
 #include <time.h>
-#include <errno.h>
 #include <sched.h>
 #include <pthread.h>
 
diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c
index 69cbc48d53d3..b8be3e28680e 100644
--- a/tools/tracing/rtla/src/trace.c
+++ b/tools/tracing/rtla/src/trace.c
@@ -2,7 +2,6 @@
 #define _GNU_SOURCE
 #include <sys/sendfile.h>
 #include <tracefs.h>
-#include <signal.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <errno.h>
-- 
cgit v1.2.3


From a0890f9dbd24b302d327fe7dad9b9c5be0e278aa Mon Sep 17 00:00:00 2001
From: Wander Lairson Costa <wander@redhat.com>
Date: Tue, 6 Jan 2026 08:49:48 -0300
Subject: rtla: Fix NULL pointer dereference in actions_parse

The actions_parse() function uses strtok() to tokenize the trigger
string, but does not check if the returned token is NULL before
passing it to strcmp(). If the trigger parameter is an empty string
or contains only delimiter characters, strtok() returns NULL, causing
strcmp() to dereference a NULL pointer and crash the program.

This issue can be triggered by malformed user input or edge cases in
trigger string parsing. Add a NULL check immediately after the strtok()
call to validate that a token was successfully extracted before using
it. If no token is found, the function now returns -1 to indicate a
parsing error.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20260106133655.249887-13-wander@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/actions.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/actions.c b/tools/tracing/rtla/src/actions.c
index d9c1db5d97d4..a42615011962 100644
--- a/tools/tracing/rtla/src/actions.c
+++ b/tools/tracing/rtla/src/actions.c
@@ -141,6 +141,8 @@ actions_parse(struct actions *self, const char *trigger, const char *tracefn)
 
 	strcpy(trigger_c, trigger);
 	token = strtok(trigger_c, ",");
+	if (!token)
+		return -1;
 
 	if (strcmp(token, "trace") == 0)
 		type = ACTION_TRACE_OUTPUT;
-- 
cgit v1.2.3


From 02689ae385c5e84874620947ac010cf7b4950375 Mon Sep 17 00:00:00 2001
From: Wander Lairson Costa <wander@redhat.com>
Date: Tue, 6 Jan 2026 08:49:50 -0300
Subject: rtla: Add generated output files to gitignore

The rtla tool generates various output files during testing and
execution, including custom trace outputs and histogram data. These
files are artifacts of running the tool with different options and
should not be tracked in version control.

Add gitignore entries for custom_filename.txt, osnoise_irq_noise_hist.txt,
osnoise_trace.txt, and timerlat_trace.txt to prevent accidentally
committing these generated files. This aligns with the existing pattern
of ignoring build artifacts and generated headers like *.skel.h.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20260106133655.249887-15-wander@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/.gitignore | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/tracing/rtla/.gitignore b/tools/tracing/rtla/.gitignore
index 1a394ad26cc1..4d39d64ac08c 100644
--- a/tools/tracing/rtla/.gitignore
+++ b/tools/tracing/rtla/.gitignore
@@ -5,3 +5,7 @@ fixdep
 feature
 FEATURE-DUMP
 *.skel.h
+custom_filename.txt
+osnoise_irq_noise_hist.txt
+osnoise_trace.txt
+timerlat_trace.txt
-- 
cgit v1.2.3


From af2962d68b970b15d8910be2b0386b4f147ed78b Mon Sep 17 00:00:00 2001
From: Wander Lairson Costa <wander@redhat.com>
Date: Tue, 6 Jan 2026 08:49:51 -0300
Subject: rtla: Make stop_tracing variable volatile

The stop_tracing global variable is accessed from both the signal
handler context and the main program flow without synchronization.
This creates a potential race condition where compiler optimizations
could cache the variable value in registers, preventing the signal
handler's updates from being visible to other parts of the program.

Add the volatile qualifier to stop_tracing in both common.c and
common.h to ensure all accesses to this variable bypass compiler
optimizations and read directly from memory. This guarantees that
when the signal handler sets stop_tracing, the change is immediately
visible to the main program loop, preventing potential hangs or
delayed shutdown when termination signals are received.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20260106133655.249887-16-wander@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/common.c | 2 +-
 tools/tracing/rtla/src/common.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index 6f64c1fc1b62..ceff76a62a30 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -10,7 +10,7 @@
 #include "common.h"
 
 struct trace_instance *trace_inst;
-int stop_tracing;
+volatile int stop_tracing;
 
 static void stop_trace(int sig)
 {
diff --git a/tools/tracing/rtla/src/common.h b/tools/tracing/rtla/src/common.h
index ef17ea5be540..7602c5593ef5 100644
--- a/tools/tracing/rtla/src/common.h
+++ b/tools/tracing/rtla/src/common.h
@@ -54,7 +54,7 @@ struct osnoise_context {
 };
 
 extern struct trace_instance *trace_inst;
-extern int stop_tracing;
+extern volatile int stop_tracing;
 
 struct hist_params {
 	char			no_irq;
-- 
cgit v1.2.3


From 33e3c807ab22bd4002640c8fe47fa30fd4f44ca0 Mon Sep 17 00:00:00 2001
From: Wander Lairson Costa <wander@redhat.com>
Date: Tue, 6 Jan 2026 08:49:52 -0300
Subject: rtla: Ensure null termination after read operations in utils.c

Add explicit null termination and buffer initialization for read()
operations in procfs_is_workload_pid() and get_self_cgroup() functions.
The read() system call does not null-terminate the data it reads, and
when the buffer is filled to capacity, subsequent string operations
will read past the buffer boundary searching for a null terminator.

In procfs_is_workload_pid(), explicitly set buffer[MAX_PATH-1] to '\0'
to ensure the buffer is always null-terminated before passing it to
strncmp(). In get_self_cgroup(), use memset() to zero the path buffer
before reading, which ensures null termination when retval is less than
MAX_PATH. Additionally, set path[MAX_PATH-1] to '\0' after the read to
handle the case where the buffer is filled completely.

These defensive buffer handling practices prevent potential buffer
overruns and align with the ongoing buffer safety improvements across
the rtla codebase.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20260106133655.249887-17-wander@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/utils.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c
index 748b86e6c2cc..5273745bc8df 100644
--- a/tools/tracing/rtla/src/utils.c
+++ b/tools/tracing/rtla/src/utils.c
@@ -317,6 +317,7 @@ static int procfs_is_workload_pid(const char *comm_prefix, struct dirent *proc_e
 	if (retval <= 0)
 		return 0;
 
+	buffer[MAX_PATH-1] = '\0';
 	retval = strncmp(comm_prefix, buffer, strlen(comm_prefix));
 	if (retval)
 		return 0;
@@ -750,6 +751,7 @@ static int get_self_cgroup(char *self_cg, int sizeof_self_cg)
 	if (fd < 0)
 		return 0;
 
+	memset(path, 0, sizeof(path));
 	retval = read(fd, path, MAX_PATH);
 
 	close(fd);
@@ -757,6 +759,7 @@ static int get_self_cgroup(char *self_cg, int sizeof_self_cg)
 	if (retval <= 0)
 		return 0;
 
+	path[MAX_PATH-1] = '\0';
 	start = path;
 
 	start = strstr(start, ":");
-- 
cgit v1.2.3


From fb8b8183208d8efe824e8d2c73fb1ab5ad1191fd Mon Sep 17 00:00:00 2001
From: Wander Lairson Costa <wander@redhat.com>
Date: Tue, 6 Jan 2026 08:49:53 -0300
Subject: rtla: Fix parse_cpu_set() return value documentation

Correct the return value documentation for parse_cpu_set() function
in utils.c. The comment incorrectly stated that the function returns
1 on success and 0 on failure, but the actual implementation returns
0 on success and 1 on failure, following the common error-on-nonzero
convention used throughout the codebase.

This documentation fix ensures that developers reading the code
understand the correct return value semantics and prevents potential
misuse of the function's return value in conditional checks.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20260106133655.249887-18-wander@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/utils.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c
index 5273745bc8df..18986a5aed3c 100644
--- a/tools/tracing/rtla/src/utils.c
+++ b/tools/tracing/rtla/src/utils.c
@@ -113,7 +113,7 @@ void get_duration(time_t start_time, char *output, int output_size)
  * Receives a cpu list, like 1-3,5 (cpus 1, 2, 3, 5), and then set
  * filling cpu_set_t argument.
  *
- * Returns 1 on success, 0 otherwise.
+ * Returns 0 on success, 1 otherwise.
  */
 int parse_cpu_set(char *cpu_list, cpu_set_t *set)
 {
-- 
cgit v1.2.3


From 1cabad3a00ab2e3d6bf19c5ab8fc9212d0b81e18 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Wed, 7 Jan 2026 09:59:33 +0800
Subject: kunit: tool: test: Rename test_data_path() to _test_data_path()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Running the KUnit testsuite through pytest fails, as the function
test_data_path() is recognized as a test function. Its execution fails
as pytest tries to resolve the 'path' argument as a fixture which does
not exist.

Rename the function, so the helper function is not incorrectly
recognized as a test function.

Link: https://lore.kernel.org/r/20260107015936.2316047-1-davidgow@google.com
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_tool_test.py | 56 +++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 28 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index bdc51b5c7b10..30ac1cb6c8ed 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -36,7 +36,7 @@ def setUpModule():
 def tearDownModule():
 	shutil.rmtree(test_tmpdir)
 
-def test_data_path(path):
+def _test_data_path(path):
 	return os.path.join(abs_test_data_dir, path)
 
 class KconfigTest(unittest.TestCase):
@@ -52,7 +52,7 @@ class KconfigTest(unittest.TestCase):
 		self.assertFalse(kconfig1.is_subset_of(kconfig0))
 
 	def test_read_from_file(self):
-		kconfig_path = test_data_path('test_read_from_file.kconfig')
+		kconfig_path = _test_data_path('test_read_from_file.kconfig')
 
 		kconfig = kunit_config.parse_file(kconfig_path)
 
@@ -98,7 +98,7 @@ class KUnitParserTest(unittest.TestCase):
 		raise AssertionError(f'"{needle}" not found in {list(backup)}!')
 
 	def test_output_isolated_correctly(self):
-		log_path = test_data_path('test_output_isolated_correctly.log')
+		log_path = _test_data_path('test_output_isolated_correctly.log')
 		with open(log_path) as file:
 			result = kunit_parser.extract_tap_lines(file.readlines())
 		self.assertContains('TAP version 14', result)
@@ -109,7 +109,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertContains('ok 1 - example', result)
 
 	def test_output_with_prefix_isolated_correctly(self):
-		log_path = test_data_path('test_pound_sign.log')
+		log_path = _test_data_path('test_pound_sign.log')
 		with open(log_path) as file:
 			result = kunit_parser.extract_tap_lines(file.readlines())
 		self.assertContains('TAP version 14', result)
@@ -138,35 +138,35 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertContains('ok 3 - string-stream-test', result)
 
 	def test_parse_successful_test_log(self):
-		all_passed_log = test_data_path('test_is_test_passed-all_passed.log')
+		all_passed_log = _test_data_path('test_is_test_passed-all_passed.log')
 		with open(all_passed_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
 		self.assertEqual(result.counts.errors, 0)
 
 	def test_parse_successful_nested_tests_log(self):
-		all_passed_log = test_data_path('test_is_test_passed-all_passed_nested.log')
+		all_passed_log = _test_data_path('test_is_test_passed-all_passed_nested.log')
 		with open(all_passed_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
 		self.assertEqual(result.counts.errors, 0)
 
 	def test_kselftest_nested(self):
-		kselftest_log = test_data_path('test_is_test_passed-kselftest.log')
+		kselftest_log = _test_data_path('test_is_test_passed-kselftest.log')
 		with open(kselftest_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
 		self.assertEqual(result.counts.errors, 0)
 
 	def test_parse_failed_test_log(self):
-		failed_log = test_data_path('test_is_test_passed-failure.log')
+		failed_log = _test_data_path('test_is_test_passed-failure.log')
 		with open(failed_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status)
 		self.assertEqual(result.counts.errors, 0)
 
 	def test_parse_failed_nested_tests_log(self):
-		nested_log = test_data_path('test_is_test_passed-failure-nested.log')
+		nested_log = _test_data_path('test_is_test_passed-failure-nested.log')
 		with open(nested_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.status)
@@ -177,7 +177,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual(kunit_parser.TestStatus.FAILURE, result.subtests[1].subtests[0].status)
 
 	def test_no_header(self):
-		empty_log = test_data_path('test_is_test_passed-no_tests_run_no_header.log')
+		empty_log = _test_data_path('test_is_test_passed-no_tests_run_no_header.log')
 		with open(empty_log) as file:
 			result = kunit_parser.parse_run_tests(
 				kunit_parser.extract_tap_lines(file.readlines()), stdout)
@@ -186,7 +186,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual(result.counts.errors, 1)
 
 	def test_missing_test_plan(self):
-		missing_plan_log = test_data_path('test_is_test_passed-'
+		missing_plan_log = _test_data_path('test_is_test_passed-'
 			'missing_plan.log')
 		with open(missing_plan_log) as file:
 			result = kunit_parser.parse_run_tests(
@@ -197,7 +197,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
 
 	def test_no_tests(self):
-		header_log = test_data_path('test_is_test_passed-no_tests_run_with_header.log')
+		header_log = _test_data_path('test_is_test_passed-no_tests_run_with_header.log')
 		with open(header_log) as file:
 			result = kunit_parser.parse_run_tests(
 				kunit_parser.extract_tap_lines(file.readlines()), stdout)
@@ -206,7 +206,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual(result.counts.errors, 1)
 
 	def test_no_tests_no_plan(self):
-		no_plan_log = test_data_path('test_is_test_passed-no_tests_no_plan.log')
+		no_plan_log = _test_data_path('test_is_test_passed-no_tests_no_plan.log')
 		with open(no_plan_log) as file:
 			result = kunit_parser.parse_run_tests(
 				kunit_parser.extract_tap_lines(file.readlines()), stdout)
@@ -218,7 +218,7 @@ class KUnitParserTest(unittest.TestCase):
 
 
 	def test_no_kunit_output(self):
-		crash_log = test_data_path('test_insufficient_memory.log')
+		crash_log = _test_data_path('test_insufficient_memory.log')
 		print_mock = mock.patch('kunit_printer.Printer.print').start()
 		with open(crash_log) as file:
 			result = kunit_parser.parse_run_tests(
@@ -229,7 +229,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual(result.counts.errors, 1)
 
 	def test_skipped_test(self):
-		skipped_log = test_data_path('test_skip_tests.log')
+		skipped_log = _test_data_path('test_skip_tests.log')
 		with open(skipped_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 
@@ -238,7 +238,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual(result.counts, kunit_parser.TestCounts(passed=4, skipped=1))
 
 	def test_skipped_all_tests(self):
-		skipped_log = test_data_path('test_skip_all_tests.log')
+		skipped_log = _test_data_path('test_skip_all_tests.log')
 		with open(skipped_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 
@@ -246,7 +246,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual(result.counts, kunit_parser.TestCounts(skipped=5))
 
 	def test_ignores_hyphen(self):
-		hyphen_log = test_data_path('test_strip_hyphen.log')
+		hyphen_log = _test_data_path('test_strip_hyphen.log')
 		with open(hyphen_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 
@@ -260,7 +260,7 @@ class KUnitParserTest(unittest.TestCase):
 			result.subtests[1].name)
 
 	def test_ignores_prefix_printk_time(self):
-		prefix_log = test_data_path('test_config_printk_time.log')
+		prefix_log = _test_data_path('test_config_printk_time.log')
 		with open(prefix_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
@@ -268,7 +268,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual(result.counts.errors, 0)
 
 	def test_ignores_multiple_prefixes(self):
-		prefix_log = test_data_path('test_multiple_prefixes.log')
+		prefix_log = _test_data_path('test_multiple_prefixes.log')
 		with open(prefix_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
@@ -276,7 +276,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual(result.counts.errors, 0)
 
 	def test_prefix_mixed_kernel_output(self):
-		mixed_prefix_log = test_data_path('test_interrupted_tap_output.log')
+		mixed_prefix_log = _test_data_path('test_interrupted_tap_output.log')
 		with open(mixed_prefix_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
@@ -284,7 +284,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual(result.counts.errors, 0)
 
 	def test_prefix_poundsign(self):
-		pound_log = test_data_path('test_pound_sign.log')
+		pound_log = _test_data_path('test_pound_sign.log')
 		with open(pound_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
@@ -292,7 +292,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual(result.counts.errors, 0)
 
 	def test_kernel_panic_end(self):
-		panic_log = test_data_path('test_kernel_panic_interrupt.log')
+		panic_log = _test_data_path('test_kernel_panic_interrupt.log')
 		with open(panic_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.TEST_CRASHED, result.status)
@@ -300,7 +300,7 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertGreaterEqual(result.counts.errors, 1)
 
 	def test_pound_no_prefix(self):
-		pound_log = test_data_path('test_pound_no_prefix.log')
+		pound_log = _test_data_path('test_pound_no_prefix.log')
 		with open(pound_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
@@ -329,7 +329,7 @@ class KUnitParserTest(unittest.TestCase):
 			'Failures: all_failed_suite, some_failed_suite.test2')
 
 	def test_ktap_format(self):
-		ktap_log = test_data_path('test_parse_ktap_output.log')
+		ktap_log = _test_data_path('test_parse_ktap_output.log')
 		with open(ktap_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.assertEqual(result.counts, kunit_parser.TestCounts(passed=3))
@@ -338,13 +338,13 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertEqual('case_2', result.subtests[0].subtests[1].name)
 
 	def test_parse_subtest_header(self):
-		ktap_log = test_data_path('test_parse_subtest_header.log')
+		ktap_log = _test_data_path('test_parse_subtest_header.log')
 		with open(ktap_log) as file:
 			kunit_parser.parse_run_tests(file.readlines(), stdout)
 		self.print_mock.assert_any_call(StrContains('suite (1 subtest)'))
 
 	def test_parse_attributes(self):
-		ktap_log = test_data_path('test_parse_attributes.log')
+		ktap_log = _test_data_path('test_parse_attributes.log')
 		with open(ktap_log) as file:
 			result = kunit_parser.parse_run_tests(file.readlines(), stdout)
 
@@ -566,7 +566,7 @@ class KUnitJsonTest(unittest.TestCase):
 		self.addCleanup(mock.patch.stopall)
 
 	def _json_for(self, log_file):
-		with open(test_data_path(log_file)) as file:
+		with open(_test_data_path(log_file)) as file:
 			test_result = kunit_parser.parse_run_tests(file, stdout)
 			json_obj = kunit_json.get_json_result(
 				test=test_result,
@@ -607,7 +607,7 @@ class StrContains(str):
 
 class KUnitMainTest(unittest.TestCase):
 	def setUp(self):
-		path = test_data_path('test_is_test_passed-all_passed.log')
+		path = _test_data_path('test_is_test_passed-all_passed.log')
 		with open(path) as file:
 			all_passed_log = file.readlines()
 
-- 
cgit v1.2.3


From f126d688193b4dd6d0044c19771469724c03f8f8 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Wed, 7 Jan 2026 09:59:34 +0800
Subject: kunit: tool: test: Don't rely on implicit working directory change
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If no kunitconfig_paths are passed to LinuxSourceTree() it falls back to
DEFAULT_KUNITCONFIG_PATH. This resolution only works when the current
working directory is the root of the source tree. This works by chance
when running the full testsuite through the default unittest runner, as
some tests will change the current working directory as a side-effect of
'kunit.main()'. When running a single testcase or using pytest, which
resets the working directory for each test, this assumption breaks.

Explicitly specify an empty kunitconfig for the affected tests.

Link: https://lore.kernel.org/r/20260107015936.2316047-2-davidgow@google.com
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_tool_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 30ac1cb6c8ed..238a31a5cc29 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -477,7 +477,8 @@ class LinuxSourceTreeTest(unittest.TestCase):
 		want_kconfig = kunit_config.Kconfig()
 		want_kconfig.add_entry('NOT_REAL', 'y')
 
-		tree = kunit_kernel.LinuxSourceTree('', kconfig_add=['CONFIG_NOT_REAL=y'])
+		tree = kunit_kernel.LinuxSourceTree('', kunitconfig_paths=[os.devnull],
+						    kconfig_add=['CONFIG_NOT_REAL=y'])
 		self.assertTrue(want_kconfig.is_subset_of(tree._kconfig), msg=tree._kconfig)
 
 	def test_invalid_arch(self):
@@ -489,7 +490,7 @@ class LinuxSourceTreeTest(unittest.TestCase):
 			return subprocess.Popen(['echo "hi\nbye"'], shell=True, text=True, stdout=subprocess.PIPE)
 
 		with tempfile.TemporaryDirectory('') as build_dir:
-			tree = kunit_kernel.LinuxSourceTree(build_dir)
+			tree = kunit_kernel.LinuxSourceTree(build_dir, kunitconfig_paths=[os.devnull])
 			mock.patch.object(tree._ops, 'start', side_effect=fake_start).start()
 
 			with self.assertRaises(ValueError):
-- 
cgit v1.2.3


From 8190b9ea30fef5b9067825b91fb3ec6d678ee5e3 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 18 Dec 2025 14:25:59 -0800
Subject: thermal: intel: selftests: workload_hint: Support slow workload hints

Add option to enable slow workload type hints. User can specify
"slow" as the command line argument to enable slow workload type hints.
There are two slow workload type hints: "power" and "performance".

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Link: https://patch.msgid.link/20251218222559.4110027-3-srinivas.pandruvada@linux.intel.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../intel/workload_hint/workload_hint_test.c       | 74 +++++++++++++++-------
 1 file changed, 52 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
index ca2bd03154e4..569d44f22835 100644
--- a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
+++ b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c
@@ -12,6 +12,7 @@
 
 #define WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/notification_delay_ms"
 #define WORKLOAD_ENABLE_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/workload_hint_enable"
+#define WORKLOAD_SLOW_ENABLE_ATTRIBUTE "/sys/bus/pci/devices/0000:00:04.0/workload_hint/workload_slow_hint_enable"
 #define WORKLOAD_TYPE_INDEX_ATTRIBUTE  "/sys/bus/pci/devices/0000:00:04.0/workload_hint/workload_type_index"
 
 static const char * const workload_types[] = {
@@ -22,6 +23,9 @@ static const char * const workload_types[] = {
 	NULL
 };
 
+static int wlt_slow;
+static char *wlt_enable_attr;
+
 #define WORKLOAD_TYPE_MAX_INDEX	3
 
 void workload_hint_exit(int signum)
@@ -30,7 +34,7 @@ void workload_hint_exit(int signum)
 
 	/* Disable feature via sysfs knob */
 
-	fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR);
+	fd = open(wlt_enable_attr, O_RDWR);
 	if (fd < 0) {
 		perror("Unable to open workload type feature enable file");
 		exit(1);
@@ -46,6 +50,26 @@ void workload_hint_exit(int signum)
 	close(fd);
 }
 
+static void update_delay(char *delay_str)
+{
+	int fd;
+
+	printf("Setting notification delay in ms to %s\n", delay_str);
+
+	fd = open(WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE, O_RDWR);
+	if (fd < 0) {
+		perror("Unable to open workload notification delay");
+		exit(1);
+	}
+
+	if (write(fd, delay_str, strlen(delay_str)) < 0) {
+		perror("Can't set delay");
+		exit(1);
+	}
+
+	close(fd);
+}
+
 int main(int argc, char **argv)
 {
 	struct pollfd ufd;
@@ -54,32 +78,26 @@ int main(int argc, char **argv)
 	char delay_str[64];
 	int delay = 0;
 
-	printf("Usage: workload_hint_test [notification delay in milli seconds]\n");
+	printf("Usage: workload_hint_test [notification delay in milli seconds][slow]\n");
 
 	if (argc > 1) {
-		ret = sscanf(argv[1], "%d", &delay);
-		if (ret < 0) {
-			printf("Invalid delay\n");
-			exit(1);
-		}
+		int i;
 
-		printf("Setting notification delay to %d ms\n", delay);
-		if (delay < 0)
-			exit(1);
+		for (i = 1; i < argc; ++i) {
+			if (!strcmp(argv[i], "slow")) {
+				wlt_slow = 1;
+				continue;
+			}
 
-		sprintf(delay_str, "%s\n", argv[1]);
-		fd = open(WORKLOAD_NOTIFICATION_DELAY_ATTRIBUTE, O_RDWR);
-		if (fd < 0) {
-			perror("Unable to open workload notification delay");
-			exit(1);
-		}
+			ret = sscanf(argv[1], "%d", &delay);
+			if (ret < 0) {
+				printf("Invalid delay\n");
+				exit(1);
+			}
 
-		if (write(fd, delay_str, strlen(delay_str)) < 0) {
-			perror("Can't set delay");
-			exit(1);
+			sprintf(delay_str, "%s\n", argv[1]);
+			update_delay(delay_str);
 		}
-
-		close(fd);
 	}
 
 	if (signal(SIGINT, workload_hint_exit) == SIG_IGN)
@@ -89,8 +107,13 @@ int main(int argc, char **argv)
 	if (signal(SIGTERM, workload_hint_exit) == SIG_IGN)
 		signal(SIGTERM, SIG_IGN);
 
+	if (wlt_slow)
+		wlt_enable_attr = WORKLOAD_SLOW_ENABLE_ATTRIBUTE;
+	else
+		wlt_enable_attr = WORKLOAD_ENABLE_ATTRIBUTE;
+
 	/* Enable feature via sysfs knob */
-	fd = open(WORKLOAD_ENABLE_ATTRIBUTE, O_RDWR);
+	fd = open(wlt_enable_attr, O_RDWR);
 	if (fd < 0) {
 		perror("Unable to open workload type feature enable file");
 		exit(1);
@@ -145,6 +168,13 @@ int main(int argc, char **argv)
 			if (ret < 0)
 				break;
 
+			if (wlt_slow) {
+				if (index & 0x10)
+					printf("workload type slow:%s\n", "power");
+				else
+					printf("workload type slow:%s\n", "performance");
+			}
+
 			index &= 0x0f;
 			if (index > WORKLOAD_TYPE_MAX_INDEX)
 				printf("Invalid workload type index\n");
-- 
cgit v1.2.3


From 0b28194c4c8e3a6c2552bfa6451f71b1879dd61f Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
Date: Fri, 5 Dec 2025 14:49:37 -0800
Subject: KVM: selftests: Test TPR / CR8 sync and interrupt masking

Add a few extra TPR / CR8 tests to x86's xapic_state_test to see if:
  * TPR is 0 on reset,
  * TPR, PPR and CR8 are equal inside the guest,
  * TPR and CR8 read equal by the host after a VMExit
  * TPR borderline values set by the host correctly mask interrupts in the
    guest.

These hopefully will catch the most obvious cases of improper TPR sync or
interrupt masking.

Do these tests both in x2APIC and xAPIC modes.
The x2APIC mode uses SELF_IPI register to trigger interrupts to give it a
bit of exercise too.

Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Acked-by: Naveen N Rao (AMD) <naveen@kernel.org>
[sean: put code in separate test]
Link: https://patch.msgid.link/20251205224937.428122-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm         |   1 +
 tools/testing/selftests/kvm/include/x86/apic.h   |   3 +
 tools/testing/selftests/kvm/x86/xapic_tpr_test.c | 276 +++++++++++++++++++++++
 3 files changed, 280 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86/xapic_tpr_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index ba5c2b643efa..3789890421bd 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -124,6 +124,7 @@ TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test
 TEST_GEN_PROGS_x86 += x86/xapic_ipi_test
 TEST_GEN_PROGS_x86 += x86/xapic_state_test
+TEST_GEN_PROGS_x86 += x86/xapic_tpr_test
 TEST_GEN_PROGS_x86 += x86/xcr0_cpuid_test
 TEST_GEN_PROGS_x86 += x86/xss_msr_test
 TEST_GEN_PROGS_x86 += x86/debug_regs
diff --git a/tools/testing/selftests/kvm/include/x86/apic.h b/tools/testing/selftests/kvm/include/x86/apic.h
index 80fe9f69b38d..e9b9aebaac97 100644
--- a/tools/testing/selftests/kvm/include/x86/apic.h
+++ b/tools/testing/selftests/kvm/include/x86/apic.h
@@ -28,6 +28,8 @@
 #define		GET_APIC_ID_FIELD(x)	(((x) >> 24) & 0xFF)
 #define	APIC_TASKPRI	0x80
 #define	APIC_PROCPRI	0xA0
+#define	GET_APIC_PRI(x) (((x) & GENMASK(7, 4)) >> 4)
+#define	SET_APIC_PRI(x, y) (((x) & ~GENMASK(7, 4)) | (y << 4))
 #define	APIC_EOI	0xB0
 #define	APIC_SPIV	0xF0
 #define		APIC_SPIV_FOCUS_DISABLED	(1 << 9)
@@ -67,6 +69,7 @@
 #define	APIC_TMICT	0x380
 #define	APIC_TMCCT	0x390
 #define	APIC_TDCR	0x3E0
+#define	APIC_SELF_IPI	0x3F0
 
 void apic_disable(void);
 void xapic_enable(void);
diff --git a/tools/testing/selftests/kvm/x86/xapic_tpr_test.c b/tools/testing/selftests/kvm/x86/xapic_tpr_test.c
new file mode 100644
index 000000000000..3862134d9d40
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/xapic_tpr_test.c
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <fcntl.h>
+#include <stdatomic.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+#include "apic.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+static bool is_x2apic;
+
+#define IRQ_VECTOR 0x20
+
+/* See also the comment at similar assertion in memslot_perf_test.c */
+static_assert(ATOMIC_INT_LOCK_FREE == 2, "atomic int is not lockless");
+
+static atomic_uint tpr_guest_irq_sync_val;
+
+static void tpr_guest_irq_sync_flag_reset(void)
+{
+	atomic_store_explicit(&tpr_guest_irq_sync_val, 0,
+			      memory_order_release);
+}
+
+static unsigned int tpr_guest_irq_sync_val_get(void)
+{
+	return atomic_load_explicit(&tpr_guest_irq_sync_val,
+				    memory_order_acquire);
+}
+
+static void tpr_guest_irq_sync_val_inc(void)
+{
+	atomic_fetch_add_explicit(&tpr_guest_irq_sync_val, 1,
+				  memory_order_acq_rel);
+}
+
+static void tpr_guest_irq_handler_xapic(struct ex_regs *regs)
+{
+	tpr_guest_irq_sync_val_inc();
+
+	xapic_write_reg(APIC_EOI, 0);
+}
+
+static void tpr_guest_irq_handler_x2apic(struct ex_regs *regs)
+{
+	tpr_guest_irq_sync_val_inc();
+
+	x2apic_write_reg(APIC_EOI, 0);
+}
+
+static void tpr_guest_irq_queue(void)
+{
+	if (is_x2apic) {
+		x2apic_write_reg(APIC_SELF_IPI, IRQ_VECTOR);
+	} else {
+		uint32_t icr, icr2;
+
+		icr = APIC_DEST_SELF | APIC_DEST_PHYSICAL | APIC_DM_FIXED |
+			IRQ_VECTOR;
+		icr2 = 0;
+
+		xapic_write_reg(APIC_ICR2, icr2);
+		xapic_write_reg(APIC_ICR, icr);
+	}
+}
+
+static uint8_t tpr_guest_tpr_get(void)
+{
+	uint32_t taskpri;
+
+	if (is_x2apic)
+		taskpri = x2apic_read_reg(APIC_TASKPRI);
+	else
+		taskpri = xapic_read_reg(APIC_TASKPRI);
+
+	return GET_APIC_PRI(taskpri);
+}
+
+static uint8_t tpr_guest_ppr_get(void)
+{
+	uint32_t procpri;
+
+	if (is_x2apic)
+		procpri = x2apic_read_reg(APIC_PROCPRI);
+	else
+		procpri = xapic_read_reg(APIC_PROCPRI);
+
+	return GET_APIC_PRI(procpri);
+}
+
+static uint8_t tpr_guest_cr8_get(void)
+{
+	uint64_t cr8;
+
+	asm volatile ("mov %%cr8, %[cr8]\n\t" : [cr8] "=r"(cr8));
+
+	return cr8 & GENMASK(3, 0);
+}
+
+static void tpr_guest_check_tpr_ppr_cr8_equal(void)
+{
+	uint8_t tpr;
+
+	tpr = tpr_guest_tpr_get();
+
+	GUEST_ASSERT_EQ(tpr_guest_ppr_get(), tpr);
+	GUEST_ASSERT_EQ(tpr_guest_cr8_get(), tpr);
+}
+
+static void tpr_guest_code(void)
+{
+	cli();
+
+	if (is_x2apic)
+		x2apic_enable();
+	else
+		xapic_enable();
+
+	GUEST_ASSERT_EQ(tpr_guest_tpr_get(), 0);
+	tpr_guest_check_tpr_ppr_cr8_equal();
+
+	tpr_guest_irq_queue();
+
+	/* TPR = 0 but IRQ masked by IF=0, should not fire */
+	udelay(1000);
+	GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 0);
+
+	sti();
+
+	/* IF=1 now, IRQ should fire */
+	while (tpr_guest_irq_sync_val_get() == 0)
+		cpu_relax();
+	GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 1);
+
+	GUEST_SYNC(true);
+	tpr_guest_check_tpr_ppr_cr8_equal();
+
+	tpr_guest_irq_queue();
+
+	/* IRQ masked by barely high enough TPR now, should not fire */
+	udelay(1000);
+	GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 1);
+
+	GUEST_SYNC(false);
+	tpr_guest_check_tpr_ppr_cr8_equal();
+
+	/* TPR barely low enough now to unmask IRQ, should fire */
+	while (tpr_guest_irq_sync_val_get() == 1)
+		cpu_relax();
+	GUEST_ASSERT_EQ(tpr_guest_irq_sync_val_get(), 2);
+
+	GUEST_DONE();
+}
+
+static uint8_t lapic_tpr_get(struct kvm_lapic_state *xapic)
+{
+	return GET_APIC_PRI(*((u32 *)&xapic->regs[APIC_TASKPRI]));
+}
+
+static void lapic_tpr_set(struct kvm_lapic_state *xapic, uint8_t val)
+{
+	u32 *taskpri = (u32 *)&xapic->regs[APIC_TASKPRI];
+
+	*taskpri = SET_APIC_PRI(*taskpri, val);
+}
+
+static uint8_t sregs_tpr(struct kvm_sregs *sregs)
+{
+	return sregs->cr8 & GENMASK(3, 0);
+}
+
+static void test_tpr_check_tpr_zero(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic_state xapic;
+
+	vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic);
+
+	TEST_ASSERT_EQ(lapic_tpr_get(&xapic), 0);
+}
+
+static void test_tpr_check_tpr_cr8_equal(struct kvm_vcpu *vcpu)
+{
+	struct kvm_sregs sregs;
+	struct kvm_lapic_state xapic;
+
+	vcpu_sregs_get(vcpu, &sregs);
+	vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic);
+
+	TEST_ASSERT_EQ(sregs_tpr(&sregs), lapic_tpr_get(&xapic));
+}
+
+static void test_tpr_set_tpr_for_irq(struct kvm_vcpu *vcpu, bool mask)
+{
+	struct kvm_lapic_state xapic;
+	uint8_t tpr;
+
+	static_assert(IRQ_VECTOR >= 16, "invalid IRQ vector number");
+	tpr = IRQ_VECTOR / 16;
+	if (!mask)
+		tpr--;
+
+	vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic);
+	lapic_tpr_set(&xapic, tpr);
+	vcpu_ioctl(vcpu, KVM_SET_LAPIC, &xapic);
+}
+
+static void test_tpr(bool __is_x2apic)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	bool done = false;
+
+	is_x2apic = __is_x2apic;
+
+	vm = vm_create_with_one_vcpu(&vcpu, tpr_guest_code);
+	if (is_x2apic) {
+		vm_install_exception_handler(vm, IRQ_VECTOR,
+					     tpr_guest_irq_handler_x2apic);
+	} else {
+		vm_install_exception_handler(vm, IRQ_VECTOR,
+					     tpr_guest_irq_handler_xapic);
+		vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_X2APIC);
+		virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+	}
+
+	sync_global_to_guest(vcpu->vm, is_x2apic);
+
+	/* According to the SDM/APM the TPR value at reset is 0 */
+	test_tpr_check_tpr_zero(vcpu);
+	test_tpr_check_tpr_cr8_equal(vcpu);
+
+	tpr_guest_irq_sync_flag_reset();
+	sync_global_to_guest(vcpu->vm, tpr_guest_irq_sync_val);
+
+	while (!done) {
+		struct ucall uc;
+
+		alarm(2);
+		vcpu_run(vcpu);
+		alarm(0);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_DONE:
+			test_tpr_check_tpr_cr8_equal(vcpu);
+			done = true;
+			break;
+		case UCALL_SYNC:
+			test_tpr_check_tpr_cr8_equal(vcpu);
+			test_tpr_set_tpr_for_irq(vcpu, uc.args[1]);
+			break;
+		default:
+			TEST_FAIL("Unknown ucall result 0x%lx", uc.cmd);
+			break;
+		}
+	}
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	/*
+	 * Use separate VMs for the xAPIC and x2APIC tests so that x2APIC can
+	 * be fully hidden from the guest.  KVM disallows changing CPUID after
+	 * KVM_RUN and AVIC is disabled if _any_ vCPU is allowed to use x2APIC.
+	 */
+	test_tpr(false);
+	test_tpr(true);
+}
-- 
cgit v1.2.3


From 7fe9f5366bd5d7ee3cfd9f66868a4410d6e4792d Mon Sep 17 00:00:00 2001
From: MJ Pooladkhay <mj@pooladkhay.com>
Date: Mon, 22 Dec 2025 17:42:07 +0000
Subject: KVM: selftests: Fix sign extension bug in get_desc64_base()

The function get_desc64_base() performs a series of bitwise left shifts on
fields of various sizes. More specifically, when performing '<< 24' on
'desc->base2' (which is a u8), 'base2' is promoted to a signed integer
before shifting.

In a scenario where base2 >= 0x80, the shift places a 1 into bit 31,
causing the 32-bit intermediate value to become negative. When this
result is cast to uint64_t or ORed into the return value, sign extension
occurs, corrupting the upper 32 bits of the address (base3).

Example:
Given:
  base0 = 0x5000
  base1 = 0xd6
  base2 = 0xf8
  base3 = 0xfffffe7c

Expected return: 0xfffffe7cf8d65000
Actual return:   0xfffffffff8d65000

Fix this by explicitly casting the fields to 'uint64_t' before shifting
to prevent sign extension.

Signed-off-by: MJ Pooladkhay <mj@pooladkhay.com>
Link: https://patch.msgid.link/20251222174207.107331-1-mj@pooladkhay.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86/processor.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 57d62a425109..26a91bb73c93 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -436,8 +436,10 @@ struct kvm_x86_state {
 
 static inline uint64_t get_desc64_base(const struct desc64 *desc)
 {
-	return ((uint64_t)desc->base3 << 32) |
-		(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
+	return (uint64_t)desc->base3 << 32 |
+	       (uint64_t)desc->base2 << 24 |
+	       (uint64_t)desc->base1 << 16 |
+	       (uint64_t)desc->base0;
 }
 
 static inline uint64_t rdtsc(void)
-- 
cgit v1.2.3


From 69e81ed5e6a59c12c0c6756c3f0524e2ddb023f4 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:30 -0800
Subject: KVM: selftests: Make __vm_get_page_table_entry() static

The function is only used in processor.c, drop the declaration in
processor.h and make it static.

No functional change intended.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86/processor.h | 2 --
 tools/testing/selftests/kvm/lib/x86/processor.c     | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 26a91bb73c93..1cb5b4c46b99 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1369,8 +1369,6 @@ static inline bool kvm_is_ignore_msrs(void)
 	return get_kvm_param_bool("ignore_msrs");
 }
 
-uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
-				    int *level);
 uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr);
 
 uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index 36104d27f3d9..c14bf2b5f28f 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -306,8 +306,8 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level)
 	return *level == current_level;
 }
 
-uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
-				    int *level)
+static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
+					   int *level)
 {
 	int va_width = 12 + (vm->pgtable_levels) * 9;
 	uint64_t *pte = &vm->pgd;
-- 
cgit v1.2.3


From 97dfbdfea405a0820ccfcf00afdda4c0f47c3df8 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:31 -0800
Subject: KVM: selftests: Stop passing a memslot to nested_map_memslot()

On x86, KVM selftests use memslot 0 for all the default regions used by
the test infrastructure. This is an implementation detail.
nested_map_memslot() is currently used to map the default regions by
explicitly passing slot 0, which leaks the library implementation into
the caller.

Rename the function to a very verbose
nested_identity_map_default_memslots() to reflect what it actually does.
Add an assertion that only memslot 0 is being used so that the
implementation does not change from under us.

No functional change intended.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86/vmx.h        |  4 ++--
 tools/testing/selftests/kvm/lib/x86/vmx.c            | 12 ++++++++----
 tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c |  2 +-
 3 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h
index 96e2b4c630a9..91916b8aa94b 100644
--- a/tools/testing/selftests/kvm/include/x86/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86/vmx.h
@@ -563,8 +563,8 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
 		   uint64_t nested_paddr, uint64_t paddr);
 void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
 		 uint64_t nested_paddr, uint64_t paddr, uint64_t size);
-void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
-			uint32_t memslot);
+void nested_identity_map_default_memslots(struct vmx_pages *vmx,
+					  struct kvm_vm *vm);
 void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
 			    uint64_t addr, uint64_t size);
 bool kvm_cpu_has_ept(void);
diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c
index 29b082a58daa..eec33ec63811 100644
--- a/tools/testing/selftests/kvm/lib/x86/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86/vmx.c
@@ -494,12 +494,16 @@ void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
 /* Prepare an identity extended page table that maps all the
  * physical pages in VM.
  */
-void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
-			uint32_t memslot)
+void nested_identity_map_default_memslots(struct vmx_pages *vmx,
+					  struct kvm_vm *vm)
 {
+	uint32_t s, memslot = 0;
 	sparsebit_idx_t i, last;
-	struct userspace_mem_region *region =
-		memslot2region(vm, memslot);
+	struct userspace_mem_region *region = memslot2region(vm, memslot);
+
+	/* Only memslot 0 is mapped here, ensure it's the only one being used */
+	for (s = 0; s < NR_MEM_REGIONS; s++)
+		TEST_ASSERT_EQ(vm->memslots[s], 0);
 
 	i = (region->region.guest_phys_addr >> vm->page_shift) - 1;
 	last = i + (region->region.memory_size >> vm->page_shift);
diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
index 98cb6bdab3e6..aab7333aaef0 100644
--- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
+++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
@@ -121,7 +121,7 @@ static void test_vmx_dirty_log(bool enable_ept)
 	 */
 	if (enable_ept) {
 		prepare_eptp(vmx, vm);
-		nested_map_memslot(vmx, vm, 0);
+		nested_identity_map_default_memslots(vmx, vm);
 		nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE);
 		nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE);
 	}
-- 
cgit v1.2.3


From 60de423781ad9967bfdd3a2f02ba0a31787b0d2c Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:32 -0800
Subject: KVM: selftests: Rename nested TDP mapping functions

Rename the functions from nested_* to tdp_* to make their purpose
clearer.

No functional change intended.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-4-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86/vmx.h      | 16 +++----
 tools/testing/selftests/kvm/lib/x86/memstress.c    |  4 +-
 tools/testing/selftests/kvm/lib/x86/vmx.c          | 50 +++++++++++-----------
 .../testing/selftests/kvm/x86/vmx_dirty_log_test.c |  6 +--
 4 files changed, 37 insertions(+), 39 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h
index 91916b8aa94b..04b8231d032a 100644
--- a/tools/testing/selftests/kvm/include/x86/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86/vmx.h
@@ -559,14 +559,14 @@ bool load_vmcs(struct vmx_pages *vmx);
 
 bool ept_1g_pages_supported(void);
 
-void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		   uint64_t nested_paddr, uint64_t paddr);
-void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		 uint64_t nested_paddr, uint64_t paddr, uint64_t size);
-void nested_identity_map_default_memslots(struct vmx_pages *vmx,
-					  struct kvm_vm *vm);
-void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
-			    uint64_t addr, uint64_t size);
+void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr,
+		uint64_t paddr);
+void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr,
+	     uint64_t paddr, uint64_t size);
+void tdp_identity_map_default_memslots(struct vmx_pages *vmx,
+				       struct kvm_vm *vm);
+void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
+			 uint64_t addr, uint64_t size);
 bool kvm_cpu_has_ept(void);
 void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm);
 void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm);
diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c
index 0b1f288ad556..1928b00bde51 100644
--- a/tools/testing/selftests/kvm/lib/x86/memstress.c
+++ b/tools/testing/selftests/kvm/lib/x86/memstress.c
@@ -70,11 +70,11 @@ void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm)
 	 * KVM can shadow the EPT12 with the maximum huge page size supported
 	 * by the backing source.
 	 */
-	nested_identity_map_1g(vmx, vm, 0, 0x100000000ULL);
+	tdp_identity_map_1g(vmx, vm, 0, 0x100000000ULL);
 
 	start = align_down(memstress_args.gpa, PG_SIZE_1G);
 	end = align_up(memstress_args.gpa + memstress_args.size, PG_SIZE_1G);
-	nested_identity_map_1g(vmx, vm, start, end - start);
+	tdp_identity_map_1g(vmx, vm, start, end - start);
 }
 
 void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[])
diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c
index eec33ec63811..1954ccdfc353 100644
--- a/tools/testing/selftests/kvm/lib/x86/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86/vmx.c
@@ -362,12 +362,12 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp)
 	init_vmcs_guest_state(guest_rip, guest_rsp);
 }
 
-static void nested_create_pte(struct kvm_vm *vm,
-			      struct eptPageTableEntry *pte,
-			      uint64_t nested_paddr,
-			      uint64_t paddr,
-			      int current_level,
-			      int target_level)
+static void tdp_create_pte(struct kvm_vm *vm,
+			   struct eptPageTableEntry *pte,
+			   uint64_t nested_paddr,
+			   uint64_t paddr,
+			   int current_level,
+			   int target_level)
 {
 	if (!pte->readable) {
 		pte->writable = true;
@@ -394,8 +394,8 @@ static void nested_create_pte(struct kvm_vm *vm,
 }
 
 
-void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		     uint64_t nested_paddr, uint64_t paddr, int target_level)
+void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+		  uint64_t nested_paddr, uint64_t paddr, int target_level)
 {
 	const uint64_t page_size = PG_LEVEL_SIZE(target_level);
 	struct eptPageTableEntry *pt = vmx->eptp_hva, *pte;
@@ -428,7 +428,7 @@ void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
 		index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
 		pte = &pt[index];
 
-		nested_create_pte(vm, pte, nested_paddr, paddr, level, target_level);
+		tdp_create_pte(vm, pte, nested_paddr, paddr, level, target_level);
 
 		if (pte->page_size)
 			break;
@@ -445,10 +445,10 @@ void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
 
 }
 
-void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		   uint64_t nested_paddr, uint64_t paddr)
+void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+		uint64_t nested_paddr, uint64_t paddr)
 {
-	__nested_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K);
+	__tdp_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K);
 }
 
 /*
@@ -468,8 +468,8 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
  * Within the VM given by vm, creates a nested guest translation for the
  * page range starting at nested_paddr to the page range starting at paddr.
  */
-void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		  uint64_t nested_paddr, uint64_t paddr, uint64_t size,
+void __tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+	       uint64_t nested_paddr, uint64_t paddr, uint64_t size,
 		  int level)
 {
 	size_t page_size = PG_LEVEL_SIZE(level);
@@ -479,23 +479,23 @@ void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
 	TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
 
 	while (npages--) {
-		__nested_pg_map(vmx, vm, nested_paddr, paddr, level);
+		__tdp_pg_map(vmx, vm, nested_paddr, paddr, level);
 		nested_paddr += page_size;
 		paddr += page_size;
 	}
 }
 
-void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		uint64_t nested_paddr, uint64_t paddr, uint64_t size)
+void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+	     uint64_t nested_paddr, uint64_t paddr, uint64_t size)
 {
-	__nested_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K);
+	__tdp_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K);
 }
 
 /* Prepare an identity extended page table that maps all the
  * physical pages in VM.
  */
-void nested_identity_map_default_memslots(struct vmx_pages *vmx,
-					  struct kvm_vm *vm)
+void tdp_identity_map_default_memslots(struct vmx_pages *vmx,
+				       struct kvm_vm *vm)
 {
 	uint32_t s, memslot = 0;
 	sparsebit_idx_t i, last;
@@ -512,18 +512,16 @@ void nested_identity_map_default_memslots(struct vmx_pages *vmx,
 		if (i > last)
 			break;
 
-		nested_map(vmx, vm,
-			   (uint64_t)i << vm->page_shift,
-			   (uint64_t)i << vm->page_shift,
-			   1 << vm->page_shift);
+		tdp_map(vmx, vm, (uint64_t)i << vm->page_shift,
+			(uint64_t)i << vm->page_shift, 1 << vm->page_shift);
 	}
 }
 
 /* Identity map a region with 1GiB Pages. */
-void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
+void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
 			    uint64_t addr, uint64_t size)
 {
-	__nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G);
+	__tdp_map(vmx, vm, addr, addr, size, PG_LEVEL_1G);
 }
 
 bool kvm_cpu_has_ept(void)
diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
index aab7333aaef0..e7d0c08ba29d 100644
--- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
+++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
@@ -121,9 +121,9 @@ static void test_vmx_dirty_log(bool enable_ept)
 	 */
 	if (enable_ept) {
 		prepare_eptp(vmx, vm);
-		nested_identity_map_default_memslots(vmx, vm);
-		nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE);
-		nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE);
+		tdp_identity_map_default_memslots(vmx, vm);
+		tdp_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE);
+		tdp_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE);
 	}
 
 	bmap = bitmap_zalloc(TEST_MEM_PAGES);
-- 
cgit v1.2.3


From b320c03d685704df51cf0774edd799e96c505c74 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:33 -0800
Subject: KVM: selftests: Kill eptPageTablePointer

Replace the struct overlay with explicit bitmasks, which is clearer and
less error-prone. See commit f18b4aebe107 ("kvm: selftests: do not use
bitfields larger than 32-bits for PTEs") for an example of why bitfields
are not preferable.

Remove the unused PAGE_SHIFT_4K definition while at it.

No functional change intended.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-5-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/lib/x86/vmx.c | 37 ++++++++++++++-----------------
 1 file changed, 17 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c
index 1954ccdfc353..85043bb1ec4d 100644
--- a/tools/testing/selftests/kvm/lib/x86/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86/vmx.c
@@ -10,10 +10,16 @@
 #include "processor.h"
 #include "vmx.h"
 
-#define PAGE_SHIFT_4K  12
-
 #define KVM_EPT_PAGE_TABLE_MIN_PADDR 0x1c0000
 
+#define EPTP_MT_SHIFT		0 /* EPTP memtype bits 2:0 */
+#define EPTP_PWL_SHIFT		3 /* EPTP page walk length bits 5:3 */
+#define EPTP_AD_ENABLED_SHIFT	6 /* EPTP AD enabled bit 6 */
+
+#define EPTP_WB			(X86_MEMTYPE_WB << EPTP_MT_SHIFT)
+#define EPTP_PWL_4		(3ULL << EPTP_PWL_SHIFT) /* PWL is (levels - 1) */
+#define EPTP_AD_ENABLED		(1ULL << EPTP_AD_ENABLED_SHIFT)
+
 bool enable_evmcs;
 
 struct hv_enlightened_vmcs *current_evmcs;
@@ -34,14 +40,6 @@ struct eptPageTableEntry {
 	uint64_t suppress_ve:1;
 };
 
-struct eptPageTablePointer {
-	uint64_t memory_type:3;
-	uint64_t page_walk_length:3;
-	uint64_t ad_enabled:1;
-	uint64_t reserved_11_07:5;
-	uint64_t address:40;
-	uint64_t reserved_63_52:12;
-};
 int vcpu_enable_evmcs(struct kvm_vcpu *vcpu)
 {
 	uint16_t evmcs_ver;
@@ -196,16 +194,15 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
 	vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS));
 
 	if (vmx->eptp_gpa) {
-		uint64_t ept_paddr;
-		struct eptPageTablePointer eptp = {
-			.memory_type = X86_MEMTYPE_WB,
-			.page_walk_length = 3, /* + 1 */
-			.ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS),
-			.address = vmx->eptp_gpa >> PAGE_SHIFT_4K,
-		};
-
-		memcpy(&ept_paddr, &eptp, sizeof(ept_paddr));
-		vmwrite(EPT_POINTER, ept_paddr);
+		uint64_t eptp = vmx->eptp_gpa | EPTP_WB | EPTP_PWL_4;
+
+		TEST_ASSERT((vmx->eptp_gpa & ~PHYSICAL_PAGE_MASK) == 0,
+			    "Illegal bits set in vmx->eptp_gpa");
+
+		if (ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS))
+			eptp |= EPTP_AD_ENABLED;
+
+		vmwrite(EPT_POINTER, eptp);
 		sec_exec_ctl |= SECONDARY_EXEC_ENABLE_EPT;
 	}
 
-- 
cgit v1.2.3


From 3cd5002807bebf504cbb6645e73d01204324e54a Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:34 -0800
Subject: KVM: selftests: Stop setting A/D bits when creating EPT PTEs

Stop setting Accessed/Dirty bits when creating EPT entries for L2 so that
the stage-1 and stage-2 (a.k.a. TDP) page table APIs can use common code
without bleeding the EPT hack into the common APIs.

While commit 094444204570 ("selftests: kvm: add test for dirty logging
inside nested guests") is _very_ light on details, the most likely
explanation is that vmx_dirty_log_test was attempting to avoid taking an
EPT Violation on the first _write_ from L2.

  static void l2_guest_code(u64 *a, u64 *b)
  {
	READ_ONCE(*a);
	WRITE_ONCE(*a, 1);   <===
	GUEST_SYNC(true);

	...
  }

When handling read faults in the shadow MMU, KVM opportunistically creates
a writable SPTE if the mapping can be writable *and* the gPTE is dirty (or
doesn't support the Dirty bit), i.e. if KVM doesn't need to intercept
writes in order to emulate Dirty-bit updates.  By setting A/D bits in the
test's EPT entries, the above READ+WRITE will fault only on the read, and
in theory expose the bug fixed by KVM commit 1f4e5fc83a42 ("KVM: x86: fix
nested guest live migration with PML").  If the Dirty bit is NOT set, the
test will get a false pass due; though again, in theory.

However, the test is flawed (and always was, at least in the versions
posted publicly), as KVM (correctly) marks the corresponding L1 GFN as
dirty (in the dirty bitmap) when creating the writable SPTE.  I.e. without
a check on the dirty bitmap after the READ_ONCE(), the check after the
first WRITE_ONCE() will get a false pass due to the dirty bitmap/log having
been updated by the read fault, not by PML.

Furthermore, the subsequent behavior in the test's l2_guest_code()
effectively hides the flawed test behavior, as the straight writes to a
new L2 GPA fault also trigger the KVM bug, and so the test will still
detect the failure due to lack of isolation between the two testcases
(Read=>Write vs. Write=>Write).

	WRITE_ONCE(*b, 1);
	GUEST_SYNC(true);
	WRITE_ONCE(*b, 1);
	GUEST_SYNC(true);
	GUEST_SYNC(false);

Punt on fixing vmx_dirty_log_test for the moment as it will be easier to
properly fix the test once the TDP code uses the common MMU APIs, at which
point it will be trivially easy for the test to retrieve the EPT PTE and
set the Dirty bit as needed.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
[sean: rewrite changelog to explain the situation]
Link: https://patch.msgid.link/20251230230150.4150236-6-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/lib/x86/vmx.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c
index 85043bb1ec4d..a3e2eae981da 100644
--- a/tools/testing/selftests/kvm/lib/x86/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86/vmx.c
@@ -432,14 +432,6 @@ void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
 
 		pt = addr_gpa2hva(vm, pte->address * vm->page_size);
 	}
-
-	/*
-	 * For now mark these as accessed and dirty because the only
-	 * testcase we have needs that.  Can be reconsidered later.
-	 */
-	pte->accessed = true;
-	pte->dirty = true;
-
 }
 
 void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-- 
cgit v1.2.3


From 9f073ac25b4c4cf3b3ea13b155035108c54148bb Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 15:01:35 -0800
Subject: KVM: selftests: Add "struct kvm_mmu" to track a given MMU instance

Add a "struct kvm_mmu" to track a given MMU instance, e.g. a VM's stage-1
MMU versus a VM's stage-2 MMU, so that x86 can share MMU functionality for
both stage-1 and stage-2 MMUs, without creating the potential for subtle
bugs, e.g. due to consuming on vm->pgtable_levels when operating a stage-2
MMU.

Encapsulate the existing de facto MMU in "struct kvm_vm", e.g instead of
burying the MMU details in "struct kvm_vm_arch", to avoid more #ifdefs in
____vm_create(), and in the hopes that other architectures can utilize the
formalized MMU structure if/when they too support stage-2 page tables.

No functional change intended.

Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-7-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h     | 11 +++++--
 tools/testing/selftests/kvm/lib/arm64/processor.c  | 38 +++++++++++-----------
 tools/testing/selftests/kvm/lib/kvm_util.c         | 28 ++++++++--------
 .../selftests/kvm/lib/loongarch/processor.c        | 28 ++++++++--------
 tools/testing/selftests/kvm/lib/riscv/processor.c  | 31 +++++++++---------
 tools/testing/selftests/kvm/lib/s390/processor.c   | 16 ++++-----
 tools/testing/selftests/kvm/lib/x86/processor.c    | 28 ++++++++--------
 .../selftests/kvm/x86/vmx_nested_la57_state_test.c |  2 +-
 8 files changed, 94 insertions(+), 88 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 81f4355ff28a..39558c05c0bf 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -88,12 +88,17 @@ enum kvm_mem_region_type {
 	NR_MEM_REGIONS,
 };
 
+struct kvm_mmu {
+	bool pgd_created;
+	uint64_t pgd;
+	int pgtable_levels;
+};
+
 struct kvm_vm {
 	int mode;
 	unsigned long type;
 	int kvm_fd;
 	int fd;
-	unsigned int pgtable_levels;
 	unsigned int page_size;
 	unsigned int page_shift;
 	unsigned int pa_bits;
@@ -104,13 +109,13 @@ struct kvm_vm {
 	struct sparsebit *vpages_valid;
 	struct sparsebit *vpages_mapped;
 	bool has_irqchip;
-	bool pgd_created;
 	vm_paddr_t ucall_mmio_addr;
-	vm_paddr_t pgd;
 	vm_vaddr_t handlers;
 	uint32_t dirty_ring_size;
 	uint64_t gpa_tag_mask;
 
+	struct kvm_mmu mmu;
+
 	struct kvm_vm_arch arch;
 
 	struct kvm_binary_stats stats;
diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c
index d46e4b13b92c..c40f59d48311 100644
--- a/tools/testing/selftests/kvm/lib/arm64/processor.c
+++ b/tools/testing/selftests/kvm/lib/arm64/processor.c
@@ -28,7 +28,7 @@ static uint64_t page_align(struct kvm_vm *vm, uint64_t v)
 
 static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva)
 {
-	unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
+	unsigned int shift = (vm->mmu.pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
 	uint64_t mask = (1UL << (vm->va_bits - shift)) - 1;
 
 	return (gva >> shift) & mask;
@@ -39,7 +39,7 @@ static uint64_t pud_index(struct kvm_vm *vm, vm_vaddr_t gva)
 	unsigned int shift = 2 * (vm->page_shift - 3) + vm->page_shift;
 	uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
 
-	TEST_ASSERT(vm->pgtable_levels == 4,
+	TEST_ASSERT(vm->mmu.pgtable_levels == 4,
 		"Mode %d does not have 4 page table levels", vm->mode);
 
 	return (gva >> shift) & mask;
@@ -50,7 +50,7 @@ static uint64_t pmd_index(struct kvm_vm *vm, vm_vaddr_t gva)
 	unsigned int shift = (vm->page_shift - 3) + vm->page_shift;
 	uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
 
-	TEST_ASSERT(vm->pgtable_levels >= 3,
+	TEST_ASSERT(vm->mmu.pgtable_levels >= 3,
 		"Mode %d does not have >= 3 page table levels", vm->mode);
 
 	return (gva >> shift) & mask;
@@ -104,7 +104,7 @@ static uint64_t pte_addr(struct kvm_vm *vm, uint64_t pte)
 
 static uint64_t ptrs_per_pgd(struct kvm_vm *vm)
 {
-	unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
+	unsigned int shift = (vm->mmu.pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
 	return 1 << (vm->va_bits - shift);
 }
 
@@ -117,13 +117,13 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
 {
 	size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size;
 
-	if (vm->pgd_created)
+	if (vm->mmu.pgd_created)
 		return;
 
-	vm->pgd = vm_phy_pages_alloc(vm, nr_pages,
-				     KVM_GUEST_PAGE_TABLE_MIN_PADDR,
-				     vm->memslots[MEM_REGION_PT]);
-	vm->pgd_created = true;
+	vm->mmu.pgd = vm_phy_pages_alloc(vm, nr_pages,
+					 KVM_GUEST_PAGE_TABLE_MIN_PADDR,
+					 vm->memslots[MEM_REGION_PT]);
+	vm->mmu.pgd_created = true;
 }
 
 static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
@@ -147,12 +147,12 @@ static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 		"  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
 		paddr, vm->max_gfn, vm->page_size);
 
-	ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8;
+	ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pgd_index(vm, vaddr) * 8;
 	if (!*ptep)
 		*ptep = addr_pte(vm, vm_alloc_page_table(vm),
 				 PGD_TYPE_TABLE | PTE_VALID);
 
-	switch (vm->pgtable_levels) {
+	switch (vm->mmu.pgtable_levels) {
 	case 4:
 		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8;
 		if (!*ptep)
@@ -190,16 +190,16 @@ uint64_t *virt_get_pte_hva_at_level(struct kvm_vm *vm, vm_vaddr_t gva, int level
 {
 	uint64_t *ptep;
 
-	if (!vm->pgd_created)
+	if (!vm->mmu.pgd_created)
 		goto unmapped_gva;
 
-	ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, gva) * 8;
+	ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pgd_index(vm, gva) * 8;
 	if (!ptep)
 		goto unmapped_gva;
 	if (level == 0)
 		return ptep;
 
-	switch (vm->pgtable_levels) {
+	switch (vm->mmu.pgtable_levels) {
 	case 4:
 		ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, gva) * 8;
 		if (!ptep)
@@ -263,13 +263,13 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t p
 
 void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 {
-	int level = 4 - (vm->pgtable_levels - 1);
+	int level = 4 - (vm->mmu.pgtable_levels - 1);
 	uint64_t pgd, *ptep;
 
-	if (!vm->pgd_created)
+	if (!vm->mmu.pgd_created)
 		return;
 
-	for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pgd(vm) * 8; pgd += 8) {
+	for (pgd = vm->mmu.pgd; pgd < vm->mmu.pgd + ptrs_per_pgd(vm) * 8; pgd += 8) {
 		ptep = addr_gpa2hva(vm, pgd);
 		if (!*ptep)
 			continue;
@@ -350,7 +350,7 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init)
 		TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
 	}
 
-	ttbr0_el1 = vm->pgd & GENMASK(47, vm->page_shift);
+	ttbr0_el1 = vm->mmu.pgd & GENMASK(47, vm->page_shift);
 
 	/* Configure output size */
 	switch (vm->mode) {
@@ -358,7 +358,7 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init)
 	case VM_MODE_P52V48_16K:
 	case VM_MODE_P52V48_64K:
 		tcr_el1 |= TCR_IPS_52_BITS;
-		ttbr0_el1 |= FIELD_GET(GENMASK(51, 48), vm->pgd) << 2;
+		ttbr0_el1 |= FIELD_GET(GENMASK(51, 48), vm->mmu.pgd) << 2;
 		break;
 	case VM_MODE_P48V48_4K:
 	case VM_MODE_P48V48_16K:
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 8279b6ced8d2..65752daeed90 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -281,34 +281,34 @@ struct kvm_vm *____vm_create(struct vm_shape shape)
 	/* Setup mode specific traits. */
 	switch (vm->mode) {
 	case VM_MODE_P52V48_4K:
-		vm->pgtable_levels = 4;
+		vm->mmu.pgtable_levels = 4;
 		break;
 	case VM_MODE_P52V48_64K:
-		vm->pgtable_levels = 3;
+		vm->mmu.pgtable_levels = 3;
 		break;
 	case VM_MODE_P48V48_4K:
-		vm->pgtable_levels = 4;
+		vm->mmu.pgtable_levels = 4;
 		break;
 	case VM_MODE_P48V48_64K:
-		vm->pgtable_levels = 3;
+		vm->mmu.pgtable_levels = 3;
 		break;
 	case VM_MODE_P40V48_4K:
 	case VM_MODE_P36V48_4K:
-		vm->pgtable_levels = 4;
+		vm->mmu.pgtable_levels = 4;
 		break;
 	case VM_MODE_P40V48_64K:
 	case VM_MODE_P36V48_64K:
-		vm->pgtable_levels = 3;
+		vm->mmu.pgtable_levels = 3;
 		break;
 	case VM_MODE_P52V48_16K:
 	case VM_MODE_P48V48_16K:
 	case VM_MODE_P40V48_16K:
 	case VM_MODE_P36V48_16K:
-		vm->pgtable_levels = 4;
+		vm->mmu.pgtable_levels = 4;
 		break;
 	case VM_MODE_P47V47_16K:
 	case VM_MODE_P36V47_16K:
-		vm->pgtable_levels = 3;
+		vm->mmu.pgtable_levels = 3;
 		break;
 	case VM_MODE_PXXVYY_4K:
 #ifdef __x86_64__
@@ -321,22 +321,22 @@ struct kvm_vm *____vm_create(struct vm_shape shape)
 			 vm->va_bits);
 
 		if (vm->va_bits == 57) {
-			vm->pgtable_levels = 5;
+			vm->mmu.pgtable_levels = 5;
 		} else {
 			TEST_ASSERT(vm->va_bits == 48,
 				    "Unexpected guest virtual address width: %d",
 				    vm->va_bits);
-			vm->pgtable_levels = 4;
+			vm->mmu.pgtable_levels = 4;
 		}
 #else
 		TEST_FAIL("VM_MODE_PXXVYY_4K not supported on non-x86 platforms");
 #endif
 		break;
 	case VM_MODE_P47V64_4K:
-		vm->pgtable_levels = 5;
+		vm->mmu.pgtable_levels = 5;
 		break;
 	case VM_MODE_P44V64_4K:
-		vm->pgtable_levels = 5;
+		vm->mmu.pgtable_levels = 5;
 		break;
 	default:
 		TEST_FAIL("Unknown guest mode: 0x%x", vm->mode);
@@ -1956,8 +1956,8 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 	fprintf(stream, "%*sMapped Virtual Pages:\n", indent, "");
 	sparsebit_dump(stream, vm->vpages_mapped, indent + 2);
 	fprintf(stream, "%*spgd_created: %u\n", indent, "",
-		vm->pgd_created);
-	if (vm->pgd_created) {
+		vm->mmu.pgd_created);
+	if (vm->mmu.pgd_created) {
 		fprintf(stream, "%*sVirtual Translation Tables:\n",
 			indent + 2, "");
 		virt_dump(stream, vm, indent + 4);
diff --git a/tools/testing/selftests/kvm/lib/loongarch/processor.c b/tools/testing/selftests/kvm/lib/loongarch/processor.c
index 07c103369ddb..17aa55a2047a 100644
--- a/tools/testing/selftests/kvm/lib/loongarch/processor.c
+++ b/tools/testing/selftests/kvm/lib/loongarch/processor.c
@@ -50,11 +50,11 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
 	int i;
 	vm_paddr_t child, table;
 
-	if (vm->pgd_created)
+	if (vm->mmu.pgd_created)
 		return;
 
 	child = table = 0;
-	for (i = 0; i < vm->pgtable_levels; i++) {
+	for (i = 0; i < vm->mmu.pgtable_levels; i++) {
 		invalid_pgtable[i] = child;
 		table = vm_phy_page_alloc(vm, LOONGARCH_PAGE_TABLE_PHYS_MIN,
 				vm->memslots[MEM_REGION_PT]);
@@ -62,8 +62,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
 		virt_set_pgtable(vm, table, child);
 		child = table;
 	}
-	vm->pgd = table;
-	vm->pgd_created = true;
+	vm->mmu.pgd = table;
+	vm->mmu.pgd_created = true;
 }
 
 static int virt_pte_none(uint64_t *ptep, int level)
@@ -77,11 +77,11 @@ static uint64_t *virt_populate_pte(struct kvm_vm *vm, vm_vaddr_t gva, int alloc)
 	uint64_t *ptep;
 	vm_paddr_t child;
 
-	if (!vm->pgd_created)
+	if (!vm->mmu.pgd_created)
 		goto unmapped_gva;
 
-	child = vm->pgd;
-	level = vm->pgtable_levels - 1;
+	child = vm->mmu.pgd;
+	level = vm->mmu.pgtable_levels - 1;
 	while (level > 0) {
 		ptep = addr_gpa2hva(vm, child) + virt_pte_index(vm, gva, level) * 8;
 		if (virt_pte_none(ptep, level)) {
@@ -161,11 +161,11 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 {
 	int level;
 
-	if (!vm->pgd_created)
+	if (!vm->mmu.pgd_created)
 		return;
 
-	level = vm->pgtable_levels - 1;
-	pte_dump(stream, vm, indent, vm->pgd, level);
+	level = vm->mmu.pgtable_levels - 1;
+	pte_dump(stream, vm, indent, vm->mmu.pgd, level);
 }
 
 void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent)
@@ -297,7 +297,7 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	width = vm->page_shift - 3;
 
-	switch (vm->pgtable_levels) {
+	switch (vm->mmu.pgtable_levels) {
 	case 4:
 		/* pud page shift and width */
 		val = (vm->page_shift + width * 2) << 20 | (width << 25);
@@ -309,15 +309,15 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu)
 		val |= vm->page_shift | width << 5;
 		break;
 	default:
-		TEST_FAIL("Got %u page table levels, expected 3 or 4", vm->pgtable_levels);
+		TEST_FAIL("Got %u page table levels, expected 3 or 4", vm->mmu.pgtable_levels);
 	}
 
 	loongarch_set_csr(vcpu, LOONGARCH_CSR_PWCTL0, val);
 
 	/* PGD page shift and width */
-	val = (vm->page_shift + width * (vm->pgtable_levels - 1)) | width << 6;
+	val = (vm->page_shift + width * (vm->mmu.pgtable_levels - 1)) | width << 6;
 	loongarch_set_csr(vcpu, LOONGARCH_CSR_PWCTL1, val);
-	loongarch_set_csr(vcpu, LOONGARCH_CSR_PGDL, vm->pgd);
+	loongarch_set_csr(vcpu, LOONGARCH_CSR_PGDL, vm->mmu.pgd);
 
 	/*
 	 * Refill exception runs on real mode
diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c
index 2eac7d4b59e9..e6ec7c224fc3 100644
--- a/tools/testing/selftests/kvm/lib/riscv/processor.c
+++ b/tools/testing/selftests/kvm/lib/riscv/processor.c
@@ -60,7 +60,7 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level)
 {
 	TEST_ASSERT(level > -1,
 		"Negative page table level (%d) not possible", level);
-	TEST_ASSERT(level < vm->pgtable_levels,
+	TEST_ASSERT(level < vm->mmu.pgtable_levels,
 		"Invalid page table level (%d)", level);
 
 	return (gva & pte_index_mask[level]) >> pte_index_shift[level];
@@ -70,19 +70,19 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
 {
 	size_t nr_pages = page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size;
 
-	if (vm->pgd_created)
+	if (vm->mmu.pgd_created)
 		return;
 
-	vm->pgd = vm_phy_pages_alloc(vm, nr_pages,
-				     KVM_GUEST_PAGE_TABLE_MIN_PADDR,
-				     vm->memslots[MEM_REGION_PT]);
-	vm->pgd_created = true;
+	vm->mmu.pgd = vm_phy_pages_alloc(vm, nr_pages,
+					 KVM_GUEST_PAGE_TABLE_MIN_PADDR,
+					 vm->memslots[MEM_REGION_PT]);
+	vm->mmu.pgd_created = true;
 }
 
 void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
 {
 	uint64_t *ptep, next_ppn;
-	int level = vm->pgtable_levels - 1;
+	int level = vm->mmu.pgtable_levels - 1;
 
 	TEST_ASSERT((vaddr % vm->page_size) == 0,
 		"Virtual address not on page boundary,\n"
@@ -98,7 +98,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
 		"  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
 		paddr, vm->max_gfn, vm->page_size);
 
-	ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, vaddr, level) * 8;
+	ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pte_index(vm, vaddr, level) * 8;
 	if (!*ptep) {
 		next_ppn = vm_alloc_page_table(vm) >> PGTBL_PAGE_SIZE_SHIFT;
 		*ptep = (next_ppn << PGTBL_PTE_ADDR_SHIFT) |
@@ -126,12 +126,12 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
 vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
 {
 	uint64_t *ptep;
-	int level = vm->pgtable_levels - 1;
+	int level = vm->mmu.pgtable_levels - 1;
 
-	if (!vm->pgd_created)
+	if (!vm->mmu.pgd_created)
 		goto unmapped_gva;
 
-	ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, gva, level) * 8;
+	ptep = addr_gpa2hva(vm, vm->mmu.pgd) + pte_index(vm, gva, level) * 8;
 	if (!ptep)
 		goto unmapped_gva;
 	level--;
@@ -176,13 +176,14 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent,
 
 void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 {
-	int level = vm->pgtable_levels - 1;
+	struct kvm_mmu *mmu = &vm->mmu;
+	int level = mmu->pgtable_levels - 1;
 	uint64_t pgd, *ptep;
 
-	if (!vm->pgd_created)
+	if (!mmu->pgd_created)
 		return;
 
-	for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pte(vm) * 8; pgd += 8) {
+	for (pgd = mmu->pgd; pgd < mmu->pgd + ptrs_per_pte(vm) * 8; pgd += 8) {
 		ptep = addr_gpa2hva(vm, pgd);
 		if (!*ptep)
 			continue;
@@ -211,7 +212,7 @@ void riscv_vcpu_mmu_setup(struct kvm_vcpu *vcpu)
 		TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
 	}
 
-	satp = (vm->pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN;
+	satp = (vm->mmu.pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN;
 	satp |= SATP_MODE_48;
 
 	vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(satp), satp);
diff --git a/tools/testing/selftests/kvm/lib/s390/processor.c b/tools/testing/selftests/kvm/lib/s390/processor.c
index 8ceeb17c819a..6a9a660413a7 100644
--- a/tools/testing/selftests/kvm/lib/s390/processor.c
+++ b/tools/testing/selftests/kvm/lib/s390/processor.c
@@ -17,7 +17,7 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
 	TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x",
 		    vm->page_size);
 
-	if (vm->pgd_created)
+	if (vm->mmu.pgd_created)
 		return;
 
 	paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION,
@@ -25,8 +25,8 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
 				   vm->memslots[MEM_REGION_PT]);
 	memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size);
 
-	vm->pgd = paddr;
-	vm->pgd_created = true;
+	vm->mmu.pgd = paddr;
+	vm->mmu.pgd_created = true;
 }
 
 /*
@@ -70,7 +70,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa)
 		gva, vm->max_gfn, vm->page_size);
 
 	/* Walk through region and segment tables */
-	entry = addr_gpa2hva(vm, vm->pgd);
+	entry = addr_gpa2hva(vm, vm->mmu.pgd);
 	for (ri = 1; ri <= 4; ri++) {
 		idx = (gva >> (64 - 11 * ri)) & 0x7ffu;
 		if (entry[idx] & REGION_ENTRY_INVALID)
@@ -94,7 +94,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
 	TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x",
 		    vm->page_size);
 
-	entry = addr_gpa2hva(vm, vm->pgd);
+	entry = addr_gpa2hva(vm, vm->mmu.pgd);
 	for (ri = 1; ri <= 4; ri++) {
 		idx = (gva >> (64 - 11 * ri)) & 0x7ffu;
 		TEST_ASSERT(!(entry[idx] & REGION_ENTRY_INVALID),
@@ -149,10 +149,10 @@ static void virt_dump_region(FILE *stream, struct kvm_vm *vm, uint8_t indent,
 
 void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 {
-	if (!vm->pgd_created)
+	if (!vm->mmu.pgd_created)
 		return;
 
-	virt_dump_region(stream, vm, indent, vm->pgd);
+	virt_dump_region(stream, vm, indent, vm->mmu.pgd);
 }
 
 void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code)
@@ -184,7 +184,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
 
 	vcpu_sregs_get(vcpu, &sregs);
 	sregs.crs[0] |= 0x00040000;		/* Enable floating point regs */
-	sregs.crs[1] = vm->pgd | 0xf;		/* Primary region table */
+	sregs.crs[1] = vm->mmu.pgd | 0xf;	/* Primary region table */
 	vcpu_sregs_set(vcpu, &sregs);
 
 	vcpu->run->psw_mask = 0x0400000180000000ULL;  /* DAT enabled + 64 bit mode */
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index c14bf2b5f28f..f027f86d1535 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -162,9 +162,9 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
 		    "Unknown or unsupported guest mode: 0x%x", vm->mode);
 
 	/* If needed, create the top-level page table. */
-	if (!vm->pgd_created) {
-		vm->pgd = vm_alloc_page_table(vm);
-		vm->pgd_created = true;
+	if (!vm->mmu.pgd_created) {
+		vm->mmu.pgd = vm_alloc_page_table(vm);
+		vm->mmu.pgd_created = true;
 	}
 }
 
@@ -175,7 +175,7 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte,
 	uint64_t *page_table = addr_gpa2hva(vm, pt_gpa);
 	int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
 
-	TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->pgd,
+	TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->mmu.pgd,
 		    "Parent PTE (level %d) not PRESENT for gva: 0x%08lx",
 		    level + 1, vaddr);
 
@@ -218,7 +218,7 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
 void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
 {
 	const uint64_t pg_size = PG_LEVEL_SIZE(level);
-	uint64_t *pte = &vm->pgd;
+	uint64_t *pte = &vm->mmu.pgd;
 	int current_level;
 
 	TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,
@@ -243,7 +243,7 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
 	 * Allocate upper level page tables, if not already present.  Return
 	 * early if a hugepage was created.
 	 */
-	for (current_level = vm->pgtable_levels;
+	for (current_level = vm->mmu.pgtable_levels;
 	     current_level > PG_LEVEL_4K;
 	     current_level--) {
 		pte = virt_create_upper_pte(vm, pte, vaddr, paddr,
@@ -309,14 +309,14 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level)
 static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
 					   int *level)
 {
-	int va_width = 12 + (vm->pgtable_levels) * 9;
-	uint64_t *pte = &vm->pgd;
+	int va_width = 12 + (vm->mmu.pgtable_levels) * 9;
+	uint64_t *pte = &vm->mmu.pgd;
 	int current_level;
 
 	TEST_ASSERT(!vm->arch.is_pt_protected,
 		    "Walking page tables of protected guests is impossible");
 
-	TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->pgtable_levels,
+	TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->mmu.pgtable_levels,
 		    "Invalid PG_LEVEL_* '%d'", *level);
 
 	TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,
@@ -332,7 +332,7 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
 		    (((int64_t)vaddr << (64 - va_width) >> (64 - va_width))),
 		    "Canonical check failed.  The virtual address is invalid.");
 
-	for (current_level = vm->pgtable_levels;
+	for (current_level = vm->mmu.pgtable_levels;
 	     current_level > PG_LEVEL_4K;
 	     current_level--) {
 		pte = virt_get_pte(vm, pte, vaddr, current_level);
@@ -357,7 +357,7 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 	uint64_t *pde, *pde_start;
 	uint64_t *pte, *pte_start;
 
-	if (!vm->pgd_created)
+	if (!vm->mmu.pgd_created)
 		return;
 
 	fprintf(stream, "%*s                                          "
@@ -365,7 +365,7 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 	fprintf(stream, "%*s      index hvaddr         gpaddr         "
 		"addr         w exec dirty\n",
 		indent, "");
-	pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd);
+	pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->mmu.pgd);
 	for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
 		pml4e = &pml4e_start[n1];
 		if (!(*pml4e & PTE_PRESENT_MASK))
@@ -538,7 +538,7 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
 	sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
 	if (kvm_cpu_has(X86_FEATURE_XSAVE))
 		sregs.cr4 |= X86_CR4_OSXSAVE;
-	if (vm->pgtable_levels == 5)
+	if (vm->mmu.pgtable_levels == 5)
 		sregs.cr4 |= X86_CR4_LA57;
 	sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
 
@@ -549,7 +549,7 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
 	kvm_seg_set_kernel_data_64bit(&sregs.gs);
 	kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr);
 
-	sregs.cr3 = vm->pgd;
+	sregs.cr3 = vm->mmu.pgd;
 	vcpu_sregs_set(vcpu, &sregs);
 }
 
diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c
index cf1d2d1f2a8f..915c42001dba 100644
--- a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c
+++ b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c
@@ -90,7 +90,7 @@ int main(int argc, char *argv[])
 	 * L1 needs to read its own PML5 table to set up L2. Identity map
 	 * the PML5 table to facilitate this.
 	 */
-	virt_map(vm, vm->pgd, vm->pgd, 1);
+	virt_map(vm, vm->mmu.pgd, vm->mmu.pgd, 1);
 
 	vcpu_alloc_vmx(vm, &vmx_pages_gva);
 	vcpu_args_set(vcpu, 1, vmx_pages_gva);
-- 
cgit v1.2.3


From 11825209f5494098da6cab666d8a767650c1c0cb Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 15:01:36 -0800
Subject: KVM: selftests: Plumb "struct kvm_mmu" into x86's MMU APIs

In preparation for generalizing the x86 virt mapping APIs to work with
TDP (stage-2) page tables, plumb "struct kvm_mmu" into all of the helper
functions instead of operating on vm->mmu directly.

Opportunistically swap the order of the check in virt_get_pte() to first
assert that the parent is the PGD, and then check that the PTE is present,
as it makes more sense to check if the parent PTE is the PGD/root (i.e.
not a PTE) before checking that the PTE is PRESENT.

No functional change intended.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
[sean: rebase on common kvm_mmu structure, rewrite changelog]
Link: https://patch.msgid.link/20251230230150.4150236-8-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../testing/selftests/kvm/include/x86/processor.h  |  3 +-
 tools/testing/selftests/kvm/lib/x86/processor.c    | 64 +++++++++++++---------
 2 files changed, 39 insertions(+), 28 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 1cb5b4c46b99..43970a96baed 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1451,7 +1451,8 @@ enum pg_level {
 #define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M)
 #define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G)
 
-void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level);
+void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr,
+		   uint64_t paddr,  int level);
 void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 		    uint64_t nr_bytes, int level);
 
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index f027f86d1535..f25742a804b0 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -156,26 +156,31 @@ bool kvm_is_tdp_enabled(void)
 		return get_kvm_amd_param_bool("npt");
 }
 
+static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu)
+{
+	/* If needed, create the top-level page table. */
+	if (!mmu->pgd_created) {
+		mmu->pgd = vm_alloc_page_table(vm);
+		mmu->pgd_created = true;
+	}
+}
+
 void virt_arch_pgd_alloc(struct kvm_vm *vm)
 {
 	TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,
 		    "Unknown or unsupported guest mode: 0x%x", vm->mode);
 
-	/* If needed, create the top-level page table. */
-	if (!vm->mmu.pgd_created) {
-		vm->mmu.pgd = vm_alloc_page_table(vm);
-		vm->mmu.pgd_created = true;
-	}
+	virt_mmu_init(vm, &vm->mmu);
 }
 
-static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte,
-			  uint64_t vaddr, int level)
+static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu,
+			  uint64_t *parent_pte, uint64_t vaddr, int level)
 {
 	uint64_t pt_gpa = PTE_GET_PA(*parent_pte);
 	uint64_t *page_table = addr_gpa2hva(vm, pt_gpa);
 	int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
 
-	TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->mmu.pgd,
+	TEST_ASSERT((*parent_pte == mmu->pgd) || (*parent_pte & PTE_PRESENT_MASK),
 		    "Parent PTE (level %d) not PRESENT for gva: 0x%08lx",
 		    level + 1, vaddr);
 
@@ -183,13 +188,14 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte,
 }
 
 static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
+				       struct kvm_mmu *mmu,
 				       uint64_t *parent_pte,
 				       uint64_t vaddr,
 				       uint64_t paddr,
 				       int current_level,
 				       int target_level)
 {
-	uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level);
+	uint64_t *pte = virt_get_pte(vm, mmu, parent_pte, vaddr, current_level);
 
 	paddr = vm_untag_gpa(vm, paddr);
 
@@ -215,10 +221,11 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
 	return pte;
 }
 
-void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
+void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr,
+		   uint64_t paddr, int level)
 {
 	const uint64_t pg_size = PG_LEVEL_SIZE(level);
-	uint64_t *pte = &vm->mmu.pgd;
+	uint64_t *pte = &mmu->pgd;
 	int current_level;
 
 	TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,
@@ -243,17 +250,17 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
 	 * Allocate upper level page tables, if not already present.  Return
 	 * early if a hugepage was created.
 	 */
-	for (current_level = vm->mmu.pgtable_levels;
+	for (current_level = mmu->pgtable_levels;
 	     current_level > PG_LEVEL_4K;
 	     current_level--) {
-		pte = virt_create_upper_pte(vm, pte, vaddr, paddr,
+		pte = virt_create_upper_pte(vm, mmu, pte, vaddr, paddr,
 					    current_level, level);
 		if (*pte & PTE_LARGE_MASK)
 			return;
 	}
 
 	/* Fill in page table entry. */
-	pte = virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K);
+	pte = virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K);
 	TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
 		    "PTE already present for 4k page at vaddr: 0x%lx", vaddr);
 	*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
@@ -270,7 +277,7 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level)
 
 void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
 {
-	__virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K);
+	__virt_pg_map(vm, &vm->mmu, vaddr, paddr, PG_LEVEL_4K);
 }
 
 void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
@@ -285,7 +292,7 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 		    nr_bytes, pg_size);
 
 	for (i = 0; i < nr_pages; i++) {
-		__virt_pg_map(vm, vaddr, paddr, level);
+		__virt_pg_map(vm, &vm->mmu, vaddr, paddr, level);
 		sparsebit_set_num(vm->vpages_mapped, vaddr >> vm->page_shift,
 				  nr_bytes / PAGE_SIZE);
 
@@ -294,7 +301,8 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 	}
 }
 
-static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level)
+static bool vm_is_target_pte(struct kvm_mmu *mmu, uint64_t *pte,
+			     int *level, int current_level)
 {
 	if (*pte & PTE_LARGE_MASK) {
 		TEST_ASSERT(*level == PG_LEVEL_NONE ||
@@ -306,17 +314,19 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level)
 	return *level == current_level;
 }
 
-static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
+static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm,
+					   struct kvm_mmu *mmu,
+					   uint64_t vaddr,
 					   int *level)
 {
-	int va_width = 12 + (vm->mmu.pgtable_levels) * 9;
-	uint64_t *pte = &vm->mmu.pgd;
+	int va_width = 12 + (mmu->pgtable_levels) * 9;
+	uint64_t *pte = &mmu->pgd;
 	int current_level;
 
 	TEST_ASSERT(!vm->arch.is_pt_protected,
 		    "Walking page tables of protected guests is impossible");
 
-	TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->mmu.pgtable_levels,
+	TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= mmu->pgtable_levels,
 		    "Invalid PG_LEVEL_* '%d'", *level);
 
 	TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,
@@ -332,22 +342,22 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr,
 		    (((int64_t)vaddr << (64 - va_width) >> (64 - va_width))),
 		    "Canonical check failed.  The virtual address is invalid.");
 
-	for (current_level = vm->mmu.pgtable_levels;
+	for (current_level = mmu->pgtable_levels;
 	     current_level > PG_LEVEL_4K;
 	     current_level--) {
-		pte = virt_get_pte(vm, pte, vaddr, current_level);
-		if (vm_is_target_pte(pte, level, current_level))
+		pte = virt_get_pte(vm, mmu, pte, vaddr, current_level);
+		if (vm_is_target_pte(mmu, pte, level, current_level))
 			return pte;
 	}
 
-	return virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K);
+	return virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K);
 }
 
 uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr)
 {
 	int level = PG_LEVEL_4K;
 
-	return __vm_get_page_table_entry(vm, vaddr, &level);
+	return __vm_get_page_table_entry(vm, &vm->mmu, vaddr, &level);
 }
 
 void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
@@ -497,7 +507,7 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp)
 vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
 {
 	int level = PG_LEVEL_NONE;
-	uint64_t *pte = __vm_get_page_table_entry(vm, gva, &level);
+	uint64_t *pte = __vm_get_page_table_entry(vm, &vm->mmu, gva, &level);
 
 	TEST_ASSERT(*pte & PTE_PRESENT_MASK,
 		    "Leaf PTE not PRESENT for gva: 0x%08lx", gva);
-- 
cgit v1.2.3


From 3d0e7595e81017558189d2252ae9453c57ab3436 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 15:01:37 -0800
Subject: KVM: selftests: Add a "struct kvm_mmu_arch arch" member to kvm_mmu

Add an arch structure+field in "struct kvm_mmu" so that architectures can
track arch-specific information for a given MMU.

No functional change intended.

Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-9-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h     | 2 ++
 tools/testing/selftests/kvm/include/kvm_util.h                | 2 ++
 tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h | 1 +
 tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h     | 1 +
 tools/testing/selftests/kvm/include/s390/kvm_util_arch.h      | 1 +
 tools/testing/selftests/kvm/include/x86/kvm_util_arch.h       | 2 ++
 6 files changed, 9 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h
index b973bb2c64a6..4a2033708227 100644
--- a/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h
+++ b/tools/testing/selftests/kvm/include/arm64/kvm_util_arch.h
@@ -2,6 +2,8 @@
 #ifndef SELFTEST_KVM_UTIL_ARCH_H
 #define SELFTEST_KVM_UTIL_ARCH_H
 
+struct kvm_mmu_arch {};
+
 struct kvm_vm_arch {
 	bool	has_gic;
 	int	gic_fd;
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 39558c05c0bf..c1497515fa6a 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -92,6 +92,8 @@ struct kvm_mmu {
 	bool pgd_created;
 	uint64_t pgd;
 	int pgtable_levels;
+
+	struct kvm_mmu_arch arch;
 };
 
 struct kvm_vm {
diff --git a/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h b/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h
index e43a57d99b56..d5095900e442 100644
--- a/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h
+++ b/tools/testing/selftests/kvm/include/loongarch/kvm_util_arch.h
@@ -2,6 +2,7 @@
 #ifndef SELFTEST_KVM_UTIL_ARCH_H
 #define SELFTEST_KVM_UTIL_ARCH_H
 
+struct kvm_mmu_arch {};
 struct kvm_vm_arch {};
 
 #endif  // SELFTEST_KVM_UTIL_ARCH_H
diff --git a/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h b/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h
index e43a57d99b56..d5095900e442 100644
--- a/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h
+++ b/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h
@@ -2,6 +2,7 @@
 #ifndef SELFTEST_KVM_UTIL_ARCH_H
 #define SELFTEST_KVM_UTIL_ARCH_H
 
+struct kvm_mmu_arch {};
 struct kvm_vm_arch {};
 
 #endif  // SELFTEST_KVM_UTIL_ARCH_H
diff --git a/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h b/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h
index e43a57d99b56..d5095900e442 100644
--- a/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h
+++ b/tools/testing/selftests/kvm/include/s390/kvm_util_arch.h
@@ -2,6 +2,7 @@
 #ifndef SELFTEST_KVM_UTIL_ARCH_H
 #define SELFTEST_KVM_UTIL_ARCH_H
 
+struct kvm_mmu_arch {};
 struct kvm_vm_arch {};
 
 #endif  // SELFTEST_KVM_UTIL_ARCH_H
diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
index 972bb1c4ab4c..456e5ca170df 100644
--- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
+++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
@@ -10,6 +10,8 @@
 
 extern bool is_forced_emulation_enabled;
 
+struct kvm_mmu_arch {};
+
 struct kvm_vm_arch {
 	vm_vaddr_t gdt;
 	vm_vaddr_t tss;
-- 
cgit v1.2.3


From 6dd70757213fc046e5f86901e4baf11ed894da6b Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:38 -0800
Subject: KVM: selftests: Move PTE bitmasks to kvm_mmu

Move the PTE bitmasks into kvm_mmu to parameterize them for virt mapping
functions. Introduce helpers to read/write different PTE bits given a
kvm_mmu.

Drop the 'global' bit definition as it's currently unused, but leave the
'user' bit as it will be used in coming changes. Opportunisitcally
rename 'large' to 'huge' as it's more consistent with the kernel naming.

Leave PHYSICAL_PAGE_MASK alone, it's fixed in all page table formats and
a lot of other macros depend on it. It's tempting to move all the other
macros to be per-struct instead, but it would be too much noise for
little benefit.

Keep c_bit and s_bit in vm->arch as they used before the MMU is
initialized, through  __vmcreate() -> vm_userspace_mem_region_add() ->
vm_mem_add() -> vm_arch_has_protected_memory().

No functional change intended.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
[sean: rename accessors to is_<adjective>_pte()]
Link: https://patch.msgid.link/20251230230150.4150236-10-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/include/x86/kvm_util_arch.h      | 16 ++++-
 .../testing/selftests/kvm/include/x86/processor.h  | 28 ++++++---
 tools/testing/selftests/kvm/lib/x86/processor.c    | 71 +++++++++++++---------
 3 files changed, 76 insertions(+), 39 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
index 456e5ca170df..bad381d63b6a 100644
--- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
+++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
@@ -10,7 +10,21 @@
 
 extern bool is_forced_emulation_enabled;
 
-struct kvm_mmu_arch {};
+struct pte_masks {
+	uint64_t present;
+	uint64_t writable;
+	uint64_t user;
+	uint64_t accessed;
+	uint64_t dirty;
+	uint64_t huge;
+	uint64_t nx;
+	uint64_t c;
+	uint64_t s;
+};
+
+struct kvm_mmu_arch {
+	struct pte_masks pte_masks;
+};
 
 struct kvm_vm_arch {
 	vm_vaddr_t gdt;
diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 43970a96baed..55dac84cd4a7 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -362,16 +362,6 @@ static inline unsigned int x86_model(unsigned int eax)
 	return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f);
 }
 
-/* Page table bitfield declarations */
-#define PTE_PRESENT_MASK        BIT_ULL(0)
-#define PTE_WRITABLE_MASK       BIT_ULL(1)
-#define PTE_USER_MASK           BIT_ULL(2)
-#define PTE_ACCESSED_MASK       BIT_ULL(5)
-#define PTE_DIRTY_MASK          BIT_ULL(6)
-#define PTE_LARGE_MASK          BIT_ULL(7)
-#define PTE_GLOBAL_MASK         BIT_ULL(8)
-#define PTE_NX_MASK             BIT_ULL(63)
-
 #define PHYSICAL_PAGE_MASK      GENMASK_ULL(51, 12)
 
 #define PAGE_SHIFT		12
@@ -1451,6 +1441,24 @@ enum pg_level {
 #define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M)
 #define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G)
 
+#define PTE_PRESENT_MASK(mmu)		((mmu)->arch.pte_masks.present)
+#define PTE_WRITABLE_MASK(mmu)		((mmu)->arch.pte_masks.writable)
+#define PTE_USER_MASK(mmu)		((mmu)->arch.pte_masks.user)
+#define PTE_ACCESSED_MASK(mmu)		((mmu)->arch.pte_masks.accessed)
+#define PTE_DIRTY_MASK(mmu)		((mmu)->arch.pte_masks.dirty)
+#define PTE_HUGE_MASK(mmu)		((mmu)->arch.pte_masks.huge)
+#define PTE_NX_MASK(mmu)		((mmu)->arch.pte_masks.nx)
+#define PTE_C_BIT_MASK(mmu)		((mmu)->arch.pte_masks.c)
+#define PTE_S_BIT_MASK(mmu)		((mmu)->arch.pte_masks.s)
+
+#define is_present_pte(mmu, pte)	(!!(*(pte) & PTE_PRESENT_MASK(mmu)))
+#define is_writable_pte(mmu, pte)	(!!(*(pte) & PTE_WRITABLE_MASK(mmu)))
+#define is_user_pte(mmu, pte)		(!!(*(pte) & PTE_USER_MASK(mmu)))
+#define is_accessed_pte(mmu, pte)	(!!(*(pte) & PTE_ACCESSED_MASK(mmu)))
+#define is_dirty_pte(mmu, pte)		(!!(*(pte) & PTE_DIRTY_MASK(mmu)))
+#define is_huge_pte(mmu, pte)		(!!(*(pte) & PTE_HUGE_MASK(mmu)))
+#define is_nx_pte(mmu, pte)		(!!(*(pte) & PTE_NX_MASK(mmu)))
+
 void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr,
 		   uint64_t paddr,  int level);
 void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index f25742a804b0..3800f4ff6770 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -156,12 +156,14 @@ bool kvm_is_tdp_enabled(void)
 		return get_kvm_amd_param_bool("npt");
 }
 
-static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu)
+static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu,
+			  struct pte_masks *pte_masks)
 {
 	/* If needed, create the top-level page table. */
 	if (!mmu->pgd_created) {
 		mmu->pgd = vm_alloc_page_table(vm);
 		mmu->pgd_created = true;
+		mmu->arch.pte_masks = *pte_masks;
 	}
 }
 
@@ -170,7 +172,19 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
 	TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,
 		    "Unknown or unsupported guest mode: 0x%x", vm->mode);
 
-	virt_mmu_init(vm, &vm->mmu);
+	struct pte_masks pte_masks = (struct pte_masks){
+		.present	=	BIT_ULL(0),
+		.writable	=	BIT_ULL(1),
+		.user		=	BIT_ULL(2),
+		.accessed	=	BIT_ULL(5),
+		.dirty		=	BIT_ULL(6),
+		.huge		=	BIT_ULL(7),
+		.nx		=	BIT_ULL(63),
+		.c		=	vm->arch.c_bit,
+		.s		=	vm->arch.s_bit,
+	};
+
+	virt_mmu_init(vm, &vm->mmu, &pte_masks);
 }
 
 static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu,
@@ -180,7 +194,7 @@ static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu,
 	uint64_t *page_table = addr_gpa2hva(vm, pt_gpa);
 	int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
 
-	TEST_ASSERT((*parent_pte == mmu->pgd) || (*parent_pte & PTE_PRESENT_MASK),
+	TEST_ASSERT((*parent_pte == mmu->pgd) || is_present_pte(mmu, parent_pte),
 		    "Parent PTE (level %d) not PRESENT for gva: 0x%08lx",
 		    level + 1, vaddr);
 
@@ -199,10 +213,10 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
 
 	paddr = vm_untag_gpa(vm, paddr);
 
-	if (!(*pte & PTE_PRESENT_MASK)) {
-		*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
+	if (!is_present_pte(mmu, pte)) {
+		*pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu);
 		if (current_level == target_level)
-			*pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
+			*pte |= PTE_HUGE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK);
 		else
 			*pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
 	} else {
@@ -214,7 +228,7 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
 		TEST_ASSERT(current_level != target_level,
 			    "Cannot create hugepage at level: %u, vaddr: 0x%lx",
 			    current_level, vaddr);
-		TEST_ASSERT(!(*pte & PTE_LARGE_MASK),
+		TEST_ASSERT(!is_huge_pte(mmu, pte),
 			    "Cannot create page table at level: %u, vaddr: 0x%lx",
 			    current_level, vaddr);
 	}
@@ -255,24 +269,24 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr,
 	     current_level--) {
 		pte = virt_create_upper_pte(vm, mmu, pte, vaddr, paddr,
 					    current_level, level);
-		if (*pte & PTE_LARGE_MASK)
+		if (is_huge_pte(mmu, pte))
 			return;
 	}
 
 	/* Fill in page table entry. */
 	pte = virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K);
-	TEST_ASSERT(!(*pte & PTE_PRESENT_MASK),
+	TEST_ASSERT(!is_present_pte(mmu, pte),
 		    "PTE already present for 4k page at vaddr: 0x%lx", vaddr);
-	*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK);
+	*pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK);
 
 	/*
 	 * Neither SEV nor TDX supports shared page tables, so only the final
 	 * leaf PTE needs manually set the C/S-bit.
 	 */
 	if (vm_is_gpa_protected(vm, paddr))
-		*pte |= vm->arch.c_bit;
+		*pte |= PTE_C_BIT_MASK(mmu);
 	else
-		*pte |= vm->arch.s_bit;
+		*pte |= PTE_S_BIT_MASK(mmu);
 }
 
 void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr)
@@ -304,7 +318,7 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 static bool vm_is_target_pte(struct kvm_mmu *mmu, uint64_t *pte,
 			     int *level, int current_level)
 {
-	if (*pte & PTE_LARGE_MASK) {
+	if (is_huge_pte(mmu, pte)) {
 		TEST_ASSERT(*level == PG_LEVEL_NONE ||
 			    *level == current_level,
 			    "Unexpected hugepage at level %d", current_level);
@@ -362,12 +376,13 @@ uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr)
 
 void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 {
+	struct kvm_mmu *mmu = &vm->mmu;
 	uint64_t *pml4e, *pml4e_start;
 	uint64_t *pdpe, *pdpe_start;
 	uint64_t *pde, *pde_start;
 	uint64_t *pte, *pte_start;
 
-	if (!vm->mmu.pgd_created)
+	if (!mmu->pgd_created)
 		return;
 
 	fprintf(stream, "%*s                                          "
@@ -375,47 +390,47 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 	fprintf(stream, "%*s      index hvaddr         gpaddr         "
 		"addr         w exec dirty\n",
 		indent, "");
-	pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->mmu.pgd);
+	pml4e_start = (uint64_t *) addr_gpa2hva(vm, mmu->pgd);
 	for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
 		pml4e = &pml4e_start[n1];
-		if (!(*pml4e & PTE_PRESENT_MASK))
+		if (!is_present_pte(mmu, pml4e))
 			continue;
 		fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u "
 			" %u\n",
 			indent, "",
 			pml4e - pml4e_start, pml4e,
 			addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
-			!!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
+			is_writable_pte(mmu, pml4e), is_nx_pte(mmu, pml4e));
 
 		pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
 		for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
 			pdpe = &pdpe_start[n2];
-			if (!(*pdpe & PTE_PRESENT_MASK))
+			if (!is_present_pte(mmu, pdpe))
 				continue;
 			fprintf(stream, "%*spdpe  0x%-3zx %p 0x%-12lx 0x%-10llx "
 				"%u  %u\n",
 				indent, "",
 				pdpe - pdpe_start, pdpe,
 				addr_hva2gpa(vm, pdpe),
-				PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
-				!!(*pdpe & PTE_NX_MASK));
+				PTE_GET_PFN(*pdpe), is_writable_pte(mmu, pdpe),
+				is_nx_pte(mmu, pdpe));
 
 			pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK);
 			for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
 				pde = &pde_start[n3];
-				if (!(*pde & PTE_PRESENT_MASK))
+				if (!is_present_pte(mmu, pde))
 					continue;
 				fprintf(stream, "%*spde   0x%-3zx %p "
 					"0x%-12lx 0x%-10llx %u  %u\n",
 					indent, "", pde - pde_start, pde,
 					addr_hva2gpa(vm, pde),
-					PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
-					!!(*pde & PTE_NX_MASK));
+					PTE_GET_PFN(*pde), is_writable_pte(mmu, pde),
+					is_nx_pte(mmu, pde));
 
 				pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK);
 				for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
 					pte = &pte_start[n4];
-					if (!(*pte & PTE_PRESENT_MASK))
+					if (!is_present_pte(mmu, pte))
 						continue;
 					fprintf(stream, "%*spte   0x%-3zx %p "
 						"0x%-12lx 0x%-10llx %u  %u "
@@ -424,9 +439,9 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 						pte - pte_start, pte,
 						addr_hva2gpa(vm, pte),
 						PTE_GET_PFN(*pte),
-						!!(*pte & PTE_WRITABLE_MASK),
-						!!(*pte & PTE_NX_MASK),
-						!!(*pte & PTE_DIRTY_MASK),
+						is_writable_pte(mmu, pte),
+						is_nx_pte(mmu, pte),
+						is_dirty_pte(mmu, pte),
 						((uint64_t) n1 << 27)
 							| ((uint64_t) n2 << 18)
 							| ((uint64_t) n3 << 9)
@@ -509,7 +524,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
 	int level = PG_LEVEL_NONE;
 	uint64_t *pte = __vm_get_page_table_entry(vm, &vm->mmu, gva, &level);
 
-	TEST_ASSERT(*pte & PTE_PRESENT_MASK,
+	TEST_ASSERT(is_present_pte(&vm->mmu, pte),
 		    "Leaf PTE not PRESENT for gva: 0x%08lx", gva);
 
 	/*
-- 
cgit v1.2.3


From f00f519cebcd4280c4ce4fab8133ed2dcfa8f95a Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:39 -0800
Subject: KVM: selftests: Use a TDP MMU to share EPT page tables between vCPUs

prepare_eptp() currently allocates new EPTs for each vCPU.  memstress has
its own hack to share the EPTs between vCPUs.  Currently, there is no
reason to have separate EPTs for each vCPU, and the complexity is
significant.  The only reason it doesn't matter now is because memstress
is the only user with multiple vCPUs.

Add vm_enable_ept() to allocate EPT page tables for an entire VM, and use
it everywhere to replace prepare_eptp().  Drop 'eptp' and 'eptp_hva' from
'struct vmx_pages' as they serve no purpose (e.g. the EPTP can be built
from the PGD), but keep 'eptp_gpa' so that the MMU structure doesn't need
to be passed in along with vmx_pages.  Dynamically allocate the TDP MMU
structure to avoid a cyclical dependency between kvm_util_arch.h and
kvm_util.h.

Remove the workaround in memstress to copy the EPT root between vCPUs
since that's now the default behavior.

Name the MMU tdp_mmu instead of e.g. nested_mmu or nested.mmu to avoid
recreating the same mess that KVM has with respect to "nested" MMUs, e.g.
does nested refer to the stage-2 page tables created by L1, or the stage-1
page tables created by L2?

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Link: https://patch.msgid.link/20251230230150.4150236-11-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/include/x86/kvm_util_arch.h      |  4 +++
 .../testing/selftests/kvm/include/x86/processor.h  |  3 +++
 tools/testing/selftests/kvm/include/x86/vmx.h      |  8 +++---
 tools/testing/selftests/kvm/lib/x86/memstress.c    | 19 +++++---------
 tools/testing/selftests/kvm/lib/x86/processor.c    |  9 +++++++
 tools/testing/selftests/kvm/lib/x86/vmx.c          | 30 ++++++++++++++--------
 .../testing/selftests/kvm/x86/vmx_dirty_log_test.c |  7 +++--
 7 files changed, 48 insertions(+), 32 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
index bad381d63b6a..05a1fc1780f2 100644
--- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
+++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
@@ -26,6 +26,8 @@ struct kvm_mmu_arch {
 	struct pte_masks pte_masks;
 };
 
+struct kvm_mmu;
+
 struct kvm_vm_arch {
 	vm_vaddr_t gdt;
 	vm_vaddr_t tss;
@@ -35,6 +37,8 @@ struct kvm_vm_arch {
 	uint64_t s_bit;
 	int sev_fd;
 	bool is_pt_protected;
+
+	struct kvm_mmu *tdp_mmu;
 };
 
 static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch)
diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 55dac84cd4a7..0164ef090787 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1459,6 +1459,9 @@ enum pg_level {
 #define is_huge_pte(mmu, pte)		(!!(*(pte) & PTE_HUGE_MASK(mmu)))
 #define is_nx_pte(mmu, pte)		(!!(*(pte) & PTE_NX_MASK(mmu)))
 
+void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels,
+		  struct pte_masks *pte_masks);
+
 void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr,
 		   uint64_t paddr,  int level);
 void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h
index 04b8231d032a..1fd83c23529a 100644
--- a/tools/testing/selftests/kvm/include/x86/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86/vmx.h
@@ -520,13 +520,11 @@ struct vmx_pages {
 	uint64_t vmwrite_gpa;
 	void *vmwrite;
 
-	void *eptp_hva;
-	uint64_t eptp_gpa;
-	void *eptp;
-
 	void *apic_access_hva;
 	uint64_t apic_access_gpa;
 	void *apic_access;
+
+	uint64_t eptp_gpa;
 };
 
 union vmx_basic {
@@ -568,7 +566,7 @@ void tdp_identity_map_default_memslots(struct vmx_pages *vmx,
 void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
 			 uint64_t addr, uint64_t size);
 bool kvm_cpu_has_ept(void);
-void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm);
+void vm_enable_ept(struct kvm_vm *vm);
 void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm);
 
 #endif /* SELFTEST_KVM_VMX_H */
diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c
index 1928b00bde51..00f7f11e5f0e 100644
--- a/tools/testing/selftests/kvm/lib/x86/memstress.c
+++ b/tools/testing/selftests/kvm/lib/x86/memstress.c
@@ -59,12 +59,10 @@ uint64_t memstress_nested_pages(int nr_vcpus)
 	return 513 + 10 * nr_vcpus;
 }
 
-void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm)
+static void memstress_setup_ept_mappings(struct vmx_pages *vmx, struct kvm_vm *vm)
 {
 	uint64_t start, end;
 
-	prepare_eptp(vmx, vm);
-
 	/*
 	 * Identity map the first 4G and the test region with 1G pages so that
 	 * KVM can shadow the EPT12 with the maximum huge page size supported
@@ -79,7 +77,7 @@ void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm)
 
 void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[])
 {
-	struct vmx_pages *vmx, *vmx0 = NULL;
+	struct vmx_pages *vmx;
 	struct kvm_regs regs;
 	vm_vaddr_t vmx_gva;
 	int vcpu_id;
@@ -87,18 +85,13 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc
 	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
 	TEST_REQUIRE(kvm_cpu_has_ept());
 
+	vm_enable_ept(vm);
 	for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
 		vmx = vcpu_alloc_vmx(vm, &vmx_gva);
 
-		if (vcpu_id == 0) {
-			memstress_setup_ept(vmx, vm);
-			vmx0 = vmx;
-		} else {
-			/* Share the same EPT table across all vCPUs. */
-			vmx->eptp = vmx0->eptp;
-			vmx->eptp_hva = vmx0->eptp_hva;
-			vmx->eptp_gpa = vmx0->eptp_gpa;
-		}
+		/* The EPTs are shared across vCPUs, setup the mappings once */
+		if (vcpu_id == 0)
+			memstress_setup_ept_mappings(vmx, vm);
 
 		/*
 		 * Override the vCPU to run memstress_l1_guest_code() which will
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index 3800f4ff6770..8a9298a72897 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -187,6 +187,15 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
 	virt_mmu_init(vm, &vm->mmu, &pte_masks);
 }
 
+void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels,
+		  struct pte_masks *pte_masks)
+{
+	TEST_ASSERT(!vm->arch.tdp_mmu, "TDP MMU already initialized");
+
+	vm->arch.tdp_mmu = calloc(1, sizeof(*vm->arch.tdp_mmu));
+	virt_mmu_init(vm, vm->arch.tdp_mmu, pte_masks);
+}
+
 static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu,
 			  uint64_t *parent_pte, uint64_t vaddr, int level)
 {
diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c
index a3e2eae981da..9d4e391fdf2c 100644
--- a/tools/testing/selftests/kvm/lib/x86/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86/vmx.c
@@ -56,6 +56,21 @@ int vcpu_enable_evmcs(struct kvm_vcpu *vcpu)
 	return evmcs_ver;
 }
 
+void vm_enable_ept(struct kvm_vm *vm)
+{
+	TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT");
+	if (vm->arch.tdp_mmu)
+		return;
+
+	/* TODO: Drop eptPageTableEntry in favor of PTE masks. */
+	struct pte_masks pte_masks = (struct pte_masks) {
+
+	};
+
+	/* TODO: Add support for 5-level EPT. */
+	tdp_mmu_init(vm, 4, &pte_masks);
+}
+
 /* Allocate memory regions for nested VMX tests.
  *
  * Input Args:
@@ -105,6 +120,9 @@ vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva)
 	vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite);
 	memset(vmx->vmwrite_hva, 0, getpagesize());
 
+	if (vm->arch.tdp_mmu)
+		vmx->eptp_gpa = vm->arch.tdp_mmu->pgd;
+
 	*p_vmx_gva = vmx_gva;
 	return vmx;
 }
@@ -395,7 +413,8 @@ void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
 		  uint64_t nested_paddr, uint64_t paddr, int target_level)
 {
 	const uint64_t page_size = PG_LEVEL_SIZE(target_level);
-	struct eptPageTableEntry *pt = vmx->eptp_hva, *pte;
+	void *eptp_hva = addr_gpa2hva(vm, vm->arch.tdp_mmu->pgd);
+	struct eptPageTableEntry *pt = eptp_hva, *pte;
 	uint16_t index;
 
 	TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,
@@ -525,15 +544,6 @@ bool kvm_cpu_has_ept(void)
 	return ctrl & SECONDARY_EXEC_ENABLE_EPT;
 }
 
-void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm)
-{
-	TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT");
-
-	vmx->eptp = (void *)vm_vaddr_alloc_page(vm);
-	vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp);
-	vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp);
-}
-
 void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm)
 {
 	vmx->apic_access = (void *)vm_vaddr_alloc_page(vm);
diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
index e7d0c08ba29d..5c8cf8ac42a2 100644
--- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
+++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
@@ -93,6 +93,9 @@ static void test_vmx_dirty_log(bool enable_ept)
 
 	/* Create VM */
 	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+	if (enable_ept)
+		vm_enable_ept(vm);
+
 	vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
 	vcpu_args_set(vcpu, 1, vmx_pages_gva);
 
@@ -113,14 +116,10 @@ static void test_vmx_dirty_log(bool enable_ept)
 	 * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to
 	 * 0xc0000000.
 	 *
-	 * Note that prepare_eptp should be called only L1's GPA map is done,
-	 * meaning after the last call to virt_map.
-	 *
 	 * When EPT is disabled, the L2 guest code will still access the same L1
 	 * GPAs as the EPT enabled case.
 	 */
 	if (enable_ept) {
-		prepare_eptp(vmx, vm);
 		tdp_identity_map_default_memslots(vmx, vm);
 		tdp_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE);
 		tdp_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE);
-- 
cgit v1.2.3


From e40e72fec0dea9ac55aea84a0d76ccb7d7f32204 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:40 -0800
Subject: KVM: selftests: Stop passing VMX metadata to TDP mapping functions

The root GPA is now retrieved from the nested MMU, stop passing VMX
metadata. This is in preparation for making these functions work for
NPTs as well.

Opportunistically drop tdp_pg_map() since it's unused.

No functional change intended.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-12-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86/vmx.h      | 11 ++------
 tools/testing/selftests/kvm/lib/x86/memstress.c    | 11 ++++----
 tools/testing/selftests/kvm/lib/x86/vmx.c          | 33 ++++++++--------------
 .../testing/selftests/kvm/x86/vmx_dirty_log_test.c |  9 +++---
 4 files changed, 24 insertions(+), 40 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h
index 1fd83c23529a..4dd4c2094ee6 100644
--- a/tools/testing/selftests/kvm/include/x86/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86/vmx.h
@@ -557,14 +557,9 @@ bool load_vmcs(struct vmx_pages *vmx);
 
 bool ept_1g_pages_supported(void);
 
-void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr,
-		uint64_t paddr);
-void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t nested_paddr,
-	     uint64_t paddr, uint64_t size);
-void tdp_identity_map_default_memslots(struct vmx_pages *vmx,
-				       struct kvm_vm *vm);
-void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
-			 uint64_t addr, uint64_t size);
+void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size);
+void tdp_identity_map_default_memslots(struct kvm_vm *vm);
+void tdp_identity_map_1g(struct kvm_vm *vm,  uint64_t addr, uint64_t size);
 bool kvm_cpu_has_ept(void);
 void vm_enable_ept(struct kvm_vm *vm);
 void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm);
diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c
index 00f7f11e5f0e..3319cb57a78d 100644
--- a/tools/testing/selftests/kvm/lib/x86/memstress.c
+++ b/tools/testing/selftests/kvm/lib/x86/memstress.c
@@ -59,7 +59,7 @@ uint64_t memstress_nested_pages(int nr_vcpus)
 	return 513 + 10 * nr_vcpus;
 }
 
-static void memstress_setup_ept_mappings(struct vmx_pages *vmx, struct kvm_vm *vm)
+static void memstress_setup_ept_mappings(struct kvm_vm *vm)
 {
 	uint64_t start, end;
 
@@ -68,16 +68,15 @@ static void memstress_setup_ept_mappings(struct vmx_pages *vmx, struct kvm_vm *v
 	 * KVM can shadow the EPT12 with the maximum huge page size supported
 	 * by the backing source.
 	 */
-	tdp_identity_map_1g(vmx, vm, 0, 0x100000000ULL);
+	tdp_identity_map_1g(vm, 0, 0x100000000ULL);
 
 	start = align_down(memstress_args.gpa, PG_SIZE_1G);
 	end = align_up(memstress_args.gpa + memstress_args.size, PG_SIZE_1G);
-	tdp_identity_map_1g(vmx, vm, start, end - start);
+	tdp_identity_map_1g(vm, start, end - start);
 }
 
 void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[])
 {
-	struct vmx_pages *vmx;
 	struct kvm_regs regs;
 	vm_vaddr_t vmx_gva;
 	int vcpu_id;
@@ -87,11 +86,11 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc
 
 	vm_enable_ept(vm);
 	for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
-		vmx = vcpu_alloc_vmx(vm, &vmx_gva);
+		vcpu_alloc_vmx(vm, &vmx_gva);
 
 		/* The EPTs are shared across vCPUs, setup the mappings once */
 		if (vcpu_id == 0)
-			memstress_setup_ept_mappings(vmx, vm);
+			memstress_setup_ept_mappings(vm);
 
 		/*
 		 * Override the vCPU to run memstress_l1_guest_code() which will
diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c
index 9d4e391fdf2c..ea1c09f9e8ab 100644
--- a/tools/testing/selftests/kvm/lib/x86/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86/vmx.c
@@ -409,8 +409,8 @@ static void tdp_create_pte(struct kvm_vm *vm,
 }
 
 
-void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		  uint64_t nested_paddr, uint64_t paddr, int target_level)
+void __tdp_pg_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr,
+		  int target_level)
 {
 	const uint64_t page_size = PG_LEVEL_SIZE(target_level);
 	void *eptp_hva = addr_gpa2hva(vm, vm->arch.tdp_mmu->pgd);
@@ -453,12 +453,6 @@ void __tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
 	}
 }
 
-void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-		uint64_t nested_paddr, uint64_t paddr)
-{
-	__tdp_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K);
-}
-
 /*
  * Map a range of EPT guest physical addresses to the VM's physical address
  *
@@ -476,9 +470,8 @@ void tdp_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
  * Within the VM given by vm, creates a nested guest translation for the
  * page range starting at nested_paddr to the page range starting at paddr.
  */
-void __tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-	       uint64_t nested_paddr, uint64_t paddr, uint64_t size,
-		  int level)
+void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr,
+	       uint64_t size, int level)
 {
 	size_t page_size = PG_LEVEL_SIZE(level);
 	size_t npages = size / page_size;
@@ -487,23 +480,22 @@ void __tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm,
 	TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
 
 	while (npages--) {
-		__tdp_pg_map(vmx, vm, nested_paddr, paddr, level);
+		__tdp_pg_map(vm, nested_paddr, paddr, level);
 		nested_paddr += page_size;
 		paddr += page_size;
 	}
 }
 
-void tdp_map(struct vmx_pages *vmx, struct kvm_vm *vm,
-	     uint64_t nested_paddr, uint64_t paddr, uint64_t size)
+void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr,
+	     uint64_t size)
 {
-	__tdp_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K);
+	__tdp_map(vm, nested_paddr, paddr, size, PG_LEVEL_4K);
 }
 
 /* Prepare an identity extended page table that maps all the
  * physical pages in VM.
  */
-void tdp_identity_map_default_memslots(struct vmx_pages *vmx,
-				       struct kvm_vm *vm)
+void tdp_identity_map_default_memslots(struct kvm_vm *vm)
 {
 	uint32_t s, memslot = 0;
 	sparsebit_idx_t i, last;
@@ -520,16 +512,15 @@ void tdp_identity_map_default_memslots(struct vmx_pages *vmx,
 		if (i > last)
 			break;
 
-		tdp_map(vmx, vm, (uint64_t)i << vm->page_shift,
+		tdp_map(vm, (uint64_t)i << vm->page_shift,
 			(uint64_t)i << vm->page_shift, 1 << vm->page_shift);
 	}
 }
 
 /* Identity map a region with 1GiB Pages. */
-void tdp_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
-			    uint64_t addr, uint64_t size)
+void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size)
 {
-	__tdp_map(vmx, vm, addr, addr, size, PG_LEVEL_1G);
+	__tdp_map(vm, addr, addr, size, PG_LEVEL_1G);
 }
 
 bool kvm_cpu_has_ept(void)
diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
index 5c8cf8ac42a2..370f8d3117c2 100644
--- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
+++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
@@ -80,7 +80,6 @@ void l1_guest_code(struct vmx_pages *vmx)
 static void test_vmx_dirty_log(bool enable_ept)
 {
 	vm_vaddr_t vmx_pages_gva = 0;
-	struct vmx_pages *vmx;
 	unsigned long *bmap;
 	uint64_t *host_test_mem;
 
@@ -96,7 +95,7 @@ static void test_vmx_dirty_log(bool enable_ept)
 	if (enable_ept)
 		vm_enable_ept(vm);
 
-	vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
 	vcpu_args_set(vcpu, 1, vmx_pages_gva);
 
 	/* Add an extra memory slot for testing dirty logging */
@@ -120,9 +119,9 @@ static void test_vmx_dirty_log(bool enable_ept)
 	 * GPAs as the EPT enabled case.
 	 */
 	if (enable_ept) {
-		tdp_identity_map_default_memslots(vmx, vm);
-		tdp_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE);
-		tdp_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE);
+		tdp_identity_map_default_memslots(vm);
+		tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE);
+		tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE);
 	}
 
 	bmap = bitmap_zalloc(TEST_MEM_PAGES);
-- 
cgit v1.2.3


From 8296b16c0a2ba018c3235db5325d679e603899d6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 15:01:41 -0800
Subject: KVM: selftests: Add a stage-2 MMU instance to kvm_vm

Add a stage-2 MMU instance so that architectures that support nested
virtualization (more specifically, nested stage-2 page tables) can create
and track stage-2 page tables for running L2 guests.  Plumb the structure
into common code to avoid cyclical dependencies, and to provide some line
of sight to having common APIs for creating stage-2 mappings.

As a bonus, putting the member in common code justifies using stage2_mmu
instead of tdp_mmu for x86.

Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-13-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index c1497515fa6a..371d55e0366e 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -116,7 +116,12 @@ struct kvm_vm {
 	uint32_t dirty_ring_size;
 	uint64_t gpa_tag_mask;
 
+	/*
+	 * "mmu" is the guest's stage-1, with a short name because the vast
+	 * majority of tests only care about the stage-1 MMU.
+	 */
 	struct kvm_mmu mmu;
+	struct kvm_mmu stage2_mmu;
 
 	struct kvm_vm_arch arch;
 
-- 
cgit v1.2.3


From 508d1cc3ca0ac428b1d5d614519bc497868c2e9f Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:42 -0800
Subject: KVM: selftests: Reuse virt mapping functions for nested EPTs

Rework tdp_map() and friends to use __virt_pg_map() and drop the custom
EPT code in __tdp_pg_map() and tdp_create_pte().  The EPT code and
__virt_pg_map() are practically identical, the main differences are:
  - EPT uses the EPT struct overlay instead of the PTE masks.
  - EPT always assumes 4-level EPTs.

To reuse __virt_pg_map(), extend the PTE masks to work with EPT's RWX and
X-only capabilities, and provide a tdp_mmu_init() API so that EPT can pass
in the EPT PTE masks along with the root page level (which is currently
hardcoded to '4').

Don't reuse KVM's insane overloading of the USER bit for EPT_R as there's
no reason to multiplex bits in the selftests, e.g. selftests aren't trying
to shadow guest PTEs and thus don't care about funnelling protections into
a common permissions check.

Another benefit of reusing the code is having separate handling for
upper-level PTEs vs 4K PTEs, which avoids some quirks like setting the
large bit on a 4K PTE in the EPTs.

For all intents and purposes, no functional change intended.

Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Link: https://patch.msgid.link/20251230230150.4150236-14-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../selftests/kvm/include/x86/kvm_util_arch.h      |   4 +-
 .../testing/selftests/kvm/include/x86/processor.h  |  16 ++-
 tools/testing/selftests/kvm/lib/x86/processor.c    |  21 +++-
 tools/testing/selftests/kvm/lib/x86/vmx.c          | 119 ++++-----------------
 4 files changed, 52 insertions(+), 108 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
index 05a1fc1780f2..1cf84b8212c6 100644
--- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
+++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
@@ -14,6 +14,8 @@ struct pte_masks {
 	uint64_t present;
 	uint64_t writable;
 	uint64_t user;
+	uint64_t readable;
+	uint64_t executable;
 	uint64_t accessed;
 	uint64_t dirty;
 	uint64_t huge;
@@ -37,8 +39,6 @@ struct kvm_vm_arch {
 	uint64_t s_bit;
 	int sev_fd;
 	bool is_pt_protected;
-
-	struct kvm_mmu *tdp_mmu;
 };
 
 static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch)
diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 0164ef090787..e17cbbe71b8f 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1444,6 +1444,8 @@ enum pg_level {
 #define PTE_PRESENT_MASK(mmu)		((mmu)->arch.pte_masks.present)
 #define PTE_WRITABLE_MASK(mmu)		((mmu)->arch.pte_masks.writable)
 #define PTE_USER_MASK(mmu)		((mmu)->arch.pte_masks.user)
+#define PTE_READABLE_MASK(mmu)		((mmu)->arch.pte_masks.readable)
+#define PTE_EXECUTABLE_MASK(mmu)	((mmu)->arch.pte_masks.executable)
 #define PTE_ACCESSED_MASK(mmu)		((mmu)->arch.pte_masks.accessed)
 #define PTE_DIRTY_MASK(mmu)		((mmu)->arch.pte_masks.dirty)
 #define PTE_HUGE_MASK(mmu)		((mmu)->arch.pte_masks.huge)
@@ -1451,13 +1453,23 @@ enum pg_level {
 #define PTE_C_BIT_MASK(mmu)		((mmu)->arch.pte_masks.c)
 #define PTE_S_BIT_MASK(mmu)		((mmu)->arch.pte_masks.s)
 
-#define is_present_pte(mmu, pte)	(!!(*(pte) & PTE_PRESENT_MASK(mmu)))
+/*
+ * For PTEs without a PRESENT bit (i.e. EPT entries), treat the PTE as present
+ * if it's executable or readable, as EPT supports execute-only PTEs, but not
+ * write-only PTEs.
+ */
+#define is_present_pte(mmu, pte)		\
+	(PTE_PRESENT_MASK(mmu) ?		\
+	 !!(*(pte) & PTE_PRESENT_MASK(mmu)) :	\
+	 !!(*(pte) & (PTE_READABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu))))
+#define is_executable_pte(mmu, pte)	\
+	((*(pte) & (PTE_EXECUTABLE_MASK(mmu) | PTE_NX_MASK(mmu))) == PTE_EXECUTABLE_MASK(mmu))
 #define is_writable_pte(mmu, pte)	(!!(*(pte) & PTE_WRITABLE_MASK(mmu)))
 #define is_user_pte(mmu, pte)		(!!(*(pte) & PTE_USER_MASK(mmu)))
 #define is_accessed_pte(mmu, pte)	(!!(*(pte) & PTE_ACCESSED_MASK(mmu)))
 #define is_dirty_pte(mmu, pte)		(!!(*(pte) & PTE_DIRTY_MASK(mmu)))
 #define is_huge_pte(mmu, pte)		(!!(*(pte) & PTE_HUGE_MASK(mmu)))
-#define is_nx_pte(mmu, pte)		(!!(*(pte) & PTE_NX_MASK(mmu)))
+#define is_nx_pte(mmu, pte)		(!is_executable_pte(mmu, pte))
 
 void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels,
 		  struct pte_masks *pte_masks);
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index 8a9298a72897..41316cac94e0 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -165,6 +165,10 @@ static void virt_mmu_init(struct kvm_vm *vm, struct kvm_mmu *mmu,
 		mmu->pgd_created = true;
 		mmu->arch.pte_masks = *pte_masks;
 	}
+
+	TEST_ASSERT(mmu->pgtable_levels == 4 || mmu->pgtable_levels == 5,
+		    "Selftests MMU only supports 4-level and 5-level paging, not %u-level paging",
+		    mmu->pgtable_levels);
 }
 
 void virt_arch_pgd_alloc(struct kvm_vm *vm)
@@ -180,6 +184,7 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
 		.dirty		=	BIT_ULL(6),
 		.huge		=	BIT_ULL(7),
 		.nx		=	BIT_ULL(63),
+		.executable	=	0,
 		.c		=	vm->arch.c_bit,
 		.s		=	vm->arch.s_bit,
 	};
@@ -190,10 +195,10 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
 void tdp_mmu_init(struct kvm_vm *vm, int pgtable_levels,
 		  struct pte_masks *pte_masks)
 {
-	TEST_ASSERT(!vm->arch.tdp_mmu, "TDP MMU already initialized");
+	TEST_ASSERT(!vm->stage2_mmu.pgtable_levels, "TDP MMU already initialized");
 
-	vm->arch.tdp_mmu = calloc(1, sizeof(*vm->arch.tdp_mmu));
-	virt_mmu_init(vm, vm->arch.tdp_mmu, pte_masks);
+	vm->stage2_mmu.pgtable_levels = pgtable_levels;
+	virt_mmu_init(vm, &vm->stage2_mmu, pte_masks);
 }
 
 static void *virt_get_pte(struct kvm_vm *vm, struct kvm_mmu *mmu,
@@ -223,7 +228,8 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
 	paddr = vm_untag_gpa(vm, paddr);
 
 	if (!is_present_pte(mmu, pte)) {
-		*pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu);
+		*pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) |
+		       PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu);
 		if (current_level == target_level)
 			*pte |= PTE_HUGE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK);
 		else
@@ -269,6 +275,9 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr,
 	TEST_ASSERT(vm_untag_gpa(vm, paddr) == paddr,
 		    "Unexpected bits in paddr: %lx", paddr);
 
+	TEST_ASSERT(!PTE_EXECUTABLE_MASK(mmu) || !PTE_NX_MASK(mmu),
+		    "X and NX bit masks cannot be used simultaneously");
+
 	/*
 	 * Allocate upper level page tables, if not already present.  Return
 	 * early if a hugepage was created.
@@ -286,7 +295,9 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr,
 	pte = virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K);
 	TEST_ASSERT(!is_present_pte(mmu, pte),
 		    "PTE already present for 4k page at vaddr: 0x%lx", vaddr);
-	*pte = PTE_PRESENT_MASK(mmu) | PTE_WRITABLE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK);
+	*pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) |
+	       PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) |
+	       (paddr & PHYSICAL_PAGE_MASK);
 
 	/*
 	 * Neither SEV nor TDX supports shared page tables, so only the final
diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c
index ea1c09f9e8ab..e3737b3d9120 100644
--- a/tools/testing/selftests/kvm/lib/x86/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86/vmx.c
@@ -25,21 +25,6 @@ bool enable_evmcs;
 struct hv_enlightened_vmcs *current_evmcs;
 struct hv_vp_assist_page *current_vp_assist;
 
-struct eptPageTableEntry {
-	uint64_t readable:1;
-	uint64_t writable:1;
-	uint64_t executable:1;
-	uint64_t memory_type:3;
-	uint64_t ignore_pat:1;
-	uint64_t page_size:1;
-	uint64_t accessed:1;
-	uint64_t dirty:1;
-	uint64_t ignored_11_10:2;
-	uint64_t address:40;
-	uint64_t ignored_62_52:11;
-	uint64_t suppress_ve:1;
-};
-
 int vcpu_enable_evmcs(struct kvm_vcpu *vcpu)
 {
 	uint16_t evmcs_ver;
@@ -58,13 +43,24 @@ int vcpu_enable_evmcs(struct kvm_vcpu *vcpu)
 
 void vm_enable_ept(struct kvm_vm *vm)
 {
-	TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT");
-	if (vm->arch.tdp_mmu)
-		return;
+	struct pte_masks pte_masks;
 
-	/* TODO: Drop eptPageTableEntry in favor of PTE masks. */
-	struct pte_masks pte_masks = (struct pte_masks) {
+	TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT");
 
+	/*
+	 * EPTs do not have 'present' or 'user' bits, instead bit 0 is the
+	 * 'readable' bit.
+	 */
+	pte_masks = (struct pte_masks) {
+		.present	=	0,
+		.user		=	0,
+		.readable	=	BIT_ULL(0),
+		.writable	=	BIT_ULL(1),
+		.executable	=	BIT_ULL(2),
+		.huge		=	BIT_ULL(7),
+		.accessed	=	BIT_ULL(8),
+		.dirty		=	BIT_ULL(9),
+		.nx		=	0,
 	};
 
 	/* TODO: Add support for 5-level EPT. */
@@ -120,8 +116,8 @@ vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva)
 	vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite);
 	memset(vmx->vmwrite_hva, 0, getpagesize());
 
-	if (vm->arch.tdp_mmu)
-		vmx->eptp_gpa = vm->arch.tdp_mmu->pgd;
+	if (vm->stage2_mmu.pgd_created)
+		vmx->eptp_gpa = vm->stage2_mmu.pgd;
 
 	*p_vmx_gva = vmx_gva;
 	return vmx;
@@ -377,82 +373,6 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp)
 	init_vmcs_guest_state(guest_rip, guest_rsp);
 }
 
-static void tdp_create_pte(struct kvm_vm *vm,
-			   struct eptPageTableEntry *pte,
-			   uint64_t nested_paddr,
-			   uint64_t paddr,
-			   int current_level,
-			   int target_level)
-{
-	if (!pte->readable) {
-		pte->writable = true;
-		pte->readable = true;
-		pte->executable = true;
-		pte->page_size = (current_level == target_level);
-		if (pte->page_size)
-			pte->address = paddr >> vm->page_shift;
-		else
-			pte->address = vm_alloc_page_table(vm) >> vm->page_shift;
-	} else {
-		/*
-		 * Entry already present.  Assert that the caller doesn't want
-		 * a hugepage at this level, and that there isn't a hugepage at
-		 * this level.
-		 */
-		TEST_ASSERT(current_level != target_level,
-			    "Cannot create hugepage at level: %u, nested_paddr: 0x%lx",
-			    current_level, nested_paddr);
-		TEST_ASSERT(!pte->page_size,
-			    "Cannot create page table at level: %u, nested_paddr: 0x%lx",
-			    current_level, nested_paddr);
-	}
-}
-
-
-void __tdp_pg_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr,
-		  int target_level)
-{
-	const uint64_t page_size = PG_LEVEL_SIZE(target_level);
-	void *eptp_hva = addr_gpa2hva(vm, vm->arch.tdp_mmu->pgd);
-	struct eptPageTableEntry *pt = eptp_hva, *pte;
-	uint16_t index;
-
-	TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K,
-		    "Unknown or unsupported guest mode: 0x%x", vm->mode);
-
-	TEST_ASSERT((nested_paddr >> 48) == 0,
-		    "Nested physical address 0x%lx is > 48-bits and requires 5-level EPT",
-		    nested_paddr);
-	TEST_ASSERT((nested_paddr % page_size) == 0,
-		    "Nested physical address not on page boundary,\n"
-		    "  nested_paddr: 0x%lx page_size: 0x%lx",
-		    nested_paddr, page_size);
-	TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn,
-		    "Physical address beyond beyond maximum supported,\n"
-		    "  nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
-		    paddr, vm->max_gfn, vm->page_size);
-	TEST_ASSERT((paddr % page_size) == 0,
-		    "Physical address not on page boundary,\n"
-		    "  paddr: 0x%lx page_size: 0x%lx",
-		    paddr, page_size);
-	TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
-		    "Physical address beyond beyond maximum supported,\n"
-		    "  paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
-		    paddr, vm->max_gfn, vm->page_size);
-
-	for (int level = PG_LEVEL_512G; level >= PG_LEVEL_4K; level--) {
-		index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu;
-		pte = &pt[index];
-
-		tdp_create_pte(vm, pte, nested_paddr, paddr, level, target_level);
-
-		if (pte->page_size)
-			break;
-
-		pt = addr_gpa2hva(vm, pte->address * vm->page_size);
-	}
-}
-
 /*
  * Map a range of EPT guest physical addresses to the VM's physical address
  *
@@ -473,6 +393,7 @@ void __tdp_pg_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr,
 void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr,
 	       uint64_t size, int level)
 {
+	struct kvm_mmu *mmu = &vm->stage2_mmu;
 	size_t page_size = PG_LEVEL_SIZE(level);
 	size_t npages = size / page_size;
 
@@ -480,7 +401,7 @@ void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr,
 	TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
 
 	while (npages--) {
-		__tdp_pg_map(vm, nested_paddr, paddr, level);
+		__virt_pg_map(vm, mmu, nested_paddr, paddr, level);
 		nested_paddr += page_size;
 		paddr += page_size;
 	}
-- 
cgit v1.2.3


From 07676c04bd753f32a9fe2f247b0ba2d213dd7a99 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 15:01:43 -0800
Subject: KVM: selftests: Move TDP mapping functions outside of vmx.c

Now that the functions are no longer VMX-specific, move them to
processor.c. Do a minor comment tweak replacing 'EPT' with 'TDP'.

No functional change intended.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-15-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../testing/selftests/kvm/include/x86/processor.h  |  4 ++
 tools/testing/selftests/kvm/include/x86/vmx.h      |  3 -
 tools/testing/selftests/kvm/lib/x86/processor.c    | 53 ++++++++++++++++
 tools/testing/selftests/kvm/lib/x86/vmx.c          | 71 ----------------------
 4 files changed, 57 insertions(+), 74 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index e17cbbe71b8f..461cf155b96e 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1479,6 +1479,10 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr,
 void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 		    uint64_t nr_bytes, int level);
 
+void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size);
+void tdp_identity_map_default_memslots(struct kvm_vm *vm);
+void tdp_identity_map_1g(struct kvm_vm *vm,  uint64_t addr, uint64_t size);
+
 /*
  * Basic CPU control in CR0
  */
diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h
index 4dd4c2094ee6..92b918700d24 100644
--- a/tools/testing/selftests/kvm/include/x86/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86/vmx.h
@@ -557,9 +557,6 @@ bool load_vmcs(struct vmx_pages *vmx);
 
 bool ept_1g_pages_supported(void);
 
-void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size);
-void tdp_identity_map_default_memslots(struct kvm_vm *vm);
-void tdp_identity_map_1g(struct kvm_vm *vm,  uint64_t addr, uint64_t size);
 bool kvm_cpu_has_ept(void);
 void vm_enable_ept(struct kvm_vm *vm);
 void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm);
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index 41316cac94e0..29e7d172f945 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -472,6 +472,59 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 	}
 }
 
+void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr,
+	       uint64_t size, int level)
+{
+	size_t page_size = PG_LEVEL_SIZE(level);
+	size_t npages = size / page_size;
+
+	TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow");
+	TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
+
+	while (npages--) {
+		__virt_pg_map(vm, &vm->stage2_mmu, nested_paddr, paddr, level);
+		nested_paddr += page_size;
+		paddr += page_size;
+	}
+}
+
+void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr,
+	     uint64_t size)
+{
+	__tdp_map(vm, nested_paddr, paddr, size, PG_LEVEL_4K);
+}
+
+/* Prepare an identity extended page table that maps all the
+ * physical pages in VM.
+ */
+void tdp_identity_map_default_memslots(struct kvm_vm *vm)
+{
+	uint32_t s, memslot = 0;
+	sparsebit_idx_t i, last;
+	struct userspace_mem_region *region = memslot2region(vm, memslot);
+
+	/* Only memslot 0 is mapped here, ensure it's the only one being used */
+	for (s = 0; s < NR_MEM_REGIONS; s++)
+		TEST_ASSERT_EQ(vm->memslots[s], 0);
+
+	i = (region->region.guest_phys_addr >> vm->page_shift) - 1;
+	last = i + (region->region.memory_size >> vm->page_shift);
+	for (;;) {
+		i = sparsebit_next_clear(region->unused_phy_pages, i);
+		if (i > last)
+			break;
+
+		tdp_map(vm, (uint64_t)i << vm->page_shift,
+			(uint64_t)i << vm->page_shift, 1 << vm->page_shift);
+	}
+}
+
+/* Identity map a region with 1GiB Pages. */
+void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size)
+{
+	__tdp_map(vm, addr, addr, size, PG_LEVEL_1G);
+}
+
 /*
  * Set Unusable Segment
  *
diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c
index e3737b3d9120..448a63457467 100644
--- a/tools/testing/selftests/kvm/lib/x86/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86/vmx.c
@@ -373,77 +373,6 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp)
 	init_vmcs_guest_state(guest_rip, guest_rsp);
 }
 
-/*
- * Map a range of EPT guest physical addresses to the VM's physical address
- *
- * Input Args:
- *   vm - Virtual Machine
- *   nested_paddr - Nested guest physical address to map
- *   paddr - VM Physical Address
- *   size - The size of the range to map
- *   level - The level at which to map the range
- *
- * Output Args: None
- *
- * Return: None
- *
- * Within the VM given by vm, creates a nested guest translation for the
- * page range starting at nested_paddr to the page range starting at paddr.
- */
-void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr,
-	       uint64_t size, int level)
-{
-	struct kvm_mmu *mmu = &vm->stage2_mmu;
-	size_t page_size = PG_LEVEL_SIZE(level);
-	size_t npages = size / page_size;
-
-	TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow");
-	TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
-
-	while (npages--) {
-		__virt_pg_map(vm, mmu, nested_paddr, paddr, level);
-		nested_paddr += page_size;
-		paddr += page_size;
-	}
-}
-
-void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr,
-	     uint64_t size)
-{
-	__tdp_map(vm, nested_paddr, paddr, size, PG_LEVEL_4K);
-}
-
-/* Prepare an identity extended page table that maps all the
- * physical pages in VM.
- */
-void tdp_identity_map_default_memslots(struct kvm_vm *vm)
-{
-	uint32_t s, memslot = 0;
-	sparsebit_idx_t i, last;
-	struct userspace_mem_region *region = memslot2region(vm, memslot);
-
-	/* Only memslot 0 is mapped here, ensure it's the only one being used */
-	for (s = 0; s < NR_MEM_REGIONS; s++)
-		TEST_ASSERT_EQ(vm->memslots[s], 0);
-
-	i = (region->region.guest_phys_addr >> vm->page_shift) - 1;
-	last = i + (region->region.memory_size >> vm->page_shift);
-	for (;;) {
-		i = sparsebit_next_clear(region->unused_phy_pages, i);
-		if (i > last)
-			break;
-
-		tdp_map(vm, (uint64_t)i << vm->page_shift,
-			(uint64_t)i << vm->page_shift, 1 << vm->page_shift);
-	}
-}
-
-/* Identity map a region with 1GiB Pages. */
-void tdp_identity_map_1g(struct kvm_vm *vm, uint64_t addr, uint64_t size)
-{
-	__tdp_map(vm, addr, addr, size, PG_LEVEL_1G);
-}
-
 bool kvm_cpu_has_ept(void)
 {
 	uint64_t ctrl;
-- 
cgit v1.2.3


From 9cb1944f6bf09ecebcc7609f35178b85aa26f165 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:44 -0800
Subject: KVM: selftests: Allow kvm_cpu_has_ept() to be called on AMD CPUs

In preparation for generalizing the nested dirty logging test, checking
if either EPT or NPT is enabled will be needed. To avoid needing to gate
the kvm_cpu_has_ept() call by the CPU type, make sure the function
returns false if VMX is not available instead of trying to read VMX-only
MSRs.

No functional change intended.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-16-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/lib/x86/vmx.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c
index 448a63457467..c87b340362a9 100644
--- a/tools/testing/selftests/kvm/lib/x86/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86/vmx.c
@@ -377,6 +377,9 @@ bool kvm_cpu_has_ept(void)
 {
 	uint64_t ctrl;
 
+	if (!kvm_cpu_has(X86_FEATURE_VMX))
+		return false;
+
 	ctrl = kvm_get_feature_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) >> 32;
 	if (!(ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
 		return false;
-- 
cgit v1.2.3


From 753c0d5a507b939d6efc60c7b437d5330880cce3 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:45 -0800
Subject: KVM: selftests: Add support for nested NPTs

Implement nCR3 and NPT initialization functions, similar to the EPT
equivalents, and create common TDP helpers for enablement checking and
initialization. Enable NPT for nested guests by default if the TDP MMU
was initialized, similar to VMX.

Reuse the PTE masks from the main MMU in the NPT MMU, except for the C
and S bits related to confidential VMs.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-17-seanjc@google.com
[sean: apply Yosry's fixup for ncr3_gpa]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../testing/selftests/kvm/include/x86/processor.h  |  2 ++
 tools/testing/selftests/kvm/include/x86/svm_util.h |  9 ++++++++
 tools/testing/selftests/kvm/lib/x86/memstress.c    |  4 ++--
 tools/testing/selftests/kvm/lib/x86/processor.c    | 15 ++++++++++++++
 tools/testing/selftests/kvm/lib/x86/svm.c          | 24 ++++++++++++++++++++++
 .../testing/selftests/kvm/x86/vmx_dirty_log_test.c |  4 ++--
 6 files changed, 54 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 461cf155b96e..115bec5eb1eb 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1479,6 +1479,8 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr,
 void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
 		    uint64_t nr_bytes, int level);
 
+void vm_enable_tdp(struct kvm_vm *vm);
+bool kvm_cpu_has_tdp(void);
 void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size);
 void tdp_identity_map_default_memslots(struct kvm_vm *vm);
 void tdp_identity_map_1g(struct kvm_vm *vm,  uint64_t addr, uint64_t size);
diff --git a/tools/testing/selftests/kvm/include/x86/svm_util.h b/tools/testing/selftests/kvm/include/x86/svm_util.h
index b74c6dcddcbd..5d7c42534bc4 100644
--- a/tools/testing/selftests/kvm/include/x86/svm_util.h
+++ b/tools/testing/selftests/kvm/include/x86/svm_util.h
@@ -27,6 +27,9 @@ struct svm_test_data {
 	void *msr; /* gva */
 	void *msr_hva;
 	uint64_t msr_gpa;
+
+	/* NPT */
+	uint64_t ncr3_gpa;
 };
 
 static inline void vmmcall(void)
@@ -57,6 +60,12 @@ struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva);
 void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp);
 void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa);
 
+static inline bool kvm_cpu_has_npt(void)
+{
+	return kvm_cpu_has(X86_FEATURE_NPT);
+}
+void vm_enable_npt(struct kvm_vm *vm);
+
 int open_sev_dev_path_or_exit(void);
 
 #endif /* SELFTEST_KVM_SVM_UTILS_H */
diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c
index 3319cb57a78d..407abfc34909 100644
--- a/tools/testing/selftests/kvm/lib/x86/memstress.c
+++ b/tools/testing/selftests/kvm/lib/x86/memstress.c
@@ -82,9 +82,9 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc
 	int vcpu_id;
 
 	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-	TEST_REQUIRE(kvm_cpu_has_ept());
+	TEST_REQUIRE(kvm_cpu_has_tdp());
 
-	vm_enable_ept(vm);
+	vm_enable_tdp(vm);
 	for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
 		vcpu_alloc_vmx(vm, &vmx_gva);
 
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index 29e7d172f945..a3a4c9a4cbcb 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -8,7 +8,9 @@
 #include "kvm_util.h"
 #include "pmu.h"
 #include "processor.h"
+#include "svm_util.h"
 #include "sev.h"
+#include "vmx.h"
 
 #ifndef NUM_INTERRUPTS
 #define NUM_INTERRUPTS 256
@@ -472,6 +474,19 @@ void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
 	}
 }
 
+void vm_enable_tdp(struct kvm_vm *vm)
+{
+	if (kvm_cpu_has(X86_FEATURE_VMX))
+		vm_enable_ept(vm);
+	else
+		vm_enable_npt(vm);
+}
+
+bool kvm_cpu_has_tdp(void)
+{
+	return kvm_cpu_has_ept() || kvm_cpu_has_npt();
+}
+
 void __tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr,
 	       uint64_t size, int level)
 {
diff --git a/tools/testing/selftests/kvm/lib/x86/svm.c b/tools/testing/selftests/kvm/lib/x86/svm.c
index d239c2097391..a25a3471f5f6 100644
--- a/tools/testing/selftests/kvm/lib/x86/svm.c
+++ b/tools/testing/selftests/kvm/lib/x86/svm.c
@@ -46,6 +46,9 @@ vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva)
 	svm->msr_gpa = addr_gva2gpa(vm, (uintptr_t)svm->msr);
 	memset(svm->msr_hva, 0, getpagesize());
 
+	if (vm->stage2_mmu.pgd_created)
+		svm->ncr3_gpa = vm->stage2_mmu.pgd;
+
 	*p_svm_gva = svm_gva;
 	return svm;
 }
@@ -59,6 +62,22 @@ static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector,
 	seg->base = base;
 }
 
+void vm_enable_npt(struct kvm_vm *vm)
+{
+	struct pte_masks pte_masks;
+
+	TEST_ASSERT(kvm_cpu_has_npt(), "KVM doesn't supported nested NPT");
+
+	/*
+	 * NPTs use the same PTE format, but deliberately drop the C-bit as the
+	 * per-VM shared vs. private information is only meant for stage-1.
+	 */
+	pte_masks = vm->mmu.arch.pte_masks;
+	pte_masks.c = 0;
+
+	tdp_mmu_init(vm, vm->mmu.pgtable_levels, &pte_masks);
+}
+
 void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp)
 {
 	struct vmcb *vmcb = svm->vmcb;
@@ -102,6 +121,11 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r
 	vmcb->save.rip = (u64)guest_rip;
 	vmcb->save.rsp = (u64)guest_rsp;
 	guest_regs.rdi = (u64)svm;
+
+	if (svm->ncr3_gpa) {
+		ctrl->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
+		ctrl->nested_cr3 = svm->ncr3_gpa;
+	}
 }
 
 /*
diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
index 370f8d3117c2..032ab8bf60a4 100644
--- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
+++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
@@ -93,7 +93,7 @@ static void test_vmx_dirty_log(bool enable_ept)
 	/* Create VM */
 	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
 	if (enable_ept)
-		vm_enable_ept(vm);
+		vm_enable_tdp(vm);
 
 	vcpu_alloc_vmx(vm, &vmx_pages_gva);
 	vcpu_args_set(vcpu, 1, vmx_pages_gva);
@@ -170,7 +170,7 @@ int main(int argc, char *argv[])
 
 	test_vmx_dirty_log(/*enable_ept=*/false);
 
-	if (kvm_cpu_has_ept())
+	if (kvm_cpu_has_tdp())
 		test_vmx_dirty_log(/*enable_ept=*/true);
 
 	return 0;
-- 
cgit v1.2.3


From 251e4849a79b258fd3e889ff095ba083ce301c13 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:46 -0800
Subject: KVM: selftests: Set the user bit on nested NPT PTEs

According to the APM, NPT walks are treated as user accesses. In
preparation for supporting NPT mappings, set the 'user' bit on NPTs by
adding a mask of bits to always be set on PTEs in kvm_mmu.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-18-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86/kvm_util_arch.h | 2 ++
 tools/testing/selftests/kvm/include/x86/processor.h     | 1 +
 tools/testing/selftests/kvm/lib/x86/processor.c         | 5 +++--
 tools/testing/selftests/kvm/lib/x86/svm.c               | 3 +++
 4 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
index 1cf84b8212c6..be35d26bb320 100644
--- a/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
+++ b/tools/testing/selftests/kvm/include/x86/kvm_util_arch.h
@@ -22,6 +22,8 @@ struct pte_masks {
 	uint64_t nx;
 	uint64_t c;
 	uint64_t s;
+
+	uint64_t always_set;
 };
 
 struct kvm_mmu_arch {
diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 115bec5eb1eb..995277cae94e 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1452,6 +1452,7 @@ enum pg_level {
 #define PTE_NX_MASK(mmu)		((mmu)->arch.pte_masks.nx)
 #define PTE_C_BIT_MASK(mmu)		((mmu)->arch.pte_masks.c)
 #define PTE_S_BIT_MASK(mmu)		((mmu)->arch.pte_masks.s)
+#define PTE_ALWAYS_SET_MASK(mmu)	((mmu)->arch.pte_masks.always_set)
 
 /*
  * For PTEs without a PRESENT bit (i.e. EPT entries), treat the PTE as present
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index a3a4c9a4cbcb..5a3385d48902 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -231,7 +231,8 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
 
 	if (!is_present_pte(mmu, pte)) {
 		*pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) |
-		       PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu);
+		       PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) |
+		       PTE_ALWAYS_SET_MASK(mmu);
 		if (current_level == target_level)
 			*pte |= PTE_HUGE_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK);
 		else
@@ -299,7 +300,7 @@ void __virt_pg_map(struct kvm_vm *vm, struct kvm_mmu *mmu, uint64_t vaddr,
 		    "PTE already present for 4k page at vaddr: 0x%lx", vaddr);
 	*pte = PTE_PRESENT_MASK(mmu) | PTE_READABLE_MASK(mmu) |
 	       PTE_WRITABLE_MASK(mmu) | PTE_EXECUTABLE_MASK(mmu) |
-	       (paddr & PHYSICAL_PAGE_MASK);
+	       PTE_ALWAYS_SET_MASK(mmu) | (paddr & PHYSICAL_PAGE_MASK);
 
 	/*
 	 * Neither SEV nor TDX supports shared page tables, so only the final
diff --git a/tools/testing/selftests/kvm/lib/x86/svm.c b/tools/testing/selftests/kvm/lib/x86/svm.c
index a25a3471f5f6..2e5c480c9afd 100644
--- a/tools/testing/selftests/kvm/lib/x86/svm.c
+++ b/tools/testing/selftests/kvm/lib/x86/svm.c
@@ -75,6 +75,9 @@ void vm_enable_npt(struct kvm_vm *vm)
 	pte_masks = vm->mmu.arch.pte_masks;
 	pte_masks.c = 0;
 
+	/* NPT walks are treated as user accesses, so set the 'user' bit. */
+	pte_masks.always_set = pte_masks.user;
+
 	tdp_mmu_init(vm, vm->mmu.pgtable_levels, &pte_masks);
 }
 
-- 
cgit v1.2.3


From 6794d916f87e0a6cd51a3d8c2c0e6ffd48fa7a79 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:47 -0800
Subject: KVM: selftests: Extend vmx_dirty_log_test to cover SVM

Generalize the code in vmx_dirty_log_test.c by adding SVM-specific L1
code, doing some renaming (e.g. EPT -> TDP), and having setup code for
both SVM and VMX in test_dirty_log().

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-19-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm           |   2 +-
 .../selftests/kvm/x86/nested_dirty_log_test.c      | 210 +++++++++++++++++++++
 .../testing/selftests/kvm/x86/vmx_dirty_log_test.c | 177 -----------------
 3 files changed, 211 insertions(+), 178 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/x86/nested_dirty_log_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 3789890421bd..ffbf891b31d3 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -89,6 +89,7 @@ TEST_GEN_PROGS_x86 += x86/kvm_buslock_test
 TEST_GEN_PROGS_x86 += x86/monitor_mwait_test
 TEST_GEN_PROGS_x86 += x86/msrs_test
 TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test
+TEST_GEN_PROGS_x86 += x86/nested_dirty_log_test
 TEST_GEN_PROGS_x86 += x86/nested_emulation_test
 TEST_GEN_PROGS_x86 += x86/nested_exceptions_test
 TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test
@@ -115,7 +116,6 @@ TEST_GEN_PROGS_x86 += x86/ucna_injection_test
 TEST_GEN_PROGS_x86 += x86/userspace_io_test
 TEST_GEN_PROGS_x86 += x86/userspace_msr_exit_test
 TEST_GEN_PROGS_x86 += x86/vmx_apic_access_test
-TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test
 TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state
 TEST_GEN_PROGS_x86 += x86/vmx_msrs_test
 TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state
diff --git a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c
new file mode 100644
index 000000000000..89d2e86a0db9
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM dirty page logging test
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "vmx.h"
+
+/* The memory slot index to track dirty pages */
+#define TEST_MEM_SLOT_INDEX		1
+#define TEST_MEM_PAGES			3
+
+/* L1 guest test virtual memory offset */
+#define GUEST_TEST_MEM			0xc0000000
+
+/* L2 guest test virtual memory offset */
+#define NESTED_TEST_MEM1		0xc0001000
+#define NESTED_TEST_MEM2		0xc0002000
+
+#define L2_GUEST_STACK_SIZE 64
+
+static void l2_guest_code(u64 *a, u64 *b)
+{
+	READ_ONCE(*a);
+	WRITE_ONCE(*a, 1);
+	GUEST_SYNC(true);
+	GUEST_SYNC(false);
+
+	WRITE_ONCE(*b, 1);
+	GUEST_SYNC(true);
+	WRITE_ONCE(*b, 1);
+	GUEST_SYNC(true);
+	GUEST_SYNC(false);
+
+	/* Exit to L1 and never come back.  */
+	vmcall();
+}
+
+static void l2_guest_code_tdp_enabled(void)
+{
+	l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2);
+}
+
+static void l2_guest_code_tdp_disabled(void)
+{
+	/* Access the same L1 GPAs as l2_guest_code_tdp_enabled() */
+	l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM);
+}
+
+void l1_vmx_code(struct vmx_pages *vmx)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	void *l2_rip;
+
+	GUEST_ASSERT(vmx->vmcs_gpa);
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx));
+	GUEST_ASSERT(load_vmcs(vmx));
+
+	if (vmx->eptp_gpa)
+		l2_rip = l2_guest_code_tdp_enabled;
+	else
+		l2_rip = l2_guest_code_tdp_disabled;
+
+	prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	GUEST_SYNC(false);
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_SYNC(false);
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL);
+	GUEST_DONE();
+}
+
+static void l1_svm_code(struct svm_test_data *svm)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	void *l2_rip;
+
+	if (svm->ncr3_gpa)
+		l2_rip = l2_guest_code_tdp_enabled;
+	else
+		l2_rip = l2_guest_code_tdp_disabled;
+
+	generic_svm_setup(svm, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	GUEST_SYNC(false);
+	run_guest(svm->vmcb, svm->vmcb_gpa);
+	GUEST_SYNC(false);
+	GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL);
+	GUEST_DONE();
+}
+
+static void l1_guest_code(void *data)
+{
+	if (this_cpu_has(X86_FEATURE_VMX))
+		l1_vmx_code(data);
+	else
+		l1_svm_code(data);
+}
+
+static void test_dirty_log(bool nested_tdp)
+{
+	vm_vaddr_t nested_gva = 0;
+	unsigned long *bmap;
+	uint64_t *host_test_mem;
+
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	bool done = false;
+
+	pr_info("Nested TDP: %s\n", nested_tdp ? "enabled" : "disabled");
+
+	/* Create VM */
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+	if (nested_tdp)
+		vm_enable_tdp(vm);
+
+	if (kvm_cpu_has(X86_FEATURE_VMX))
+		vcpu_alloc_vmx(vm, &nested_gva);
+	else
+		vcpu_alloc_svm(vm, &nested_gva);
+
+	vcpu_args_set(vcpu, 1, nested_gva);
+
+	/* Add an extra memory slot for testing dirty logging */
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    GUEST_TEST_MEM,
+				    TEST_MEM_SLOT_INDEX,
+				    TEST_MEM_PAGES,
+				    KVM_MEM_LOG_DIRTY_PAGES);
+
+	/*
+	 * Add an identity map for GVA range [0xc0000000, 0xc0002000).  This
+	 * affects both L1 and L2.  However...
+	 */
+	virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES);
+
+	/*
+	 * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to
+	 * 0xc0000000.
+	 *
+	 * When TDP is disabled, the L2 guest code will still access the same L1
+	 * GPAs as the TDP enabled case.
+	 */
+	if (nested_tdp) {
+		tdp_identity_map_default_memslots(vm);
+		tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE);
+		tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE);
+	}
+
+	bmap = bitmap_zalloc(TEST_MEM_PAGES);
+	host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM);
+
+	while (!done) {
+		memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE);
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			/*
+			 * The nested guest wrote at offset 0x1000 in the memslot, but the
+			 * dirty bitmap must be filled in according to L1 GPA, not L2.
+			 */
+			kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
+			if (uc.args[1]) {
+				TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean");
+				TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest");
+			} else {
+				TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty");
+				TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest");
+			}
+
+			TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty");
+			TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest");
+			TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty");
+			TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest");
+			break;
+		case UCALL_DONE:
+			done = true;
+			break;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || kvm_cpu_has(X86_FEATURE_SVM));
+
+	test_dirty_log(/*nested_tdp=*/false);
+
+	if (kvm_cpu_has_tdp())
+		test_dirty_log(/*nested_tdp=*/true);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
deleted file mode 100644
index 032ab8bf60a4..000000000000
--- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c
+++ /dev/null
@@ -1,177 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * KVM dirty page logging test
- *
- * Copyright (C) 2018, Red Hat, Inc.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <linux/bitmap.h>
-#include <linux/bitops.h>
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-
-/* The memory slot index to track dirty pages */
-#define TEST_MEM_SLOT_INDEX		1
-#define TEST_MEM_PAGES			3
-
-/* L1 guest test virtual memory offset */
-#define GUEST_TEST_MEM			0xc0000000
-
-/* L2 guest test virtual memory offset */
-#define NESTED_TEST_MEM1		0xc0001000
-#define NESTED_TEST_MEM2		0xc0002000
-
-static void l2_guest_code(u64 *a, u64 *b)
-{
-	READ_ONCE(*a);
-	WRITE_ONCE(*a, 1);
-	GUEST_SYNC(true);
-	GUEST_SYNC(false);
-
-	WRITE_ONCE(*b, 1);
-	GUEST_SYNC(true);
-	WRITE_ONCE(*b, 1);
-	GUEST_SYNC(true);
-	GUEST_SYNC(false);
-
-	/* Exit to L1 and never come back.  */
-	vmcall();
-}
-
-static void l2_guest_code_ept_enabled(void)
-{
-	l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2);
-}
-
-static void l2_guest_code_ept_disabled(void)
-{
-	/* Access the same L1 GPAs as l2_guest_code_ept_enabled() */
-	l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM);
-}
-
-void l1_guest_code(struct vmx_pages *vmx)
-{
-#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-	void *l2_rip;
-
-	GUEST_ASSERT(vmx->vmcs_gpa);
-	GUEST_ASSERT(prepare_for_vmx_operation(vmx));
-	GUEST_ASSERT(load_vmcs(vmx));
-
-	if (vmx->eptp_gpa)
-		l2_rip = l2_guest_code_ept_enabled;
-	else
-		l2_rip = l2_guest_code_ept_disabled;
-
-	prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	GUEST_SYNC(false);
-	GUEST_ASSERT(!vmlaunch());
-	GUEST_SYNC(false);
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-	GUEST_DONE();
-}
-
-static void test_vmx_dirty_log(bool enable_ept)
-{
-	vm_vaddr_t vmx_pages_gva = 0;
-	unsigned long *bmap;
-	uint64_t *host_test_mem;
-
-	struct kvm_vcpu *vcpu;
-	struct kvm_vm *vm;
-	struct ucall uc;
-	bool done = false;
-
-	pr_info("Nested EPT: %s\n", enable_ept ? "enabled" : "disabled");
-
-	/* Create VM */
-	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
-	if (enable_ept)
-		vm_enable_tdp(vm);
-
-	vcpu_alloc_vmx(vm, &vmx_pages_gva);
-	vcpu_args_set(vcpu, 1, vmx_pages_gva);
-
-	/* Add an extra memory slot for testing dirty logging */
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
-				    GUEST_TEST_MEM,
-				    TEST_MEM_SLOT_INDEX,
-				    TEST_MEM_PAGES,
-				    KVM_MEM_LOG_DIRTY_PAGES);
-
-	/*
-	 * Add an identity map for GVA range [0xc0000000, 0xc0002000).  This
-	 * affects both L1 and L2.  However...
-	 */
-	virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES);
-
-	/*
-	 * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to
-	 * 0xc0000000.
-	 *
-	 * When EPT is disabled, the L2 guest code will still access the same L1
-	 * GPAs as the EPT enabled case.
-	 */
-	if (enable_ept) {
-		tdp_identity_map_default_memslots(vm);
-		tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE);
-		tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE);
-	}
-
-	bmap = bitmap_zalloc(TEST_MEM_PAGES);
-	host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM);
-
-	while (!done) {
-		memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE);
-		vcpu_run(vcpu);
-		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
-
-		switch (get_ucall(vcpu, &uc)) {
-		case UCALL_ABORT:
-			REPORT_GUEST_ASSERT(uc);
-			/* NOT REACHED */
-		case UCALL_SYNC:
-			/*
-			 * The nested guest wrote at offset 0x1000 in the memslot, but the
-			 * dirty bitmap must be filled in according to L1 GPA, not L2.
-			 */
-			kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
-			if (uc.args[1]) {
-				TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean");
-				TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest");
-			} else {
-				TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty");
-				TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest");
-			}
-
-			TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty");
-			TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest");
-			TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty");
-			TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest");
-			break;
-		case UCALL_DONE:
-			done = true;
-			break;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-	}
-}
-
-int main(int argc, char *argv[])
-{
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-
-	test_vmx_dirty_log(/*enable_ept=*/false);
-
-	if (kvm_cpu_has_tdp())
-		test_vmx_dirty_log(/*enable_ept=*/true);
-
-	return 0;
-}
-- 
cgit v1.2.3


From 59eef1a47b8c264d09ee84c909a1da60b4e70bd7 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 30 Dec 2025 15:01:48 -0800
Subject: KVM: selftests: Extend memstress to run on nested SVM

Add L1 SVM code and generalize the setup code to work for both VMX and
SVM. This allows running 'dirty_log_perf_test -n' on AMD CPUs.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-20-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/lib/x86/memstress.c | 42 ++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c
index 407abfc34909..86f4c5e4c430 100644
--- a/tools/testing/selftests/kvm/lib/x86/memstress.c
+++ b/tools/testing/selftests/kvm/lib/x86/memstress.c
@@ -13,6 +13,7 @@
 #include "kvm_util.h"
 #include "memstress.h"
 #include "processor.h"
+#include "svm_util.h"
 #include "vmx.h"
 
 void memstress_l2_guest_code(uint64_t vcpu_id)
@@ -29,9 +30,10 @@ __asm__(
 "	ud2;"
 );
 
-static void memstress_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id)
-{
 #define L2_GUEST_STACK_SIZE 64
+
+static void l1_vmx_code(struct vmx_pages *vmx, uint64_t vcpu_id)
+{
 	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
 	unsigned long *rsp;
 
@@ -45,10 +47,34 @@ static void memstress_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id)
 	prepare_vmcs(vmx, memstress_l2_guest_entry, rsp);
 
 	GUEST_ASSERT(!vmlaunch());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL);
+	GUEST_DONE();
+}
+
+static void l1_svm_code(struct svm_test_data *svm, uint64_t vcpu_id)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	unsigned long *rsp;
+
+
+	rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1];
+	*rsp = vcpu_id;
+	generic_svm_setup(svm, memstress_l2_guest_entry, rsp);
+
+	run_guest(svm->vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL);
 	GUEST_DONE();
 }
 
+
+static void memstress_l1_guest_code(void *data, uint64_t vcpu_id)
+{
+	if (this_cpu_has(X86_FEATURE_VMX))
+		l1_vmx_code(data, vcpu_id);
+	else
+		l1_svm_code(data, vcpu_id);
+}
+
 uint64_t memstress_nested_pages(int nr_vcpus)
 {
 	/*
@@ -78,15 +104,17 @@ static void memstress_setup_ept_mappings(struct kvm_vm *vm)
 void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[])
 {
 	struct kvm_regs regs;
-	vm_vaddr_t vmx_gva;
+	vm_vaddr_t nested_gva;
 	int vcpu_id;
 
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
 	TEST_REQUIRE(kvm_cpu_has_tdp());
 
 	vm_enable_tdp(vm);
 	for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
-		vcpu_alloc_vmx(vm, &vmx_gva);
+		if (kvm_cpu_has(X86_FEATURE_VMX))
+			vcpu_alloc_vmx(vm, &nested_gva);
+		else
+			vcpu_alloc_svm(vm, &nested_gva);
 
 		/* The EPTs are shared across vCPUs, setup the mappings once */
 		if (vcpu_id == 0)
@@ -99,6 +127,6 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc
 		vcpu_regs_get(vcpus[vcpu_id], &regs);
 		regs.rip = (unsigned long) memstress_l1_guest_code;
 		vcpu_regs_set(vcpus[vcpu_id], &regs);
-		vcpu_args_set(vcpus[vcpu_id], 2, vmx_gva, vcpu_id);
+		vcpu_args_set(vcpus[vcpu_id], 2, nested_gva, vcpu_id);
 	}
 }
-- 
cgit v1.2.3


From e353850499c717f1f984f7c208f49a8618beff2f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 15:01:49 -0800
Subject: KVM: selftests: Rename vm_get_page_table_entry() to vm_get_pte()

Shorten the API to get a PTE as the "PTE" acronym is ubiquitous, and the
"page table entry" makes it unnecessarily difficult to quickly understand
what callers are doing.

No functional change intended.

Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230230150.4150236-21-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/x86/processor.h                 | 2 +-
 tools/testing/selftests/kvm/lib/x86/processor.c                     | 2 +-
 tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c                  | 2 +-
 tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c | 4 +---
 4 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 995277cae94e..8f130e7d7048 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1359,7 +1359,7 @@ static inline bool kvm_is_ignore_msrs(void)
 	return get_kvm_param_bool("ignore_msrs");
 }
 
-uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr);
+uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr);
 
 uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
 		       uint64_t a3);
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index 5a3385d48902..ab869a98bbdc 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -390,7 +390,7 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm,
 	return virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K);
 }
 
-uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr)
+uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr)
 {
 	int level = PG_LEVEL_4K;
 
diff --git a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c
index a3b7ce155981..c542cc4762b1 100644
--- a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c
+++ b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c
@@ -619,7 +619,7 @@ int main(int argc, char *argv[])
 	 */
 	gva = vm_vaddr_unused_gap(vm, NTEST_PAGES * PAGE_SIZE, KVM_UTIL_MIN_VADDR);
 	for (i = 0; i < NTEST_PAGES; i++) {
-		pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE);
+		pte = vm_get_pte(vm, data->test_pages + i * PAGE_SIZE);
 		gpa = addr_hva2gpa(vm, pte);
 		virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK);
 		data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK);
diff --git a/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c
index fabeeaddfb3a..0e8aec568010 100644
--- a/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c
+++ b/tools/testing/selftests/kvm/x86/smaller_maxphyaddr_emulation_test.c
@@ -47,7 +47,6 @@ int main(int argc, char *argv[])
 	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
 	struct ucall uc;
-	uint64_t *pte;
 	uint64_t *hva;
 	uint64_t gpa;
 	int rc;
@@ -73,8 +72,7 @@ int main(int argc, char *argv[])
 	hva = addr_gpa2hva(vm, MEM_REGION_GPA);
 	memset(hva, 0, PAGE_SIZE);
 
-	pte = vm_get_page_table_entry(vm, MEM_REGION_GVA);
-	*pte |= BIT_ULL(MAXPHYADDR);
+	*vm_get_pte(vm, MEM_REGION_GVA) |= BIT_ULL(MAXPHYADDR);
 
 	vcpu_run(vcpu);
 
-- 
cgit v1.2.3


From bda6ae6f29664b659671f872a2adda3c1c2f5dd6 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Fri, 21 Nov 2025 20:48:02 +0000
Subject: KVM: selftests: Use TEST_ASSERT_EQ() in test_vmx_nested_state()

The assert messages do not add much value, so use TEST_ASSERT_EQ(),
which also nicely displays the addresses in hex. While at it, also
assert the values of state->flags.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251121204803.991707-4-yosry.ahmed@linux.dev
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c
index 67a62a5a8895..b59a8a17084d 100644
--- a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c
+++ b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c
@@ -241,8 +241,10 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu)
 	TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
 		    "Size must be between %ld and %d.  The size returned was %d.",
 		    sizeof(*state), state_sz, state->size);
-	TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull.");
-	TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull.");
+
+	TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull);
+	TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull);
+	TEST_ASSERT_EQ(state->flags, 0);
 
 	free(state);
 }
-- 
cgit v1.2.3


From ca2eccb953fd33ef38701e33e660b21f7e84aa14 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Fri, 21 Nov 2025 20:48:03 +0000
Subject: KVM: selftests: Extend vmx_set_nested_state_test to cover SVM

Add test cases for the validation checks in svm_set_nested_state(), and
allow the test to run with SVM as well as VMX. The SVM test also makes
sure that KVM_SET_NESTED_STATE accepts GIF being set or cleared if
EFER.SVME is cleared, verifying a recently fixed bug where GIF was
incorrectly expected to always be set when EFER.SVME is cleared.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251121204803.991707-5-yosry.ahmed@linux.dev
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm           |   2 +-
 .../selftests/kvm/x86/nested_set_state_test.c      | 406 +++++++++++++++++++++
 .../selftests/kvm/x86/vmx_set_nested_state_test.c  | 306 ----------------
 3 files changed, 407 insertions(+), 307 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/x86/nested_set_state_test.c
 delete mode 100644 tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index ba5c2b643efa..4ddece4ee365 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -92,6 +92,7 @@ TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test
 TEST_GEN_PROGS_x86 += x86/nested_emulation_test
 TEST_GEN_PROGS_x86 += x86/nested_exceptions_test
 TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test
+TEST_GEN_PROGS_x86 += x86/nested_set_state_test
 TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test
 TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test
 TEST_GEN_PROGS_x86 += x86/platform_info_test
@@ -120,7 +121,6 @@ TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state
 TEST_GEN_PROGS_x86 += x86/vmx_msrs_test
 TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state
 TEST_GEN_PROGS_x86 += x86/vmx_nested_la57_state_test
-TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test
 TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test
 TEST_GEN_PROGS_x86 += x86/xapic_ipi_test
 TEST_GEN_PROGS_x86 += x86/xapic_state_test
diff --git a/tools/testing/selftests/kvm/x86/nested_set_state_test.c b/tools/testing/selftests/kvm/x86/nested_set_state_test.c
new file mode 100644
index 000000000000..0f2102b43629
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/nested_set_state_test.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019, Google LLC.
+ *
+ * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+#include "svm_util.h"
+
+#include <errno.h>
+#include <linux/kvm.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+/*
+ * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value
+ * changes this should be updated.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+
+bool have_evmcs;
+
+void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state)
+{
+	vcpu_nested_state_set(vcpu, state);
+}
+
+void test_nested_state_expect_errno(struct kvm_vcpu *vcpu,
+				    struct kvm_nested_state *state,
+				    int expected_errno)
+{
+	int rv;
+
+	rv = __vcpu_nested_state_set(vcpu, state);
+	TEST_ASSERT(rv == -1 && errno == expected_errno,
+		"Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)",
+		strerror(expected_errno), expected_errno, rv, strerror(errno),
+		errno);
+}
+
+void test_nested_state_expect_einval(struct kvm_vcpu *vcpu,
+				     struct kvm_nested_state *state)
+{
+	test_nested_state_expect_errno(vcpu, state, EINVAL);
+}
+
+void test_nested_state_expect_efault(struct kvm_vcpu *vcpu,
+				     struct kvm_nested_state *state)
+{
+	test_nested_state_expect_errno(vcpu, state, EFAULT);
+}
+
+void set_revision_id_for_vmcs12(struct kvm_nested_state *state,
+				u32 vmcs12_revision)
+{
+	/* Set revision_id in vmcs12 to vmcs12_revision. */
+	memcpy(&state->data, &vmcs12_revision, sizeof(u32));
+}
+
+void set_default_state(struct kvm_nested_state *state)
+{
+	memset(state, 0, sizeof(*state));
+	state->flags = KVM_STATE_NESTED_RUN_PENDING |
+		       KVM_STATE_NESTED_GUEST_MODE;
+	state->format = 0;
+	state->size = sizeof(*state);
+}
+
+void set_default_vmx_state(struct kvm_nested_state *state, int size)
+{
+	memset(state, 0, size);
+	if (have_evmcs)
+		state->flags = KVM_STATE_NESTED_EVMCS;
+	state->format = 0;
+	state->size = size;
+	state->hdr.vmx.vmxon_pa = 0x1000;
+	state->hdr.vmx.vmcs12_pa = 0x2000;
+	state->hdr.vmx.smm.flags = 0;
+	set_revision_id_for_vmcs12(state, VMCS12_REVISION);
+}
+
+void test_vmx_nested_state(struct kvm_vcpu *vcpu)
+{
+	/* Add a page for VMCS12. */
+	const int state_sz = sizeof(struct kvm_nested_state) + getpagesize();
+	struct kvm_nested_state *state =
+		(struct kvm_nested_state *)malloc(state_sz);
+
+	/* The format must be set to 0. 0 for VMX, 1 for SVM. */
+	set_default_vmx_state(state, state_sz);
+	state->format = 1;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * We cannot virtualize anything if the guest does not have VMX
+	 * enabled.
+	 */
+	set_default_vmx_state(state, state_sz);
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * We cannot virtualize anything if the guest does not have VMX
+	 * enabled.  We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa
+	 * is set to -1ull, but the flags must be zero.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = -1ull;
+	test_nested_state_expect_einval(vcpu, state);
+
+	state->hdr.vmx.vmcs12_pa = -1ull;
+	state->flags = KVM_STATE_NESTED_EVMCS;
+	test_nested_state_expect_einval(vcpu, state);
+
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+
+	/* Enable VMX in the guest CPUID. */
+	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX);
+
+	/*
+	 * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without
+	 * setting the nested state. When the eVMCS flag is not set, the
+	 * expected return value is '0'.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->flags = 0;
+	state->hdr.vmx.vmxon_pa = -1ull;
+	state->hdr.vmx.vmcs12_pa = -1ull;
+	test_nested_state(vcpu, state);
+
+	/*
+	 * When eVMCS is supported, the eVMCS flag can only be set if the
+	 * enlightened VMCS capability has been enabled.
+	 */
+	if (have_evmcs) {
+		state->flags = KVM_STATE_NESTED_EVMCS;
+		test_nested_state_expect_einval(vcpu, state);
+		vcpu_enable_evmcs(vcpu);
+		test_nested_state(vcpu, state);
+	}
+
+	/* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */
+	state->hdr.vmx.smm.flags = 1;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* Invalid flags are rejected. */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.flags = ~0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = -1ull;
+	state->flags = 0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* It is invalid to have vmxon_pa set to a non-page aligned address. */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = 1;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and
+	 * KVM_STATE_NESTED_GUEST_MODE set together.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->flags = KVM_STATE_NESTED_GUEST_MODE  |
+		      KVM_STATE_NESTED_RUN_PENDING;
+	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * It is invalid to have any of the SMM flags set besides:
+	 *	KVM_STATE_NESTED_SMM_GUEST_MODE
+	 *	KVM_STATE_NESTED_SMM_VMXON
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE |
+				KVM_STATE_NESTED_SMM_VMXON);
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* Outside SMM, SMM flags must be zero. */
+	set_default_vmx_state(state, state_sz);
+	state->flags = 0;
+	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * Size must be large enough to fit kvm_nested_state and vmcs12
+	 * if VMCS12 physical address is set
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->size = sizeof(*state);
+	state->flags = 0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	set_default_vmx_state(state, state_sz);
+	state->size = sizeof(*state);
+	state->flags = 0;
+	state->hdr.vmx.vmcs12_pa = -1;
+	test_nested_state(vcpu, state);
+
+	/*
+	 * KVM_SET_NESTED_STATE succeeds with invalid VMCS
+	 * contents but L2 not running.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+
+	/* Invalid flags are rejected, even if no VMCS loaded. */
+	set_default_vmx_state(state, state_sz);
+	state->size = sizeof(*state);
+	state->flags = 0;
+	state->hdr.vmx.vmcs12_pa = -1;
+	state->hdr.vmx.flags = ~0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* vmxon_pa cannot be the same address as vmcs_pa. */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = 0;
+	state->hdr.vmx.vmcs12_pa = 0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * Test that if we leave nesting the state reflects that when we get
+	 * it again.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.vmxon_pa = -1ull;
+	state->hdr.vmx.vmcs12_pa = -1ull;
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+	vcpu_nested_state_get(vcpu, state);
+	TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
+		    "Size must be between %ld and %d.  The size returned was %d.",
+		    sizeof(*state), state_sz, state->size);
+
+	TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull);
+	TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull);
+	TEST_ASSERT_EQ(state->flags, 0);
+
+	free(state);
+}
+
+static void vcpu_efer_enable_svm(struct kvm_vcpu *vcpu)
+{
+	uint64_t old_efer = vcpu_get_msr(vcpu, MSR_EFER);
+
+	vcpu_set_msr(vcpu, MSR_EFER, old_efer | EFER_SVME);
+}
+
+static void vcpu_efer_disable_svm(struct kvm_vcpu *vcpu)
+{
+	uint64_t old_efer = vcpu_get_msr(vcpu, MSR_EFER);
+
+	vcpu_set_msr(vcpu, MSR_EFER, old_efer & ~EFER_SVME);
+}
+
+void set_default_svm_state(struct kvm_nested_state *state, int size)
+{
+	memset(state, 0, size);
+	state->format = 1;
+	state->size = size;
+	state->hdr.svm.vmcb_pa = 0x3000;
+}
+
+void test_svm_nested_state(struct kvm_vcpu *vcpu)
+{
+	/* Add a page for VMCB. */
+	const int state_sz = sizeof(struct kvm_nested_state) + getpagesize();
+	struct kvm_nested_state *state =
+		(struct kvm_nested_state *)malloc(state_sz);
+
+	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_SVM);
+
+	/* The format must be set to 1. 0 for VMX, 1 for SVM. */
+	set_default_svm_state(state, state_sz);
+	state->format = 0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* Invalid flags are rejected, KVM_STATE_NESTED_EVMCS is VMX-only  */
+	set_default_svm_state(state, state_sz);
+	state->flags = KVM_STATE_NESTED_EVMCS;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * If EFER.SVME is clear, guest mode is disallowed and GIF can be set or
+	 * cleared.
+	 */
+	vcpu_efer_disable_svm(vcpu);
+
+	set_default_svm_state(state, state_sz);
+	state->flags = KVM_STATE_NESTED_GUEST_MODE;
+	test_nested_state_expect_einval(vcpu, state);
+
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+
+	state->flags = KVM_STATE_NESTED_GIF_SET;
+	test_nested_state(vcpu, state);
+
+	/* Enable SVM in the guest EFER. */
+	vcpu_efer_enable_svm(vcpu);
+
+	/* Setting vmcb_pa to a non-aligned address is only fine when not entering guest mode */
+	set_default_svm_state(state, state_sz);
+	state->hdr.svm.vmcb_pa = -1ull;
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+	state->flags = KVM_STATE_NESTED_GUEST_MODE;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * Size must be large enough to fit kvm_nested_state and VMCB
+	 * only when entering guest mode.
+	 */
+	set_default_svm_state(state, state_sz/2);
+	state->flags = 0;
+	test_nested_state(vcpu, state);
+	state->flags = KVM_STATE_NESTED_GUEST_MODE;
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * Test that if we leave nesting the state reflects that when we get it
+	 * again, except for vmcb_pa, which is always returned as 0 when not in
+	 * guest mode.
+	 */
+	set_default_svm_state(state, state_sz);
+	state->hdr.svm.vmcb_pa = -1ull;
+	state->flags = KVM_STATE_NESTED_GIF_SET;
+	test_nested_state(vcpu, state);
+	vcpu_nested_state_get(vcpu, state);
+	TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
+		    "Size must be between %ld and %d.  The size returned was %d.",
+		    sizeof(*state), state_sz, state->size);
+
+	TEST_ASSERT_EQ(state->hdr.svm.vmcb_pa, 0);
+	TEST_ASSERT_EQ(state->flags, KVM_STATE_NESTED_GIF_SET);
+
+	free(state);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	struct kvm_nested_state state;
+	struct kvm_vcpu *vcpu;
+
+	have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS);
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) ||
+		     kvm_cpu_has(X86_FEATURE_SVM));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
+
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+	/*
+	 * First run tests with VMX/SVM disabled to check error handling.
+	 * test_{vmx/svm}_nested_state() will re-enable as needed.
+	 */
+	if (kvm_cpu_has(X86_FEATURE_VMX))
+		vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX);
+	else
+		vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_SVM);
+
+	/* Passing a NULL kvm_nested_state causes a EFAULT. */
+	test_nested_state_expect_efault(vcpu, NULL);
+
+	/* 'size' cannot be smaller than sizeof(kvm_nested_state). */
+	set_default_state(&state);
+	state.size = 0;
+	test_nested_state_expect_einval(vcpu, &state);
+
+	/*
+	 * Setting the flags 0xf fails the flags check.  The only flags that
+	 * can be used are:
+	 *     KVM_STATE_NESTED_GUEST_MODE
+	 *     KVM_STATE_NESTED_RUN_PENDING
+	 *     KVM_STATE_NESTED_EVMCS
+	 */
+	set_default_state(&state);
+	state.flags = 0xf;
+	test_nested_state_expect_einval(vcpu, &state);
+
+	/*
+	 * If KVM_STATE_NESTED_RUN_PENDING is set then
+	 * KVM_STATE_NESTED_GUEST_MODE has to be set as well.
+	 */
+	set_default_state(&state);
+	state.flags = KVM_STATE_NESTED_RUN_PENDING;
+	test_nested_state_expect_einval(vcpu, &state);
+
+	if (kvm_cpu_has(X86_FEATURE_VMX))
+		test_vmx_nested_state(vcpu);
+	else
+		test_svm_nested_state(vcpu);
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c
deleted file mode 100644
index b59a8a17084d..000000000000
--- a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c
+++ /dev/null
@@ -1,306 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * vmx_set_nested_state_test
- *
- * Copyright (C) 2019, Google LLC.
- *
- * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE.
- */
-
-#include "test_util.h"
-#include "kvm_util.h"
-#include "processor.h"
-#include "vmx.h"
-
-#include <errno.h>
-#include <linux/kvm.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <unistd.h>
-
-/*
- * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value
- * changes this should be updated.
- */
-#define VMCS12_REVISION 0x11e57ed0
-
-bool have_evmcs;
-
-void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state)
-{
-	vcpu_nested_state_set(vcpu, state);
-}
-
-void test_nested_state_expect_errno(struct kvm_vcpu *vcpu,
-				    struct kvm_nested_state *state,
-				    int expected_errno)
-{
-	int rv;
-
-	rv = __vcpu_nested_state_set(vcpu, state);
-	TEST_ASSERT(rv == -1 && errno == expected_errno,
-		"Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)",
-		strerror(expected_errno), expected_errno, rv, strerror(errno),
-		errno);
-}
-
-void test_nested_state_expect_einval(struct kvm_vcpu *vcpu,
-				     struct kvm_nested_state *state)
-{
-	test_nested_state_expect_errno(vcpu, state, EINVAL);
-}
-
-void test_nested_state_expect_efault(struct kvm_vcpu *vcpu,
-				     struct kvm_nested_state *state)
-{
-	test_nested_state_expect_errno(vcpu, state, EFAULT);
-}
-
-void set_revision_id_for_vmcs12(struct kvm_nested_state *state,
-				u32 vmcs12_revision)
-{
-	/* Set revision_id in vmcs12 to vmcs12_revision. */
-	memcpy(&state->data, &vmcs12_revision, sizeof(u32));
-}
-
-void set_default_state(struct kvm_nested_state *state)
-{
-	memset(state, 0, sizeof(*state));
-	state->flags = KVM_STATE_NESTED_RUN_PENDING |
-		       KVM_STATE_NESTED_GUEST_MODE;
-	state->format = 0;
-	state->size = sizeof(*state);
-}
-
-void set_default_vmx_state(struct kvm_nested_state *state, int size)
-{
-	memset(state, 0, size);
-	if (have_evmcs)
-		state->flags = KVM_STATE_NESTED_EVMCS;
-	state->format = 0;
-	state->size = size;
-	state->hdr.vmx.vmxon_pa = 0x1000;
-	state->hdr.vmx.vmcs12_pa = 0x2000;
-	state->hdr.vmx.smm.flags = 0;
-	set_revision_id_for_vmcs12(state, VMCS12_REVISION);
-}
-
-void test_vmx_nested_state(struct kvm_vcpu *vcpu)
-{
-	/* Add a page for VMCS12. */
-	const int state_sz = sizeof(struct kvm_nested_state) + getpagesize();
-	struct kvm_nested_state *state =
-		(struct kvm_nested_state *)malloc(state_sz);
-
-	/* The format must be set to 0. 0 for VMX, 1 for SVM. */
-	set_default_vmx_state(state, state_sz);
-	state->format = 1;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * We cannot virtualize anything if the guest does not have VMX
-	 * enabled.
-	 */
-	set_default_vmx_state(state, state_sz);
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * We cannot virtualize anything if the guest does not have VMX
-	 * enabled.  We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa
-	 * is set to -1ull, but the flags must be zero.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = -1ull;
-	test_nested_state_expect_einval(vcpu, state);
-
-	state->hdr.vmx.vmcs12_pa = -1ull;
-	state->flags = KVM_STATE_NESTED_EVMCS;
-	test_nested_state_expect_einval(vcpu, state);
-
-	state->flags = 0;
-	test_nested_state(vcpu, state);
-
-	/* Enable VMX in the guest CPUID. */
-	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX);
-
-	/*
-	 * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without
-	 * setting the nested state. When the eVMCS flag is not set, the
-	 * expected return value is '0'.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->flags = 0;
-	state->hdr.vmx.vmxon_pa = -1ull;
-	state->hdr.vmx.vmcs12_pa = -1ull;
-	test_nested_state(vcpu, state);
-
-	/*
-	 * When eVMCS is supported, the eVMCS flag can only be set if the
-	 * enlightened VMCS capability has been enabled.
-	 */
-	if (have_evmcs) {
-		state->flags = KVM_STATE_NESTED_EVMCS;
-		test_nested_state_expect_einval(vcpu, state);
-		vcpu_enable_evmcs(vcpu);
-		test_nested_state(vcpu, state);
-	}
-
-	/* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */
-	state->hdr.vmx.smm.flags = 1;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* Invalid flags are rejected. */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.flags = ~0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = -1ull;
-	state->flags = 0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* It is invalid to have vmxon_pa set to a non-page aligned address. */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = 1;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and
-	 * KVM_STATE_NESTED_GUEST_MODE set together.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->flags = KVM_STATE_NESTED_GUEST_MODE  |
-		      KVM_STATE_NESTED_RUN_PENDING;
-	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * It is invalid to have any of the SMM flags set besides:
-	 *	KVM_STATE_NESTED_SMM_GUEST_MODE
-	 *	KVM_STATE_NESTED_SMM_VMXON
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE |
-				KVM_STATE_NESTED_SMM_VMXON);
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* Outside SMM, SMM flags must be zero. */
-	set_default_vmx_state(state, state_sz);
-	state->flags = 0;
-	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * Size must be large enough to fit kvm_nested_state and vmcs12
-	 * if VMCS12 physical address is set
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->size = sizeof(*state);
-	state->flags = 0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	set_default_vmx_state(state, state_sz);
-	state->size = sizeof(*state);
-	state->flags = 0;
-	state->hdr.vmx.vmcs12_pa = -1;
-	test_nested_state(vcpu, state);
-
-	/*
-	 * KVM_SET_NESTED_STATE succeeds with invalid VMCS
-	 * contents but L2 not running.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->flags = 0;
-	test_nested_state(vcpu, state);
-
-	/* Invalid flags are rejected, even if no VMCS loaded. */
-	set_default_vmx_state(state, state_sz);
-	state->size = sizeof(*state);
-	state->flags = 0;
-	state->hdr.vmx.vmcs12_pa = -1;
-	state->hdr.vmx.flags = ~0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/* vmxon_pa cannot be the same address as vmcs_pa. */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = 0;
-	state->hdr.vmx.vmcs12_pa = 0;
-	test_nested_state_expect_einval(vcpu, state);
-
-	/*
-	 * Test that if we leave nesting the state reflects that when we get
-	 * it again.
-	 */
-	set_default_vmx_state(state, state_sz);
-	state->hdr.vmx.vmxon_pa = -1ull;
-	state->hdr.vmx.vmcs12_pa = -1ull;
-	state->flags = 0;
-	test_nested_state(vcpu, state);
-	vcpu_nested_state_get(vcpu, state);
-	TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
-		    "Size must be between %ld and %d.  The size returned was %d.",
-		    sizeof(*state), state_sz, state->size);
-
-	TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull);
-	TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull);
-	TEST_ASSERT_EQ(state->flags, 0);
-
-	free(state);
-}
-
-int main(int argc, char *argv[])
-{
-	struct kvm_vm *vm;
-	struct kvm_nested_state state;
-	struct kvm_vcpu *vcpu;
-
-	have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS);
-
-	TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
-
-	/*
-	 * AMD currently does not implement set_nested_state, so for now we
-	 * just early out.
-	 */
-	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
-
-	vm = vm_create_with_one_vcpu(&vcpu, NULL);
-
-	/*
-	 * First run tests with VMX disabled to check error handling.
-	 */
-	vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX);
-
-	/* Passing a NULL kvm_nested_state causes a EFAULT. */
-	test_nested_state_expect_efault(vcpu, NULL);
-
-	/* 'size' cannot be smaller than sizeof(kvm_nested_state). */
-	set_default_state(&state);
-	state.size = 0;
-	test_nested_state_expect_einval(vcpu, &state);
-
-	/*
-	 * Setting the flags 0xf fails the flags check.  The only flags that
-	 * can be used are:
-	 *     KVM_STATE_NESTED_GUEST_MODE
-	 *     KVM_STATE_NESTED_RUN_PENDING
-	 *     KVM_STATE_NESTED_EVMCS
-	 */
-	set_default_state(&state);
-	state.flags = 0xf;
-	test_nested_state_expect_einval(vcpu, &state);
-
-	/*
-	 * If KVM_STATE_NESTED_RUN_PENDING is set then
-	 * KVM_STATE_NESTED_GUEST_MODE has to be set as well.
-	 */
-	set_default_state(&state);
-	state.flags = KVM_STATE_NESTED_RUN_PENDING;
-	test_nested_state_expect_einval(vcpu, &state);
-
-	test_vmx_nested_state(vcpu);
-
-	kvm_vm_free(vm);
-	return 0;
-}
-- 
cgit v1.2.3


From 58e3e5265484a1bf39569903630a45a924621aaa Mon Sep 17 00:00:00 2001
From: Shengming Hu <hu.shengming@zte.com.cn>
Date: Mon, 29 Dec 2025 21:52:27 +0800
Subject: memblock: drop redundant 'struct page *' argument from
 memblock_free_pages()

memblock_free_pages() currently takes both a struct page * and the
corresponding PFN. The page pointer is always derived from the PFN at
call sites (pfn_to_page(pfn)), making the parameter redundant and also
allowing accidental mismatches between the two arguments.

Simplify the interface by removing the struct page * argument and
deriving the page locally from the PFN, after the deferred struct page
initialization check. This keeps the behavior unchanged while making
the helper harder to misuse.

Signed-off-by: Shengming Hu <hu.shengming@zte.com.cn>
Reviewed-by: David Hildenbrand (Red Hat) <david@kernel.org>
Link: https://patch.msgid.link/tencent_F741CE6ECC49EE099736685E60C0DBD4A209@qq.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 mm/internal.h                     | 3 +--
 mm/memblock.c                     | 4 ++--
 mm/mm_init.c                      | 5 +++--
 tools/testing/memblock/internal.h | 3 +--
 4 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/mm/internal.h b/mm/internal.h
index e430da900430..5f93ee1459d9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -742,8 +742,7 @@ static inline void clear_zone_contiguous(struct zone *zone)
 extern int __isolate_free_page(struct page *page, unsigned int order);
 extern void __putback_isolated_page(struct page *page, unsigned int order,
 				    int mt);
-extern void memblock_free_pages(struct page *page, unsigned long pfn,
-					unsigned int order);
+extern void memblock_free_pages(unsigned long pfn, unsigned int order);
 extern void __free_pages_core(struct page *page, unsigned int order,
 		enum meminit_context context);
 
diff --git a/mm/memblock.c b/mm/memblock.c
index 905d06b16348..6e11f81c4870 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1771,7 +1771,7 @@ void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
 	end = PFN_DOWN(base + size);
 
 	for (; cursor < end; cursor++) {
-		memblock_free_pages(pfn_to_page(cursor), cursor, 0);
+		memblock_free_pages(cursor, 0);
 		totalram_pages_inc();
 	}
 }
@@ -2216,7 +2216,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
 		while (start + (1UL << order) > end)
 			order--;
 
-		memblock_free_pages(pfn_to_page(start), start, order);
+		memblock_free_pages(start, order);
 
 		start += (1UL << order);
 	}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index fc2a6f1e518f..d5b91602ff2a 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2480,9 +2480,10 @@ void *__init alloc_large_system_hash(const char *tablename,
 	return table;
 }
 
-void __init memblock_free_pages(struct page *page, unsigned long pfn,
-							unsigned int order)
+void __init memblock_free_pages(unsigned long pfn, unsigned int order)
 {
+	struct page *page = pfn_to_page(pfn);
+
 	if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
 		int nid = early_pfn_to_nid(pfn);
 
diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h
index 0ab4b53bb4f3..009b97bbdd22 100644
--- a/tools/testing/memblock/internal.h
+++ b/tools/testing/memblock/internal.h
@@ -15,8 +15,7 @@ bool mirrored_kernelcore = false;
 
 struct page {};
 
-void memblock_free_pages(struct page *page, unsigned long pfn,
-			 unsigned int order)
+void memblock_free_pages(unsigned long pfn, unsigned int order)
 {
 }
 
-- 
cgit v1.2.3


From 736a2dcfdae72483a36793bc92182f33bd61d30e Mon Sep 17 00:00:00 2001
From: "Borislav Petkov (AMD)" <bp@alien8.de>
Date: Tue, 30 Dec 2025 12:07:31 +0100
Subject: x86/CPU/AMD: Simplify the spectral chicken fix

msr_set_bit() takes a bit number to set but MSR_ZEN2_SPECTRAL_CHICKEN_BIT
is a bit mask. The usual pattern that code uses is a _BIT-named type
macro instead of a mask.

So convert it to a bit number to reflect that.

Also, msr_set_bit() already does the reading and checking whether the
bit needs to be set so use that instead of a local variable.

Fixup tabbing while at it.

No functional changes.

Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Nikolay Borisov <nik.borisov@suse.com>
Link: https://patch.msgid.link/20251230110731.28108-1-bp@kernel.org
---
 arch/x86/include/asm/msr-index.h       |  4 ++--
 arch/x86/kernel/cpu/amd.c              | 10 ++--------
 tools/arch/x86/include/asm/msr-index.h |  4 ++--
 3 files changed, 6 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 3d0a0950d20a..43adc38d31d5 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -794,8 +794,8 @@
 #define MSR_F19H_UMC_PERF_CTR           0xc0010801
 
 /* Zen 2 */
-#define MSR_ZEN2_SPECTRAL_CHICKEN       0xc00110e3
-#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT   BIT_ULL(1)
+#define MSR_ZEN2_SPECTRAL_CHICKEN	0xc00110e3
+#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT	1
 
 /* Fam 17h MSRs */
 #define MSR_F17H_IRPERF			0xc00000e9
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bc94ff1e250a..ab9158c94f8c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -900,20 +900,14 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c)
 void init_spectral_chicken(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_MITIGATION_UNRET_ENTRY
-	u64 value;
-
 	/*
 	 * On Zen2 we offer this chicken (bit) on the altar of Speculation.
 	 *
 	 * This suppresses speculation from the middle of a basic block, i.e. it
 	 * suppresses non-branch predictions.
 	 */
-	if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
-		if (!rdmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) {
-			value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT;
-			wrmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value);
-		}
-	}
+	if (!cpu_has(c, X86_FEATURE_HYPERVISOR))
+		msr_set_bit(MSR_ZEN2_SPECTRAL_CHICKEN, MSR_ZEN2_SPECTRAL_CHICKEN_BIT);
 #endif
 }
 
diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h
index 9e1720d73244..d4137a302793 100644
--- a/tools/arch/x86/include/asm/msr-index.h
+++ b/tools/arch/x86/include/asm/msr-index.h
@@ -770,8 +770,8 @@
 #define MSR_F19H_UMC_PERF_CTR           0xc0010801
 
 /* Zen 2 */
-#define MSR_ZEN2_SPECTRAL_CHICKEN       0xc00110e3
-#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT   BIT_ULL(1)
+#define MSR_ZEN2_SPECTRAL_CHICKEN	0xc00110e3
+#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT	1
 
 /* Fam 17h MSRs */
 #define MSR_F17H_IRPERF			0xc00000e9
-- 
cgit v1.2.3


From 15e8d739fda1084d81f7d3813e9600eba6e0f134 Mon Sep 17 00:00:00 2001
From: Günther Noack <gnoack3000@gmail.com>
Date: Thu, 1 Jan 2026 14:40:58 +0100
Subject: selftests/landlock: Properly close a file descriptor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a missing close(srv_fd) call, and use EXPECT_EQ() to check the
result.

Signed-off-by: Günther Noack <gnoack3000@gmail.com>
Fixes: f83d51a5bdfe ("selftests/landlock: Check IOCTL restrictions for named UNIX domain sockets")
Link: https://lore.kernel.org/r/20260101134102.25938-2-gnoack3000@gmail.com
[mic: Use EXPECT_EQ() and update commit message]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/fs_test.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 37a5a3df712e..968a91c927a4 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -4399,7 +4399,8 @@ TEST_F_FORK(layout1, named_unix_domain_socket_ioctl)
 	/* FIONREAD and other IOCTLs should not be forbidden. */
 	EXPECT_EQ(0, test_fionread_ioctl(cli_fd));
 
-	ASSERT_EQ(0, close(cli_fd));
+	EXPECT_EQ(0, close(cli_fd));
+	EXPECT_EQ(0, close(srv_fd));
 }
 
 /* clang-format off */
-- 
cgit v1.2.3


From 37488ae6ceff9c912dab1a7b2217c563b43f99d2 Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Thu, 8 Jan 2026 16:13:27 +0000
Subject: tools: ynl: pylint suppressions and docstrings

Add some docstrings and suppress all the pylint warnings that won't get
fixed yet:

- no-name-in-module,wrong-import-position
- too-many-locals
- too-many-branches
- too-many-statements
- too-many-nested-blocks
- too-many-instance-attributes
- too-many-arguments
- too-many-positional-arguments
- too-few-public-methods
- missing-class-docstring
- missing-function-docstring

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260108161339.29166-2-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/cli.py          | 17 +++++++++++++++++
 tools/net/ynl/pyynl/ethtool.py      |  1 +
 tools/net/ynl/pyynl/lib/__init__.py |  2 ++
 tools/net/ynl/pyynl/lib/nlspec.py   |  7 +++++++
 tools/net/ynl/pyynl/lib/ynl.py      | 18 ++++++++++++++++++
 5 files changed, 45 insertions(+)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py
index af02a5b7e5a2..996c76be1403 100755
--- a/tools/net/ynl/pyynl/cli.py
+++ b/tools/net/ynl/pyynl/cli.py
@@ -1,6 +1,10 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 
+"""
+YNL cli tool
+"""
+
 import argparse
 import json
 import os
@@ -9,6 +13,7 @@ import pprint
 import sys
 import textwrap
 
+# pylint: disable=no-name-in-module,wrong-import-position
 sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
 from lib import YnlFamily, Netlink, NlError, SpecFamily
 
@@ -16,6 +21,10 @@ sys_schema_dir='/usr/share/ynl'
 relative_schema_dir='../../../../Documentation/netlink'
 
 def schema_dir():
+    """
+    Return the effective schema directory, preferring in-tree before
+    system schema directory.
+    """
     script_dir = os.path.dirname(os.path.abspath(__file__))
     schema_dir = os.path.abspath(f"{script_dir}/{relative_schema_dir}")
     if not os.path.isdir(schema_dir):
@@ -25,6 +34,10 @@ def schema_dir():
     return schema_dir
 
 def spec_dir():
+    """
+    Return the effective spec directory, relative to the effective
+    schema directory.
+    """
     spec_dir = schema_dir() + '/specs'
     if not os.path.isdir(spec_dir):
         raise Exception(f"Spec directory {spec_dir} does not exist")
@@ -32,6 +45,7 @@ def spec_dir():
 
 
 class YnlEncoder(json.JSONEncoder):
+    """A custom encoder for emitting JSON with ynl-specific instance types"""
     def default(self, obj):
         if isinstance(obj, bytes):
             return bytes.hex(obj)
@@ -94,7 +108,10 @@ def print_mode_attrs(ynl, mode, mode_spec, attr_set, print_request=True):
         print_attr_list(ynl, mode_spec['attributes'], attr_set)
 
 
+# pylint: disable=too-many-locals,too-many-branches,too-many-statements
 def main():
+    """YNL cli tool"""
+
     description = """
     YNL CLI utility - a general purpose netlink utility that uses YAML
     specs to drive protocol encoding and decoding.
diff --git a/tools/net/ynl/pyynl/ethtool.py b/tools/net/ynl/pyynl/ethtool.py
index fd0f6b8d54d1..40a8ba8d296f 100755
--- a/tools/net/ynl/pyynl/ethtool.py
+++ b/tools/net/ynl/pyynl/ethtool.py
@@ -8,6 +8,7 @@ import sys
 import re
 import os
 
+# pylint: disable=no-name-in-module,wrong-import-position
 sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
 from lib import YnlFamily
 from cli import schema_dir, spec_dir
diff --git a/tools/net/ynl/pyynl/lib/__init__.py b/tools/net/ynl/pyynl/lib/__init__.py
index ec9ea00071be..c40dd788fe8a 100644
--- a/tools/net/ynl/pyynl/lib/__init__.py
+++ b/tools/net/ynl/pyynl/lib/__init__.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 
+""" YNL library """
+
 from .nlspec import SpecAttr, SpecAttrSet, SpecEnumEntry, SpecEnumSet, \
     SpecFamily, SpecOperation, SpecSubMessage, SpecSubMessageFormat
 from .ynl import YnlFamily, Netlink, NlError
diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py
index 85c17fe01e35..2ffeccf0b99b 100644
--- a/tools/net/ynl/pyynl/lib/nlspec.py
+++ b/tools/net/ynl/pyynl/lib/nlspec.py
@@ -1,4 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#
+# pylint: disable=missing-function-docstring, too-many-instance-attributes, too-many-branches
+
+"""
+The nlspec is a python library for parsing and using YNL netlink
+specifications.
+"""
 
 import collections
 import importlib
diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py
index 36d36eb7e3b8..27169ff8dafc 100644
--- a/tools/net/ynl/pyynl/lib/ynl.py
+++ b/tools/net/ynl/pyynl/lib/ynl.py
@@ -1,4 +1,14 @@
 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#
+# pylint: disable=missing-class-docstring, missing-function-docstring
+# pylint: disable=too-many-branches, too-many-locals, too-many-instance-attributes
+# pylint: disable=too-many-lines
+
+"""
+YAML Netlink Library
+
+An implementation of the genetlink and raw netlink protocols.
+"""
 
 from collections import namedtuple
 from enum import Enum
@@ -22,6 +32,7 @@ from .nlspec import SpecFamily
 #
 
 
+# pylint: disable=too-few-public-methods
 class Netlink:
     # Netlink socket
     SOL_NETLINK = 270
@@ -289,6 +300,7 @@ class NlMsg:
         return msg
 
 
+# pylint: disable=too-few-public-methods
 class NlMsgs:
     def __init__(self, data):
         self.msgs = []
@@ -319,6 +331,7 @@ def _genl_msg_finalize(msg):
     return struct.pack("I", len(msg) + 4) + msg
 
 
+# pylint: disable=too-many-nested-blocks
 def _genl_load_families():
     with socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC) as sock:
         sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
@@ -447,6 +460,7 @@ class GenlProtocol(NetlinkProtocol):
         return super().msghdr_size() + 4
 
 
+# pylint: disable=too-few-public-methods
 class SpaceAttrs:
     SpecValuesPair = namedtuple('SpecValuesPair', ['spec', 'values'])
 
@@ -555,6 +569,7 @@ class YnlFamily(SpecFamily):
                 return self._from_string(value, attr_spec)
             raise e
 
+    # pylint: disable=too-many-statements
     def _add_attr(self, space, name, value, search_attrs):
         try:
             attr = self.attr_sets[space][name]
@@ -778,6 +793,7 @@ class YnlFamily(SpecFamily):
                 raise Exception(f"Unknown attribute-set '{msg_format.attr_set}' when decoding '{attr_spec.name}'")
         return decoded
 
+    # pylint: disable=too-many-statements
     def _decode(self, attrs, space, outer_attrs = None):
         rsp = dict()
         if space:
@@ -838,6 +854,7 @@ class YnlFamily(SpecFamily):
 
         return rsp
 
+    # pylint: disable=too-many-arguments, too-many-positional-arguments
     def _decode_extack_path(self, attrs, attr_set, offset, target, search_attrs):
         for attr in attrs:
             try:
@@ -1081,6 +1098,7 @@ class YnlFamily(SpecFamily):
         msg = _genl_msg_finalize(msg)
         return msg
 
+    # pylint: disable=too-many-statements
     def _ops(self, ops):
         reqs_by_seq = {}
         req_seq = random.randint(1024, 65535)
-- 
cgit v1.2.3


From bcdd8ea73f750a4a6c38859a3f06027aa40b84c5 Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Thu, 8 Jan 2026 16:13:28 +0000
Subject: tools: ynl: fix pylint redefinition, encoding errors

Fix pylint warnings for:

- invalid-name
- arguments-renamed
- redefined-outer-name
- unspecified-encoding
- consider-using-sys-exit

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260108161339.29166-3-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/cli.py        | 44 +++++++++++++++----------------
 tools/net/ynl/pyynl/lib/nlspec.py | 18 ++++++-------
 tools/net/ynl/pyynl/lib/ynl.py    | 54 +++++++++++++++++++--------------------
 3 files changed, 58 insertions(+), 58 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py
index 996c76be1403..37efa8c4f0e2 100755
--- a/tools/net/ynl/pyynl/cli.py
+++ b/tools/net/ynl/pyynl/cli.py
@@ -17,8 +17,8 @@ import textwrap
 sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
 from lib import YnlFamily, Netlink, NlError, SpecFamily
 
-sys_schema_dir='/usr/share/ynl'
-relative_schema_dir='../../../../Documentation/netlink'
+SYS_SCHEMA_DIR='/usr/share/ynl'
+RELATIVE_SCHEMA_DIR='../../../../Documentation/netlink'
 
 def schema_dir():
     """
@@ -26,32 +26,32 @@ def schema_dir():
     system schema directory.
     """
     script_dir = os.path.dirname(os.path.abspath(__file__))
-    schema_dir = os.path.abspath(f"{script_dir}/{relative_schema_dir}")
-    if not os.path.isdir(schema_dir):
-        schema_dir = sys_schema_dir
-    if not os.path.isdir(schema_dir):
-        raise Exception(f"Schema directory {schema_dir} does not exist")
-    return schema_dir
+    schema_dir_ = os.path.abspath(f"{script_dir}/{RELATIVE_SCHEMA_DIR}")
+    if not os.path.isdir(schema_dir_):
+        schema_dir_ = SYS_SCHEMA_DIR
+    if not os.path.isdir(schema_dir_):
+        raise Exception(f"Schema directory {schema_dir_} does not exist")
+    return schema_dir_
 
 def spec_dir():
     """
     Return the effective spec directory, relative to the effective
     schema directory.
     """
-    spec_dir = schema_dir() + '/specs'
-    if not os.path.isdir(spec_dir):
-        raise Exception(f"Spec directory {spec_dir} does not exist")
-    return spec_dir
+    spec_dir_ = schema_dir() + '/specs'
+    if not os.path.isdir(spec_dir_):
+        raise Exception(f"Spec directory {spec_dir_} does not exist")
+    return spec_dir_
 
 
 class YnlEncoder(json.JSONEncoder):
     """A custom encoder for emitting JSON with ynl-specific instance types"""
-    def default(self, obj):
-        if isinstance(obj, bytes):
-            return bytes.hex(obj)
-        if isinstance(obj, set):
-            return list(obj)
-        return json.JSONEncoder.default(self, obj)
+    def default(self, o):
+        if isinstance(o, bytes):
+            return bytes.hex(o)
+        if isinstance(o, set):
+            return list(o)
+        return json.JSONEncoder.default(self, o)
 
 
 def print_attr_list(ynl, attr_names, attr_set, indent=2):
@@ -196,11 +196,11 @@ def main():
             SpecFamily(spec, args.schema)
         except Exception as error:
             print(error)
-            exit(1)
+            sys.exit(1)
         return
 
     if args.family: # set behaviour when using installed specs
-        if args.schema is None and spec.startswith(sys_schema_dir):
+        if args.schema is None and spec.startswith(SYS_SCHEMA_DIR):
             args.schema = '' # disable schema validation when installed
         if args.process_unknown is None:
             args.process_unknown = True
@@ -224,7 +224,7 @@ def main():
         op = ynl.msgs.get(args.list_attrs)
         if not op:
             print(f'Operation {args.list_attrs} not found')
-            exit(1)
+            sys.exit(1)
 
         print(f'Operation: {op.name}')
         print(op.yaml['doc'])
@@ -259,7 +259,7 @@ def main():
                 output(msg)
     except NlError as e:
         print(e)
-        exit(1)
+        sys.exit(1)
     except KeyboardInterrupt:
         pass
     except BrokenPipeError:
diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py
index 2ffeccf0b99b..c3113952c417 100644
--- a/tools/net/ynl/pyynl/lib/nlspec.py
+++ b/tools/net/ynl/pyynl/lib/nlspec.py
@@ -10,7 +10,7 @@ specifications.
 import collections
 import importlib
 import os
-import yaml
+import yaml as pyyaml
 
 
 # To be loaded dynamically as needed
@@ -313,11 +313,11 @@ class SpecSubMessage(SpecElement):
 
         self.formats = collections.OrderedDict()
         for elem in self.yaml['formats']:
-            format = self.new_format(family, elem)
-            self.formats[format.value] = format
+            msg_format = self.new_format(family, elem)
+            self.formats[msg_format.value] = msg_format
 
-    def new_format(self, family, format):
-        return SpecSubMessageFormat(family, format)
+    def new_format(self, family, msg_format):
+        return SpecSubMessageFormat(family, msg_format)
 
 
 class SpecSubMessageFormat(SpecElement):
@@ -436,7 +436,7 @@ class SpecFamily(SpecElement):
         kernel_family   dict of kernel family attributes
     """
     def __init__(self, spec_path, schema_path=None, exclude_ops=None):
-        with open(spec_path, "r") as stream:
+        with open(spec_path, "r", encoding='utf-8') as stream:
             prefix = '# SPDX-License-Identifier: '
             first = stream.readline().strip()
             if not first.startswith(prefix):
@@ -444,7 +444,7 @@ class SpecFamily(SpecElement):
             self.license = first[len(prefix):]
 
             stream.seek(0)
-            spec = yaml.safe_load(stream)
+            spec = pyyaml.safe_load(stream)
 
         self._resolution_list = []
 
@@ -460,8 +460,8 @@ class SpecFamily(SpecElement):
         if schema_path:
             global jsonschema
 
-            with open(schema_path, "r") as stream:
-                schema = yaml.safe_load(stream)
+            with open(schema_path, "r", encoding='utf-8') as stream:
+                schema = pyyaml.safe_load(stream)
 
             if jsonschema is None:
                 jsonschema = importlib.import_module("jsonschema")
diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py
index 27169ff8dafc..78579e495351 100644
--- a/tools/net/ynl/pyynl/lib/ynl.py
+++ b/tools/net/ynl/pyynl/lib/ynl.py
@@ -155,22 +155,22 @@ class NlAttr:
 
     @classmethod
     def get_format(cls, attr_type, byte_order=None):
-        format = cls.type_formats[attr_type]
+        format_ = cls.type_formats[attr_type]
         if byte_order:
-            return format.big if byte_order == "big-endian" \
-                else format.little
-        return format.native
+            return format_.big if byte_order == "big-endian" \
+                else format_.little
+        return format_.native
 
     def as_scalar(self, attr_type, byte_order=None):
-        format = self.get_format(attr_type, byte_order)
-        return format.unpack(self.raw)[0]
+        format_ = self.get_format(attr_type, byte_order)
+        return format_.unpack(self.raw)[0]
 
     def as_auto_scalar(self, attr_type, byte_order=None):
         if len(self.raw) != 4 and len(self.raw) != 8:
             raise Exception(f"Auto-scalar len payload be 4 or 8 bytes, got {len(self.raw)}")
         real_type = attr_type[0] + str(len(self.raw) * 8)
-        format = self.get_format(real_type, byte_order)
-        return format.unpack(self.raw)[0]
+        format_ = self.get_format(real_type, byte_order)
+        return format_.unpack(self.raw)[0]
 
     def as_strz(self):
         return self.raw.decode('ascii')[:-1]
@@ -178,9 +178,9 @@ class NlAttr:
     def as_bin(self):
         return self.raw
 
-    def as_c_array(self, type):
-        format = self.get_format(type)
-        return [ x[0] for x in format.iter_unpack(self.raw) ]
+    def as_c_array(self, c_type):
+        format_ = self.get_format(c_type)
+        return [ x[0] for x in format_.iter_unpack(self.raw) ]
 
     def __repr__(self):
         return f"[type:{self.type} len:{self._len}] {self.raw}"
@@ -256,8 +256,8 @@ class NlMsg:
         policy = {}
         for attr in NlAttrs(raw):
             if attr.type == Netlink.NL_POLICY_TYPE_ATTR_TYPE:
-                type = attr.as_scalar('u32')
-                policy['type'] = Netlink.AttrType(type).name
+                type_ = attr.as_scalar('u32')
+                policy['type'] = Netlink.AttrType(type_).name
             elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MIN_VALUE_S:
                 policy['min-value'] = attr.as_scalar('s64')
             elif attr.type == Netlink.NL_POLICY_TYPE_ATTR_MAX_VALUE_S:
@@ -612,8 +612,8 @@ class YnlFamily(SpecFamily):
             elif isinstance(value, dict) and attr.struct_name:
                 attr_payload = self._encode_struct(attr.struct_name, value)
             elif isinstance(value, list) and attr.sub_type in NlAttr.type_formats:
-                format = NlAttr.get_format(attr.sub_type)
-                attr_payload = b''.join([format.pack(x) for x in value])
+                format_ = NlAttr.get_format(attr.sub_type)
+                attr_payload = b''.join([format_.pack(x) for x in value])
             else:
                 raise Exception(f'Unknown type for binary attribute, value: {value}')
         elif attr['type'] in NlAttr.type_formats or attr.is_auto_scalar:
@@ -622,8 +622,8 @@ class YnlFamily(SpecFamily):
                 attr_type = attr["type"][0] + ('32' if scalar.bit_length() <= 32 else '64')
             else:
                 attr_type = attr["type"]
-            format = NlAttr.get_format(attr_type, attr.byte_order)
-            attr_payload = format.pack(scalar)
+            format_ = NlAttr.get_format(attr_type, attr.byte_order)
+            attr_payload = format_.pack(scalar)
         elif attr['type'] in "bitfield32":
             scalar_value = self._get_scalar(attr, value["value"])
             scalar_selector = self._get_scalar(attr, value["selector"])
@@ -915,8 +915,8 @@ class YnlFamily(SpecFamily):
                     else:
                         size += m.len
                 else:
-                    format = NlAttr.get_format(m.type, m.byte_order)
-                    size += format.size
+                    format_ = NlAttr.get_format(m.type, m.byte_order)
+                    size += format_.size
             return size
         else:
             return 0
@@ -931,17 +931,17 @@ class YnlFamily(SpecFamily):
                 offset += m.len
             elif m.type == 'binary':
                 if m.struct:
-                    len = self._struct_size(m.struct)
-                    value = self._decode_struct(data[offset : offset + len],
+                    len_ = self._struct_size(m.struct)
+                    value = self._decode_struct(data[offset : offset + len_],
                                                 m.struct)
-                    offset += len
+                    offset += len_
                 else:
                     value = data[offset : offset + m.len]
                     offset += m.len
             else:
-                format = NlAttr.get_format(m.type, m.byte_order)
-                [ value ] = format.unpack_from(data, offset)
-                offset += format.size
+                format_ = NlAttr.get_format(m.type, m.byte_order)
+                [ value ] = format_.unpack_from(data, offset)
+                offset += format_.size
             if value is not None:
                 if m.enum:
                     value = self._decode_enum(value, m)
@@ -970,8 +970,8 @@ class YnlFamily(SpecFamily):
             else:
                 if value is None:
                     value = 0
-                format = NlAttr.get_format(m.type, m.byte_order)
-                attr_payload += format.pack(value)
+                format_ = NlAttr.get_format(m.type, m.byte_order)
+                attr_payload += format_.pack(value)
         return attr_payload
 
     def _formatted_string(self, raw, display_hint):
-- 
cgit v1.2.3


From b6270a10b0f8727fea3005eecd9907ddaf38dc3f Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Thu, 8 Jan 2026 16:13:29 +0000
Subject: tools: ynl: fix pylint exception warnings

Fix pylint warnings for:

- broad-exception-raised
- broad-exception-caught
- raise-missing-from

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260108161339.29166-4-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/cli.py          | 10 +++----
 tools/net/ynl/pyynl/lib/__init__.py |  8 +++--
 tools/net/ynl/pyynl/lib/nlspec.py   | 11 +++++--
 tools/net/ynl/pyynl/lib/ynl.py      | 59 +++++++++++++++++++++----------------
 4 files changed, 52 insertions(+), 36 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py
index 37efa8c4f0e2..5fee45e48bbf 100755
--- a/tools/net/ynl/pyynl/cli.py
+++ b/tools/net/ynl/pyynl/cli.py
@@ -15,7 +15,7 @@ import textwrap
 
 # pylint: disable=no-name-in-module,wrong-import-position
 sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
-from lib import YnlFamily, Netlink, NlError, SpecFamily
+from lib import YnlFamily, Netlink, NlError, SpecFamily, SpecException, YnlException
 
 SYS_SCHEMA_DIR='/usr/share/ynl'
 RELATIVE_SCHEMA_DIR='../../../../Documentation/netlink'
@@ -30,7 +30,7 @@ def schema_dir():
     if not os.path.isdir(schema_dir_):
         schema_dir_ = SYS_SCHEMA_DIR
     if not os.path.isdir(schema_dir_):
-        raise Exception(f"Schema directory {schema_dir_} does not exist")
+        raise YnlException(f"Schema directory {schema_dir_} does not exist")
     return schema_dir_
 
 def spec_dir():
@@ -40,7 +40,7 @@ def spec_dir():
     """
     spec_dir_ = schema_dir() + '/specs'
     if not os.path.isdir(spec_dir_):
-        raise Exception(f"Spec directory {spec_dir_} does not exist")
+        raise YnlException(f"Spec directory {spec_dir_} does not exist")
     return spec_dir_
 
 
@@ -189,12 +189,12 @@ def main():
     else:
         spec = args.spec
     if not os.path.isfile(spec):
-        raise Exception(f"Spec file {spec} does not exist")
+        raise YnlException(f"Spec file {spec} does not exist")
 
     if args.validate:
         try:
             SpecFamily(spec, args.schema)
-        except Exception as error:
+        except SpecException as error:
             print(error)
             sys.exit(1)
         return
diff --git a/tools/net/ynl/pyynl/lib/__init__.py b/tools/net/ynl/pyynl/lib/__init__.py
index c40dd788fe8a..33a96155fb3b 100644
--- a/tools/net/ynl/pyynl/lib/__init__.py
+++ b/tools/net/ynl/pyynl/lib/__init__.py
@@ -3,11 +3,13 @@
 """ YNL library """
 
 from .nlspec import SpecAttr, SpecAttrSet, SpecEnumEntry, SpecEnumSet, \
-    SpecFamily, SpecOperation, SpecSubMessage, SpecSubMessageFormat
-from .ynl import YnlFamily, Netlink, NlError
+    SpecFamily, SpecOperation, SpecSubMessage, SpecSubMessageFormat, \
+    SpecException
+from .ynl import YnlFamily, Netlink, NlError, YnlException
 
 from .doc_generator import YnlDocGenerator
 
 __all__ = ["SpecAttr", "SpecAttrSet", "SpecEnumEntry", "SpecEnumSet",
            "SpecFamily", "SpecOperation", "SpecSubMessage", "SpecSubMessageFormat",
-           "YnlFamily", "Netlink", "NlError", "YnlDocGenerator"]
+           "SpecException",
+           "YnlFamily", "Netlink", "NlError", "YnlDocGenerator", "YnlException"]
diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py
index c3113952c417..a35f827f09e3 100644
--- a/tools/net/ynl/pyynl/lib/nlspec.py
+++ b/tools/net/ynl/pyynl/lib/nlspec.py
@@ -17,6 +17,11 @@ import yaml as pyyaml
 jsonschema = None
 
 
+class SpecException(Exception):
+    """Netlink spec exception.
+    """
+
+
 class SpecElement:
     """Netlink spec element.
 
@@ -385,7 +390,7 @@ class SpecOperation(SpecElement):
         elif self.is_resv:
             attr_set_name = ''
         else:
-            raise Exception(f"Can't resolve attribute set for op '{self.name}'")
+            raise SpecException(f"Can't resolve attribute set for op '{self.name}'")
         if attr_set_name:
             self.attr_set = self.family.attr_sets[attr_set_name]
 
@@ -440,7 +445,7 @@ class SpecFamily(SpecElement):
             prefix = '# SPDX-License-Identifier: '
             first = stream.readline().strip()
             if not first.startswith(prefix):
-                raise Exception('SPDX license tag required in the spec')
+                raise SpecException('SPDX license tag required in the spec')
             self.license = first[len(prefix):]
 
             stream.seek(0)
@@ -555,7 +560,7 @@ class SpecFamily(SpecElement):
                 req_val_next = req_val + 1
                 rsp_val_next = rsp_val + rsp_inc
             else:
-                raise Exception("Can't parse directional ops")
+                raise SpecException("Can't parse directional ops")
 
             if req_val == req_val_next:
                 req_val = None
diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py
index 78579e495351..6e39618e5598 100644
--- a/tools/net/ynl/pyynl/lib/ynl.py
+++ b/tools/net/ynl/pyynl/lib/ynl.py
@@ -32,6 +32,10 @@ from .nlspec import SpecFamily
 #
 
 
+class YnlException(Exception):
+    pass
+
+
 # pylint: disable=too-few-public-methods
 class Netlink:
     # Netlink socket
@@ -167,7 +171,7 @@ class NlAttr:
 
     def as_auto_scalar(self, attr_type, byte_order=None):
         if len(self.raw) != 4 and len(self.raw) != 8:
-            raise Exception(f"Auto-scalar len payload be 4 or 8 bytes, got {len(self.raw)}")
+            raise YnlException(f"Auto-scalar len payload be 4 or 8 bytes, got {len(self.raw)}")
         real_type = attr_type[0] + str(len(self.raw) * 8)
         format_ = self.get_format(real_type, byte_order)
         return format_.unpack(self.raw)[0]
@@ -425,7 +429,7 @@ class NetlinkProtocol:
 
     def get_mcast_id(self, mcast_name, mcast_groups):
         if mcast_name not in mcast_groups:
-            raise Exception(f'Multicast group "{mcast_name}" not present in the spec')
+            raise YnlException(f'Multicast group "{mcast_name}" not present in the spec')
         return mcast_groups[mcast_name].value
 
     def msghdr_size(self):
@@ -453,7 +457,7 @@ class GenlProtocol(NetlinkProtocol):
 
     def get_mcast_id(self, mcast_name, mcast_groups):
         if mcast_name not in self.genl_family['mcast']:
-            raise Exception(f'Multicast group "{mcast_name}" not present in the family')
+            raise YnlException(f'Multicast group "{mcast_name}" not present in the family')
         return self.genl_family['mcast'][mcast_name]
 
     def msghdr_size(self):
@@ -475,9 +479,9 @@ class SpaceAttrs:
                 if name in scope.values:
                     return scope.values[name]
                 spec_name = scope.spec.yaml['name']
-                raise Exception(
+                raise YnlException(
                     f"No value for '{name}' in attribute space '{spec_name}'")
-        raise Exception(f"Attribute '{name}' not defined in any attribute-set")
+        raise YnlException(f"Attribute '{name}' not defined in any attribute-set")
 
 
 #
@@ -499,8 +503,8 @@ class YnlFamily(SpecFamily):
                                                self.yaml['protonum'])
             else:
                 self.nlproto = GenlProtocol(self.yaml['name'])
-        except KeyError:
-            raise Exception(f"Family '{self.yaml['name']}' not supported by the kernel")
+        except KeyError as err:
+            raise YnlException(f"Family '{self.yaml['name']}' not supported by the kernel") from err
 
         self._recv_dbg = False
         # Note that netlink will use conservative (min) message size for
@@ -573,8 +577,8 @@ class YnlFamily(SpecFamily):
     def _add_attr(self, space, name, value, search_attrs):
         try:
             attr = self.attr_sets[space][name]
-        except KeyError:
-            raise Exception(f"Space '{space}' has no attribute '{name}'")
+        except KeyError as err:
+            raise YnlException(f"Space '{space}' has no attribute '{name}'") from err
         nl_type = attr.value
 
         if attr.is_multi and isinstance(value, list):
@@ -615,7 +619,7 @@ class YnlFamily(SpecFamily):
                 format_ = NlAttr.get_format(attr.sub_type)
                 attr_payload = b''.join([format_.pack(x) for x in value])
             else:
-                raise Exception(f'Unknown type for binary attribute, value: {value}')
+                raise YnlException(f'Unknown type for binary attribute, value: {value}')
         elif attr['type'] in NlAttr.type_formats or attr.is_auto_scalar:
             scalar = self._get_scalar(attr, value)
             if attr.is_auto_scalar:
@@ -641,9 +645,9 @@ class YnlFamily(SpecFamily):
                         attr_payload += self._add_attr(msg_format.attr_set,
                                                        subname, subvalue, sub_attrs)
                 else:
-                    raise Exception(f"Unknown attribute-set '{msg_format.attr_set}'")
+                    raise YnlException(f"Unknown attribute-set '{msg_format.attr_set}'")
         else:
-            raise Exception(f'Unknown type at {space} {name} {value} {attr["type"]}')
+            raise YnlException(f'Unknown type at {space} {name} {value} {attr["type"]}')
 
         return self._add_attr_raw(nl_type, attr_payload)
 
@@ -730,7 +734,7 @@ class YnlFamily(SpecFamily):
                     subattr = self._formatted_string(subattr, attr_spec.display_hint)
                 decoded.append(subattr)
             else:
-                raise Exception(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}')
+                raise YnlException(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}')
         return decoded
 
     def _decode_nest_type_value(self, attr, attr_spec):
@@ -767,13 +771,13 @@ class YnlFamily(SpecFamily):
     def _resolve_selector(self, attr_spec, search_attrs):
         sub_msg = attr_spec.sub_message
         if sub_msg not in self.sub_msgs:
-            raise Exception(f"No sub-message spec named {sub_msg} for {attr_spec.name}")
+            raise YnlException(f"No sub-message spec named {sub_msg} for {attr_spec.name}")
         sub_msg_spec = self.sub_msgs[sub_msg]
 
         selector = attr_spec.selector
         value = search_attrs.lookup(selector)
         if value not in sub_msg_spec.formats:
-            raise Exception(f"No message format for '{value}' in sub-message spec '{sub_msg}'")
+            raise YnlException(f"No message format for '{value}' in sub-message spec '{sub_msg}'")
 
         spec = sub_msg_spec.formats[value]
         return spec, value
@@ -790,7 +794,8 @@ class YnlFamily(SpecFamily):
                 subdict = self._decode(NlAttrs(attr.raw, offset), msg_format.attr_set)
                 decoded.update(subdict)
             else:
-                raise Exception(f"Unknown attribute-set '{msg_format.attr_set}' when decoding '{attr_spec.name}'")
+                raise YnlException(f"Unknown attribute-set '{msg_format.attr_set}' "
+                                   f"when decoding '{attr_spec.name}'")
         return decoded
 
     # pylint: disable=too-many-statements
@@ -803,9 +808,10 @@ class YnlFamily(SpecFamily):
         for attr in attrs:
             try:
                 attr_spec = attr_space.attrs_by_val[attr.type]
-            except (KeyError, UnboundLocalError):
+            except (KeyError, UnboundLocalError) as err:
                 if not self.process_unknown:
-                    raise Exception(f"Space '{space}' has no attribute with value '{attr.type}'")
+                    raise YnlException(f"Space '{space}' has no attribute "
+                                       f"with value '{attr.type}'") from err
                 attr_name = f"UnknownAttr({attr.type})"
                 self._rsp_add(rsp, attr_name, None, self._decode_unknown(attr))
                 continue
@@ -844,7 +850,8 @@ class YnlFamily(SpecFamily):
                     decoded = self._decode_nest_type_value(attr, attr_spec)
                 else:
                     if not self.process_unknown:
-                        raise Exception(f'Unknown {attr_spec["type"]} with name {attr_spec["name"]}')
+                        raise YnlException(f'Unknown {attr_spec["type"]} '
+                                           f'with name {attr_spec["name"]}')
                     decoded = self._decode_unknown(attr)
 
                 self._rsp_add(rsp, attr_spec["name"], attr_spec.is_multi, decoded)
@@ -859,8 +866,9 @@ class YnlFamily(SpecFamily):
         for attr in attrs:
             try:
                 attr_spec = attr_set.attrs_by_val[attr.type]
-            except KeyError:
-                raise Exception(f"Space '{attr_set.name}' has no attribute with value '{attr.type}'")
+            except KeyError as err:
+                raise YnlException(
+                    f"Space '{attr_set.name}' has no attribute with value '{attr.type}'") from err
             if offset > target:
                 break
             if offset == target:
@@ -877,11 +885,12 @@ class YnlFamily(SpecFamily):
             elif attr_spec['type'] == 'sub-message':
                 msg_format, value = self._resolve_selector(attr_spec, search_attrs)
                 if msg_format is None:
-                    raise Exception(f"Can't resolve sub-message of {attr_spec['name']} for extack")
+                    raise YnlException(f"Can't resolve sub-message of "
+                                       f"{attr_spec['name']} for extack")
                 sub_attrs = self.attr_sets[msg_format.attr_set]
                 pathname += f"({value})"
             else:
-                raise Exception(f"Can't dive into {attr.type} ({attr_spec['name']}) for extack")
+                raise YnlException(f"Can't dive into {attr.type} ({attr_spec['name']}) for extack")
             offset += 4
             subpath = self._decode_extack_path(NlAttrs(attr.raw), sub_attrs,
                                                offset, target, search_attrs)
@@ -1008,11 +1017,11 @@ class YnlFamily(SpecFamily):
                 mac_bytes = [int(x, 16) for x in string.split(':')]
             else:
                 if len(string) % 2 != 0:
-                    raise Exception(f"Invalid MAC address format: {string}")
+                    raise YnlException(f"Invalid MAC address format: {string}")
                 mac_bytes = [int(string[i:i+2], 16) for i in range(0, len(string), 2)]
             raw = bytes(mac_bytes)
         else:
-            raise Exception(f"Display hint '{attr_spec.display_hint}' not implemented"
+            raise YnlException(f"Display hint '{attr_spec.display_hint}' not implemented"
                             f" when parsing '{attr_spec['name']}'")
         return raw
 
-- 
cgit v1.2.3


From 04b0b64e86b7ee9923099d141f8e2ed74389c435 Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Thu, 8 Jan 2026 16:13:30 +0000
Subject: tools: ynl: fix pylint dict, indentation, long lines, uninitialised

Fix pylint warnings for:

- use-dict-literal
- bad-indentation
- line-too-long
- possibly-used-before-assignment

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260108161339.29166-5-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/lib/nlspec.py | 19 ++++++++++---------
 tools/net/ynl/pyynl/lib/ynl.py    | 37 +++++++++++++++++++------------------
 2 files changed, 29 insertions(+), 27 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py
index a35f827f09e3..fcd4106d0cfa 100644
--- a/tools/net/ynl/pyynl/lib/nlspec.py
+++ b/tools/net/ynl/pyynl/lib/nlspec.py
@@ -129,8 +129,8 @@ class SpecEnumSet(SpecElement):
 
         prev_entry = None
         value_start = self.yaml.get('value-start', 0)
-        self.entries = dict()
-        self.entries_by_val = dict()
+        self.entries = {}
+        self.entries_by_val = {}
         for entry in self.yaml['entries']:
             e = self.new_entry(entry, prev_entry, value_start)
             self.entries[e.name] = e
@@ -451,6 +451,7 @@ class SpecFamily(SpecElement):
             stream.seek(0)
             spec = pyyaml.safe_load(stream)
 
+        self.fixed_header = None
         self._resolution_list = []
 
         super().__init__(self, spec)
@@ -579,13 +580,13 @@ class SpecFamily(SpecElement):
             self.msgs[op.name] = op
 
     def find_operation(self, name):
-      """
-      For a given operation name, find and return operation spec.
-      """
-      for op in self.yaml['operations']['list']:
-        if name == op['name']:
-          return op
-      return None
+        """
+        For a given operation name, find and return operation spec.
+        """
+        for op in self.yaml['operations']['list']:
+            if name == op['name']:
+                return op
+        return None
 
     def resolve(self):
         self.resolve_up(super())
diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py
index 6e39618e5598..040ff3b87c17 100644
--- a/tools/net/ynl/pyynl/lib/ynl.py
+++ b/tools/net/ynl/pyynl/lib/ynl.py
@@ -235,7 +235,7 @@ class NlMsg:
 
         self.extack = None
         if self.nl_flags & Netlink.NLM_F_ACK_TLVS and extack_off:
-            self.extack = dict()
+            self.extack = {}
             extack_attrs = NlAttrs(self.raw[extack_off:])
             for extack in extack_attrs:
                 if extack.type == Netlink.NLMSGERR_ATTR_MSG:
@@ -296,7 +296,8 @@ class NlMsg:
         return self.nl_type
 
     def __repr__(self):
-        msg = f"nl_len = {self.nl_len} ({len(self.raw)}) nl_flags = 0x{self.nl_flags:x} nl_type = {self.nl_type}"
+        msg = (f"nl_len = {self.nl_len} ({len(self.raw)}) "
+               f"nl_flags = 0x{self.nl_flags:x} nl_type = {self.nl_type}")
         if self.error:
             msg += '\n\terror: ' + str(self.error)
         if self.extack:
@@ -361,7 +362,7 @@ def _genl_load_families():
                     return
 
                 gm = GenlMsg(nl_msg)
-                fam = dict()
+                fam = {}
                 for attr in NlAttrs(gm.raw):
                     if attr.type == Netlink.CTRL_ATTR_FAMILY_ID:
                         fam['id'] = attr.as_scalar('u16')
@@ -370,7 +371,7 @@ def _genl_load_families():
                     elif attr.type == Netlink.CTRL_ATTR_MAXATTR:
                         fam['maxattr'] = attr.as_scalar('u32')
                     elif attr.type == Netlink.CTRL_ATTR_MCAST_GROUPS:
-                        fam['mcast'] = dict()
+                        fam['mcast'] = {}
                         for entry in NlAttrs(attr.raw):
                             mcast_name = None
                             mcast_id = None
@@ -390,6 +391,7 @@ class GenlMsg:
         self.nl = nl_msg
         self.genl_cmd, self.genl_version, _ = struct.unpack_from("BBH", nl_msg.raw, 0)
         self.raw = nl_msg.raw[4:]
+        self.raw_attrs = []
 
     def cmd(self):
         return self.genl_cmd
@@ -560,8 +562,7 @@ class YnlFamily(SpecFamily):
             for single_value in value:
                 scalar += enum.entries[single_value].user_value(as_flags = True)
             return scalar
-        else:
-            return enum.entries[value].user_value()
+        return enum.entries[value].user_value()
 
     def _get_scalar(self, attr_spec, value):
         try:
@@ -750,8 +751,7 @@ class YnlFamily(SpecFamily):
     def _decode_unknown(self, attr):
         if attr.is_nest:
             return self._decode(NlAttrs(attr.raw), None)
-        else:
-            return attr.as_bin()
+        return attr.as_bin()
 
     def _rsp_add(self, rsp, name, is_multi, decoded):
         if is_multi is None:
@@ -800,7 +800,8 @@ class YnlFamily(SpecFamily):
 
     # pylint: disable=too-many-statements
     def _decode(self, attrs, space, outer_attrs = None):
-        rsp = dict()
+        rsp = {}
+        search_attrs = {}
         if space:
             attr_space = self.attr_sets[space]
             search_attrs = SpaceAttrs(attr_space, rsp, outer_attrs)
@@ -818,7 +819,9 @@ class YnlFamily(SpecFamily):
 
             try:
                 if attr_spec["type"] == 'nest':
-                    subdict = self._decode(NlAttrs(attr.raw), attr_spec['nested-attributes'], search_attrs)
+                    subdict = self._decode(NlAttrs(attr.raw),
+                                           attr_spec['nested-attributes'],
+                                           search_attrs)
                     decoded = subdict
                 elif attr_spec["type"] == 'string':
                     decoded = attr.as_strz()
@@ -927,12 +930,11 @@ class YnlFamily(SpecFamily):
                     format_ = NlAttr.get_format(m.type, m.byte_order)
                     size += format_.size
             return size
-        else:
-            return 0
+        return 0
 
     def _decode_struct(self, data, name):
         members = self.consts[name].members
-        attrs = dict()
+        attrs = {}
         offset = 0
         for m in members:
             value = None
@@ -969,7 +971,7 @@ class YnlFamily(SpecFamily):
             elif m.type == 'binary':
                 if m.struct:
                     if value is None:
-                        value = dict()
+                        value = {}
                     attr_payload += self._encode_struct(m.struct, value)
                 else:
                     if value is None:
@@ -1026,7 +1028,7 @@ class YnlFamily(SpecFamily):
         return raw
 
     def handle_ntf(self, decoded):
-        msg = dict()
+        msg = {}
         if self.include_raw:
             msg['raw'] = decoded
         op = self.rsp_by_value[decoded.cmd()]
@@ -1166,9 +1168,8 @@ class YnlFamily(SpecFamily):
                     if decoded.cmd() in self.async_msg_ids:
                         self.handle_ntf(decoded)
                         continue
-                    else:
-                        print('Unexpected message: ' + repr(decoded))
-                        continue
+                    print('Unexpected message: ' + repr(decoded))
+                    continue
 
                 rsp_msg = self._decode(decoded.raw_attrs, op.attr_set.name)
                 if op.fixed_header:
-- 
cgit v1.2.3


From 542ba2de32fb7ad6b8c51a05a4f0d6ca3cc66d67 Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Thu, 8 Jan 2026 16:13:31 +0000
Subject: tools: ynl: fix pylint misc warnings

Fix pylint warnings for:

- unused-argument
- consider-using-in
- consider-using-get
- consider-using-f-string
- protected-access
- unidiomatic-typecheck
- no-else-return

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260108161339.29166-6-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/lib/doc_generator.py |  3 +--
 tools/net/ynl/pyynl/lib/nlspec.py        |  5 ++---
 tools/net/ynl/pyynl/lib/ynl.py           | 18 +++++++++---------
 3 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/lib/doc_generator.py b/tools/net/ynl/pyynl/lib/doc_generator.py
index 3a16b8eb01ca..d1afff9d9956 100644
--- a/tools/net/ynl/pyynl/lib/doc_generator.py
+++ b/tools/net/ynl/pyynl/lib/doc_generator.py
@@ -109,8 +109,7 @@ class RstFormatters:
                     'fixed-header': 'definition',
                     'nested-attributes': 'attribute-set',
                     'struct': 'definition'}
-        if prefix in mappings:
-            prefix = mappings[prefix]
+        prefix = mappings.get(prefix, prefix)
         return f":ref:`{namespace}-{prefix}-{name}`"
 
     def rst_header(self) -> str:
diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py
index fcd4106d0cfa..f3173146b64b 100644
--- a/tools/net/ynl/pyynl/lib/nlspec.py
+++ b/tools/net/ynl/pyynl/lib/nlspec.py
@@ -105,8 +105,7 @@ class SpecEnumEntry(SpecElement):
     def user_value(self, as_flags=None):
         if self.enum_set['type'] == 'flags' or as_flags:
             return 1 << self.value
-        else:
-            return self.value
+        return self.value
 
 
 class SpecEnumSet(SpecElement):
@@ -194,7 +193,7 @@ class SpecAttr(SpecElement):
         self.sub_message = yaml.get('sub-message')
         self.selector = yaml.get('selector')
 
-        self.is_auto_scalar = self.type == "sint" or self.type == "uint"
+        self.is_auto_scalar = self.type in ("sint", "uint")
 
 
 class SpecAttrSet(SpecElement):
diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py
index 040ff3b87c17..4bc8e58cb621 100644
--- a/tools/net/ynl/pyynl/lib/ynl.py
+++ b/tools/net/ynl/pyynl/lib/ynl.py
@@ -415,7 +415,7 @@ class NetlinkProtocol:
         nlmsg = struct.pack("HHII", nl_type, nl_flags, seq, 0)
         return nlmsg
 
-    def message(self, flags, command, version, seq=None):
+    def message(self, flags, command, _version, seq=None):
         return self._message(command, flags, seq)
 
     def _decode(self, nl_msg):
@@ -425,7 +425,7 @@ class NetlinkProtocol:
         msg = self._decode(nl_msg)
         if op is None:
             op = ynl.rsp_by_value[msg.cmd()]
-        fixed_header_size = ynl._struct_size(op.fixed_header)
+        fixed_header_size = ynl.struct_size(op.fixed_header)
         msg.raw_attrs = NlAttrs(msg.raw, fixed_header_size)
         return msg
 
@@ -755,7 +755,7 @@ class YnlFamily(SpecFamily):
 
     def _rsp_add(self, rsp, name, is_multi, decoded):
         if is_multi is None:
-            if name in rsp and type(rsp[name]) is not list:
+            if name in rsp and not isinstance(rsp[name], list):
                 rsp[name] = [rsp[name]]
                 is_multi = True
             else:
@@ -788,7 +788,7 @@ class YnlFamily(SpecFamily):
         offset = 0
         if msg_format.fixed_header:
             decoded.update(self._decode_struct(attr.raw, msg_format.fixed_header))
-            offset = self._struct_size(msg_format.fixed_header)
+            offset = self.struct_size(msg_format.fixed_header)
         if msg_format.attr_set:
             if msg_format.attr_set in self.attr_sets:
                 subdict = self._decode(NlAttrs(attr.raw, offset), msg_format.attr_set)
@@ -908,7 +908,7 @@ class YnlFamily(SpecFamily):
             return
 
         msg = self.nlproto.decode(self, NlMsg(request, 0, op.attr_set), op)
-        offset = self.nlproto.msghdr_size() + self._struct_size(op.fixed_header)
+        offset = self.nlproto.msghdr_size() + self.struct_size(op.fixed_header)
         search_attrs = SpaceAttrs(op.attr_set, vals)
         path = self._decode_extack_path(msg.raw_attrs, op.attr_set, offset,
                                         extack['bad-attr-offs'], search_attrs)
@@ -916,14 +916,14 @@ class YnlFamily(SpecFamily):
             del extack['bad-attr-offs']
             extack['bad-attr'] = path
 
-    def _struct_size(self, name):
+    def struct_size(self, name):
         if name:
             members = self.consts[name].members
             size = 0
             for m in members:
                 if m.type in ['pad', 'binary']:
                     if m.struct:
-                        size += self._struct_size(m.struct)
+                        size += self.struct_size(m.struct)
                     else:
                         size += m.len
                 else:
@@ -942,7 +942,7 @@ class YnlFamily(SpecFamily):
                 offset += m.len
             elif m.type == 'binary':
                 if m.struct:
-                    len_ = self._struct_size(m.struct)
+                    len_ = self.struct_size(m.struct)
                     value = self._decode_struct(data[offset : offset + len_],
                                                 m.struct)
                     offset += len_
@@ -987,7 +987,7 @@ class YnlFamily(SpecFamily):
 
     def _formatted_string(self, raw, display_hint):
         if display_hint == 'mac':
-            formatted = ':'.join('%02x' % b for b in raw)
+            formatted = ':'.join(f'{b:02x}' for b in raw)
         elif display_hint == 'hex':
             if isinstance(raw, int):
                 formatted = hex(raw)
-- 
cgit v1.2.3


From 00ef9f153ed899e26382fc7918c1d087b20ef2c5 Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Thu, 8 Jan 2026 16:13:32 +0000
Subject: tools: ynl: fix pylint global variable related warnings

Refactor to avoid using global variables to fix the following pylint
issues:

- invalid-name
- global-statement
- global-variable-not-assigned

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260108161339.29166-7-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/lib/nlspec.py | 16 +++++++---------
 tools/net/ynl/pyynl/lib/ynl.py    | 24 ++++++++++--------------
 2 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py
index f3173146b64b..0b5277082b38 100644
--- a/tools/net/ynl/pyynl/lib/nlspec.py
+++ b/tools/net/ynl/pyynl/lib/nlspec.py
@@ -13,10 +13,6 @@ import os
 import yaml as pyyaml
 
 
-# To be loaded dynamically as needed
-jsonschema = None
-
-
 class SpecException(Exception):
     """Netlink spec exception.
     """
@@ -439,6 +435,10 @@ class SpecFamily(SpecElement):
         mcast_groups  dict of all multicast groups (index by name)
         kernel_family   dict of kernel family attributes
     """
+
+    # To be loaded dynamically as needed
+    jsonschema = None
+
     def __init__(self, spec_path, schema_path=None, exclude_ops=None):
         with open(spec_path, "r", encoding='utf-8') as stream:
             prefix = '# SPDX-License-Identifier: '
@@ -463,15 +463,13 @@ class SpecFamily(SpecElement):
         if schema_path is None:
             schema_path = os.path.dirname(os.path.dirname(spec_path)) + f'/{self.proto}.yaml'
         if schema_path:
-            global jsonschema
-
             with open(schema_path, "r", encoding='utf-8') as stream:
                 schema = pyyaml.safe_load(stream)
 
-            if jsonschema is None:
-                jsonschema = importlib.import_module("jsonschema")
+            if SpecFamily.jsonschema is None:
+                SpecFamily.jsonschema = importlib.import_module("jsonschema")
 
-            jsonschema.validate(self.yaml, schema)
+            SpecFamily.jsonschema.validate(self.yaml, schema)
 
         self.attr_sets = collections.OrderedDict()
         self.sub_msgs = collections.OrderedDict()
diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py
index 4bc8e58cb621..9774005e7ad1 100644
--- a/tools/net/ynl/pyynl/lib/ynl.py
+++ b/tools/net/ynl/pyynl/lib/ynl.py
@@ -320,9 +320,6 @@ class NlMsgs:
         yield from self.msgs
 
 
-genl_family_name_to_id = None
-
-
 def _genl_msg(nl_type, nl_flags, genl_cmd, genl_version, seq=None):
     # we prepend length in _genl_msg_finalize()
     if seq is None:
@@ -338,6 +335,8 @@ def _genl_msg_finalize(msg):
 
 # pylint: disable=too-many-nested-blocks
 def _genl_load_families():
+    genl_family_name_to_id = {}
+
     with socket.socket(socket.AF_NETLINK, socket.SOCK_RAW, Netlink.NETLINK_GENERIC) as sock:
         sock.setsockopt(Netlink.SOL_NETLINK, Netlink.NETLINK_CAP_ACK, 1)
 
@@ -348,18 +347,14 @@ def _genl_load_families():
 
         sock.send(msg, 0)
 
-        global genl_family_name_to_id
-        genl_family_name_to_id = dict()
-
         while True:
             reply = sock.recv(128 * 1024)
             nms = NlMsgs(reply)
             for nl_msg in nms:
                 if nl_msg.error:
-                    print("Netlink error:", nl_msg.error)
-                    return
+                    raise YnlException(f"Netlink error: {nl_msg.error}")
                 if nl_msg.done:
-                    return
+                    return genl_family_name_to_id
 
                 gm = GenlMsg(nl_msg)
                 fam = {}
@@ -439,15 +434,16 @@ class NetlinkProtocol:
 
 
 class GenlProtocol(NetlinkProtocol):
+    genl_family_name_to_id = {}
+
     def __init__(self, family_name):
         super().__init__(family_name, Netlink.NETLINK_GENERIC)
 
-        global genl_family_name_to_id
-        if genl_family_name_to_id is None:
-            _genl_load_families()
+        if not GenlProtocol.genl_family_name_to_id:
+            GenlProtocol.genl_family_name_to_id = _genl_load_families()
 
-        self.genl_family = genl_family_name_to_id[family_name]
-        self.family_id = genl_family_name_to_id[family_name]['id']
+        self.genl_family = GenlProtocol.genl_family_name_to_id[family_name]
+        self.family_id = GenlProtocol.genl_family_name_to_id[family_name]['id']
 
     def message(self, flags, command, version, seq=None):
         nlmsg = self._message(self.family_id, flags, seq)
-- 
cgit v1.2.3


From 9b6b016df4c2902cd6acd2673bffea6c1e8f643d Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Thu, 8 Jan 2026 16:13:33 +0000
Subject: tools: ynl: fix logic errors reported by pylint

Fix the following logic errors:

tools/net/ynl/pyynl/lib/nlspec.py:299:15: E1101: Instance of 'list' has no
'items' member (no-member)

tools/net/ynl/pyynl/lib/nlspec.py:580:22: E0606: Possibly using variable 'op'
before assignment (possibly-used-before-assignment)

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260108161339.29166-8-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/lib/nlspec.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/lib/nlspec.py b/tools/net/ynl/pyynl/lib/nlspec.py
index 0b5277082b38..fcffeb5b7ba3 100644
--- a/tools/net/ynl/pyynl/lib/nlspec.py
+++ b/tools/net/ynl/pyynl/lib/nlspec.py
@@ -295,7 +295,7 @@ class SpecStruct(SpecElement):
         yield from self.members
 
     def items(self):
-        return self.members.items()
+        return self.members
 
 
 class SpecSubMessage(SpecElement):
@@ -570,12 +570,11 @@ class SpecFamily(SpecElement):
                 skip |= bool(exclude.match(elem['name']))
             if not skip:
                 op = self.new_operation(elem, req_val, rsp_val)
+                self.msgs[op.name] = op
 
             req_val = req_val_next
             rsp_val = rsp_val_next
 
-            self.msgs[op.name] = op
-
     def find_operation(self, name):
         """
         For a given operation name, find and return operation spec.
-- 
cgit v1.2.3


From 301da4cfea5fef6dfc82a37031b84c89e06c5c7b Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Thu, 8 Jan 2026 16:13:34 +0000
Subject: tools: ynl: ethtool: fix pylint issues

Fix or suppress all the pylint issues in ethtool.py, except for
TODO (fixme) items.

Suppress:

- too-many-locals
- too-many-branches
- too-many-statements
- too-many-return-statements
- import-error

Fix:

- missing-module-docstring
- redefined-outer-name
- dangerous-default-value
- use-dict-literal
- missing-function-docstring
- global-variable-undefined
- expression-not-assigned
- inconsistent-return-statements
- wrong-import-order

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260108161339.29166-9-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/ethtool.py | 46 ++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/ethtool.py b/tools/net/ynl/pyynl/ethtool.py
index 40a8ba8d296f..f1a2a2a89985 100755
--- a/tools/net/ynl/pyynl/ethtool.py
+++ b/tools/net/ynl/pyynl/ethtool.py
@@ -1,5 +1,10 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#
+# pylint: disable=too-many-locals, too-many-branches, too-many-statements
+# pylint: disable=too-many-return-statements
+
+""" YNL ethtool utility """
 
 import argparse
 import pathlib
@@ -10,8 +15,10 @@ import os
 
 # pylint: disable=no-name-in-module,wrong-import-position
 sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
-from lib import YnlFamily
+# pylint: disable=import-error
 from cli import schema_dir, spec_dir
+from lib import YnlFamily
+
 
 def args_to_req(ynl, op_name, args, req):
     """
@@ -49,7 +56,8 @@ def print_field(reply, *desc):
         return
 
     if len(desc) == 0:
-        return print_field(reply, *zip(reply.keys(), reply.keys()))
+        print_field(reply, *zip(reply.keys(), reply.keys()))
+        return
 
     for spec in desc:
         try:
@@ -89,11 +97,12 @@ def doit(ynl, args, op_name):
     args_to_req(ynl, op_name, args.args, req)
     ynl.do(op_name, req)
 
-def dumpit(ynl, args, op_name, extra = {}):
+def dumpit(ynl, args, op_name, extra=None):
     """
     Prepare request header, parse arguments and dumpit (filtering out the
     devices we're not interested in).
     """
+    extra = extra or {}
     reply = ynl.dump(op_name, { 'header': {} } | extra)
     if not reply:
         return {}
@@ -115,9 +124,9 @@ def bits_to_dict(attr):
     """
     ret = {}
     if 'bits' not in attr:
-        return dict()
+        return {}
     if 'bit' not in attr['bits']:
-        return dict()
+        return {}
     for bit in attr['bits']['bit']:
         if bit['name'] == '':
             continue
@@ -127,6 +136,8 @@ def bits_to_dict(attr):
     return ret
 
 def main():
+    """ YNL ethtool utility """
+
     parser = argparse.ArgumentParser(description='ethtool wannabe')
     parser.add_argument('--json', action=argparse.BooleanOptionalAction)
     parser.add_argument('--show-priv-flags', action=argparse.BooleanOptionalAction)
@@ -156,7 +167,7 @@ def main():
     # TODO:                       rss-get
     parser.add_argument('device', metavar='device', type=str)
     parser.add_argument('args', metavar='args', type=str, nargs='*')
-    global args
+
     args = parser.parse_args()
 
     spec = os.path.join(spec_dir(), 'ethtool.yaml')
@@ -170,13 +181,16 @@ def main():
         return
 
     if args.set_eee:
-        return doit(ynl, args, 'eee-set')
+        doit(ynl, args, 'eee-set')
+        return
 
     if args.set_pause:
-        return doit(ynl, args, 'pause-set')
+        doit(ynl, args, 'pause-set')
+        return
 
     if args.set_coalesce:
-        return doit(ynl, args, 'coalesce-set')
+        doit(ynl, args, 'coalesce-set')
+        return
 
     if args.set_features:
         # TODO: parse the bitmask
@@ -184,10 +198,12 @@ def main():
         return
 
     if args.set_channels:
-        return doit(ynl, args, 'channels-set')
+        doit(ynl, args, 'channels-set')
+        return
 
     if args.set_ring:
-        return doit(ynl, args, 'rings-set')
+        doit(ynl, args, 'rings-set')
+        return
 
     if args.show_priv_flags:
         flags = bits_to_dict(dumpit(ynl, args, 'privflags-get')['flags'])
@@ -338,25 +354,25 @@ def main():
         print(f'Time stamping parameters for {args.device}:')
 
         print('Capabilities:')
-        [print(f'\t{v}') for v in bits_to_dict(tsinfo['timestamping'])]
+        _ = [print(f'\t{v}') for v in bits_to_dict(tsinfo['timestamping'])]
 
         print(f'PTP Hardware Clock: {tsinfo.get("phc-index", "none")}')
 
         if 'tx-types' in tsinfo:
             print('Hardware Transmit Timestamp Modes:')
-            [print(f'\t{v}') for v in bits_to_dict(tsinfo['tx-types'])]
+            _ = [print(f'\t{v}') for v in bits_to_dict(tsinfo['tx-types'])]
         else:
             print('Hardware Transmit Timestamp Modes: none')
 
         if 'rx-filters' in tsinfo:
             print('Hardware Receive Filter Modes:')
-            [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])]
+            _ = [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])]
         else:
             print('Hardware Receive Filter Modes: none')
 
         if 'stats' in tsinfo and tsinfo['stats']:
             print('Statistics:')
-            [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()]
+            _ = [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()]
 
         return
 
-- 
cgit v1.2.3


From 9a130471f854aa2146e74a8dec47478d9e6d0654 Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Thu, 8 Jan 2026 16:13:35 +0000
Subject: tools: ynl: fix pylint issues in ynl_gen_rst

Add a couple of pylint suppressions to ynl_gen_rst.py:

- no-name-in-module,wrong-import-position
- broad-exception-caught

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260108161339.29166-10-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/ynl_gen_rst.py | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/ynl_gen_rst.py b/tools/net/ynl/pyynl/ynl_gen_rst.py
index 90ae19aac89d..30324e2fd682 100755
--- a/tools/net/ynl/pyynl/ynl_gen_rst.py
+++ b/tools/net/ynl/pyynl/ynl_gen_rst.py
@@ -19,6 +19,7 @@ import sys
 import argparse
 import logging
 
+# pylint: disable=no-name-in-module,wrong-import-position
 sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
 from lib import YnlDocGenerator    # pylint: disable=C0413
 
@@ -60,6 +61,7 @@ def write_to_rstfile(content: str, filename: str) -> None:
         rst_file.write(content)
 
 
+# pylint: disable=broad-exception-caught
 def main() -> None:
     """Main function that reads the YAML files and generates the RST files"""
 
-- 
cgit v1.2.3


From c2fa97c509ec5d0d9b3134132b90c117a961f2b6 Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Thu, 8 Jan 2026 16:13:36 +0000
Subject: tools: ynl-gen-c: suppress unhelpful pylint messages

Disable pylint messages for too-many-*, too-few-*, docstrings,
broad-exception-* and messages for specific code that won't get changed.

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260108161339.29166-11-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/ynl_gen_c.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py
index b517d0c605ad..14d16024fe11 100755
--- a/tools/net/ynl/pyynl/ynl_gen_c.py
+++ b/tools/net/ynl/pyynl/ynl_gen_c.py
@@ -1,5 +1,11 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+#
+# pylint: disable=line-too-long, missing-class-docstring, missing-function-docstring
+# pylint: disable=too-many-positional-arguments, too-many-arguments, too-many-statements
+# pylint: disable=too-many-branches, too-many-locals, too-many-instance-attributes
+# pylint: disable=too-many-nested-blocks, too-many-lines, too-few-public-methods
+# pylint: disable=broad-exception-raised, broad-exception-caught, protected-access
 
 import argparse
 import filecmp
@@ -11,6 +17,7 @@ import sys
 import tempfile
 import yaml
 
+# pylint: disable=no-name-in-module,wrong-import-position
 sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
 from lib import SpecFamily, SpecAttrSet, SpecAttr, SpecOperation, SpecEnumSet, SpecEnumEntry
 from lib import SpecSubMessage
@@ -183,6 +190,7 @@ class Type(SpecAttr):
         for line in lines:
             ri.cw.p(line)
 
+    # pylint: disable=assignment-from-none
     def arg_member(self, ri):
         member = self._complex_member_type(ri)
         if member:
@@ -280,6 +288,7 @@ class Type(SpecAttr):
 
         code = []
         presence = ''
+        # pylint: disable=consider-using-enumerate
         for i in range(0, len(ref)):
             presence = f"{var}->{'.'.join(ref[:i] + [''])}_present.{ref[i]}"
             # Every layer below last is a nest, so we know it uses bit presence
@@ -414,6 +423,7 @@ class TypeScalar(Type):
         if low < -32768 or high > 32767:
             self.checks['full-range'] = True
 
+    # pylint: disable=too-many-return-statements
     def _attr_policy(self, policy):
         if 'flags-mask' in self.checks or self.is_bitfield:
             if self.is_bitfield:
@@ -1650,6 +1660,7 @@ class CodeWriter:
         if out_file is None:
             self._out = os.sys.stdout
         else:
+            # pylint: disable=consider-using-with
             self._out = tempfile.NamedTemporaryFile('w+')
             self._out_file = out_file
 
-- 
cgit v1.2.3


From 93ef84292959e14fac8aa49079ae951e0078dc6e Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Thu, 8 Jan 2026 16:13:37 +0000
Subject: tools: ynl-gen-c: fix pylint warnings for returns, unused, redefined

Fix the following pylint warnings:

- unused-argument
- unused-variable
- no-else-return
- inconsistent-return-statements
- redefined-outer-name
- unreachable

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260108161339.29166-12-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/ynl_gen_c.py | 100 ++++++++++++++++++++-------------------
 1 file changed, 52 insertions(+), 48 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py
index 14d16024fe11..900896779e61 100755
--- a/tools/net/ynl/pyynl/ynl_gen_c.py
+++ b/tools/net/ynl/pyynl/ynl_gen_c.py
@@ -7,6 +7,12 @@
 # pylint: disable=too-many-nested-blocks, too-many-lines, too-few-public-methods
 # pylint: disable=broad-exception-raised, broad-exception-caught, protected-access
 
+"""
+ynl_gen_c
+
+A YNL to C code generator for both kernel and userspace protocol stubs.
+"""
+
 import argparse
 import filecmp
 import pathlib
@@ -15,7 +21,7 @@ import re
 import shutil
 import sys
 import tempfile
-import yaml
+import yaml as pyyaml
 
 # pylint: disable=no-name-in-module,wrong-import-position
 sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
@@ -164,7 +170,7 @@ class Type(SpecAttr):
 
     def presence_member(self, space, type_filter):
         if self.presence_type() != type_filter:
-            return
+            return ''
 
         if self.presence_type() == 'present':
             pfx = '__' if space == 'user' else ''
@@ -173,14 +179,15 @@ class Type(SpecAttr):
         if self.presence_type() in {'len', 'count'}:
             pfx = '__' if space == 'user' else ''
             return f"{pfx}u32 {self.c_name};"
+        return ''
 
-    def _complex_member_type(self, ri):
+    def _complex_member_type(self, _ri):
         return None
 
     def free_needs_iter(self):
         return False
 
-    def _free_lines(self, ri, var, ref):
+    def _free_lines(self, _ri, var, ref):
         if self.is_multi_val() or self.presence_type() in {'count', 'len'}:
             return [f'free({var}->{ref}{self.c_name});']
         return []
@@ -278,7 +285,7 @@ class Type(SpecAttr):
     def _setter_lines(self, ri, member, presence):
         raise Exception(f"Setter not implemented for class type {self.type}")
 
-    def setter(self, ri, space, direction, deref=False, ref=None, var="req"):
+    def setter(self, ri, _space, direction, deref=False, ref=None, var="req"):
         ref = (ref if ref else []) + [self.c_name]
         member = f"{var}->{'.'.join(ref)}"
 
@@ -434,15 +441,15 @@ class TypeScalar(Type):
                 flag_cnt = len(flags['entries'])
                 mask = (1 << flag_cnt) - 1
             return f"NLA_POLICY_MASK({policy}, 0x{mask:x})"
-        elif 'full-range' in self.checks:
+        if 'full-range' in self.checks:
             return f"NLA_POLICY_FULL_RANGE({policy}, &{c_lower(self.enum_name)}_range)"
-        elif 'range' in self.checks:
+        if 'range' in self.checks:
             return f"NLA_POLICY_RANGE({policy}, {self.get_limit_str('min')}, {self.get_limit_str('max')})"
-        elif 'min' in self.checks:
+        if 'min' in self.checks:
             return f"NLA_POLICY_MIN({policy}, {self.get_limit_str('min')})"
-        elif 'max' in self.checks:
+        if 'max' in self.checks:
             return f"NLA_POLICY_MAX({policy}, {self.get_limit_str('max')})"
-        elif 'sparse' in self.checks:
+        if 'sparse' in self.checks:
             return f"NLA_POLICY_VALIDATE_FN({policy}, &{c_lower(self.enum_name)}_validate)"
         return super()._attr_policy(policy)
 
@@ -637,7 +644,7 @@ class TypeBinaryScalarArray(TypeBinary):
 
 
 class TypeBitfield32(Type):
-    def _complex_member_type(self, ri):
+    def _complex_member_type(self, _ri):
         return "struct nla_bitfield32"
 
     def _attr_typol(self):
@@ -665,7 +672,7 @@ class TypeNest(Type):
     def is_recursive(self):
         return self.family.pure_nested_structs[self.nested_attrs].recursive
 
-    def _complex_member_type(self, ri):
+    def _complex_member_type(self, _ri):
         return self.nested_struct_type
 
     def _free_lines(self, ri, var, ref):
@@ -699,7 +706,7 @@ class TypeNest(Type):
                       f"parg.data = &{var}->{self.c_name};"]
         return get_lines, init_lines, None
 
-    def setter(self, ri, space, direction, deref=False, ref=None, var="req"):
+    def setter(self, ri, _space, direction, deref=False, ref=None, var="req"):
         ref = (ref if ref else []) + [self.c_name]
 
         for _, attr in ri.family.pure_nested_structs[self.nested_attrs].member_list():
@@ -724,19 +731,18 @@ class TypeMultiAttr(Type):
     def _complex_member_type(self, ri):
         if 'type' not in self.attr or self.attr['type'] == 'nest':
             return self.nested_struct_type
-        elif self.attr['type'] == 'binary' and 'struct' in self.attr:
+        if self.attr['type'] == 'binary' and 'struct' in self.attr:
             return None  # use arg_member()
-        elif self.attr['type'] == 'string':
+        if self.attr['type'] == 'string':
             return 'struct ynl_string *'
-        elif self.attr['type'] in scalars:
+        if self.attr['type'] in scalars:
             scalar_pfx = '__' if ri.ku_space == 'user' else ''
             if self.is_auto_scalar:
                 name = self.type[0] + '64'
             else:
                 name = self.attr['type']
             return scalar_pfx + name
-        else:
-            raise Exception(f"Sub-type {self.attr['type']} not supported yet")
+        raise Exception(f"Sub-type {self.attr['type']} not supported yet")
 
     def arg_member(self, ri):
         if self.type == 'binary' and 'struct' in self.attr:
@@ -747,7 +753,7 @@ class TypeMultiAttr(Type):
     def free_needs_iter(self):
         return self.attr['type'] in {'nest', 'string'}
 
-    def _free_lines(self, ri, var, ref):
+    def _free_lines(self, _ri, var, ref):
         lines = []
         if self.attr['type'] in scalars:
             lines += [f"free({var}->{ref}{self.c_name});"]
@@ -811,13 +817,12 @@ class TypeIndexedArray(Type):
     def _complex_member_type(self, ri):
         if 'sub-type' not in self.attr or self.attr['sub-type'] == 'nest':
             return self.nested_struct_type
-        elif self.attr['sub-type'] in scalars:
+        if self.attr['sub-type'] in scalars:
             scalar_pfx = '__' if ri.ku_space == 'user' else ''
             return scalar_pfx + self.attr['sub-type']
-        elif self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks:
+        if self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks:
             return None  # use arg_member()
-        else:
-            raise Exception(f"Sub-type {self.attr['sub-type']} not supported yet")
+        raise Exception(f"Sub-type {self.attr['sub-type']} not supported yet")
 
     def arg_member(self, ri):
         if self.sub_type == 'binary' and 'exact-len' in self.checks:
@@ -833,12 +838,11 @@ class TypeIndexedArray(Type):
     def _attr_typol(self):
         if self.attr['sub-type'] in scalars:
             return f'.type = YNL_PT_U{c_upper(self.sub_type[1:])}, '
-        elif self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks:
+        if self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks:
             return f'.type = YNL_PT_BINARY, .len = {self.checks["exact-len"]}, '
-        elif self.attr['sub-type'] == 'nest':
+        if self.attr['sub-type'] == 'nest':
             return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, '
-        else:
-            raise Exception(f"Typol for IndexedArray sub-type {self.attr['sub-type']} not supported, yet")
+        raise Exception(f"Typol for IndexedArray sub-type {self.attr['sub-type']} not supported, yet")
 
     def _attr_get(self, ri, var):
         local_vars = ['const struct nlattr *attr2;']
@@ -874,7 +878,7 @@ class TypeIndexedArray(Type):
     def free_needs_iter(self):
         return self.sub_type == 'nest'
 
-    def _free_lines(self, ri, var, ref):
+    def _free_lines(self, _ri, var, ref):
         lines = []
         if self.sub_type == 'nest':
             lines += [
@@ -885,7 +889,7 @@ class TypeIndexedArray(Type):
         return lines
 
 class TypeNestTypeValue(Type):
-    def _complex_member_type(self, ri):
+    def _complex_member_type(self, _ri):
         return self.nested_struct_type
 
     def _attr_typol(self):
@@ -1030,7 +1034,7 @@ class Struct:
 
     def external_selectors(self):
         sels = []
-        for name, attr in self.attr_list:
+        for _name, attr in self.attr_list:
             if isinstance(attr, TypeSubMessage) and attr.selector.is_external():
                 sels.append(attr.selector)
         return sels
@@ -1047,9 +1051,9 @@ class EnumEntry(SpecEnumEntry):
         super().__init__(enum_set, yaml, prev, value_start)
 
         if prev:
-            self.value_change = (self.value != prev.value + 1)
+            self.value_change = self.value != prev.value + 1
         else:
-            self.value_change = (self.value != 0)
+            self.value_change = self.value != 0
         self.value_change = self.value_change or self.enum_set['type'] == 'flags'
 
         # Added by resolve:
@@ -1321,7 +1325,7 @@ class Family(SpecFamily):
                 }
 
     def _load_root_sets(self):
-        for op_name, op in self.msgs.items():
+        for _op_name, op in self.msgs.items():
             if 'attribute-set' not in op:
                 continue
 
@@ -1520,7 +1524,7 @@ class Family(SpecFamily):
             for k, _ in self.root_sets.items():
                 yield k, None  # we don't have a struct, but it must be terminal
 
-        for attr_set, struct in all_structs():
+        for attr_set, _struct in all_structs():
             for _, spec in self.attr_sets[attr_set].items():
                 if 'nested-attributes' in spec:
                     child_name = spec['nested-attributes']
@@ -1540,7 +1544,7 @@ class Family(SpecFamily):
     def _load_global_policy(self):
         global_set = set()
         attr_set_name = None
-        for op_name, op in self.ops.items():
+        for _op_name, op in self.ops.items():
             if not op:
                 continue
             if 'attribute-set' not in op:
@@ -2049,12 +2053,12 @@ def put_op_name(family, cw):
     _put_enum_to_str_helper(cw, family.c_name + '_op', map_name, 'op')
 
 
-def put_enum_to_str_fwd(family, cw, enum):
+def put_enum_to_str_fwd(_family, cw, enum):
     args = [enum.user_type + ' value']
     cw.write_func_prot('const char *', f'{enum.render_name}_str', args, suffix=';')
 
 
-def put_enum_to_str(family, cw, enum):
+def put_enum_to_str(_family, cw, enum):
     map_name = f'{enum.render_name}_strmap'
     cw.block_start(line=f"static const char * const {map_name}[] =")
     for entry in enum.entries.values():
@@ -2335,7 +2339,8 @@ def parse_rsp_nested_prototype(ri, struct, suffix=';'):
 
 def parse_rsp_nested(ri, struct):
     if struct.submsg:
-        return parse_rsp_submsg(ri, struct)
+        parse_rsp_submsg(ri, struct)
+        return
 
     parse_rsp_nested_prototype(ri, struct, suffix='')
 
@@ -2715,7 +2720,7 @@ def _free_type(ri, direction, struct):
 
 
 def free_rsp_nested_prototype(ri):
-        print_free_prototype(ri, "")
+    print_free_prototype(ri, "")
 
 
 def free_rsp_nested(ri, struct):
@@ -3357,7 +3362,7 @@ def render_user_family(family, cw, prototype):
             else:
                 raise Exception('Invalid notification ' + ntf_op_name)
             _render_user_ntf_entry(ri, ntf_op)
-        for op_name, op in family.ops.items():
+        for _op_name, op in family.ops.items():
             if 'event' not in op:
                 continue
             ri = RenderInfo(cw, family, "user", op, "event")
@@ -3429,10 +3434,9 @@ def main():
             print('Spec license:', parsed.license)
             print('License must be: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)')
             os.sys.exit(1)
-    except yaml.YAMLError as exc:
+    except pyyaml.YAMLError as exc:
         print(exc)
         os.sys.exit(1)
-        return
 
     cw = CodeWriter(BaseNlLib(), args.out_file, overwrite=(not args.cmp_out))
 
@@ -3535,7 +3539,7 @@ def main():
                 cw.nl()
 
             if parsed.kernel_policy in {'per-op', 'split'}:
-                for op_name, op in parsed.ops.items():
+                for _op_name, op in parsed.ops.items():
                     if 'do' in op and 'event' not in op:
                         ri = RenderInfo(cw, parsed, args.mode, op, "do")
                         print_req_policy_fwd(cw, ri.struct['request'], ri=ri)
@@ -3564,7 +3568,7 @@ def main():
                 print_req_policy(cw, struct)
                 cw.nl()
 
-            for op_name, op in parsed.ops.items():
+            for _op_name, op in parsed.ops.items():
                 if parsed.kernel_policy in {'per-op', 'split'}:
                     for op_mode in ['do', 'dump']:
                         if op_mode in op and 'request' in op[op_mode]:
@@ -3592,7 +3596,7 @@ def main():
                 ri = RenderInfo(cw, parsed, args.mode, "", "", attr_set)
                 print_type_full(ri, struct)
 
-            for op_name, op in parsed.ops.items():
+            for _op_name, op in parsed.ops.items():
                 cw.p(f"/* ============== {op.enum_name} ============== */")
 
                 if 'do' in op and 'event' not in op:
@@ -3625,7 +3629,7 @@ def main():
                         raise Exception(f'Only notifications with consistent types supported ({op.name})')
                     print_wrapped_type(ri)
 
-            for op_name, op in parsed.ntfs.items():
+            for _op_name, op in parsed.ntfs.items():
                 if 'event' in op:
                     ri = RenderInfo(cw, parsed, args.mode, op, 'event')
                     cw.p(f"/* {op.enum_name} - event */")
@@ -3675,7 +3679,7 @@ def main():
                 if struct.reply:
                     parse_rsp_nested(ri, struct)
 
-            for op_name, op in parsed.ops.items():
+            for _op_name, op in parsed.ops.items():
                 cw.p(f"/* ============== {op.enum_name} ============== */")
                 if 'do' in op and 'event' not in op:
                     cw.p(f"/* {op.enum_name} - do */")
@@ -3703,7 +3707,7 @@ def main():
                         raise Exception(f'Only notifications with consistent types supported ({op.name})')
                     print_ntf_type_free(ri)
 
-            for op_name, op in parsed.ntfs.items():
+            for _op_name, op in parsed.ntfs.items():
                 if 'event' in op:
                     cw.p(f"/* {op.enum_name} - event */")
 
-- 
cgit v1.2.3


From a587f592d6c49903457c1d06876ddb071907850d Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Thu, 8 Jan 2026 16:13:38 +0000
Subject: tools: ynl-gen-c: fix pylint None, type, dict, generators, init

Fix the following pylint warnings that are trivial one-liners:

- unsubscriptable-object
- unidiomatic-typecheck
- use-dict-literal
- attribute-defined-outside-init
- consider-using-in
- consider-using-generator

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260108161339.29166-13-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/ynl_gen_c.py | 49 ++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py
index 900896779e61..5f079a74c8d1 100755
--- a/tools/net/ynl/pyynl/ynl_gen_c.py
+++ b/tools/net/ynl/pyynl/ynl_gen_c.py
@@ -200,7 +200,7 @@ class Type(SpecAttr):
     # pylint: disable=assignment-from-none
     def arg_member(self, ri):
         member = self._complex_member_type(ri)
-        if member:
+        if member is not None:
             spc = ' ' if member[-1] != '*' else ''
             arg = [member + spc + '*' + self.c_name]
             if self.presence_type() == 'count':
@@ -210,7 +210,7 @@ class Type(SpecAttr):
 
     def struct_member(self, ri):
         member = self._complex_member_type(ri)
-        if member:
+        if member is not None:
             ptr = '*' if self.is_multi_val() else ''
             if self.is_recursive_for_op(ri):
                 ptr = '*'
@@ -258,9 +258,9 @@ class Type(SpecAttr):
 
     def attr_get(self, ri, var, first):
         lines, init_lines, _ = self._attr_get(ri, var)
-        if type(lines) is str:
+        if isinstance(lines, str):
             lines = [lines]
-        if type(init_lines) is str:
+        if isinstance(init_lines, str):
             init_lines = [init_lines]
 
         kw = 'if' if first else 'else if'
@@ -1002,7 +1002,7 @@ class Struct:
         self.in_multi_val = False  # used by a MultiAttr or and legacy arrays
 
         self.attr_list = []
-        self.attrs = dict()
+        self.attrs = {}
         if type_list is not None:
             for t in type_list:
                 self.attr_list.append((t, self.attr_set[t]),)
@@ -1094,8 +1094,8 @@ class EnumSet(SpecEnumSet):
         return EnumEntry(self, entry, prev_entry, value_start)
 
     def value_range(self):
-        low = min([x.value for x in self.entries.values()])
-        high = max([x.value for x in self.entries.values()])
+        low = min(x.value for x in self.entries.values())
+        high = max(x.value for x in self.entries.values())
 
         if high - low + 1 != len(self.entries):
             return None, None
@@ -1234,6 +1234,12 @@ class Family(SpecFamily):
         self.hooks = None
         delattr(self, "hooks")
 
+        self.root_sets = {}
+        self.pure_nested_structs = {}
+        self.kernel_policy = None
+        self.global_policy = None
+        self.global_policy_set = None
+
         super().__init__(file_name, exclude_ops=exclude_ops)
 
         self.fam_key = c_upper(self.yaml.get('c-family-name', self.yaml["name"] + '_FAMILY_NAME'))
@@ -1268,18 +1274,18 @@ class Family(SpecFamily):
 
         self.mcgrps = self.yaml.get('mcast-groups', {'list': []})
 
-        self.hooks = dict()
+        self.hooks = {}
         for when in ['pre', 'post']:
-            self.hooks[when] = dict()
+            self.hooks[when] = {}
             for op_mode in ['do', 'dump']:
-                self.hooks[when][op_mode] = dict()
+                self.hooks[when][op_mode] = {}
                 self.hooks[when][op_mode]['set'] = set()
                 self.hooks[when][op_mode]['list'] = []
 
         # dict space-name -> 'request': set(attrs), 'reply': set(attrs)
-        self.root_sets = dict()
+        self.root_sets = {}
         # dict space-name -> Struct
-        self.pure_nested_structs = dict()
+        self.pure_nested_structs = {}
 
         self._mark_notify()
         self._mock_up_events()
@@ -1627,7 +1633,7 @@ class RenderInfo:
 
         self.cw = cw
 
-        self.struct = dict()
+        self.struct = {}
         if op_mode == 'notify':
             op_mode = 'do' if 'do' in op else 'dump'
         for op_dir in ['request', 'reply']:
@@ -1794,7 +1800,7 @@ class CodeWriter:
         if not local_vars:
             return
 
-        if type(local_vars) is str:
+        if isinstance(local_vars, str):
             local_vars = [local_vars]
 
         local_vars.sort(key=len, reverse=True)
@@ -1814,20 +1820,19 @@ class CodeWriter:
     def writes_defines(self, defines):
         longest = 0
         for define in defines:
-            if len(define[0]) > longest:
-                longest = len(define[0])
+            longest = max(len(define[0]), longest)
         longest = ((longest + 8) // 8) * 8
         for define in defines:
             line = '#define ' + define[0]
             line += '\t' * ((longest - len(define[0]) + 7) // 8)
-            if type(define[1]) is int:
+            if isinstance(define[1], int):
                 line += str(define[1])
-            elif type(define[1]) is str:
+            elif isinstance(define[1], str):
                 line += '"' + define[1] + '"'
             self.p(line)
 
     def write_struct_init(self, members):
-        longest = max([len(x[0]) for x in members])
+        longest = max(len(x[0]) for x in members)
         longest += 1  # because we prepend a .
         longest = ((longest + 8) // 8) * 8
         for one in members:
@@ -2670,7 +2675,7 @@ def print_req_free(ri):
 
 
 def print_rsp_type(ri):
-    if (ri.op_mode == 'do' or ri.op_mode == 'dump') and 'reply' in ri.op[ri.op_mode]:
+    if ri.op_mode in ('do', 'dump') and 'reply' in ri.op[ri.op_mode]:
         direction = 'reply'
     elif ri.op_mode == 'event':
         direction = 'reply'
@@ -2683,7 +2688,7 @@ def print_wrapped_type(ri):
     ri.cw.block_start(line=f"{type_name(ri, 'reply')}")
     if ri.op_mode == 'dump':
         ri.cw.p(f"{type_name(ri, 'reply')} *next;")
-    elif ri.op_mode == 'notify' or ri.op_mode == 'event':
+    elif ri.op_mode in ('notify', 'event'):
         ri.cw.p('__u16 family;')
         ri.cw.p('__u8 cmd;')
         ri.cw.p('struct ynl_ntf_base_type *next;')
@@ -2946,7 +2951,7 @@ def print_kernel_op_table_hdr(family, cw):
 
 def print_kernel_op_table(family, cw):
     print_kernel_op_table_fwd(family, cw, terminate=False)
-    if family.kernel_policy == 'global' or family.kernel_policy == 'per-op':
+    if family.kernel_policy in ('global', 'per-op'):
         for op_name, op in family.ops.items():
             if op.is_async:
                 continue
-- 
cgit v1.2.3


From 1ecc8ae876c41befc4d4f4f85c7abd42387d06e0 Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Thu, 8 Jan 2026 16:13:39 +0000
Subject: tools: ynl-gen-c: Fix remaining pylint warnings

Fix the following pylint warning instances:

ynl_gen_c.py:575:15: E0606: Possibly using variable 'mem' before
assignment (possibly-used-before-assignment)

ynl_gen_c.py:888:0: R1707: Disallow trailing comma tuple
(trailing-comma-tuple)

ynl_gen_c.py:944:21: C0209: Formatting a regular string which could be an
f-string (consider-using-f-string)

ynl_gen_c.py:1450:14: C1802: Do not use `len(SEQUENCE)` without comparison
to determine if a sequence is empty (use-implicit-booleaness-not-len)

ynl_gen_c.py:1688:13: W1514: Using open without explicitly specifying an
encoding (unspecified-encoding)

ynl_gen_c.py:3446:0: C0325: Unnecessary parens after '=' keyword
(superfluous-parens)

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260108161339.29166-14-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/ynl_gen_c.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py
index 5f079a74c8d1..0e1e486c1185 100755
--- a/tools/net/ynl/pyynl/ynl_gen_c.py
+++ b/tools/net/ynl/pyynl/ynl_gen_c.py
@@ -571,6 +571,8 @@ class TypeBinary(Type):
             mem = 'NLA_POLICY_MIN_LEN(' + self.get_limit_str('min-len') + ')'
         elif 'max-len' in self.checks:
             mem = 'NLA_POLICY_MAX_LEN(' + self.get_limit_str('max-len') + ')'
+        else:
+            raise Exception('Failed to process policy check for binary type')
 
         return mem
 
@@ -885,7 +887,7 @@ class TypeIndexedArray(Type):
                 f"for (i = 0; i < {var}->{ref}_count.{self.c_name}; i++)",
                 f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name}[i]);',
             ]
-        lines += f"free({var}->{ref}{self.c_name});",
+        lines += (f"free({var}->{ref}{self.c_name});",)
         return lines
 
 class TypeNestTypeValue(Type):
@@ -935,15 +937,15 @@ class TypeSubMessage(TypeNest):
         return typol
 
     def _attr_get(self, ri, var):
-        sel = c_lower(self['selector'])
+        selector = self['selector']
+        sel = c_lower(selector)
         if self.selector.is_external():
             sel_var = f"_sel_{sel}"
         else:
             sel_var = f"{var}->{sel}"
         get_lines = [f'if (!{sel_var})',
-                     'return ynl_submsg_failed(yarg, "%s", "%s");' %
-                        (self.name, self['selector']),
-                    f"if ({self.nested_render_name}_parse(&parg, {sel_var}, attr))",
+                     f'return ynl_submsg_failed(yarg, "{self.name}", "{selector}");',
+                     f"if ({self.nested_render_name}_parse(&parg, {sel_var}, attr))",
                      "return YNL_PARSE_CB_ERROR;"]
         init_lines = [f"parg.rsp_policy = &{self.nested_render_name}_nest;",
                       f"parg.data = &{var}->{self.c_name};"]
@@ -1447,7 +1449,7 @@ class Family(SpecFamily):
         attr_set_queue = list(self.root_sets.keys())
         attr_set_seen = set(self.root_sets.keys())
 
-        while len(attr_set_queue):
+        while attr_set_queue:
             a_set = attr_set_queue.pop(0)
             for attr, spec in self.attr_sets[a_set].items():
                 if 'nested-attributes' in spec:
@@ -1685,7 +1687,7 @@ class CodeWriter:
         if not self._overwrite and os.path.isfile(self._out_file):
             if filecmp.cmp(self._out.name, self._out_file, shallow=False):
                 return
-        with open(self._out_file, 'w+') as out_file:
+        with open(self._out_file, 'w+', encoding='utf-8') as out_file:
             self._out.seek(0)
             shutil.copyfileobj(self._out, out_file)
             self._out.close()
@@ -3443,7 +3445,7 @@ def main():
         print(exc)
         os.sys.exit(1)
 
-    cw = CodeWriter(BaseNlLib(), args.out_file, overwrite=(not args.cmp_out))
+    cw = CodeWriter(BaseNlLib(), args.out_file, overwrite=not args.cmp_out)
 
     _, spec_kernel = find_kernel_root(args.spec)
     if args.mode == 'uapi' or args.header:
-- 
cgit v1.2.3


From 4effccde0a0521b220c3585c9a0d8e677d345209 Mon Sep 17 00:00:00 2001
From: WanLi Niu <niuwl1@chinatelecom.cn>
Date: Tue, 6 Jan 2026 10:31:23 +0800
Subject: bpftool: Make skeleton C++ compatible with explicit casts

Fix C++ compilation errors in generated skeleton by adding explicit
pointer casts and use char * subtraction for offset calculation

error: invalid conversion from 'void*' to '<obj_name>*' [-fpermissive]
      |         skel = skel_alloc(sizeof(*skel));
      |                ~~~~~~~~~~^~~~~~~~~~~~~~~
      |                          |
      |                          void*

error: arithmetic on pointers to void
      |         skel->ctx.sz = (void *)&skel->links - (void *)skel;
      |                        ~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~

error: assigning to 'struct <obj_name>__<ident> *' from incompatible type 'void *'
      |                 skel-><ident> = skel_prep_map_data((void *)data, 4096,
      |                             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                                                 sizeof(data) - 1);
      |                                                 ~~~~~~~~~~~~~~~~~

error: assigning to 'struct <obj_name>__<ident> *' from incompatible type 'void *'
      |         skel-><ident> = skel_finalize_map_data(&skel->maps.<ident>.initial_value,
      |                     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                                         4096, PROT_READ | PROT_WRITE, skel->maps.<ident>.map_fd);
      |                                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Minimum reproducer:

	$ cat test.bpf.c
	int val; // placed in .bss section

	#include "vmlinux.h"
	#include <bpf/bpf_helpers.h>

	SEC("raw_tracepoint/sched_wakeup_new") int handle(void *ctx) { return 0; }

	$ cat test.cpp
	#include <cerrno>

	extern "C" {
	#include "test.bpf.skel.h"
	}

	$ bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h
	$ clang -g -O2 -target bpf -c test.bpf.c -o test.bpf.o
	$ bpftool gen skeleton test.bpf.o -L  > test.bpf.skel.h
	$ g++ -c test.cpp -I.

Co-developed-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: WanLi Niu <niuwl1@chinatelecom.cn>
Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260106023123.2928-1-kiraskyler@163.com
---
 tools/bpf/bpftool/gen.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c
index 993c7d9484a4..2f9e10752e28 100644
--- a/tools/bpf/bpftool/gen.c
+++ b/tools/bpf/bpftool/gen.c
@@ -731,10 +731,10 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h
 		{							    \n\
 			struct %1$s *skel;				    \n\
 									    \n\
-			skel = skel_alloc(sizeof(*skel));		    \n\
+			skel = (struct %1$s *)skel_alloc(sizeof(*skel));    \n\
 			if (!skel)					    \n\
 				goto cleanup;				    \n\
-			skel->ctx.sz = (void *)&skel->links - (void *)skel; \n\
+			skel->ctx.sz = (char *)&skel->links - (char *)skel; \n\
 		",
 		obj_name, opts.data_sz);
 	bpf_object__for_each_map(map, obj) {
@@ -755,7 +755,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h
 		\n\
 		\";							    \n\
 									    \n\
-				skel->%1$s = skel_prep_map_data((void *)data, %2$zd,\n\
+				skel->%1$s = (__typeof__(skel->%1$s))skel_prep_map_data((void *)data, %2$zd,\n\
 								sizeof(data) - 1);\n\
 				if (!skel->%1$s)			    \n\
 					goto cleanup;			    \n\
@@ -857,7 +857,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h
 
 		codegen("\
 		\n\
-			skel->%1$s = skel_finalize_map_data(&skel->maps.%1$s.initial_value,  \n\
+			skel->%1$s = (__typeof__(skel->%1$s))skel_finalize_map_data(&skel->maps.%1$s.initial_value,\n\
 							%2$zd, %3$s, skel->maps.%1$s.map_fd);\n\
 			if (!skel->%1$s)				    \n\
 				return -ENOMEM;				    \n\
-- 
cgit v1.2.3


From 671ef08d9455f5754d1fc96f5a14e357d6b80936 Mon Sep 17 00:00:00 2001
From: Xiaochen Shen <shenxiaochen@open-hieco.net>
Date: Wed, 17 Dec 2025 11:04:53 +0800
Subject: selftests/resctrl: Fix a division by zero error on Hygon

Change to adjust effective L3 cache size with SNC enabled change
introduced the snc_nodes_per_l3_cache() function to detect the Intel
Sub-NUMA Clustering (SNC) feature by comparing #CPUs in node0 with #CPUs
sharing LLC with CPU0. The function was designed to return:
  (1) >1: SNC mode is enabled.
  (2)  1: SNC mode is not enabled or not supported.

However, on certain Hygon CPUs, #CPUs sharing LLC with CPU0 is actually
less than #CPUs in node0. This results in snc_nodes_per_l3_cache()
returning 0 (calculated as cache_cpus / node_cpus).

This leads to a division by zero error in get_cache_size():
  *cache_size /= snc_nodes_per_l3_cache();

Causing the resctrl selftest to fail with:
  "Floating point exception (core dumped)"

Fix the issue by ensuring snc_nodes_per_l3_cache() returns 1 when SNC
mode is not supported on the platform.

Updated commit log to fix commit has issues:
Shuah Khan <skhan@linuxfoundation.org>

Link: https://lore.kernel.org/r/20251217030456.3834956-2-shenxiaochen@open-hieco.net
Fixes: a1cd99e700ec ("selftests/resctrl: Adjust effective L3 cache size with SNC enabled")
Signed-off-by: Xiaochen Shen <shenxiaochen@open-hieco.net>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Fenghua Yu <fenghuay@nvidia.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/resctrlfs.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 195f04c4d158..b9c1bfb6cc02 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -243,6 +243,16 @@ int snc_nodes_per_l3_cache(void)
 		}
 		snc_mode = cache_cpus / node_cpus;
 
+		/*
+		 * On some platforms (e.g. Hygon),
+		 * cache_cpus < node_cpus, the calculated snc_mode is 0.
+		 *
+		 * Set snc_mode = 1 to indicate that SNC mode is not
+		 * supported on the platform.
+		 */
+		if (!snc_mode)
+			snc_mode = 1;
+
 		if (snc_mode > 1)
 			ksft_print_msg("SNC-%d mode discovered.\n", snc_mode);
 	}
-- 
cgit v1.2.3


From 4f4f01cc333e97b0e63b61ed1a65c928aa662f99 Mon Sep 17 00:00:00 2001
From: Xiaochen Shen <shenxiaochen@open-hieco.net>
Date: Wed, 17 Dec 2025 11:04:54 +0800
Subject: selftests/resctrl: Define CPU vendor IDs as bits to match usage

The CPU vendor IDs are required to be unique bits because they're used
for vendor_specific bitmask in the struct resctrl_test.
Consider for example their usage in test_vendor_specific_check():
	return get_vendor() & test->vendor_specific

However, the definitions of CPU vendor IDs in file resctrl.h is quite
subtle as a bitmask value:
  #define ARCH_INTEL     1
  #define ARCH_AMD       2

A clearer and more maintainable approach is to define these CPU vendor
IDs using BIT(). This ensures each vendor corresponds to a distinct bit
and makes it obvious when adding new vendor IDs.

Accordingly, update the return types of detect_vendor() and get_vendor()
from 'int' to 'unsigned int' to align with their usage as bitmask values
and to prevent potentially risky type conversions.

Furthermore, introduce a bool flag 'initialized' to simplify the
get_vendor() -> detect_vendor() logic. This ensures the vendor ID is
detected only once and resolves the ambiguity of using the same variable
'vendor' both as a value and as a state.

Link: https://lore.kernel.org/r/20251217030456.3834956-3-shenxiaochen@open-hieco.net
Suggested-by: Reinette Chatre <reinette.chatre@intel.com>
Suggested-by: Fenghua Yu <fenghuay@nvidia.com>
Signed-off-by: Xiaochen Shen <shenxiaochen@open-hieco.net>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/resctrl.h       |  7 ++++---
 tools/testing/selftests/resctrl/resctrl_tests.c | 26 +++++++++++++++++--------
 2 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 3c51bdac2dfa..4f9c7d04c98d 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -23,6 +23,7 @@
 #include <asm/unistd.h>
 #include <linux/perf_event.h>
 #include <linux/compiler.h>
+#include <linux/bits.h>
 #include "kselftest.h"
 
 #define MB			(1024 * 1024)
@@ -36,8 +37,8 @@
  * Define as bits because they're used for vendor_specific bitmask in
  * the struct resctrl_test.
  */
-#define ARCH_INTEL     1
-#define ARCH_AMD       2
+#define ARCH_INTEL	BIT(0)
+#define ARCH_AMD	BIT(1)
 
 #define END_OF_TESTS	1
 
@@ -163,7 +164,7 @@ extern int snc_unreliable;
 extern char llc_occup_path[1024];
 
 int snc_nodes_per_l3_cache(void);
-int get_vendor(void);
+unsigned int get_vendor(void);
 bool check_resctrlfs_support(void);
 int filter_dmesg(void);
 int get_domain_id(const char *resource, int cpu_no, int *domain_id);
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 5154ffd821c4..42605e2a3b66 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -23,16 +23,24 @@ static struct resctrl_test *resctrl_tests[] = {
 	&l2_noncont_cat_test,
 };
 
-static int detect_vendor(void)
+static unsigned int detect_vendor(void)
 {
-	FILE *inf = fopen("/proc/cpuinfo", "r");
-	int vendor_id = 0;
+	static unsigned int vendor_id;
+	static bool initialized;
 	char *s = NULL;
+	FILE *inf;
 	char *res;
 
-	if (!inf)
+	if (initialized)
 		return vendor_id;
 
+	inf = fopen("/proc/cpuinfo", "r");
+	if (!inf) {
+		vendor_id = 0;
+		initialized = true;
+		return vendor_id;
+	}
+
 	res = fgrep(inf, "vendor_id");
 
 	if (res)
@@ -45,15 +53,17 @@ static int detect_vendor(void)
 
 	fclose(inf);
 	free(res);
+
+	initialized = true;
 	return vendor_id;
 }
 
-int get_vendor(void)
+unsigned int get_vendor(void)
 {
-	static int vendor = -1;
+	unsigned int vendor;
+
+	vendor = detect_vendor();
 
-	if (vendor == -1)
-		vendor = detect_vendor();
 	if (vendor == 0)
 		ksft_print_msg("Can not get vendor info...\n");
 
-- 
cgit v1.2.3


From 367f931e6476747edbde4e7c7b95fc5d5b724934 Mon Sep 17 00:00:00 2001
From: Xiaochen Shen <shenxiaochen@open-hieco.net>
Date: Wed, 17 Dec 2025 11:04:55 +0800
Subject: selftests/resctrl: Add CPU vendor detection for Hygon

The resctrl selftest currently fails on Hygon CPUs that support Platform
QoS features, printing the error:

  "# Can not get vendor info..."

This occurs because vendor detection is missing for Hygon CPUs.

Fix this by extending the CPU vendor detection logic to include
Hygon's vendor ID.

Link: https://lore.kernel.org/r/20251217030456.3834956-4-shenxiaochen@open-hieco.net
Signed-off-by: Xiaochen Shen <shenxiaochen@open-hieco.net>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/resctrl.h       | 1 +
 tools/testing/selftests/resctrl/resctrl_tests.c | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index 4f9c7d04c98d..afe635b6e48d 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -39,6 +39,7 @@
  */
 #define ARCH_INTEL	BIT(0)
 #define ARCH_AMD	BIT(1)
+#define ARCH_HYGON	BIT(2)
 
 #define END_OF_TESTS	1
 
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 42605e2a3b66..dbcd5eea9fbc 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -50,6 +50,8 @@ static unsigned int detect_vendor(void)
 		vendor_id = ARCH_INTEL;
 	else if (s && !strcmp(s, ": AuthenticAMD\n"))
 		vendor_id = ARCH_AMD;
+	else if (s && !strcmp(s, ": HygonGenuine\n"))
+		vendor_id = ARCH_HYGON;
 
 	fclose(inf);
 	free(res);
-- 
cgit v1.2.3


From 86063a2568b8f2eeb68da1411b320c0ff778f852 Mon Sep 17 00:00:00 2001
From: Xiaochen Shen <shenxiaochen@open-hieco.net>
Date: Wed, 17 Dec 2025 11:04:56 +0800
Subject: selftests/resctrl: Fix non-contiguous CBM check for Hygon

The resctrl selftest currently fails on Hygon CPUs that always supports
non-contiguous CBM, printing the error:

  "# Hardware and kernel differ on non-contiguous CBM support!"

This occurs because the arch_supports_noncont_cat() function lacks
vendor detection for Hygon CPUs, preventing proper identification of
their non-contiguous CBM capability.

Fix this by adding Hygon vendor ID detection to
arch_supports_noncont_cat().

Link: https://lore.kernel.org/r/20251217030456.3834956-5-shenxiaochen@open-hieco.net
Signed-off-by: Xiaochen Shen <shenxiaochen@open-hieco.net>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Fenghua Yu <fenghuay@nvidia.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/cat_test.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index 94cfdba5308d..f00b622c1460 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -290,8 +290,10 @@ static int cat_run_test(const struct resctrl_test *test, const struct user_param
 
 static bool arch_supports_noncont_cat(const struct resctrl_test *test)
 {
-	/* AMD always supports non-contiguous CBM. */
-	if (get_vendor() == ARCH_AMD)
+	unsigned int vendor_id = get_vendor();
+
+	/* AMD and Hygon always support non-contiguous CBM. */
+	if (vendor_id == ARCH_AMD || vendor_id == ARCH_HYGON)
 		return true;
 
 #if defined(__i386__) || defined(__x86_64__) /* arch */
-- 
cgit v1.2.3


From 5714ca8cba5ed736f3733663c446cbee63a10a64 Mon Sep 17 00:00:00 2001
From: Varun R Mallya <varunrmallya@gmail.com>
Date: Wed, 7 Jan 2026 05:05:27 +0530
Subject: libbpf: Fix OOB read in btf_dump_get_bitfield_value

When dumping bitfield data, btf_dump_get_bitfield_value() reads data
based on the underlying type's size (t->size). However, it does not
verify that the provided data buffer (data_sz) is large enough to
contain these bytes.

If btf_dump__dump_type_data() is called with a buffer smaller than
the type's size, this leads to an out-of-bounds read. This was
confirmed by AddressSanitizer in the linked issue.

Fix this by ensuring we do not read past the provided data_sz limit.

Fixes: a1d3cc3c5eca ("libbpf: Avoid use of __int128 in typed dump display")
Reported-by: Harrison Green <harrisonmichaelgreen@gmail.com>
Suggested-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Varun R Mallya <varunrmallya@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260106233527.163487-1-varunrmallya@gmail.com

Closes: https://github.com/libbpf/libbpf/issues/928
---
 tools/lib/bpf/btf_dump.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 6388392f49a0..53c6624161d7 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -1762,9 +1762,18 @@ static int btf_dump_get_bitfield_value(struct btf_dump *d,
 	__u16 left_shift_bits, right_shift_bits;
 	const __u8 *bytes = data;
 	__u8 nr_copy_bits;
+	__u8 start_bit, nr_bytes;
 	__u64 num = 0;
 	int i;
 
+	/* Calculate how many bytes cover the bitfield */
+	start_bit = bits_offset % 8;
+	nr_bytes = (start_bit + bit_sz + 7) / 8;
+
+	/* Bound check */
+	if (data + nr_bytes > d->typed_dump->data_end)
+		return -E2BIG;
+
 	/* Maximum supported bitfield size is 64 bits */
 	if (t->size > 8) {
 		pr_warn("unexpected bitfield size %d\n", t->size);
-- 
cgit v1.2.3


From 96ea4fa60c4528d95bdbce7f4212c015ab3e8113 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 6 Jan 2026 12:02:05 -0800
Subject: selftests: tls: avoid flakiness in data_steal

We see the following failure a few times a week:

  #  RUN           global.data_steal ...
  # tls.c:3280:data_steal:Expected recv(cfd, buf2, sizeof(buf2), MSG_DONTWAIT) (10000) == -1 (-1)
  # data_steal: Test failed
  #          FAIL  global.data_steal
  not ok 8 global.data_steal

The 10000 bytes read suggests that the child process did a recv()
of half of the data using the TLS ULP and we're now getting the
remaining half. The intent of the test is to get the child to
enter _TCP_ recvmsg handler, so it needs to enter the syscall before
parent installed the TLS recvmsg with setsockopt(SOL_TLS).

Instead of the 10msec sleep send 1 byte of data and wait for the
child to consume it.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Link: https://patch.msgid.link/20260106200205.1593915-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tls.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index a4d16a460fbe..9e2ccea13d70 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -3260,17 +3260,25 @@ TEST(data_steal) {
 	ASSERT_EQ(setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")), 0);
 
 	/* Spawn a child and get it into the read wait path of the underlying
-	 * TCP socket.
+	 * TCP socket (before kernel .recvmsg is replaced with the TLS one).
 	 */
 	pid = fork();
 	ASSERT_GE(pid, 0);
 	if (!pid) {
-		EXPECT_EQ(recv(cfd, buf, sizeof(buf) / 2, MSG_WAITALL),
-			  sizeof(buf) / 2);
+		EXPECT_EQ(recv(cfd, buf, sizeof(buf) / 2 + 1, MSG_WAITALL),
+			  sizeof(buf) / 2 + 1);
 		exit(!__test_passed(_metadata));
 	}
 
-	usleep(10000);
+	/* Send a sync byte and poll until it's consumed to ensure
+	 * the child is in recv() before we proceed to install TLS.
+	 */
+	ASSERT_EQ(send(fd, buf, 1, 0), 1);
+	do {
+		usleep(500);
+	} while (recv(cfd, buf, 1, MSG_PEEK | MSG_DONTWAIT) == 1);
+	EXPECT_EQ(errno, EAGAIN);
+
 	ASSERT_EQ(setsockopt(fd, SOL_TLS, TLS_TX, &tls, tls.len), 0);
 	ASSERT_EQ(setsockopt(cfd, SOL_TLS, TLS_RX, &tls, tls.len), 0);
 
-- 
cgit v1.2.3


From a0ac0ff382767a5dbde266fb5f7997a5d6b70e10 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 7 Jan 2026 15:25:57 -0800
Subject: selftests: drv-net: gro: increase the rcvbuf size

The gro.py test (testing software GRO) is slightly flaky when
running against fbnic. We see one flake per roughly 20 runs in NIPA,
mostly in ipip.large, and always including some EAGAIN:

  # Shouldn't coalesce if exceed IP max pkt size: Test succeeded
  # Expected {65475 899 }, Total 2 packets
  # Received {65475 899 }, Total 2 packets.
  # Expected {64576 900 900 }, Total 3 packets
  # Received {64576 /home/virtme/testing/wt-24/tools/testing/selftests/drivers/net/gro: could not receive: Resource temporarily unavailable

The test sends 2 large frames (64k + change). Looks like the default
packet socket rcvbuf (~200kB) may not be large enough to hold them.
Bump the rcvbuf to 1MB.

Add a debug print showing socket statistics to make debugging this
issue easier in the future. Without the rcvbuf increase we see:

  # Shouldn't coalesce if exceed IP max pkt size: Test succeeded
  # Expected {65475 899 }, Total 2 packets
  # Received {65475 899 }, Total 2 packets.
  # Expected {64576 900 900 }, Total 3 packets
  # Received {64576 Socket stats: packets=7, drops=3
                    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # /home/virtme/testing/wt-24/tools/testing/selftests/drivers/net/gro: could not receive: Resource temporarily unavailable

Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20260107232557.2147760-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/gro.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/gro.c b/tools/testing/selftests/drivers/net/gro.c
index e894037d2e3e..751a8103f408 100644
--- a/tools/testing/selftests/drivers/net/gro.c
+++ b/tools/testing/selftests/drivers/net/gro.c
@@ -926,6 +926,28 @@ static void set_timeout(int fd)
 		error(1, errno, "cannot set timeout, setsockopt failed");
 }
 
+static void set_rcvbuf(int fd)
+{
+	int bufsize = 1 * 1024 * 1024; /* 1 MB */
+
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &bufsize, sizeof(bufsize)))
+		error(1, errno, "cannot set rcvbuf size, setsockopt failed");
+}
+
+static void recv_error(int fd, int rcv_errno)
+{
+	struct tpacket_stats stats;
+	socklen_t len;
+
+	len = sizeof(stats);
+	if (getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, &stats, &len))
+		error(1, errno, "can't get stats");
+
+	fprintf(stderr, "Socket stats: packets=%u, drops=%u\n",
+		stats.tp_packets, stats.tp_drops);
+	error(1, rcv_errno, "could not receive");
+}
+
 static void check_recv_pkts(int fd, int *correct_payload,
 			    int correct_num_pkts)
 {
@@ -950,7 +972,7 @@ static void check_recv_pkts(int fd, int *correct_payload,
 		ip_ext_len = 0;
 		pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0);
 		if (pkt_size < 0)
-			error(1, errno, "could not receive");
+			recv_error(fd, errno);
 
 		if (iph->version == 4)
 			ip_ext_len = (iph->ihl - 5) * 4;
@@ -1126,6 +1148,7 @@ static void gro_receiver(void)
 		error(1, 0, "socket creation");
 	setup_sock_filter(rxfd);
 	set_timeout(rxfd);
+	set_rcvbuf(rxfd);
 	bind_packetsocket(rxfd);
 
 	ksft_ready();
-- 
cgit v1.2.3


From 68ec2b9fc59e8053f17ee1d1a5b1959c43a19202 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 7 Jan 2026 06:53:19 -0800
Subject: selftests: forwarding: update PTP tcpdump patterns

Recent version of tcpdump (tcpdump-4.99.6-1.fc43.x86_64) seems to have
removed the spurious space after msg type in PTP info, e.g.:

 before:  PTPv2, majorSdoId: 0x0, msg type : sync msg, length: 44
 after:   PTPv2, majorSdoId: 0x0, msg type: sync msg, length: 44

Update our patterns to match both.

Reviewed-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Link: https://patch.msgid.link/20260107145320.1837464-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/net/forwarding/local_termination.sh      | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh
index 892895659c7e..1f2bf6e81847 100755
--- a/tools/testing/selftests/net/forwarding/local_termination.sh
+++ b/tools/testing/selftests/net/forwarding/local_termination.sh
@@ -306,39 +306,39 @@ run_test()
 
 	if [ $skip_ptp = false ]; then
 		check_rcv $rcv_if_name "1588v2 over L2 transport, Sync" \
-			"ethertype PTP (0x88f7).* PTPv2.* msg type : sync msg" \
+			"ethertype PTP (0x88f7).* PTPv2.* msg type *: sync msg" \
 			true "$test_name"
 
 		check_rcv $rcv_if_name "1588v2 over L2 transport, Follow-Up" \
-			"ethertype PTP (0x88f7).* PTPv2.* msg type : follow up msg" \
+			"ethertype PTP (0x88f7).* PTPv2.* msg type *: follow up msg" \
 			true "$test_name"
 
 		check_rcv $rcv_if_name "1588v2 over L2 transport, Peer Delay Request" \
-			"ethertype PTP (0x88f7).* PTPv2.* msg type : peer delay req msg" \
+			"ethertype PTP (0x88f7).* PTPv2.* msg type *: peer delay req msg" \
 			true "$test_name"
 
 		check_rcv $rcv_if_name "1588v2 over IPv4, Sync" \
-			"ethertype IPv4 (0x0800).* PTPv2.* msg type : sync msg" \
+			"ethertype IPv4 (0x0800).* PTPv2.* msg type *: sync msg" \
 			true "$test_name"
 
 		check_rcv $rcv_if_name "1588v2 over IPv4, Follow-Up" \
-			"ethertype IPv4 (0x0800).* PTPv2.* msg type : follow up msg" \
+			"ethertype IPv4 (0x0800).* PTPv2.* msg type *: follow up msg" \
 			true "$test_name"
 
 		check_rcv $rcv_if_name "1588v2 over IPv4, Peer Delay Request" \
-			"ethertype IPv4 (0x0800).* PTPv2.* msg type : peer delay req msg" \
+			"ethertype IPv4 (0x0800).* PTPv2.* msg type *: peer delay req msg" \
 			true "$test_name"
 
 		check_rcv $rcv_if_name "1588v2 over IPv6, Sync" \
-			"ethertype IPv6 (0x86dd).* PTPv2.* msg type : sync msg" \
+			"ethertype IPv6 (0x86dd).* PTPv2.* msg type *: sync msg" \
 			true "$test_name"
 
 		check_rcv $rcv_if_name "1588v2 over IPv6, Follow-Up" \
-			"ethertype IPv6 (0x86dd).* PTPv2.* msg type : follow up msg" \
+			"ethertype IPv6 (0x86dd).* PTPv2.* msg type *: follow up msg" \
 			true "$test_name"
 
 		check_rcv $rcv_if_name "1588v2 over IPv6, Peer Delay Request" \
-			"ethertype IPv6 (0x86dd).* PTPv2.* msg type : peer delay req msg" \
+			"ethertype IPv6 (0x86dd).* PTPv2.* msg type *: peer delay req msg" \
 			true "$test_name"
 	fi
 
-- 
cgit v1.2.3


From a1025dcd377ef92d9a09af03b70ce80be281ee22 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 24 Dec 2025 00:44:49 +0100
Subject: selftests: kvm: replace numbered sync points with actions

Rework the guest=>host syncs in the AMX test to use named actions instead
of arbitrary, incrementing numbers.  The "stage" of the test has no real
meaning, what matters is what action the test wants the host to perform.
The incrementing numbers are somewhat helpful for triaging failures, but
fully debugging failures almost always requires a much deeper dive into
the test (and KVM).

Using named actions not only makes it easier to extend the test without
having to shift all sync point numbers, it makes the code easier to read.

[Commit message by Sean Christopherson]

Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/x86/amx_test.c | 88 +++++++++++++++---------------
 1 file changed, 43 insertions(+), 45 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86/amx_test.c b/tools/testing/selftests/kvm/x86/amx_test.c
index f4ce5a185a7d..3de4402ac17d 100644
--- a/tools/testing/selftests/kvm/x86/amx_test.c
+++ b/tools/testing/selftests/kvm/x86/amx_test.c
@@ -124,6 +124,14 @@ static void set_tilecfg(struct tile_config *cfg)
 	}
 }
 
+enum {
+	/* Check TMM0 against tiledata */
+	TEST_COMPARE_TILEDATA = 1,
+
+	/* Full VM save/restore */
+	TEST_SAVE_RESTORE = 2,
+};
+
 static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
 						    struct tile_data *tiledata,
 						    struct xstate *xstate)
@@ -131,20 +139,20 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
 	GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) &&
 		     this_cpu_has(X86_FEATURE_OSXSAVE));
 	check_xtile_info();
-	GUEST_SYNC(1);
+	GUEST_SYNC(TEST_SAVE_RESTORE);
 
 	/* xfd=0, enable amx */
 	wrmsr(MSR_IA32_XFD, 0);
-	GUEST_SYNC(2);
+	GUEST_SYNC(TEST_SAVE_RESTORE);
 	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0);
 	set_tilecfg(amx_cfg);
 	__ldtilecfg(amx_cfg);
-	GUEST_SYNC(3);
+	GUEST_SYNC(TEST_SAVE_RESTORE);
 	/* Check save/restore when trap to userspace */
 	__tileloadd(tiledata);
-	GUEST_SYNC(4);
+	GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE);
 	__tilerelease();
-	GUEST_SYNC(5);
+	GUEST_SYNC(TEST_SAVE_RESTORE);
 	/*
 	 * After XSAVEC, XTILEDATA is cleared in the xstate_bv but is set in
 	 * the xcomp_bv.
@@ -154,6 +162,8 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
 	GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
 	GUEST_ASSERT(xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA);
 
+	/* #NM test */
+
 	/* xfd=0x40000, disable amx tiledata */
 	wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA);
 
@@ -166,13 +176,13 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
 	GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
 	GUEST_ASSERT((xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA));
 
-	GUEST_SYNC(6);
+	GUEST_SYNC(TEST_SAVE_RESTORE);
 	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
 	set_tilecfg(amx_cfg);
 	__ldtilecfg(amx_cfg);
 	/* Trigger #NM exception */
 	__tileloadd(tiledata);
-	GUEST_SYNC(10);
+	GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE);
 
 	GUEST_DONE();
 }
@@ -180,18 +190,18 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
 void guest_nm_handler(struct ex_regs *regs)
 {
 	/* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */
-	GUEST_SYNC(7);
+	GUEST_SYNC(TEST_SAVE_RESTORE);
 	GUEST_ASSERT(!(get_cr0() & X86_CR0_TS));
 	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
 	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
-	GUEST_SYNC(8);
+	GUEST_SYNC(TEST_SAVE_RESTORE);
 	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
 	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
 	/* Clear xfd_err */
 	wrmsr(MSR_IA32_XFD_ERR, 0);
 	/* xfd=0, enable amx */
 	wrmsr(MSR_IA32_XFD, 0);
-	GUEST_SYNC(9);
+	GUEST_SYNC(TEST_SAVE_RESTORE);
 }
 
 int main(int argc, char *argv[])
@@ -244,6 +254,7 @@ int main(int argc, char *argv[])
 	memset(addr_gva2hva(vm, xstate), 0, PAGE_SIZE * DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE));
 	vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xstate);
 
+	int iter = 0;
 	for (;;) {
 		vcpu_run(vcpu);
 		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
@@ -253,20 +264,9 @@ int main(int argc, char *argv[])
 			REPORT_GUEST_ASSERT(uc);
 			/* NOT REACHED */
 		case UCALL_SYNC:
-			switch (uc.args[1]) {
-			case 1:
-			case 2:
-			case 3:
-			case 5:
-			case 6:
-			case 7:
-			case 8:
-				fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]);
-				break;
-			case 4:
-			case 10:
-				fprintf(stderr,
-				"GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]);
+			++iter;
+			if (uc.args[1] & TEST_COMPARE_TILEDATA) {
+				fprintf(stderr, "GUEST_SYNC #%d, check TMM0 contents\n", iter);
 
 				/* Compacted mode, get amx offset by xsave area
 				 * size subtract 8K amx size.
@@ -279,11 +279,25 @@ int main(int argc, char *argv[])
 				ret = memcmp(amx_start, tiles_data, TILE_SIZE);
 				TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret);
 				kvm_x86_state_cleanup(state);
-				break;
-			case 9:
-				fprintf(stderr,
-				"GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]);
-				break;
+			}
+			if (uc.args[1] & TEST_SAVE_RESTORE) {
+				fprintf(stderr, "GUEST_SYNC #%d, save/restore VM state\n", iter);
+				state = vcpu_save_state(vcpu);
+				memset(&regs1, 0, sizeof(regs1));
+				vcpu_regs_get(vcpu, &regs1);
+
+				kvm_vm_release(vm);
+
+				/* Restore state in a new VM.  */
+				vcpu = vm_recreate_with_one_vcpu(vm);
+				vcpu_load_state(vcpu, state);
+				kvm_x86_state_cleanup(state);
+
+				memset(&regs2, 0, sizeof(regs2));
+				vcpu_regs_get(vcpu, &regs2);
+				TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+					    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+					    (ulong) regs2.rdi, (ulong) regs2.rsi);
 			}
 			break;
 		case UCALL_DONE:
@@ -293,22 +307,6 @@ int main(int argc, char *argv[])
 			TEST_FAIL("Unknown ucall %lu", uc.cmd);
 		}
 
-		state = vcpu_save_state(vcpu);
-		memset(&regs1, 0, sizeof(regs1));
-		vcpu_regs_get(vcpu, &regs1);
-
-		kvm_vm_release(vm);
-
-		/* Restore state in a new VM.  */
-		vcpu = vm_recreate_with_one_vcpu(vm);
-		vcpu_load_state(vcpu, state);
-		kvm_x86_state_cleanup(state);
-
-		memset(&regs2, 0, sizeof(regs2));
-		vcpu_regs_get(vcpu, &regs2);
-		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
-			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
-			    (ulong) regs2.rdi, (ulong) regs2.rsi);
 	}
 done:
 	kvm_vm_free(vm);
-- 
cgit v1.2.3


From 0383a8edef396cf0a6884b0be81d62bde60737b0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 31 Dec 2025 16:47:26 +0100
Subject: selftests: kvm: try getting XFD and XSAVE state out of sync

The host is allowed to set FPU state that includes a disabled
xstate component.  Check that this does not cause bad effects.

Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/x86/amx_test.c | 38 +++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86/amx_test.c b/tools/testing/selftests/kvm/x86/amx_test.c
index 3de4402ac17d..bee56c1f7833 100644
--- a/tools/testing/selftests/kvm/x86/amx_test.c
+++ b/tools/testing/selftests/kvm/x86/amx_test.c
@@ -125,11 +125,17 @@ static void set_tilecfg(struct tile_config *cfg)
 }
 
 enum {
+	/* Retrieve TMM0 from guest, stash it for TEST_RESTORE_TILEDATA */
+	TEST_SAVE_TILEDATA = 1,
+
 	/* Check TMM0 against tiledata */
-	TEST_COMPARE_TILEDATA = 1,
+	TEST_COMPARE_TILEDATA = 2,
+
+	/* Restore TMM0 from earlier save */
+	TEST_RESTORE_TILEDATA = 4,
 
 	/* Full VM save/restore */
-	TEST_SAVE_RESTORE = 2,
+	TEST_SAVE_RESTORE = 8,
 };
 
 static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
@@ -150,7 +156,16 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
 	GUEST_SYNC(TEST_SAVE_RESTORE);
 	/* Check save/restore when trap to userspace */
 	__tileloadd(tiledata);
-	GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE);
+	GUEST_SYNC(TEST_SAVE_TILEDATA | TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE);
+
+	/* xfd=0x40000, disable amx tiledata */
+	wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA);
+
+	/* host tries setting tiledata while guest XFD is set */
+	GUEST_SYNC(TEST_RESTORE_TILEDATA);
+	GUEST_SYNC(TEST_SAVE_RESTORE);
+
+	wrmsr(MSR_IA32_XFD, 0);
 	__tilerelease();
 	GUEST_SYNC(TEST_SAVE_RESTORE);
 	/*
@@ -210,10 +225,10 @@ int main(int argc, char *argv[])
 	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
 	struct kvm_x86_state *state;
+	struct kvm_x86_state *tile_state = NULL;
 	int xsave_restore_size;
 	vm_vaddr_t amx_cfg, tiledata, xstate;
 	struct ucall uc;
-	u32 amx_offset;
 	int ret;
 
 	/*
@@ -265,20 +280,27 @@ int main(int argc, char *argv[])
 			/* NOT REACHED */
 		case UCALL_SYNC:
 			++iter;
+			if (uc.args[1] & TEST_SAVE_TILEDATA) {
+				fprintf(stderr, "GUEST_SYNC #%d, save tiledata\n", iter);
+				tile_state = vcpu_save_state(vcpu);
+			}
 			if (uc.args[1] & TEST_COMPARE_TILEDATA) {
 				fprintf(stderr, "GUEST_SYNC #%d, check TMM0 contents\n", iter);
 
 				/* Compacted mode, get amx offset by xsave area
 				 * size subtract 8K amx size.
 				 */
-				amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
-				state = vcpu_save_state(vcpu);
-				void *amx_start = (void *)state->xsave + amx_offset;
+				u32 amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
+				void *amx_start = (void *)tile_state->xsave + amx_offset;
 				void *tiles_data = (void *)addr_gva2hva(vm, tiledata);
 				/* Only check TMM0 register, 1 tile */
 				ret = memcmp(amx_start, tiles_data, TILE_SIZE);
 				TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret);
-				kvm_x86_state_cleanup(state);
+			}
+			if (uc.args[1] & TEST_RESTORE_TILEDATA) {
+				fprintf(stderr, "GUEST_SYNC #%d, before KVM_SET_XSAVE\n", iter);
+				vcpu_xsave_set(vcpu, tile_state->xsave);
+				fprintf(stderr, "GUEST_SYNC #%d, after KVM_SET_XSAVE\n", iter);
 			}
 			if (uc.args[1] & TEST_SAVE_RESTORE) {
 				fprintf(stderr, "GUEST_SYNC #%d, save/restore VM state\n", iter);
-- 
cgit v1.2.3


From 3611ca7c12b740e250d83f8bbe3554b740c503b0 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 29 Dec 2025 12:23:30 -0800
Subject: selftests: kvm: Verify TILELOADD actually #NM faults when XFD[18]=1

Rework the AMX test's #NM handling to use kvm_asm_safe() to verify an #NM
actually occurs.  As is, a completely missing #NM could go unnoticed.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/x86/amx_test.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/x86/amx_test.c b/tools/testing/selftests/kvm/x86/amx_test.c
index bee56c1f7833..37b166260ee3 100644
--- a/tools/testing/selftests/kvm/x86/amx_test.c
+++ b/tools/testing/selftests/kvm/x86/amx_test.c
@@ -69,6 +69,12 @@ static inline void __tileloadd(void *tile)
 		     : : "a"(tile), "d"(0));
 }
 
+static inline int tileloadd_safe(void *tile)
+{
+	return kvm_asm_safe(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10",
+			    "a"(tile), "d"(0));
+}
+
 static inline void __tilerelease(void)
 {
 	asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::);
@@ -142,6 +148,8 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
 						    struct tile_data *tiledata,
 						    struct xstate *xstate)
 {
+	int vector;
+
 	GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) &&
 		     this_cpu_has(X86_FEATURE_OSXSAVE));
 	check_xtile_info();
@@ -195,17 +203,13 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
 	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
 	set_tilecfg(amx_cfg);
 	__ldtilecfg(amx_cfg);
-	/* Trigger #NM exception */
-	__tileloadd(tiledata);
-	GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE);
 
-	GUEST_DONE();
-}
+	/* Trigger #NM exception */
+	vector = tileloadd_safe(tiledata);
+	__GUEST_ASSERT(vector == NM_VECTOR,
+		       "Wanted #NM on tileloadd with XFD[18]=1, got %s",
+		       ex_str(vector));
 
-void guest_nm_handler(struct ex_regs *regs)
-{
-	/* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */
-	GUEST_SYNC(TEST_SAVE_RESTORE);
 	GUEST_ASSERT(!(get_cr0() & X86_CR0_TS));
 	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
 	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
@@ -217,6 +221,11 @@ void guest_nm_handler(struct ex_regs *regs)
 	/* xfd=0, enable amx */
 	wrmsr(MSR_IA32_XFD, 0);
 	GUEST_SYNC(TEST_SAVE_RESTORE);
+
+	__tileloadd(tiledata);
+	GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE);
+
+	GUEST_DONE();
 }
 
 int main(int argc, char *argv[])
@@ -253,9 +262,6 @@ int main(int argc, char *argv[])
 
 	vcpu_regs_get(vcpu, &regs1);
 
-	/* Register #NM handler */
-	vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler);
-
 	/* amx cfg for guest_code */
 	amx_cfg = vm_vaddr_alloc_page(vm);
 	memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize());
-- 
cgit v1.2.3


From c39a6a277e0e67ffff6a8efcbbf7e7e23ce9e38c Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Thu, 8 Jan 2026 12:44:19 +0100
Subject: vsock/test: add a final full barrier after run all tests

If the last test fails, the other side still completes correctly,
which could lead to false positives.

Let's add a final barrier that ensures that the last test has finished
correctly on both sides, but also that the two sides agree on the
number of tests to be performed.

Fixes: 2f65b44e199c ("VSOCK: add full barrier between test cases")
Reviewed-by: Luigi Leonardi <leonardi@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20260108114419.52747-1-sgarzare@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/vsock/util.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index d843643ced6b..9430ef5b8bc3 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -511,6 +511,18 @@ void run_tests(const struct test_case *test_cases,
 
 		printf("ok\n");
 	}
+
+	printf("All tests have been executed. Waiting other peer...");
+	fflush(stdout);
+
+	/*
+	 * Final full barrier, to ensure that all tests have been run and
+	 * that even the last one has been successful on both sides.
+	 */
+	control_writeln("COMPLETED");
+	control_expectln("COMPLETED");
+
+	printf("ok\n");
 }
 
 void list_tests(const struct test_case *test_cases)
-- 
cgit v1.2.3


From 9086984ff52e703cd7ce47ae19f12d8d31914396 Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Fri, 9 Jan 2026 13:08:51 +0200
Subject: selftests: drv-net: psp: Better control the used PSP dev

The PSP responder fails when zero or multiple PSP devices are detected.
There's an option to select the device id to use (-d) but it's
currently not used from the PSP self test. It's also hard to use because
the PSP test doesn't dump the PSP devices so can't choose one.
When zero devices are detected, psp_responder fails which will cause the
parent test to fail as well instead of skipping PSP tests.

Fix both of these problems. Change psp_responder to:
- not fail when no PSP devs are detected.
- get an optional -i ifindex argument instead of -d.
- select the correct PSP dev from the dump corresponding to ifindex or
- select the first PSP dev when -i is not given.
- fail when multiple devs are found and -i is not given.
- warn and continue when the requested ifindex is not found.

Also plumb the ifindex from the Python test.

With these, when there are no PSP devs found or the wrong one is chosen,
psp_responder opens the server socket, listens for control connections
normally, and leaves the skipping of the various test cases which
require a PSP device (~most, but not all of them) to the parent test.
This results in output like:

ok 1 psp.test_case # SKIP No PSP devices found
[...]
ok 12 psp.dev_get_device # SKIP No PSP devices found
ok 13 psp.dev_get_device_bad
ok 14 psp.dev_rotate # SKIP No PSP devices found
[...]

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Link: https://patch.msgid.link/20260109110851.2952906-2-cratiu@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/lib/py/env.py  |  1 +
 tools/testing/selftests/drivers/net/psp.py         |  4 +-
 .../testing/selftests/drivers/net/psp_responder.c  | 50 ++++++++++------------
 3 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py
index 8b644fd84ff2..63495376e654 100644
--- a/tools/testing/selftests/drivers/net/lib/py/env.py
+++ b/tools/testing/selftests/drivers/net/lib/py/env.py
@@ -170,6 +170,7 @@ class NetDrvEpEnv(NetDrvEnvBase):
         self.remote_ifname = self.resolve_remote_ifc()
         self.remote_dev = ip("-d link show dev " + self.remote_ifname,
                              host=self.remote, json=True)[0]
+        self.remote_ifindex = self.remote_dev['ifindex']
 
         self._required_cmd = {}
 
diff --git a/tools/testing/selftests/drivers/net/psp.py b/tools/testing/selftests/drivers/net/psp.py
index 52523bdad240..528a421ecf76 100755
--- a/tools/testing/selftests/drivers/net/psp.py
+++ b/tools/testing/selftests/drivers/net/psp.py
@@ -601,8 +601,8 @@ def main() -> None:
         cfg.comm_port = rand_port()
         srv = None
         try:
-            with bkg(responder + f" -p {cfg.comm_port}", host=cfg.remote,
-                     exit_wait=True) as srv:
+            with bkg(responder + f" -p {cfg.comm_port} -i {cfg.remote_ifindex}",
+                     host=cfg.remote, exit_wait=True) as srv:
                 wait_port_listen(cfg.comm_port, host=cfg.remote)
 
                 cfg.comm_sock = socket.create_connection((cfg.remote_addr,
diff --git a/tools/testing/selftests/drivers/net/psp_responder.c b/tools/testing/selftests/drivers/net/psp_responder.c
index f309e0d73cbf..a26e7628bbb1 100644
--- a/tools/testing/selftests/drivers/net/psp_responder.c
+++ b/tools/testing/selftests/drivers/net/psp_responder.c
@@ -22,7 +22,7 @@ static bool should_quit;
 
 struct opts {
 	int port;
-	int devid;
+	int ifindex;
 	bool verbose;
 };
 
@@ -360,7 +360,7 @@ static void usage(const char *name, const char *miss)
 	if (miss)
 		fprintf(stderr, "Missing argument: %s\n", miss);
 
-	fprintf(stderr, "Usage: %s -p port [-v] [-d psp-dev-id]\n", name);
+	fprintf(stderr, "Usage: %s -p port [-v] [-i ifindex]\n", name);
 	exit(EXIT_FAILURE);
 }
 
@@ -368,7 +368,7 @@ static void parse_cmd_opts(int argc, char **argv, struct opts *opts)
 {
 	int opt;
 
-	while ((opt = getopt(argc, argv, "vp:d:")) != -1) {
+	while ((opt = getopt(argc, argv, "vp:i:")) != -1) {
 		switch (opt) {
 		case 'v':
 			opts->verbose = 1;
@@ -376,8 +376,8 @@ static void parse_cmd_opts(int argc, char **argv, struct opts *opts)
 		case 'p':
 			opts->port = atoi(optarg);
 			break;
-		case 'd':
-			opts->devid = atoi(optarg);
+		case 'i':
+			opts->ifindex = atoi(optarg);
 			break;
 		default:
 			usage(argv[0], NULL);
@@ -410,12 +410,11 @@ static int psp_dev_set_ena(struct ynl_sock *ys, __u32 dev_id, __u32 versions)
 int main(int argc, char **argv)
 {
 	struct psp_dev_get_list *dev_list;
-	bool devid_found = false;
 	__u32 ver_ena, ver_cap;
 	struct opts opts = {};
 	struct ynl_error yerr;
 	struct ynl_sock *ys;
-	int first_id = 0;
+	int devid = -1;
 	int ret;
 
 	parse_cmd_opts(argc, argv, &opts);
@@ -429,20 +428,19 @@ int main(int argc, char **argv)
 	}
 
 	dev_list = psp_dev_get_dump(ys);
-	if (ynl_dump_empty(dev_list)) {
-		if (ys->err.code)
-			goto err_close;
-		fprintf(stderr, "No PSP devices\n");
-		goto err_close_silent;
-	}
+	if (ynl_dump_empty(dev_list) && ys->err.code)
+		goto err_close;
 
 	ynl_dump_foreach(dev_list, d) {
-		if (opts.devid) {
-			devid_found = true;
+		if (opts.ifindex) {
+			if (d->ifindex != opts.ifindex)
+				continue;
+			devid = d->id;
 			ver_ena = d->psp_versions_ena;
 			ver_cap = d->psp_versions_cap;
-		} else if (!first_id) {
-			first_id = d->id;
+			break;
+		} else if (devid < 0) {
+			devid = d->id;
 			ver_ena = d->psp_versions_ena;
 			ver_cap = d->psp_versions_cap;
 		} else {
@@ -452,23 +450,21 @@ int main(int argc, char **argv)
 	}
 	psp_dev_get_list_free(dev_list);
 
-	if (opts.devid && !devid_found) {
-		fprintf(stderr, "PSP device %d requested on cmdline, not found\n",
-			opts.devid);
-		goto err_close_silent;
-	} else if (!opts.devid) {
-		opts.devid = first_id;
-	}
+	if (opts.ifindex && devid < 0)
+		fprintf(stderr,
+			"WARN: PSP device with ifindex %d requested on cmdline, not found\n",
+			opts.ifindex);
 
-	if (ver_ena != ver_cap) {
-		ret = psp_dev_set_ena(ys, opts.devid, ver_cap);
+	if (devid >= 0 && ver_ena != ver_cap) {
+		ret = psp_dev_set_ena(ys, devid, ver_cap);
 		if (ret)
 			goto err_close;
 	}
 
 	ret = run_responder(ys, &opts);
 
-	if (ver_ena != ver_cap && psp_dev_set_ena(ys, opts.devid, ver_ena))
+	if (devid >= 0 && ver_ena != ver_cap &&
+	    psp_dev_set_ena(ys, devid, ver_ena))
 		fprintf(stderr, "WARN: failed to set the PSP versions back\n");
 
 	ynl_sock_destroy(ys);
-- 
cgit v1.2.3


From de7c600e2d5b501c0c04bde8ebab89ac5888a69f Mon Sep 17 00:00:00 2001
From: David Wei <dw@davidwei.uk>
Date: Thu, 8 Jan 2026 15:45:21 -0800
Subject: selftests/net: parametrise iou-zcrx.py with ksft_variants

Use ksft_variants to parametrise tests in iou-zcrx.py to either use
single queues or RSS contexts, reducing duplication.

Signed-off-by: David Wei <dw@davidwei.uk>
Link: https://patch.msgid.link/20260108234521.3619621-1-dw@davidwei.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/iou-zcrx.py | 162 ++++++++++-----------
 1 file changed, 73 insertions(+), 89 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
index 712c806508b5..2c5acfb4f5dc 100755
--- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
+++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
@@ -3,132 +3,114 @@
 
 import re
 from os import path
-from lib.py import ksft_run, ksft_exit, KsftSkipEx
+from lib.py import ksft_run, ksft_exit, KsftSkipEx, ksft_variants, KsftNamedVariant
 from lib.py import NetDrvEpEnv
 from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen
+from lib.py import EthtoolFamily
 
 
-def _get_current_settings(cfg):
-    output = ethtool(f"-g {cfg.ifname}", json=True)[0]
-    return (output['rx'], output['hds-thresh'])
-
-
-def _get_combined_channels(cfg):
-    output = ethtool(f"-l {cfg.ifname}").stdout
-    values = re.findall(r'Combined:\s+(\d+)', output)
-    return int(values[1])
-
-
-def _create_rss_ctx(cfg, chan):
-    output = ethtool(f"-X {cfg.ifname} context new start {chan} equal 1").stdout
+def create_rss_ctx(cfg):
+    output = ethtool(f"-X {cfg.ifname} context new start {cfg.target} equal 1").stdout
     values = re.search(r'New RSS context is (\d+)', output).group(1)
-    ctx_id = int(values)
-    return (ctx_id, defer(ethtool, f"-X {cfg.ifname} delete context {ctx_id}"))
+    return int(values)
 
 
-def _set_flow_rule(cfg, port, chan):
-    output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {port} action {chan}").stdout
+def set_flow_rule(cfg):
+    output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.target}").stdout
     values = re.search(r'ID (\d+)', output).group(1)
     return int(values)
 
 
-def _set_flow_rule_rss(cfg, port, ctx_id):
-    output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {port} context {ctx_id}").stdout
+def set_flow_rule_rss(cfg, rss_ctx_id):
+    output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout
     values = re.search(r'ID (\d+)', output).group(1)
     return int(values)
 
 
-def test_zcrx(cfg) -> None:
-    cfg.require_ipver('6')
-
-    combined_chans = _get_combined_channels(cfg)
-    if combined_chans < 2:
-        raise KsftSkipEx('at least 2 combined channels required')
-    (rx_ring, hds_thresh) = _get_current_settings(cfg)
-    port = rand_port()
-
-    ethtool(f"-G {cfg.ifname} tcp-data-split on")
-    defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto")
+def single(cfg):
+    channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}})
+    channels = channels['combined-count']
+    if channels < 2:
+        raise KsftSkipEx('Test requires NETIF with at least 2 combined channels')
 
-    ethtool(f"-G {cfg.ifname} hds-thresh 0")
-    defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}")
+    rings = cfg.ethnl.rings_get({'header': {'dev-index': cfg.ifindex}})
+    rx_rings = rings['rx']
+    hds_thresh = rings.get('hds-thresh', 0)
 
-    ethtool(f"-G {cfg.ifname} rx 64")
-    defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}")
+    cfg.ethnl.rings_set({'header': {'dev-index': cfg.ifindex},
+                         'tcp-data-split': 'enabled',
+                         'hds-thresh': 0,
+                         'rx': 64})
+    defer(cfg.ethnl.rings_set, {'header': {'dev-index': cfg.ifindex},
+                                'tcp-data-split': 'unknown',
+                                'hds-thresh': hds_thresh,
+                                'rx': rx_rings})
 
-    ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}")
+    cfg.target = channels - 1
+    ethtool(f"-X {cfg.ifname} equal {cfg.target}")
     defer(ethtool, f"-X {cfg.ifname} default")
 
-    flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1)
+    flow_rule_id = set_flow_rule(cfg)
     defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
 
-    rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1}"
-    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 12840"
-    with bkg(rx_cmd, exit_wait=True):
-        wait_port_listen(port, proto="tcp")
-        cmd(tx_cmd, host=cfg.remote)
 
+def rss(cfg):
+    channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}})
+    channels = channels['combined-count']
+    if channels < 2:
+        raise KsftSkipEx('Test requires NETIF with at least 2 combined channels')
 
-def test_zcrx_oneshot(cfg) -> None:
-    cfg.require_ipver('6')
+    rings = cfg.ethnl.rings_get({'header': {'dev-index': cfg.ifindex}})
+    rx_rings = rings['rx']
+    hds_thresh = rings.get('hds-thresh', 0)
 
-    combined_chans = _get_combined_channels(cfg)
-    if combined_chans < 2:
-        raise KsftSkipEx('at least 2 combined channels required')
-    (rx_ring, hds_thresh) = _get_current_settings(cfg)
-    port = rand_port()
+    cfg.ethnl.rings_set({'header': {'dev-index': cfg.ifindex},
+                         'tcp-data-split': 'enabled',
+                         'hds-thresh': 0,
+                         'rx': 64})
+    defer(cfg.ethnl.rings_set, {'header': {'dev-index': cfg.ifindex},
+                                'tcp-data-split': 'unknown',
+                                'hds-thresh': hds_thresh,
+                                'rx': rx_rings})
 
-    ethtool(f"-G {cfg.ifname} tcp-data-split on")
-    defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto")
+    cfg.target = channels - 1
+    ethtool(f"-X {cfg.ifname} equal {cfg.target}")
+    defer(ethtool, f"-X {cfg.ifname} default")
 
-    ethtool(f"-G {cfg.ifname} hds-thresh 0")
-    defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}")
+    rss_ctx_id = create_rss_ctx(cfg)
+    defer(ethtool, f"-X {cfg.ifname} delete context {rss_ctx_id}")
 
-    ethtool(f"-G {cfg.ifname} rx 64")
-    defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}")
+    flow_rule_id = set_flow_rule_rss(cfg, rss_ctx_id)
+    defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
 
-    ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}")
-    defer(ethtool, f"-X {cfg.ifname} default")
 
-    flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1)
-    defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
+@ksft_variants([
+    KsftNamedVariant("single", single),
+    KsftNamedVariant("rss", rss),
+])
+def test_zcrx(cfg, setup) -> None:
+    cfg.require_ipver('6')
 
-    rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1} -o 4"
-    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 4096 -z 16384"
+    setup(cfg)
+    rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target}"
+    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840"
     with bkg(rx_cmd, exit_wait=True):
-        wait_port_listen(port, proto="tcp")
+        wait_port_listen(cfg.port, proto="tcp")
         cmd(tx_cmd, host=cfg.remote)
 
 
-def test_zcrx_rss(cfg) -> None:
+@ksft_variants([
+    KsftNamedVariant("single", single),
+    KsftNamedVariant("rss", rss),
+])
+def test_zcrx_oneshot(cfg, setup) -> None:
     cfg.require_ipver('6')
 
-    combined_chans = _get_combined_channels(cfg)
-    if combined_chans < 2:
-        raise KsftSkipEx('at least 2 combined channels required')
-    (rx_ring, hds_thresh) = _get_current_settings(cfg)
-    port = rand_port()
-
-    ethtool(f"-G {cfg.ifname} tcp-data-split on")
-    defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto")
-
-    ethtool(f"-G {cfg.ifname} hds-thresh 0")
-    defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}")
-
-    ethtool(f"-G {cfg.ifname} rx 64")
-    defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}")
-
-    ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}")
-    defer(ethtool, f"-X {cfg.ifname} default")
-
-    (ctx_id, delete_ctx) = _create_rss_ctx(cfg, combined_chans - 1)
-    flow_rule_id = _set_flow_rule_rss(cfg, port, ctx_id)
-    defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
-
-    rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1}"
-    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 12840"
+    setup(cfg)
+    rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target} -o 4"
+    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 4096 -z 16384"
     with bkg(rx_cmd, exit_wait=True):
-        wait_port_listen(port, proto="tcp")
+        wait_port_listen(cfg.port, proto="tcp")
         cmd(tx_cmd, host=cfg.remote)
 
 
@@ -137,7 +119,9 @@ def main() -> None:
         cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx")
         cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
 
-        ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, ))
+        cfg.ethnl = EthtoolFamily()
+        cfg.port = rand_port()
+        ksft_run(globs=globals(), cases=[test_zcrx, test_zcrx_oneshot], args=(cfg, ))
     ksft_exit()
 
 
-- 
cgit v1.2.3


From 799a4912eea74c667da1c8167f93bf2d1508a89e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 8 Jan 2026 14:52:56 -0800
Subject: selftests: net: py: capitalize defer queue and improve import

Import utils and refer to the global defer queue that way instead
of importing the queue. This will make it possible to assign value
to the global variable. While at it capitalize the name, to comply
with the Python coding style.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20260108225257.2684238-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/lib/py/ksft.py  | 8 ++++----
 tools/testing/selftests/net/lib/py/utils.py | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py
index 531e7fa1b3ea..248cd1a723a3 100644
--- a/tools/testing/selftests/net/lib/py/ksft.py
+++ b/tools/testing/selftests/net/lib/py/ksft.py
@@ -8,7 +8,7 @@ import time
 import traceback
 from collections import namedtuple
 from .consts import KSFT_MAIN_NAME
-from .utils import global_defer_queue
+from . import utils
 
 KSFT_RESULT = None
 KSFT_RESULT_ALL = True
@@ -157,10 +157,10 @@ def ksft_flush_defer():
     global KSFT_RESULT
 
     i = 0
-    qlen_start = len(global_defer_queue)
-    while global_defer_queue:
+    qlen_start = len(utils.GLOBAL_DEFER_QUEUE)
+    while utils.GLOBAL_DEFER_QUEUE:
         i += 1
-        entry = global_defer_queue.pop()
+        entry = utils.GLOBAL_DEFER_QUEUE.pop()
         try:
             entry.exec_only()
         except Exception:
diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py
index 106ee1f2df86..2dde34560d65 100644
--- a/tools/testing/selftests/net/lib/py/utils.py
+++ b/tools/testing/selftests/net/lib/py/utils.py
@@ -141,7 +141,7 @@ class bkg(cmd):
         return self.process(terminate=terminate, fail=self.check_fail)
 
 
-global_defer_queue = []
+GLOBAL_DEFER_QUEUE = []
 
 
 class defer:
@@ -153,7 +153,7 @@ class defer:
         self.args = args
         self.kwargs = kwargs
 
-        self._queue =  global_defer_queue
+        self._queue = GLOBAL_DEFER_QUEUE
         self._queue.append(self)
 
     def __enter__(self):
-- 
cgit v1.2.3


From 7a1ff3545adeec5dc65c3063c2f084500d6f7014 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 8 Jan 2026 14:52:57 -0800
Subject: selftests: net: py: ensure defer() is only used within a test case

I wasted a couple of hours recently after accidentally adding
a defer() from within a function which itself was called as
part of defer(). This leads to an infinite loop of defer().
Make sure this cannot happen and raise a helpful exception.

I understand that the pair of _ksft_defer_arm() calls may
not be the most Pythonic way to implement this, but it's
easy enough to understand.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20260108225257.2684238-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/lib/py/ksft.py  | 7 +++++++
 tools/testing/selftests/net/lib/py/utils.py | 3 +++
 2 files changed, 10 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py
index 248cd1a723a3..0a96f88bb60a 100644
--- a/tools/testing/selftests/net/lib/py/ksft.py
+++ b/tools/testing/selftests/net/lib/py/ksft.py
@@ -153,6 +153,11 @@ def ktap_result(ok, cnt=1, case_name="", comment=""):
     print(res, flush=True)
 
 
+def _ksft_defer_arm(state):
+    """ Allow or disallow the use of defer() """
+    utils.GLOBAL_DEFER_ARMED = state
+
+
 def ksft_flush_defer():
     global KSFT_RESULT
 
@@ -315,6 +320,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()):
         comment = ""
         cnt_key = ""
 
+        _ksft_defer_arm(True)
         try:
             func(*args)
         except KsftSkipEx as e:
@@ -332,6 +338,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()):
                 ksft_pr(f"Stopping tests due to {type(e).__name__}.")
             KSFT_RESULT = False
             cnt_key = 'fail'
+        _ksft_defer_arm(False)
 
         try:
             ksft_flush_defer()
diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py
index 2dde34560d65..824f039d384c 100644
--- a/tools/testing/selftests/net/lib/py/utils.py
+++ b/tools/testing/selftests/net/lib/py/utils.py
@@ -142,6 +142,7 @@ class bkg(cmd):
 
 
 GLOBAL_DEFER_QUEUE = []
+GLOBAL_DEFER_ARMED = False
 
 
 class defer:
@@ -153,6 +154,8 @@ class defer:
         self.args = args
         self.kwargs = kwargs
 
+        if not GLOBAL_DEFER_ARMED:
+            raise Exception("defer queue not armed, did you use defer() outside of a test case?")
         self._queue = GLOBAL_DEFER_QUEUE
         self._queue.append(self)
 
-- 
cgit v1.2.3


From 4203c6fb5e9d2e4fb9a48b421d92efd4429a4d55 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Tue, 6 Jan 2026 12:44:57 +0100
Subject: selftests/nolibc: try to read from stdin in readv_zero test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When stdout is redirected to a file this test fails.
This happens when running through the kselftest runner since
commit d9e6269e3303 ("selftests/run_kselftest.sh: exit with
error if tests fail").

For consistency with other tests that read from a file descriptor,
switch to stdin over stdout. The tests are still brittle against
a redirected stdin, but at least they are now consistently so.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://patch.msgid.link/20260106-nolibc-selftests-v1-1-f82101c2c505@weissschuh.net
---
 tools/testing/selftests/nolibc/nolibc-test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 3986d55a6ff6..e83c1e7e2beb 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -1404,7 +1404,7 @@ int run_syscall(int min, int max)
 		CASE_TEST(write_badf);        EXPECT_SYSER(1, write(-1, &tmp, 1), -1, EBADF); break;
 		CASE_TEST(write_zero);        EXPECT_SYSZR(1, write(1, &tmp, 0)); break;
 		CASE_TEST(readv_badf);        EXPECT_SYSER(1, readv(-1, &iov_one, 1), -1, EBADF); break;
-		CASE_TEST(readv_zero);        EXPECT_SYSZR(1, readv(1, NULL, 0)); break;
+		CASE_TEST(readv_zero);        EXPECT_SYSZR(1, readv(0, NULL, 0)); break;
 		CASE_TEST(writev_badf);       EXPECT_SYSER(1, writev(-1, &iov_one, 1), -1, EBADF); break;
 		CASE_TEST(writev_zero);       EXPECT_SYSZR(1, writev(1, NULL, 0)); break;
 		CASE_TEST(ptrace);            EXPECT_SYSER(1, ptrace(PTRACE_CONT, getpid(), NULL, NULL), -1, ESRCH); break;
-- 
cgit v1.2.3


From 20c72de1f8a9e338531579bd784371aba4b7dd2c Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Tue, 6 Jan 2026 12:44:58 +0100
Subject: selftests/nolibc: scope custom flags to the nolibc-test target
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A new target for 'libc-test' is going to be added which should not be
affected by these options.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://patch.msgid.link/20260106-nolibc-selftests-v1-2-f82101c2c505@weissschuh.net
---
 tools/testing/selftests/nolibc/Makefile | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 40f5c2908dda..43f0b608c796 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -9,14 +9,10 @@ cc-option = $(call __cc-option, $(CC),,$(1),$(2))
 
 include Makefile.include
 
-CFLAGS = -nostdlib -nostdinc -static \
+$(OUTPUT)/nolibc-test: CFLAGS = -nostdlib -nostdinc -static \
 	 -isystem $(top_srcdir)/tools/include/nolibc -isystem $(top_srcdir)/usr/include \
 	 $(CFLAGS_NOLIBC_TEST)
-
-ifeq ($(LLVM),)
-LDLIBS := -lgcc
-endif
-
+$(OUTPUT)/nolibc-test: LDLIBS = $(if $(LLVM),,-lgcc)
 $(OUTPUT)/nolibc-test: nolibc-test.c nolibc-test-linkage.c | headers
 
 help:
-- 
cgit v1.2.3


From 6fe8360b16acbfb50c703f52568cad46759be2ed Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Tue, 6 Jan 2026 12:44:59 +0100
Subject: selftests/nolibc: also test libc-test through regular selftest
 framework
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hook up libc-test to the regular selftest build to make sure
nolibc-test.c stays compatible with a normal libc.

As the pattern rule from lib.mk does not handle compiling a target from
a differently named source file, add an explicit rule definition.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Willy Tarreau <w@1wt.eu>
Link: https://patch.msgid.link/20260106-nolibc-selftests-v1-3-f82101c2c505@weissschuh.net
---
 tools/testing/selftests/nolibc/Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 43f0b608c796..0370489d938b 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
-TEST_GEN_PROGS := nolibc-test
+TEST_GEN_PROGS := nolibc-test libc-test
 
 include ../lib.mk
 include $(top_srcdir)/scripts/Makefile.compiler
@@ -15,6 +15,10 @@ $(OUTPUT)/nolibc-test: CFLAGS = -nostdlib -nostdinc -static \
 $(OUTPUT)/nolibc-test: LDLIBS = $(if $(LLVM),,-lgcc)
 $(OUTPUT)/nolibc-test: nolibc-test.c nolibc-test-linkage.c | headers
 
+$(OUTPUT)/libc-test: nolibc-test.c nolibc-test-linkage.c
+	$(call msg,CC,,$@)
+	$(Q)$(LINK.c) $^ -o $@
+
 help:
 	@echo "For the custom nolibc testsuite use '$(MAKE) -f Makefile.nolibc'; available targets:"
 	@$(MAKE) -f Makefile.nolibc help
-- 
cgit v1.2.3


From edaf30743185f6ed8e29dcb2f1d01e183c0b807b Mon Sep 17 00:00:00 2001
From: Daniel Palmer <daniel@thingy.jp>
Date: Mon, 5 Jan 2026 11:36:27 +0900
Subject: tools/nolibc: Add fread() to stdio.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a very basic version of fread() like we already have for fwrite().

Signed-off-by: Daniel Palmer <daniel@thingy.jp>
Link: https://patch.msgid.link/20260105023629.1502801-2-daniel@thingy.jp
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/stdio.h | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h
index 1f16dab2ac88..6904252df97d 100644
--- a/tools/include/nolibc/stdio.h
+++ b/tools/include/nolibc/stdio.h
@@ -170,7 +170,7 @@ int putchar(int c)
 }
 
 
-/* fwrite(), puts(), fputs(). Note that puts() emits '\n' but not fputs(). */
+/* fwrite(), fread(), puts(), fputs(). Note that puts() emits '\n' but not fputs(). */
 
 /* internal fwrite()-like function which only takes a size and returns 0 on
  * success or EOF on error. It automatically retries on short writes.
@@ -204,6 +204,38 @@ size_t fwrite(const void *s, size_t size, size_t nmemb, FILE *stream)
 	return written;
 }
 
+/* internal fread()-like function which only takes a size and returns 0 on
+ * success or EOF on error. It automatically retries on short reads.
+ */
+static __attribute__((unused))
+int _fread(void *buf, size_t size, FILE *stream)
+{
+	int fd = fileno(stream);
+	ssize_t ret;
+
+	while (size) {
+		ret = read(fd, buf, size);
+		if (ret <= 0)
+			return EOF;
+		size -= ret;
+		buf += ret;
+	}
+	return 0;
+}
+
+static __attribute__((unused))
+size_t fread(void *s, size_t size, size_t nmemb, FILE *stream)
+{
+	size_t nread;
+
+	for (nread = 0; nread < nmemb; nread++) {
+		if (_fread(s, size, stream) != 0)
+			break;
+		s += size;
+	}
+	return nread;
+}
+
 static __attribute__((unused))
 int fputs(const char *s, FILE *stream)
 {
-- 
cgit v1.2.3


From 109770cc81680b802ee983b09b61c3979240fd09 Mon Sep 17 00:00:00 2001
From: Daniel Palmer <daniel@thingy.jp>
Date: Mon, 5 Jan 2026 11:36:28 +0900
Subject: tools/nolibc: Add fseek() to stdio.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A very basic wrapper around lseek() that implements fseek().

Signed-off-by: Daniel Palmer <daniel@thingy.jp>
Link: https://patch.msgid.link/20260105023629.1502801-3-daniel@thingy.jp
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/include/nolibc/stdio.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'tools')

diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h
index 6904252df97d..233318b0d0f0 100644
--- a/tools/include/nolibc/stdio.h
+++ b/tools/include/nolibc/stdio.h
@@ -272,6 +272,25 @@ char *fgets(char *s, int size, FILE *stream)
 }
 
 
+/* fseek */
+static __attribute__((unused))
+int fseek(FILE *stream, long offset, int whence)
+{
+	int fd = fileno(stream);
+	off_t ret;
+
+	ret = lseek(fd, offset, whence);
+
+	/* lseek() and fseek() differ in that lseek returns the new
+	 * position or -1, fseek() returns either 0 or -1.
+	 */
+	if (ret >= 0)
+		return 0;
+
+	return -1;
+}
+
+
 /* minimal printf(). It supports the following formats:
  *  - %[l*]{d,u,c,x,p}
  *  - %s
-- 
cgit v1.2.3


From a5f00be9b3b07d92c6689997403851a32e1874cc Mon Sep 17 00:00:00 2001
From: Daniel Palmer <daniel@thingy.jp>
Date: Mon, 5 Jan 2026 11:36:29 +0900
Subject: tools/nolibc: Add a simple test for writing to a FILE and reading it
 back
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a test that exercises create->write->seek->read to check that using the
stream functions (fwrite() etc) is not totally broken.

The only edge cases this is testing for are:
- Reading the file after writing but without rewinding reads nothing.
- Trying to read more items than the file contains returns the count of
  fully read items.

Signed-off-by: Daniel Palmer <daniel@thingy.jp>
Link: https://patch.msgid.link/20260105023629.1502801-4-daniel@thingy.jp
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 53 ++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index e83c1e7e2beb..1b9d3b2e2491 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -878,6 +878,58 @@ int test_file_stream(void)
 	return 0;
 }
 
+int test_file_stream_wsr(void)
+{
+	const char dataout[] = "foo";
+	const size_t datasz = sizeof(dataout);
+	char datain[datasz];
+	int fd, r;
+	FILE *f;
+
+	fd = open("/tmp", O_TMPFILE | O_RDWR, 0644);
+	if (fd == -1)
+		return -1;
+
+	f = fdopen(fd, "w+");
+	if (!f)
+		return -1;
+
+	errno = 0;
+	r = fwrite(dataout, 1, datasz, f);
+	if (r != datasz)
+		return -1;
+
+	/* Attempt to read from the file without rewinding,
+	 * we should read 0 items.
+	 */
+	r = fread(datain, 1, datasz, f);
+	if (r)
+		return -1;
+
+	/* Rewind the file to the start */
+	r = fseek(f, 0, SEEK_SET);
+	if (r)
+		return -1;
+
+	/* Attempt to read back more than was written to
+	 * make sure we handle short reads properly.
+	 * fread() should return the number of complete items.
+	 */
+	r = fread(datain, 1, datasz + 1, f);
+	if (r != datasz)
+		return -1;
+
+	/* Data we read should match the data we just wrote */
+	if (memcmp(datain, dataout, datasz) != 0)
+		return -1;
+
+	r = fclose(f);
+	if (r)
+		return -1;
+
+	return 0;
+}
+
 enum fork_type {
 	FORK_STANDARD,
 	FORK_VFORK,
@@ -1352,6 +1404,7 @@ int run_syscall(int min, int max)
 		CASE_TEST(fchdir_stdin);      EXPECT_SYSER(1, fchdir(STDIN_FILENO), -1, ENOTDIR); break;
 		CASE_TEST(fchdir_badfd);      EXPECT_SYSER(1, fchdir(-1), -1, EBADF); break;
 		CASE_TEST(file_stream);       EXPECT_SYSZR(1, test_file_stream()); break;
+		CASE_TEST(file_stream_wsr);   EXPECT_SYSZR(1, test_file_stream_wsr()); break;
 		CASE_TEST(fork);              EXPECT_SYSZR(1, test_fork(FORK_STANDARD)); break;
 		CASE_TEST(getdents64_root);   EXPECT_SYSNE(1, test_getdents64("/"), -1); break;
 		CASE_TEST(getdents64_null);   EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break;
-- 
cgit v1.2.3


From 531b50e06aa7600f854a90b0f714f4e49ea2c1ac Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Wed, 26 Nov 2025 11:42:35 +0100
Subject: verification/rvgen: Adapt dot2k and templates after refactoring
 da_monitor.h

Previous changes refactored the da_monitor header file to avoid using
macros. This implies a few changes in how to import and use da_monitor
helpers:

 DECLARE_DA_MON_<TYPE>(name, type) is substituted by
 #define RV_MON_TYPE RV_MON_<TYPE>

Update the rvgen templates to reflect the changes.

Reviewed-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20251126104241.291258-5-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 tools/verification/rvgen/rvgen/dot2k.py            |  6 ++++--
 .../rvgen/rvgen/templates/dot2k/main.c             | 25 ++++++++--------------
 2 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/verification/rvgen/rvgen/dot2k.py b/tools/verification/rvgen/rvgen/dot2k.py
index ed0a3c901106..d618a842fc52 100644
--- a/tools/verification/rvgen/rvgen/dot2k.py
+++ b/tools/verification/rvgen/rvgen/dot2k.py
@@ -38,9 +38,9 @@ class dot2k(Monitor, Dot2c):
                 handle = "handle_start_run_event"
             if self.monitor_type == "per_task":
                 buff.append("\tstruct task_struct *p = /* XXX: how do I get p? */;");
-                buff.append("\tda_%s_%s(p, %s%s);" % (handle, self.name, event, self.enum_suffix));
+                buff.append("\tda_%s(p, %s%s);" % (handle, event, self.enum_suffix));
             else:
-                buff.append("\tda_%s_%s(%s%s);" % (handle, self.name, event, self.enum_suffix));
+                buff.append("\tda_%s(%s%s);" % (handle, event, self.enum_suffix));
             buff.append("}")
             buff.append("")
         return '\n'.join(buff)
@@ -66,6 +66,8 @@ class dot2k(Monitor, Dot2c):
         buff.append(" *   Documentation/trace/rv/deterministic_automata.rst")
         buff.append(" */")
         buff.append("")
+        buff.append("#define MONITOR_NAME %s" % (self.name))
+        buff.append("")
 
         return buff
 
diff --git a/tools/verification/rvgen/rvgen/templates/dot2k/main.c b/tools/verification/rvgen/rvgen/templates/dot2k/main.c
index e0fd1134bd85..a14e4f0883db 100644
--- a/tools/verification/rvgen/rvgen/templates/dot2k/main.c
+++ b/tools/verification/rvgen/rvgen/templates/dot2k/main.c
@@ -6,7 +6,6 @@
 #include <linux/init.h>
 #include <linux/rv.h>
 #include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
 
 #define MODULE_NAME "%%MODEL_NAME%%"
 
@@ -20,15 +19,9 @@
  * This is the self-generated part of the monitor. Generally, there is no need
  * to touch this section.
  */
+#define RV_MON_TYPE RV_MON_%%MONITOR_TYPE%%
 #include "%%MODEL_NAME%%.h"
-
-/*
- * Declare the deterministic automata monitor.
- *
- * The rv monitor reference is needed for the monitor declaration.
- */
-static struct rv_monitor rv_%%MODEL_NAME%%;
-DECLARE_DA_MON_%%MONITOR_TYPE%%(%%MODEL_NAME%%, %%MIN_TYPE%%);
+#include <rv/da_monitor.h>
 
 /*
  * This is the instrumentation part of the monitor.
@@ -42,7 +35,7 @@ static int enable_%%MODEL_NAME%%(void)
 {
 	int retval;
 
-	retval = da_monitor_init_%%MODEL_NAME%%();
+	retval = da_monitor_init();
 	if (retval)
 		return retval;
 
@@ -53,33 +46,33 @@ static int enable_%%MODEL_NAME%%(void)
 
 static void disable_%%MODEL_NAME%%(void)
 {
-	rv_%%MODEL_NAME%%.enabled = 0;
+	rv_this.enabled = 0;
 
 %%TRACEPOINT_DETACH%%
 
-	da_monitor_destroy_%%MODEL_NAME%%();
+	da_monitor_destroy();
 }
 
 /*
  * This is the monitor register section.
  */
-static struct rv_monitor rv_%%MODEL_NAME%% = {
+static struct rv_monitor rv_this = {
 	.name = "%%MODEL_NAME%%",
 	.description = "%%DESCRIPTION%%",
 	.enable = enable_%%MODEL_NAME%%,
 	.disable = disable_%%MODEL_NAME%%,
-	.reset = da_monitor_reset_all_%%MODEL_NAME%%,
+	.reset = da_monitor_reset_all,
 	.enabled = 0,
 };
 
 static int __init register_%%MODEL_NAME%%(void)
 {
-	return rv_register_monitor(&rv_%%MODEL_NAME%%, %%PARENT%%);
+	return rv_register_monitor(&rv_this, %%PARENT%%);
 }
 
 static void __exit unregister_%%MODEL_NAME%%(void)
 {
-	rv_unregister_monitor(&rv_%%MODEL_NAME%%);
+	rv_unregister_monitor(&rv_this);
 }
 
 module_init(register_%%MODEL_NAME%%);
-- 
cgit v1.2.3


From 3c5720b9ba3ee9b3ae238aeaf0340e4c9666330e Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Wed, 26 Nov 2025 11:42:36 +0100
Subject: verification/rvgen: Annotate DA functions with types

Functions in automata.py, dot2c.py and dot2k.py don't have type
annotations and it can get complicated to remember how to use them.

Add minimal type annotations.

Reviewed-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20251126104241.291258-6-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 tools/verification/rvgen/rvgen/automata.py | 20 +++++++-------
 tools/verification/rvgen/rvgen/dot2c.py    | 42 +++++++++++++++---------------
 tools/verification/rvgen/rvgen/dot2k.py    | 20 +++++++-------
 3 files changed, 41 insertions(+), 41 deletions(-)

(limited to 'tools')

diff --git a/tools/verification/rvgen/rvgen/automata.py b/tools/verification/rvgen/rvgen/automata.py
index d9a3fe2b74bf..3f06aef8d4fd 100644
--- a/tools/verification/rvgen/rvgen/automata.py
+++ b/tools/verification/rvgen/rvgen/automata.py
@@ -28,7 +28,7 @@ class Automata:
         self.function = self.__create_matrix()
         self.events_start, self.events_start_run = self.__store_init_events()
 
-    def __get_model_name(self):
+    def __get_model_name(self) -> str:
         basename = ntpath.basename(self.__dot_path)
         if not basename.endswith(".dot") and not basename.endswith(".gv"):
             print("not a dot file")
@@ -40,7 +40,7 @@ class Automata:
 
         return model_name
 
-    def __open_dot(self):
+    def __open_dot(self) -> list[str]:
         cursor = 0
         dot_lines = []
         try:
@@ -60,13 +60,13 @@ class Automata:
             cursor += 1
         return dot_lines
 
-    def __get_cursor_begin_states(self):
+    def __get_cursor_begin_states(self) -> int:
         cursor = 0
         while self.__dot_lines[cursor].split()[0] != "{node":
             cursor += 1
         return cursor
 
-    def __get_cursor_begin_events(self):
+    def __get_cursor_begin_events(self) -> int:
         cursor = 0
         while self.__dot_lines[cursor].split()[0] != "{node":
             cursor += 1
@@ -76,7 +76,7 @@ class Automata:
         cursor += 1
         return cursor
 
-    def __get_state_variables(self):
+    def __get_state_variables(self) -> tuple[list[str], str, list[str]]:
         # wait for node declaration
         states = []
         final_states = []
@@ -116,7 +116,7 @@ class Automata:
 
         return states, initial_state, final_states
 
-    def __get_event_variables(self):
+    def __get_event_variables(self) -> list[str]:
         # here we are at the begin of transitions, take a note, we will return later.
         cursor = self.__get_cursor_begin_events()
 
@@ -140,7 +140,7 @@ class Automata:
 
         return sorted(set(events))
 
-    def __create_matrix(self):
+    def __create_matrix(self) -> list[list[str]]:
         # transform the array into a dictionary
         events = self.events
         states = self.states
@@ -174,7 +174,7 @@ class Automata:
 
         return matrix
 
-    def __store_init_events(self):
+    def __store_init_events(self) -> tuple[list[bool], list[bool]]:
         events_start = [False] * len(self.events)
         events_start_run = [False] * len(self.events)
         for i, _ in enumerate(self.events):
@@ -196,10 +196,10 @@ class Automata:
                 events_start_run[i] = True
         return events_start, events_start_run
 
-    def is_start_event(self, event):
+    def is_start_event(self, event: str) -> bool:
         return self.events_start[self.events.index(event)]
 
-    def is_start_run_event(self, event):
+    def is_start_run_event(self, event: str) -> bool:
         # prefer handle_start_event if there
         if any(self.events_start):
             return False
diff --git a/tools/verification/rvgen/rvgen/dot2c.py b/tools/verification/rvgen/rvgen/dot2c.py
index b9b6f14cc536..fd64174fcfad 100644
--- a/tools/verification/rvgen/rvgen/dot2c.py
+++ b/tools/verification/rvgen/rvgen/dot2c.py
@@ -35,7 +35,7 @@ class Dot2c(Automata):
         # cut off the last \n
         return string[:-1]
 
-    def __get_enum_states_content(self):
+    def __get_enum_states_content(self) -> list[str]:
         buff = []
         buff.append("\t%s%s = 0," % (self.initial_state, self.enum_suffix))
         for state in self.states:
@@ -49,7 +49,7 @@ class Dot2c(Automata):
         buff = self.__get_enum_states_content()
         return self.__buff_to_string(buff)
 
-    def format_states_enum(self):
+    def format_states_enum(self) -> list[str]:
         buff = []
         buff.append("enum %s {" % self.enum_states_def)
         buff.append(self.get_enum_states_string())
@@ -57,7 +57,7 @@ class Dot2c(Automata):
 
         return buff
 
-    def __get_enum_events_content(self):
+    def __get_enum_events_content(self) -> list[str]:
         buff = []
         first = True
         for event in self.events:
@@ -75,7 +75,7 @@ class Dot2c(Automata):
         buff = self.__get_enum_events_content()
         return self.__buff_to_string(buff)
 
-    def format_events_enum(self):
+    def format_events_enum(self) -> list[str]:
         buff = []
         buff.append("enum %s {" % self.enum_events_def)
         buff.append(self.get_enum_events_string())
@@ -83,7 +83,7 @@ class Dot2c(Automata):
 
         return buff
 
-    def get_minimun_type(self):
+    def get_minimun_type(self) -> str:
         min_type = "unsigned char"
 
         if self.states.__len__() > 255:
@@ -97,7 +97,7 @@ class Dot2c(Automata):
 
         return min_type
 
-    def format_automaton_definition(self):
+    def format_automaton_definition(self) -> list[str]:
         min_type = self.get_minimun_type()
         buff = []
         buff.append("struct %s {" % self.struct_automaton_def)
@@ -109,12 +109,12 @@ class Dot2c(Automata):
         buff.append("};\n")
         return buff
 
-    def format_aut_init_header(self):
+    def format_aut_init_header(self) -> list[str]:
         buff = []
         buff.append("static const struct %s %s = {" % (self.struct_automaton_def, self.var_automaton_def))
         return buff
 
-    def __get_string_vector_per_line_content(self, buff):
+    def __get_string_vector_per_line_content(self, buff: list[str]) -> str:
         first = True
         string = ""
         for entry in buff:
@@ -133,14 +133,14 @@ class Dot2c(Automata):
     def get_aut_init_states_string(self):
         return self.__get_string_vector_per_line_content(self.states)
 
-    def format_aut_init_events_string(self):
+    def format_aut_init_events_string(self) -> list[str]:
         buff = []
         buff.append("\t.event_names = {")
         buff.append(self.get_aut_init_events_string())
         buff.append("\t},")
         return buff
 
-    def format_aut_init_states_string(self):
+    def format_aut_init_states_string(self) -> list[str]:
         buff = []
         buff.append("\t.state_names = {")
         buff.append(self.get_aut_init_states_string())
@@ -148,11 +148,11 @@ class Dot2c(Automata):
 
         return buff
 
-    def __get_max_strlen_of_states(self):
+    def __get_max_strlen_of_states(self) -> int:
         max_state_name = max(self.states, key = len).__len__()
         return max(max_state_name, self.invalid_state_str.__len__())
 
-    def get_aut_init_function(self):
+    def get_aut_init_function(self) -> str:
         nr_states = self.states.__len__()
         nr_events = self.events.__len__()
         buff = []
@@ -180,7 +180,7 @@ class Dot2c(Automata):
 
         return self.__buff_to_string(buff)
 
-    def format_aut_init_function(self):
+    def format_aut_init_function(self) -> list[str]:
         buff = []
         buff.append("\t.function = {")
         buff.append(self.get_aut_init_function())
@@ -188,17 +188,17 @@ class Dot2c(Automata):
 
         return buff
 
-    def get_aut_init_initial_state(self):
+    def get_aut_init_initial_state(self) -> str:
         return self.initial_state
 
-    def format_aut_init_initial_state(self):
+    def format_aut_init_initial_state(self) -> list[str]:
         buff = []
         initial_state = self.get_aut_init_initial_state()
         buff.append("\t.initial_state = " + initial_state + self.enum_suffix + ",")
 
         return buff
 
-    def get_aut_init_final_states(self):
+    def get_aut_init_final_states(self) -> str:
         line = ""
         first = True
         for state in self.states:
@@ -213,29 +213,29 @@ class Dot2c(Automata):
                 line = line + '0'
         return line
 
-    def format_aut_init_final_states(self):
+    def format_aut_init_final_states(self) -> list[str]:
        buff = []
        buff.append("\t.final_states = { %s }," % self.get_aut_init_final_states())
 
        return buff
 
-    def __get_automaton_initialization_footer_string(self):
+    def __get_automaton_initialization_footer_string(self) -> str:
         footer = "};\n"
         return footer
 
-    def format_aut_init_footer(self):
+    def format_aut_init_footer(self) -> list[str]:
         buff = []
         buff.append(self.__get_automaton_initialization_footer_string())
 
         return buff
 
-    def format_invalid_state(self):
+    def format_invalid_state(self) -> list[str]:
         buff = []
         buff.append("#define %s state_max%s\n" % (self.invalid_state_str, self.enum_suffix))
 
         return buff
 
-    def format_model(self):
+    def format_model(self) -> list[str]:
         buff = []
         buff += self.format_states_enum()
         buff += self.format_invalid_state()
diff --git a/tools/verification/rvgen/rvgen/dot2k.py b/tools/verification/rvgen/rvgen/dot2k.py
index d618a842fc52..6128fe238430 100644
--- a/tools/verification/rvgen/rvgen/dot2k.py
+++ b/tools/verification/rvgen/rvgen/dot2k.py
@@ -21,10 +21,10 @@ class dot2k(Monitor, Dot2c):
         Dot2c.__init__(self, file_path, extra_params.get("model_name"))
         self.enum_suffix = "_%s" % self.name
 
-    def fill_monitor_type(self):
+    def fill_monitor_type(self) -> str:
         return self.monitor_type.upper()
 
-    def fill_tracepoint_handlers_skel(self):
+    def fill_tracepoint_handlers_skel(self) -> str:
         buff = []
         for event in self.events:
             buff.append("static void handle_%s(void *data, /* XXX: fill header */)" % event)
@@ -45,19 +45,19 @@ class dot2k(Monitor, Dot2c):
             buff.append("")
         return '\n'.join(buff)
 
-    def fill_tracepoint_attach_probe(self):
+    def fill_tracepoint_attach_probe(self) -> str:
         buff = []
         for event in self.events:
             buff.append("\trv_attach_trace_probe(\"%s\", /* XXX: tracepoint */, handle_%s);" % (self.name, event))
         return '\n'.join(buff)
 
-    def fill_tracepoint_detach_helper(self):
+    def fill_tracepoint_detach_helper(self) -> str:
         buff = []
         for event in self.events:
             buff.append("\trv_detach_trace_probe(\"%s\", /* XXX: tracepoint */, handle_%s);" % (self.name, event))
         return '\n'.join(buff)
 
-    def fill_model_h_header(self):
+    def fill_model_h_header(self) -> list[str]:
         buff = []
         buff.append("/* SPDX-License-Identifier: GPL-2.0 */")
         buff.append("/*")
@@ -71,7 +71,7 @@ class dot2k(Monitor, Dot2c):
 
         return buff
 
-    def fill_model_h(self):
+    def fill_model_h(self) -> str:
         #
         # Adjust the definition names
         #
@@ -85,17 +85,17 @@ class dot2k(Monitor, Dot2c):
 
         return '\n'.join(buff)
 
-    def fill_monitor_class_type(self):
+    def fill_monitor_class_type(self) -> str:
         if self.monitor_type == "per_task":
             return "DA_MON_EVENTS_ID"
         return "DA_MON_EVENTS_IMPLICIT"
 
-    def fill_monitor_class(self):
+    def fill_monitor_class(self) -> str:
         if self.monitor_type == "per_task":
             return "da_monitor_id"
         return "da_monitor"
 
-    def fill_tracepoint_args_skel(self, tp_type):
+    def fill_tracepoint_args_skel(self, tp_type: str) -> str:
         buff = []
         tp_args_event = [
                 ("char *", "state"),
@@ -117,7 +117,7 @@ class dot2k(Monitor, Dot2c):
         buff.append("	     TP_ARGS(%s)" % tp_args_c)
         return '\n'.join(buff)
 
-    def fill_main_c(self):
+    def fill_main_c(self) -> str:
         main_c = super().fill_main_c()
 
         min_type = self.get_minimun_type()
-- 
cgit v1.2.3


From 0d2405a086a035cce1e0ba1aa0849bd2104a4d6b Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Wed, 26 Nov 2025 11:42:37 +0100
Subject: verification/dot2c: Remove __buff_to_string() and cleanup

str.join() can do what __buff_to_string() does. Therefore replace
__buff_to_string() to make the scripts more pythonic.

Also clean and remove some intermediate functions.

Reviewed-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20251126104241.291258-7-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 tools/verification/rvgen/rvgen/dot2c.py | 35 ++++++---------------------------
 1 file changed, 6 insertions(+), 29 deletions(-)

(limited to 'tools')

diff --git a/tools/verification/rvgen/rvgen/dot2c.py b/tools/verification/rvgen/rvgen/dot2c.py
index fd64174fcfad..24894411c3cd 100644
--- a/tools/verification/rvgen/rvgen/dot2c.py
+++ b/tools/verification/rvgen/rvgen/dot2c.py
@@ -26,15 +26,6 @@ class Dot2c(Automata):
         super().__init__(file_path, model_name)
         self.line_length = 100
 
-    def __buff_to_string(self, buff):
-        string = ""
-
-        for line in buff:
-            string = string + line + "\n"
-
-        # cut off the last \n
-        return string[:-1]
-
     def __get_enum_states_content(self) -> list[str]:
         buff = []
         buff.append("\t%s%s = 0," % (self.initial_state, self.enum_suffix))
@@ -45,14 +36,10 @@ class Dot2c(Automata):
 
         return buff
 
-    def get_enum_states_string(self):
-        buff = self.__get_enum_states_content()
-        return self.__buff_to_string(buff)
-
     def format_states_enum(self) -> list[str]:
         buff = []
         buff.append("enum %s {" % self.enum_states_def)
-        buff.append(self.get_enum_states_string())
+        buff += self.__get_enum_states_content()
         buff.append("};\n")
 
         return buff
@@ -71,14 +58,10 @@ class Dot2c(Automata):
 
         return buff
 
-    def get_enum_events_string(self):
-        buff = self.__get_enum_events_content()
-        return self.__buff_to_string(buff)
-
     def format_events_enum(self) -> list[str]:
         buff = []
         buff.append("enum %s {" % self.enum_events_def)
-        buff.append(self.get_enum_events_string())
+        buff += self.__get_enum_events_content()
         buff.append("};\n")
 
         return buff
@@ -127,23 +110,17 @@ class Dot2c(Automata):
 
         return string
 
-    def get_aut_init_events_string(self):
-        return self.__get_string_vector_per_line_content(self.events)
-
-    def get_aut_init_states_string(self):
-        return self.__get_string_vector_per_line_content(self.states)
-
     def format_aut_init_events_string(self) -> list[str]:
         buff = []
         buff.append("\t.event_names = {")
-        buff.append(self.get_aut_init_events_string())
+        buff.append(self.__get_string_vector_per_line_content(self.events))
         buff.append("\t},")
         return buff
 
     def format_aut_init_states_string(self) -> list[str]:
         buff = []
         buff.append("\t.state_names = {")
-        buff.append(self.get_aut_init_states_string())
+        buff.append(self.__get_string_vector_per_line_content(self.states))
         buff.append("\t},")
 
         return buff
@@ -178,7 +155,7 @@ class Dot2c(Automata):
                     line += "\n\t\t}," if linetoolong else " },"
             buff.append(line)
 
-        return self.__buff_to_string(buff)
+        return '\n'.join(buff)
 
     def format_aut_init_function(self) -> list[str]:
         buff = []
@@ -253,4 +230,4 @@ class Dot2c(Automata):
 
     def print_model_classic(self):
         buff = self.format_model()
-        print(self.__buff_to_string(buff))
+        print('\n'.join(buff))
-- 
cgit v1.2.3


From 3d2bfeeef340c8494eba80e7a005159cac69c2f7 Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Wed, 26 Nov 2025 11:42:38 +0100
Subject: verification/dot2c: Remove superfluous enum assignment and add last
 comma

The header files generated by dot2c currently create enums for states
and events assigning the first element to 0. This is superfluous as it
happens automatically if no value is specified.
Also it doesn't add a comma to the last enum elements, which slightly
complicates the diff if states or events are added.

Remove the assignment to 0 and add a comma to last elements, this
simplifies the logic for the code generator.

Reviewed-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20251126104241.291258-8-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 kernel/trace/rv/monitors/nrp/nrp.h      | 20 +++++++++---------
 kernel/trace/rv/monitors/opid/opid.h    | 22 ++++++++++----------
 kernel/trace/rv/monitors/sco/sco.h      | 12 +++++------
 kernel/trace/rv/monitors/scpd/scpd.h    | 12 +++++------
 kernel/trace/rv/monitors/snep/snep.h    | 16 +++++++--------
 kernel/trace/rv/monitors/snroc/snroc.h  | 12 +++++------
 kernel/trace/rv/monitors/sssw/sssw.h    | 20 +++++++++---------
 kernel/trace/rv/monitors/sts/sts.h      | 26 ++++++++++++------------
 kernel/trace/rv/monitors/wip/wip.h      | 12 +++++------
 kernel/trace/rv/monitors/wwnr/wwnr.h    | 12 +++++------
 tools/verification/rvgen/rvgen/dot2c.py | 36 +++++++++++----------------------
 11 files changed, 94 insertions(+), 106 deletions(-)

(limited to 'tools')

diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h
index c2ec83da2124..3270d4c0139f 100644
--- a/kernel/trace/rv/monitors/nrp/nrp.h
+++ b/kernel/trace/rv/monitors/nrp/nrp.h
@@ -8,21 +8,21 @@
 #define MONITOR_NAME nrp
 
 enum states_nrp {
-	preempt_irq_nrp = 0,
+	preempt_irq_nrp,
 	any_thread_running_nrp,
 	nested_preempt_nrp,
 	rescheduling_nrp,
-	state_max_nrp
+	state_max_nrp,
 };
 
 #define INVALID_STATE state_max_nrp
 
 enum events_nrp {
-	irq_entry_nrp = 0,
+	irq_entry_nrp,
 	sched_need_resched_nrp,
 	schedule_entry_nrp,
 	schedule_entry_preempt_nrp,
-	event_max_nrp
+	event_max_nrp,
 };
 
 struct automaton_nrp {
@@ -38,38 +38,38 @@ static const struct automaton_nrp automaton_nrp = {
 		"preempt_irq",
 		"any_thread_running",
 		"nested_preempt",
-		"rescheduling"
+		"rescheduling",
 	},
 	.event_names = {
 		"irq_entry",
 		"sched_need_resched",
 		"schedule_entry",
-		"schedule_entry_preempt"
+		"schedule_entry_preempt",
 	},
 	.function = {
 		{
 			preempt_irq_nrp,
 			preempt_irq_nrp,
 			nested_preempt_nrp,
-			nested_preempt_nrp
+			nested_preempt_nrp,
 		},
 		{
 			any_thread_running_nrp,
 			rescheduling_nrp,
 			any_thread_running_nrp,
-			INVALID_STATE
+			INVALID_STATE,
 		},
 		{
 			nested_preempt_nrp,
 			preempt_irq_nrp,
 			any_thread_running_nrp,
-			any_thread_running_nrp
+			any_thread_running_nrp,
 		},
 		{
 			preempt_irq_nrp,
 			rescheduling_nrp,
 			any_thread_running_nrp,
-			any_thread_running_nrp
+			any_thread_running_nrp,
 		},
 	},
 	.initial_state = preempt_irq_nrp,
diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h
index 5014f1b85ecf..092992514970 100644
--- a/kernel/trace/rv/monitors/opid/opid.h
+++ b/kernel/trace/rv/monitors/opid/opid.h
@@ -8,25 +8,25 @@
 #define MONITOR_NAME opid
 
 enum states_opid {
-	disabled_opid = 0,
+	disabled_opid,
 	enabled_opid,
 	in_irq_opid,
 	irq_disabled_opid,
 	preempt_disabled_opid,
-	state_max_opid
+	state_max_opid,
 };
 
 #define INVALID_STATE state_max_opid
 
 enum events_opid {
-	irq_disable_opid = 0,
+	irq_disable_opid,
 	irq_enable_opid,
 	irq_entry_opid,
 	preempt_disable_opid,
 	preempt_enable_opid,
 	sched_need_resched_opid,
 	sched_waking_opid,
-	event_max_opid
+	event_max_opid,
 };
 
 struct automaton_opid {
@@ -43,7 +43,7 @@ static const struct automaton_opid automaton_opid = {
 		"enabled",
 		"in_irq",
 		"irq_disabled",
-		"preempt_disabled"
+		"preempt_disabled",
 	},
 	.event_names = {
 		"irq_disable",
@@ -52,7 +52,7 @@ static const struct automaton_opid automaton_opid = {
 		"preempt_disable",
 		"preempt_enable",
 		"sched_need_resched",
-		"sched_waking"
+		"sched_waking",
 	},
 	.function = {
 		{
@@ -62,7 +62,7 @@ static const struct automaton_opid automaton_opid = {
 			INVALID_STATE,
 			irq_disabled_opid,
 			disabled_opid,
-			disabled_opid
+			disabled_opid,
 		},
 		{
 			irq_disabled_opid,
@@ -71,7 +71,7 @@ static const struct automaton_opid automaton_opid = {
 			preempt_disabled_opid,
 			enabled_opid,
 			INVALID_STATE,
-			INVALID_STATE
+			INVALID_STATE,
 		},
 		{
 			INVALID_STATE,
@@ -80,7 +80,7 @@ static const struct automaton_opid automaton_opid = {
 			INVALID_STATE,
 			INVALID_STATE,
 			in_irq_opid,
-			in_irq_opid
+			in_irq_opid,
 		},
 		{
 			INVALID_STATE,
@@ -89,7 +89,7 @@ static const struct automaton_opid automaton_opid = {
 			disabled_opid,
 			INVALID_STATE,
 			irq_disabled_opid,
-			INVALID_STATE
+			INVALID_STATE,
 		},
 		{
 			disabled_opid,
@@ -98,7 +98,7 @@ static const struct automaton_opid automaton_opid = {
 			INVALID_STATE,
 			enabled_opid,
 			INVALID_STATE,
-			INVALID_STATE
+			INVALID_STATE,
 		},
 	},
 	.initial_state = disabled_opid,
diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h
index 06b1c420ce54..bac3beb51e72 100644
--- a/kernel/trace/rv/monitors/sco/sco.h
+++ b/kernel/trace/rv/monitors/sco/sco.h
@@ -8,18 +8,18 @@
 #define MONITOR_NAME sco
 
 enum states_sco {
-	thread_context_sco = 0,
+	thread_context_sco,
 	scheduling_context_sco,
-	state_max_sco
+	state_max_sco,
 };
 
 #define INVALID_STATE state_max_sco
 
 enum events_sco {
-	sched_set_state_sco = 0,
+	sched_set_state_sco,
 	schedule_entry_sco,
 	schedule_exit_sco,
-	event_max_sco
+	event_max_sco,
 };
 
 struct automaton_sco {
@@ -33,12 +33,12 @@ struct automaton_sco {
 static const struct automaton_sco automaton_sco = {
 	.state_names = {
 		"thread_context",
-		"scheduling_context"
+		"scheduling_context",
 	},
 	.event_names = {
 		"sched_set_state",
 		"schedule_entry",
-		"schedule_exit"
+		"schedule_exit",
 	},
 	.function = {
 		{     thread_context_sco, scheduling_context_sco,          INVALID_STATE },
diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h
index 4a725a68085a..d6329da2671b 100644
--- a/kernel/trace/rv/monitors/scpd/scpd.h
+++ b/kernel/trace/rv/monitors/scpd/scpd.h
@@ -8,19 +8,19 @@
 #define MONITOR_NAME scpd
 
 enum states_scpd {
-	cant_sched_scpd = 0,
+	cant_sched_scpd,
 	can_sched_scpd,
-	state_max_scpd
+	state_max_scpd,
 };
 
 #define INVALID_STATE state_max_scpd
 
 enum events_scpd {
-	preempt_disable_scpd = 0,
+	preempt_disable_scpd,
 	preempt_enable_scpd,
 	schedule_entry_scpd,
 	schedule_exit_scpd,
-	event_max_scpd
+	event_max_scpd,
 };
 
 struct automaton_scpd {
@@ -34,13 +34,13 @@ struct automaton_scpd {
 static const struct automaton_scpd automaton_scpd = {
 	.state_names = {
 		"cant_sched",
-		"can_sched"
+		"can_sched",
 	},
 	.event_names = {
 		"preempt_disable",
 		"preempt_enable",
 		"schedule_entry",
-		"schedule_exit"
+		"schedule_exit",
 	},
 	.function = {
 		{     can_sched_scpd,     INVALID_STATE,     INVALID_STATE,     INVALID_STATE },
diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h
index 753080dc5fa1..357520a5b3d1 100644
--- a/kernel/trace/rv/monitors/snep/snep.h
+++ b/kernel/trace/rv/monitors/snep/snep.h
@@ -8,19 +8,19 @@
 #define MONITOR_NAME snep
 
 enum states_snep {
-	non_scheduling_context_snep = 0,
+	non_scheduling_context_snep,
 	scheduling_contex_snep,
-	state_max_snep
+	state_max_snep,
 };
 
 #define INVALID_STATE state_max_snep
 
 enum events_snep {
-	preempt_disable_snep = 0,
+	preempt_disable_snep,
 	preempt_enable_snep,
 	schedule_entry_snep,
 	schedule_exit_snep,
-	event_max_snep
+	event_max_snep,
 };
 
 struct automaton_snep {
@@ -34,26 +34,26 @@ struct automaton_snep {
 static const struct automaton_snep automaton_snep = {
 	.state_names = {
 		"non_scheduling_context",
-		"scheduling_contex"
+		"scheduling_contex",
 	},
 	.event_names = {
 		"preempt_disable",
 		"preempt_enable",
 		"schedule_entry",
-		"schedule_exit"
+		"schedule_exit",
 	},
 	.function = {
 		{
 			non_scheduling_context_snep,
 			non_scheduling_context_snep,
 			scheduling_contex_snep,
-			INVALID_STATE
+			INVALID_STATE,
 		},
 		{
 			INVALID_STATE,
 			INVALID_STATE,
 			INVALID_STATE,
-			non_scheduling_context_snep
+			non_scheduling_context_snep,
 		},
 	},
 	.initial_state = non_scheduling_context_snep,
diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h
index ada5ee08bdab..88b7328ad31a 100644
--- a/kernel/trace/rv/monitors/snroc/snroc.h
+++ b/kernel/trace/rv/monitors/snroc/snroc.h
@@ -8,18 +8,18 @@
 #define MONITOR_NAME snroc
 
 enum states_snroc {
-	other_context_snroc = 0,
+	other_context_snroc,
 	own_context_snroc,
-	state_max_snroc
+	state_max_snroc,
 };
 
 #define INVALID_STATE state_max_snroc
 
 enum events_snroc {
-	sched_set_state_snroc = 0,
+	sched_set_state_snroc,
 	sched_switch_in_snroc,
 	sched_switch_out_snroc,
-	event_max_snroc
+	event_max_snroc,
 };
 
 struct automaton_snroc {
@@ -33,12 +33,12 @@ struct automaton_snroc {
 static const struct automaton_snroc automaton_snroc = {
 	.state_names = {
 		"other_context",
-		"own_context"
+		"own_context",
 	},
 	.event_names = {
 		"sched_set_state",
 		"sched_switch_in",
-		"sched_switch_out"
+		"sched_switch_out",
 	},
 	.function = {
 		{      INVALID_STATE,  own_context_snroc,       INVALID_STATE },
diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h
index 8409eaadc7e0..1a4b806061c3 100644
--- a/kernel/trace/rv/monitors/sssw/sssw.h
+++ b/kernel/trace/rv/monitors/sssw/sssw.h
@@ -8,17 +8,17 @@
 #define MONITOR_NAME sssw
 
 enum states_sssw {
-	runnable_sssw = 0,
+	runnable_sssw,
 	signal_wakeup_sssw,
 	sleepable_sssw,
 	sleeping_sssw,
-	state_max_sssw
+	state_max_sssw,
 };
 
 #define INVALID_STATE state_max_sssw
 
 enum events_sssw {
-	sched_set_state_runnable_sssw = 0,
+	sched_set_state_runnable_sssw,
 	sched_set_state_sleepable_sssw,
 	sched_switch_blocking_sssw,
 	sched_switch_in_sssw,
@@ -27,7 +27,7 @@ enum events_sssw {
 	sched_switch_yield_sssw,
 	sched_wakeup_sssw,
 	signal_deliver_sssw,
-	event_max_sssw
+	event_max_sssw,
 };
 
 struct automaton_sssw {
@@ -43,7 +43,7 @@ static const struct automaton_sssw automaton_sssw = {
 		"runnable",
 		"signal_wakeup",
 		"sleepable",
-		"sleeping"
+		"sleeping",
 	},
 	.event_names = {
 		"sched_set_state_runnable",
@@ -54,7 +54,7 @@ static const struct automaton_sssw automaton_sssw = {
 		"sched_switch_suspend",
 		"sched_switch_yield",
 		"sched_wakeup",
-		"signal_deliver"
+		"signal_deliver",
 	},
 	.function = {
 		{
@@ -66,7 +66,7 @@ static const struct automaton_sssw automaton_sssw = {
 			INVALID_STATE,
 			runnable_sssw,
 			runnable_sssw,
-			runnable_sssw
+			runnable_sssw,
 		},
 		{
 			INVALID_STATE,
@@ -77,7 +77,7 @@ static const struct automaton_sssw automaton_sssw = {
 			INVALID_STATE,
 			signal_wakeup_sssw,
 			signal_wakeup_sssw,
-			runnable_sssw
+			runnable_sssw,
 		},
 		{
 			runnable_sssw,
@@ -88,7 +88,7 @@ static const struct automaton_sssw automaton_sssw = {
 			sleeping_sssw,
 			signal_wakeup_sssw,
 			runnable_sssw,
-			sleepable_sssw
+			sleepable_sssw,
 		},
 		{
 			INVALID_STATE,
@@ -99,7 +99,7 @@ static const struct automaton_sssw automaton_sssw = {
 			INVALID_STATE,
 			INVALID_STATE,
 			runnable_sssw,
-			INVALID_STATE
+			INVALID_STATE,
 		},
 	},
 	.initial_state = runnable_sssw,
diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h
index 3779d7f99404..6f7b2d9d72e6 100644
--- a/kernel/trace/rv/monitors/sts/sts.h
+++ b/kernel/trace/rv/monitors/sts/sts.h
@@ -8,26 +8,26 @@
 #define MONITOR_NAME sts
 
 enum states_sts {
-	can_sched_sts = 0,
+	can_sched_sts,
 	cant_sched_sts,
 	disable_to_switch_sts,
 	enable_to_exit_sts,
 	in_irq_sts,
 	scheduling_sts,
 	switching_sts,
-	state_max_sts
+	state_max_sts,
 };
 
 #define INVALID_STATE state_max_sts
 
 enum events_sts {
-	irq_disable_sts = 0,
+	irq_disable_sts,
 	irq_enable_sts,
 	irq_entry_sts,
 	sched_switch_sts,
 	schedule_entry_sts,
 	schedule_exit_sts,
-	event_max_sts
+	event_max_sts,
 };
 
 struct automaton_sts {
@@ -46,7 +46,7 @@ static const struct automaton_sts automaton_sts = {
 		"enable_to_exit",
 		"in_irq",
 		"scheduling",
-		"switching"
+		"switching",
 	},
 	.event_names = {
 		"irq_disable",
@@ -54,7 +54,7 @@ static const struct automaton_sts automaton_sts = {
 		"irq_entry",
 		"sched_switch",
 		"schedule_entry",
-		"schedule_exit"
+		"schedule_exit",
 	},
 	.function = {
 		{
@@ -63,7 +63,7 @@ static const struct automaton_sts automaton_sts = {
 			INVALID_STATE,
 			INVALID_STATE,
 			scheduling_sts,
-			INVALID_STATE
+			INVALID_STATE,
 		},
 		{
 			INVALID_STATE,
@@ -71,7 +71,7 @@ static const struct automaton_sts automaton_sts = {
 			cant_sched_sts,
 			INVALID_STATE,
 			INVALID_STATE,
-			INVALID_STATE
+			INVALID_STATE,
 		},
 		{
 			INVALID_STATE,
@@ -79,7 +79,7 @@ static const struct automaton_sts automaton_sts = {
 			in_irq_sts,
 			switching_sts,
 			INVALID_STATE,
-			INVALID_STATE
+			INVALID_STATE,
 		},
 		{
 			enable_to_exit_sts,
@@ -87,7 +87,7 @@ static const struct automaton_sts automaton_sts = {
 			enable_to_exit_sts,
 			INVALID_STATE,
 			INVALID_STATE,
-			can_sched_sts
+			can_sched_sts,
 		},
 		{
 			INVALID_STATE,
@@ -95,7 +95,7 @@ static const struct automaton_sts automaton_sts = {
 			in_irq_sts,
 			INVALID_STATE,
 			INVALID_STATE,
-			INVALID_STATE
+			INVALID_STATE,
 		},
 		{
 			disable_to_switch_sts,
@@ -103,7 +103,7 @@ static const struct automaton_sts automaton_sts = {
 			INVALID_STATE,
 			INVALID_STATE,
 			INVALID_STATE,
-			INVALID_STATE
+			INVALID_STATE,
 		},
 		{
 			INVALID_STATE,
@@ -111,7 +111,7 @@ static const struct automaton_sts automaton_sts = {
 			INVALID_STATE,
 			INVALID_STATE,
 			INVALID_STATE,
-			INVALID_STATE
+			INVALID_STATE,
 		},
 	},
 	.initial_state = can_sched_sts,
diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h
index cfdc52975354..b4c3eea94c86 100644
--- a/kernel/trace/rv/monitors/wip/wip.h
+++ b/kernel/trace/rv/monitors/wip/wip.h
@@ -8,18 +8,18 @@
 #define MONITOR_NAME wip
 
 enum states_wip {
-	preemptive_wip = 0,
+	preemptive_wip,
 	non_preemptive_wip,
-	state_max_wip
+	state_max_wip,
 };
 
 #define INVALID_STATE state_max_wip
 
 enum events_wip {
-	preempt_disable_wip = 0,
+	preempt_disable_wip,
 	preempt_enable_wip,
 	sched_waking_wip,
-	event_max_wip
+	event_max_wip,
 };
 
 struct automaton_wip {
@@ -33,12 +33,12 @@ struct automaton_wip {
 static const struct automaton_wip automaton_wip = {
 	.state_names = {
 		"preemptive",
-		"non_preemptive"
+		"non_preemptive",
 	},
 	.event_names = {
 		"preempt_disable",
 		"preempt_enable",
-		"sched_waking"
+		"sched_waking",
 	},
 	.function = {
 		{ non_preemptive_wip,      INVALID_STATE,      INVALID_STATE },
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h
index 85d12e42a955..a28006512c9b 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.h
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.h
@@ -8,18 +8,18 @@
 #define MONITOR_NAME wwnr
 
 enum states_wwnr {
-	not_running_wwnr = 0,
+	not_running_wwnr,
 	running_wwnr,
-	state_max_wwnr
+	state_max_wwnr,
 };
 
 #define INVALID_STATE state_max_wwnr
 
 enum events_wwnr {
-	switch_in_wwnr = 0,
+	switch_in_wwnr,
 	switch_out_wwnr,
 	wakeup_wwnr,
-	event_max_wwnr
+	event_max_wwnr,
 };
 
 struct automaton_wwnr {
@@ -33,12 +33,12 @@ struct automaton_wwnr {
 static const struct automaton_wwnr automaton_wwnr = {
 	.state_names = {
 		"not_running",
-		"running"
+		"running",
 	},
 	.event_names = {
 		"switch_in",
 		"switch_out",
-		"wakeup"
+		"wakeup",
 	},
 	.function = {
 		{       running_wwnr,      INVALID_STATE,   not_running_wwnr },
diff --git a/tools/verification/rvgen/rvgen/dot2c.py b/tools/verification/rvgen/rvgen/dot2c.py
index 24894411c3cd..06a26bf15a7e 100644
--- a/tools/verification/rvgen/rvgen/dot2c.py
+++ b/tools/verification/rvgen/rvgen/dot2c.py
@@ -28,11 +28,11 @@ class Dot2c(Automata):
 
     def __get_enum_states_content(self) -> list[str]:
         buff = []
-        buff.append("\t%s%s = 0," % (self.initial_state, self.enum_suffix))
+        buff.append("\t%s%s," % (self.initial_state, self.enum_suffix))
         for state in self.states:
             if state != self.initial_state:
                 buff.append("\t%s%s," % (state, self.enum_suffix))
-        buff.append("\tstate_max%s" % (self.enum_suffix))
+        buff.append("\tstate_max%s," % (self.enum_suffix))
 
         return buff
 
@@ -46,15 +46,10 @@ class Dot2c(Automata):
 
     def __get_enum_events_content(self) -> list[str]:
         buff = []
-        first = True
         for event in self.events:
-            if first:
-                buff.append("\t%s%s = 0," % (event, self.enum_suffix))
-                first = False
-            else:
-                buff.append("\t%s%s," % (event, self.enum_suffix))
+            buff.append("\t%s%s," % (event, self.enum_suffix))
 
-        buff.append("\tevent_max%s" % self.enum_suffix)
+        buff.append("\tevent_max%s," % self.enum_suffix)
 
         return buff
 
@@ -97,18 +92,11 @@ class Dot2c(Automata):
         buff.append("static const struct %s %s = {" % (self.struct_automaton_def, self.var_automaton_def))
         return buff
 
-    def __get_string_vector_per_line_content(self, buff: list[str]) -> str:
-        first = True
-        string = ""
-        for entry in buff:
-            if first:
-                string = string + "\t\t\"" + entry
-                first = False;
-            else:
-                string = string + "\",\n\t\t\"" + entry
-        string = string + "\""
-
-        return string
+    def __get_string_vector_per_line_content(self, entries: list[str]) -> str:
+        buff = []
+        for entry in entries:
+            buff.append(f"\t\t\"{entry}\",")
+        return "\n".join(buff)
 
     def format_aut_init_events_string(self) -> list[str]:
         buff = []
@@ -152,7 +140,7 @@ class Dot2c(Automata):
                 if y != nr_events-1:
                     line += ",\n" if linetoolong else ", "
                 else:
-                    line += "\n\t\t}," if linetoolong else " },"
+                    line += ",\n\t\t}," if linetoolong else " },"
             buff.append(line)
 
         return '\n'.join(buff)
@@ -179,12 +167,12 @@ class Dot2c(Automata):
         line = ""
         first = True
         for state in self.states:
-            if first == False:
+            if not first:
                 line = line + ', '
             else:
                 first = False
 
-            if self.final_states.__contains__(state):
+            if state in self.final_states:
                 line = line + '1'
             else:
                 line = line + '0'
-- 
cgit v1.2.3


From 3fee5b320c15c8f61e44729a9513347de6a93735 Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Wed, 26 Nov 2025 11:42:39 +0100
Subject: verification/rvgen: Remove unused variable declaration from
 containers

The monitor container source files contained a declaration and a
definition for the rv_monitor variable. The former is superfluous and
can be removed.

Remove the variable declaration from the template as well as the
existing monitor containers.

Reviewed-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20251126104241.291258-9-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 kernel/trace/rv/monitors/rtapp/rtapp.c                    | 2 --
 kernel/trace/rv/monitors/sched/sched.c                    | 2 --
 tools/verification/rvgen/rvgen/templates/container/main.c | 2 --
 3 files changed, 6 deletions(-)

(limited to 'tools')

diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c
index fd75fc927d65..17f271231c99 100644
--- a/kernel/trace/rv/monitors/rtapp/rtapp.c
+++ b/kernel/trace/rv/monitors/rtapp/rtapp.c
@@ -8,8 +8,6 @@
 
 #include "rtapp.h"
 
-struct rv_monitor rv_rtapp;
-
 struct rv_monitor rv_rtapp = {
 	.name = "rtapp",
 	.description = "Collection of monitors for detecting problems with real-time applications",
diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c
index d04db4b543f9..dd9d96fc6e21 100644
--- a/kernel/trace/rv/monitors/sched/sched.c
+++ b/kernel/trace/rv/monitors/sched/sched.c
@@ -8,8 +8,6 @@
 
 #include "sched.h"
 
-struct rv_monitor rv_sched;
-
 struct rv_monitor rv_sched = {
 	.name = "sched",
 	.description = "container for several scheduler monitor specifications.",
diff --git a/tools/verification/rvgen/rvgen/templates/container/main.c b/tools/verification/rvgen/rvgen/templates/container/main.c
index 7d9b2f95c7e9..5fc89b46f279 100644
--- a/tools/verification/rvgen/rvgen/templates/container/main.c
+++ b/tools/verification/rvgen/rvgen/templates/container/main.c
@@ -8,8 +8,6 @@
 
 #include "%%MODEL_NAME%%.h"
 
-struct rv_monitor rv_%%MODEL_NAME%%;
-
 struct rv_monitor rv_%%MODEL_NAME%% = {
 	.name = "%%MODEL_NAME%%",
 	.description = "%%DESCRIPTION%%",
-- 
cgit v1.2.3


From bbf8c67aa6ae8bd588f097510d887dad071f9f43 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@huawei.com>
Date: Thu, 8 Jan 2026 19:38:36 +0800
Subject: tools: jobserver: Prevent deadlock caused by incorrect jobserver
 configuration and enhance error reporting

When using GNU Make's jobserver feature in kernel builds, a bug in MAKEFLAGS
propagation caused "--jobserver-auth=r,w" to reference an unintended file
descriptor. This led to infinite loops in jobserver-exec's os.read() calls
due to empty token.

My shell opened /etc/passwd for some reason without closing it, and as a
result, all child processes inherited this fd 3.

$ ls -l /proc/self/fd
total 0
lrwx------ 1 changbin changbin 64 Dec 25 13:03 0 -> /dev/pts/1
lrwx------ 1 changbin changbin 64 Dec 25 13:03 1 -> /dev/pts/1
lrwx------ 1 changbin changbin 64 Dec 25 13:03 2 -> /dev/pts/1
lr-x------ 1 changbin changbin 64 Dec 25 13:03 3 -> /etc/passwd
lr-x------ 1 changbin changbin 64 Dec 25 13:03 4 -> /proc/1421383/fd

In this case, the `make` should open a new file descriptor for jobserver
control, but clearly, it did not do so and instead still passed fd 3 as
"--jobserver-auth=3,4" in MAKEFLAGS. (The version of my gnu make is 4.3)

This update ensures robustness against invalid jobserver configurations,
even when `make` incorrectly pass non-pipe file descriptors.
 * Rejecting empty reads to prevent infinite loops on EOF.
 * Clearing `self.jobs` to avoid writing to incorrect files if invalid tokens
   are detected.
 * Printing detailed error messages to stderr to inform the user.

Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Reviewed-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Changbin Du <changbin.du@huawei.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <20260108113836.2976527-1-changbin.du@huawei.com>
---
 tools/lib/python/jobserver.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/python/jobserver.py b/tools/lib/python/jobserver.py
index a24f30ef4fa8..616411087725 100755
--- a/tools/lib/python/jobserver.py
+++ b/tools/lib/python/jobserver.py
@@ -91,6 +91,10 @@ class JobserverExec:
             while True:
                 try:
                     slot = os.read(self.reader, 8)
+                    if not slot:
+                        # Clear self.jobs to prevent us from probably writing incorrect file.
+                        self.jobs = b""
+                        raise ValueError("unexpected empty token from jobserver fd, invalid '--jobserver-auth=' setting?")
                     self.jobs += slot
                 except (OSError, IOError) as e:
                     if e.errno == errno.EWOULDBLOCK:
@@ -105,7 +109,8 @@ class JobserverExec:
             # to sit here blocked on our child.
             self.claim = len(self.jobs) + 1
 
-        except (KeyError, IndexError, ValueError, OSError, IOError):
+        except (KeyError, IndexError, ValueError, OSError, IOError) as e:
+            print(f"jobserver: warning: {repr(e)}", file=sys.stderr)
             # Any missing environment strings or bad fds should result in just
             # not being parallel.
             self.claim = None
-- 
cgit v1.2.3


From c1d7c0f9cdf6690eff4518f1c17a37d5ee647cd1 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Thu, 8 Jan 2026 02:19:40 -0700
Subject: selftests: ublk: display UBLK_F_INTEGRITY support

Add support for printing the UBLK_F_INTEGRITY feature flag in the
human-readable kublk features output.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/kublk.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 185ba553686a..261095f19c93 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -1454,6 +1454,7 @@ static int cmd_dev_get_features(void)
 		FEAT_NAME(UBLK_F_QUIESCE),
 		FEAT_NAME(UBLK_F_PER_IO_DAEMON),
 		FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON),
+		FEAT_NAME(UBLK_F_INTEGRITY),
 	};
 	struct ublk_dev *dev;
 	__u64 features = 0;
-- 
cgit v1.2.3


From 261b67f4e34716e793b0b95d2722b2fe780ed5f4 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Thu, 8 Jan 2026 02:19:41 -0700
Subject: selftests: ublk: add utility to get block device metadata size

Some block device integrity parameters are available in sysfs, but
others are only accessible using the FS_IOC_GETLBMD_CAP ioctl. Add a
metadata_size utility program to print out the logical block metadata
size, PI offset, and PI size within the metadata. Example output:
$ metadata_size /dev/ublkb0
metadata_size: 64
pi_offset: 56
pi_tuple_size: 8

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile        |  5 ++--
 tools/testing/selftests/ublk/metadata_size.c | 36 ++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/ublk/metadata_size.c

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 06ba6fde098d..351ac6438561 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -49,12 +49,13 @@ TEST_PROGS += test_stress_05.sh
 TEST_PROGS += test_stress_06.sh
 TEST_PROGS += test_stress_07.sh
 
-TEST_GEN_PROGS_EXTENDED = kublk
+TEST_GEN_PROGS_EXTENDED = kublk metadata_size
+STANDALONE_UTILS := metadata_size.c
 
 LOCAL_HDRS += $(wildcard *.h)
 include ../lib.mk
 
-$(TEST_GEN_PROGS_EXTENDED): $(wildcard *.c)
+$(OUTPUT)/kublk: $(filter-out $(STANDALONE_UTILS),$(wildcard *.c))
 
 check:
 	shellcheck -x -f gcc *.sh
diff --git a/tools/testing/selftests/ublk/metadata_size.c b/tools/testing/selftests/ublk/metadata_size.c
new file mode 100644
index 000000000000..76ecddf04d25
--- /dev/null
+++ b/tools/testing/selftests/ublk/metadata_size.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <fcntl.h>
+#include <linux/fs.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+
+int main(int argc, char **argv)
+{
+	struct logical_block_metadata_cap cap = {};
+	const char *filename;
+	int fd;
+	int result;
+
+	if (argc != 2) {
+		fprintf(stderr, "Usage: %s BLOCK_DEVICE\n", argv[0]);
+		return 1;
+	}
+
+	filename = argv[1];
+	fd = open(filename, O_RDONLY);
+	if (fd < 0) {
+		perror(filename);
+		return 1;
+	}
+
+	result = ioctl(fd, FS_IOC_GETLBMD_CAP, &cap);
+	if (result < 0) {
+		perror("ioctl");
+		return 1;
+	}
+
+	printf("metadata_size: %u\n", cap.lbmd_size);
+	printf("pi_offset: %u\n", cap.lbmd_pi_offset);
+	printf("pi_tuple_size: %u\n", cap.lbmd_pi_size);
+	return 0;
+}
-- 
cgit v1.2.3


From 6ed6476c4aefa9ee3ba90f39bcc002dd034f6e03 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Thu, 8 Jan 2026 02:19:42 -0700
Subject: selftests: ublk: add kublk support for integrity params

Add integrity param command line arguments to kublk. Plumb these to
struct ublk_params for the null and fault_inject targets, as they don't
need to actually read or write the integrity data. Forbid the integrity
params for loop or stripe until the integrity data copy is implemented.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/fault_inject.c |  1 +
 tools/testing/selftests/ublk/file_backed.c  |  4 +++
 tools/testing/selftests/ublk/kublk.c        | 47 +++++++++++++++++++++++++++++
 tools/testing/selftests/ublk/kublk.h        | 21 +++++++++++++
 tools/testing/selftests/ublk/null.c         |  1 +
 tools/testing/selftests/ublk/stripe.c       |  4 +++
 6 files changed, 78 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c
index b227bd78b252..3b897f69c014 100644
--- a/tools/testing/selftests/ublk/fault_inject.c
+++ b/tools/testing/selftests/ublk/fault_inject.c
@@ -33,6 +33,7 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx,
 			.dev_sectors		= dev_size >> 9,
 		},
 	};
+	ublk_set_integrity_params(ctx, &dev->tgt.params);
 
 	dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000);
 	return 0;
diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index 269d5f124e06..c14ce6608696 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -158,6 +158,10 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
 		ublk_err("%s: not support auto_zc_fallback\n", __func__);
 		return -EINVAL;
 	}
+	if (ctx->metadata_size) {
+		ublk_err("%s: integrity not supported\n", __func__);
+		return -EINVAL;
+	}
 
 	ret = backing_file_tgt_init(dev);
 	if (ret)
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 261095f19c93..48e1865b4875 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -3,6 +3,7 @@
  * Description: uring_cmd based ublk
  */
 
+#include <linux/fs.h>
 #include "kublk.h"
 
 #define MAX_NR_TGT_ARG 	64
@@ -1550,6 +1551,8 @@ static void __cmd_create_help(char *exe, bool recovery)
 	printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n");
 	printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n");
 	printf("\t[--nthreads threads] [--per_io_tasks]\n");
+	printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] "
+		 "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n");
 	printf("\t[target options] [backfile1] [backfile2] ...\n");
 	printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
 	printf("\tdefault: nthreads=nr_queues");
@@ -1613,6 +1616,12 @@ int main(int argc, char *argv[])
 		{ "nthreads",		1,	NULL,  0 },
 		{ "per_io_tasks",	0,	NULL,  0 },
 		{ "no_ublk_fixed_fd",	0,	NULL,  0 },
+		{ "integrity_capable",	0,	NULL,  0 },
+		{ "integrity_reftag",	0,	NULL,  0 },
+		{ "metadata_size",	1,	NULL,  0 },
+		{ "pi_offset",		1,	NULL,  0 },
+		{ "csum_type",		1,	NULL,  0 },
+		{ "tag_size",		1,	NULL,  0 },
 		{ 0, 0, 0, 0 }
 	};
 	const struct ublk_tgt_ops *ops = NULL;
@@ -1623,6 +1632,7 @@ int main(int argc, char *argv[])
 		.nr_hw_queues	=	2,
 		.dev_id		=	-1,
 		.tgt_type	=	"unknown",
+		.csum_type	=	LBMD_PI_CSUM_NONE,
 	};
 	int ret = -EINVAL, i;
 	int tgt_argc = 1;
@@ -1697,6 +1707,28 @@ int main(int argc, char *argv[])
 				ctx.per_io_tasks = 1;
 			if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd"))
 				ctx.no_ublk_fixed_fd = 1;
+			if (!strcmp(longopts[option_idx].name, "integrity_capable"))
+				ctx.integrity_flags |= LBMD_PI_CAP_INTEGRITY;
+			if (!strcmp(longopts[option_idx].name, "integrity_reftag"))
+				ctx.integrity_flags |= LBMD_PI_CAP_REFTAG;
+			if (!strcmp(longopts[option_idx].name, "metadata_size"))
+				ctx.metadata_size = strtoul(optarg, NULL, 0);
+			if (!strcmp(longopts[option_idx].name, "pi_offset"))
+				ctx.pi_offset = strtoul(optarg, NULL, 0);
+			if (!strcmp(longopts[option_idx].name, "csum_type")) {
+				if (!strcmp(optarg, "ip")) {
+					ctx.csum_type = LBMD_PI_CSUM_IP;
+				} else if (!strcmp(optarg, "t10dif")) {
+					ctx.csum_type = LBMD_PI_CSUM_CRC16_T10DIF;
+				} else if (!strcmp(optarg, "nvme")) {
+					ctx.csum_type = LBMD_PI_CSUM_CRC64_NVME;
+				} else {
+					ublk_err("invalid csum_type: %s\n", optarg);
+					return -EINVAL;
+				}
+			}
+			if (!strcmp(longopts[option_idx].name, "tag_size"))
+				ctx.tag_size = strtoul(optarg, NULL, 0);
 			break;
 		case '?':
 			/*
@@ -1739,6 +1771,21 @@ int main(int argc, char *argv[])
 		return -EINVAL;
 	}
 
+	if (ctx.metadata_size) {
+		if (!(ctx.flags & UBLK_F_USER_COPY)) {
+			ublk_err("integrity requires user_copy\n");
+			return -EINVAL;
+		}
+
+		ctx.flags |= UBLK_F_INTEGRITY;
+	} else if (ctx.integrity_flags ||
+		   ctx.pi_offset ||
+		   ctx.csum_type != LBMD_PI_CSUM_NONE ||
+		   ctx.tag_size) {
+		ublk_err("integrity parameters require metadata_size\n");
+		return -EINVAL;
+	}
+
 	i = optind;
 	while (i < argc && ctx.nr_files < MAX_BACK_FILES) {
 		ctx.files[ctx.nr_files++] = argv[i++];
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 8a83b90ec603..d00f2b465cdf 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -78,6 +78,11 @@ struct dev_ctx {
 	unsigned int	auto_zc_fallback:1;
 	unsigned int	per_io_tasks:1;
 	unsigned int	no_ublk_fixed_fd:1;
+	__u32 integrity_flags;
+	__u8 metadata_size;
+	__u8 pi_offset;
+	__u8 csum_type;
+	__u8 tag_size;
 
 	int _evtfd;
 	int _shmid;
@@ -202,6 +207,22 @@ struct ublk_dev {
 
 extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io);
 
+static inline void ublk_set_integrity_params(const struct dev_ctx *ctx,
+					     struct ublk_params *params)
+{
+	if (!ctx->metadata_size)
+		return;
+
+	params->types |= UBLK_PARAM_TYPE_INTEGRITY;
+	params->integrity = (struct ublk_param_integrity) {
+		.flags = ctx->integrity_flags,
+		.interval_exp = params->basic.logical_bs_shift,
+		.metadata_size = ctx->metadata_size,
+		.pi_offset = ctx->pi_offset,
+		.csum_type = ctx->csum_type,
+		.tag_size = ctx->tag_size,
+	};
+}
 
 static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod)
 {
diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c
index 280043f6b689..3aa162f08476 100644
--- a/tools/testing/selftests/ublk/null.c
+++ b/tools/testing/selftests/ublk/null.c
@@ -36,6 +36,7 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
 			.max_segments 		= 32,
 		},
 	};
+	ublk_set_integrity_params(ctx, &dev->tgt.params);
 
 	if (info->flags & UBLK_F_SUPPORT_ZERO_COPY)
 		dev->tgt.sq_depth = dev->tgt.cq_depth = 2 * info->queue_depth;
diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c
index fd412e1f01c0..d4aaf3351d71 100644
--- a/tools/testing/selftests/ublk/stripe.c
+++ b/tools/testing/selftests/ublk/stripe.c
@@ -298,6 +298,10 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
 		ublk_err("%s: not support auto_zc_fallback\n", __func__);
 		return -EINVAL;
 	}
+	if (ctx->metadata_size) {
+		ublk_err("%s: integrity not supported\n", __func__);
+		return -EINVAL;
+	}
 
 	if ((chunk_size & (chunk_size - 1)) || !chunk_size) {
 		ublk_err("invalid chunk size %u\n", chunk_size);
-- 
cgit v1.2.3


From 24f8a44b797f03dfadb455138930523599d3c22a Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Thu, 8 Jan 2026 02:19:43 -0700
Subject: selftests: ublk: implement integrity user copy in kublk

If integrity data is enabled for kublk, allocate an integrity buffer for
each I/O. Extend ublk_user_copy() to copy the integrity data between the
ublk request and the integrity buffer if the ublksrv_io_desc indicates
that the request has integrity data.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/kublk.c | 41 +++++++++++++++++++++++++++++++-----
 tools/testing/selftests/ublk/kublk.h | 14 ++++++++++++
 2 files changed, 50 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 48e1865b4875..d95937dd6167 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -416,8 +416,10 @@ static void ublk_queue_deinit(struct ublk_queue *q)
 	if (q->io_cmd_buf)
 		munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
 
-	for (i = 0; i < nr_ios; i++)
+	for (i = 0; i < nr_ios; i++) {
 		free(q->ios[i].buf_addr);
+		free(q->ios[i].integrity_buf);
+	}
 }
 
 static void ublk_thread_deinit(struct ublk_thread *t)
@@ -433,12 +435,13 @@ static void ublk_thread_deinit(struct ublk_thread *t)
 	}
 }
 
-static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags)
+static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags,
+			   __u8 metadata_size)
 {
 	struct ublk_dev *dev = q->dev;
 	int depth = dev->dev_info.queue_depth;
 	int i;
-	int cmd_buf_size, io_buf_size;
+	int cmd_buf_size, io_buf_size, integrity_size;
 	unsigned long off;
 
 	q->tgt_ops = dev->tgt.ops;
@@ -446,6 +449,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags)
 	q->q_depth = depth;
 	q->flags = dev->dev_info.flags;
 	q->flags |= extra_flags;
+	q->metadata_size = metadata_size;
 
 	/* Cache fd in queue for fast path access */
 	q->ublk_fd = dev->fds[0];
@@ -461,11 +465,23 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags)
 	}
 
 	io_buf_size = dev->dev_info.max_io_buf_bytes;
+	integrity_size = ublk_integrity_len(q, io_buf_size);
 	for (i = 0; i < q->q_depth; i++) {
 		q->ios[i].buf_addr = NULL;
 		q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE;
 		q->ios[i].tag = i;
 
+		if (integrity_size) {
+			q->ios[i].integrity_buf = malloc(integrity_size);
+			if (!q->ios[i].integrity_buf) {
+				ublk_err("ublk dev %d queue %d io %d malloc(%d) failed: %m\n",
+					 dev->dev_info.dev_id, q->q_id, i,
+					 integrity_size);
+				goto fail;
+			}
+		}
+
+
 		if (ublk_queue_no_buf(q))
 			continue;
 
@@ -608,13 +624,13 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op)
 	__u8 ublk_op = ublksrv_get_op(iod);
 	__u32 len = iod->nr_sectors << 9;
 	void *addr = io->buf_addr;
+	ssize_t copied;
 
 	if (ublk_op != match_ublk_op)
 		return;
 
 	while (len) {
 		__u32 copy_len = min(len, UBLK_USER_COPY_LEN);
-		ssize_t copied;
 
 		if (ublk_op == UBLK_IO_OP_WRITE)
 			copied = pread(q->ublk_fd, addr, copy_len, off);
@@ -627,6 +643,20 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op)
 		off += copy_len;
 		len -= copy_len;
 	}
+
+	if (!(iod->op_flags & UBLK_IO_F_INTEGRITY))
+		return;
+
+	len = ublk_integrity_len(q, iod->nr_sectors << 9);
+	off = ublk_user_copy_offset(q->q_id, io->tag);
+	off |= UBLKSRV_IO_INTEGRITY_FLAG;
+	if (ublk_op == UBLK_IO_OP_WRITE)
+		copied = pread(q->ublk_fd, io->integrity_buf, len, off);
+	else if (ublk_op == UBLK_IO_OP_READ)
+		copied = pwrite(q->ublk_fd, io->integrity_buf, len, off);
+	else
+		assert(0);
+	assert(copied == (ssize_t)len);
 }
 
 int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io)
@@ -1013,7 +1043,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 		dev->q[i].dev = dev;
 		dev->q[i].q_id = i;
 
-		ret = ublk_queue_init(&dev->q[i], extra_flags);
+		ret = ublk_queue_init(&dev->q[i], extra_flags,
+				      ctx->metadata_size);
 		if (ret) {
 			ublk_err("ublk dev %d queue %d init queue failed\n",
 				 dinfo->dev_id, i);
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index d00f2b465cdf..830b49a7716a 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -112,6 +112,7 @@ struct ublk_ctrl_cmd_data {
 
 struct ublk_io {
 	char *buf_addr;
+	void *integrity_buf;
 
 #define UBLKS_IO_NEED_FETCH_RQ		(1UL << 0)
 #define UBLKS_IO_NEED_COMMIT_RQ_COMP	(1UL << 1)
@@ -175,6 +176,7 @@ struct ublk_queue {
 #define UBLKS_Q_NO_UBLK_FIXED_FD	(1ULL << 62)
 	__u64 flags;
 	int ublk_fd;	/* cached ublk char device fd */
+	__u8 metadata_size;
 	struct ublk_io ios[UBLK_QUEUE_DEPTH];
 };
 
@@ -224,6 +226,18 @@ static inline void ublk_set_integrity_params(const struct dev_ctx *ctx,
 	};
 }
 
+static inline size_t ublk_integrity_len(const struct ublk_queue *q, size_t len)
+{
+	/* All targets currently use interval_exp = logical_bs_shift = 9 */
+	return (len >> 9) * q->metadata_size;
+}
+
+static inline size_t
+ublk_integrity_data_len(const struct ublk_queue *q, size_t integrity_len)
+{
+	return (integrity_len / q->metadata_size) << 9;
+}
+
 static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod)
 {
 	return !!(iod->op_flags & UBLK_IO_F_NEED_REG_BUF);
-- 
cgit v1.2.3


From a1805442674b85ff9d626965f828e4fd71a82b28 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Thu, 8 Jan 2026 02:19:44 -0700
Subject: selftests: ublk: support non-O_DIRECT backing files

A subsequent commit will add support for using a backing file to store
integrity data. Since integrity data is accessed in intervals of
metadata_size, which may be much smaller than a logical block on the
backing device, direct I/O cannot be used. Add an argument to
backing_file_tgt_init() to specify the number of files to open for
direct I/O. The remaining files will use buffered I/O. For now, continue
to request direct I/O for all the files.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/common.c      | 4 ++--
 tools/testing/selftests/ublk/file_backed.c | 2 +-
 tools/testing/selftests/ublk/kublk.h       | 2 +-
 tools/testing/selftests/ublk/stripe.c      | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c
index 01580a6f8519..d9873d4d50d0 100644
--- a/tools/testing/selftests/ublk/common.c
+++ b/tools/testing/selftests/ublk/common.c
@@ -12,7 +12,7 @@ void backing_file_tgt_deinit(struct ublk_dev *dev)
 	}
 }
 
-int backing_file_tgt_init(struct ublk_dev *dev)
+int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct)
 {
 	int fd, i;
 
@@ -25,7 +25,7 @@ int backing_file_tgt_init(struct ublk_dev *dev)
 
 		ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file);
 
-		fd = open(file, O_RDWR | O_DIRECT);
+		fd = open(file, O_RDWR | (i < nr_direct ? O_DIRECT : 0));
 		if (fd < 0) {
 			ublk_err("%s: backing file %s can't be opened: %s\n",
 					__func__, file, strerror(errno));
diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index c14ce6608696..db4c176a4f28 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -163,7 +163,7 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
 		return -EINVAL;
 	}
 
-	ret = backing_file_tgt_init(dev);
+	ret = backing_file_tgt_init(dev, 1);
 	if (ret)
 		return ret;
 
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 830b49a7716a..96c66b337bc0 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -462,6 +462,6 @@ extern const struct ublk_tgt_ops stripe_tgt_ops;
 extern const struct ublk_tgt_ops fault_inject_tgt_ops;
 
 void backing_file_tgt_deinit(struct ublk_dev *dev);
-int backing_file_tgt_init(struct ublk_dev *dev);
+int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct);
 
 #endif
diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c
index d4aaf3351d71..2be1c36438e7 100644
--- a/tools/testing/selftests/ublk/stripe.c
+++ b/tools/testing/selftests/ublk/stripe.c
@@ -315,7 +315,7 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
 
 	chunk_shift = ilog2(chunk_size);
 
-	ret = backing_file_tgt_init(dev);
+	ret = backing_file_tgt_init(dev, dev->tgt.nr_backing_files);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From f48250dc5ba8368ccb587093eb20d1c7baecaacf Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Thu, 8 Jan 2026 02:19:45 -0700
Subject: selftests: ublk: add integrity data support to loop target

To perform and end-to-end test of integrity information through a ublk
device, we need to actually store it somewhere and retrieve it. Add this
support to kublk's loop target. It uses a second backing file for the
integrity data corresponding to the data stored in the first file.
The integrity file is initialized with byte 0xFF, which ensures the app
and reference tags are set to the "escape" pattern to disable the
bio-integrity-auto guard and reftag checks until the blocks are written.
The integrity file is opened without O_DIRECT since it will be accessed
at sub-block granularity. Each incoming read/write results in a pair of
reads/writes, one to the data file, and one to the integrity file. If
either backing I/O fails, the error is propagated to the ublk request.
If both backing I/Os read/write some bytes, the ublk request is
completed with the smaller of the number of blocks accessed by each I/O.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/file_backed.c | 92 ++++++++++++++++++++++++------
 1 file changed, 74 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index db4c176a4f28..c3ce5ff72422 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -35,9 +35,23 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 	unsigned auto_zc = ublk_queue_use_auto_zc(q);
 	enum io_uring_op op = ublk_to_uring_op(iod, zc | auto_zc);
 	struct ublk_io *io = ublk_get_io(q, tag);
+	__u64 offset = iod->start_sector << 9;
+	__u32 len = iod->nr_sectors << 9;
 	struct io_uring_sqe *sqe[3];
 	void *addr = io->buf_addr;
 
+	if (iod->op_flags & UBLK_IO_F_INTEGRITY) {
+		ublk_io_alloc_sqes(t, sqe, 1);
+		/* Use second backing file for integrity data */
+		io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 2),
+				 io->integrity_buf,
+				 ublk_integrity_len(q, len),
+				 ublk_integrity_len(q, offset));
+		sqe[0]->flags = IOSQE_FIXED_FILE;
+		/* tgt_data = 1 indicates integrity I/O */
+		sqe[0]->user_data = build_user_data(tag, ublk_op, 1, q->q_id, 1);
+	}
+
 	if (!zc || auto_zc) {
 		ublk_io_alloc_sqes(t, sqe, 1);
 		if (!sqe[0])
@@ -45,14 +59,14 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 
 		io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1) /*fds[1]*/,
 				addr,
-				iod->nr_sectors << 9,
-				iod->start_sector << 9);
+				len,
+				offset);
 		if (auto_zc)
 			sqe[0]->buf_index = tag;
 		io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
 		/* bit63 marks us as tgt io */
 		sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
-		return 1;
+		return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 1;
 	}
 
 	ublk_io_alloc_sqes(t, sqe, 3);
@@ -63,8 +77,8 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
 
 	io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0,
-		iod->nr_sectors << 9,
-		iod->start_sector << 9);
+			len,
+			offset);
 	sqe[1]->buf_index = tag;
 	sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK;
 	sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
@@ -72,7 +86,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 	io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, io->buf_index);
 	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
 
-	return 2;
+	return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 2;
 }
 
 static int loop_queue_tgt_io(struct ublk_thread *t, struct ublk_queue *q, int tag)
@@ -119,12 +133,17 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q,
 	unsigned op = user_data_to_op(cqe->user_data);
 	struct ublk_io *io = ublk_get_io(q, tag);
 
-	if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) {
-		if (!io->result)
-			io->result = cqe->res;
-		if (cqe->res < 0)
-			ublk_err("%s: io failed op %x user_data %lx\n",
-					__func__, op, cqe->user_data);
+	if (cqe->res < 0) {
+		io->result = cqe->res;
+		ublk_err("%s: io failed op %x user_data %lx\n",
+				__func__, op, cqe->user_data);
+	} else if (op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) {
+		__s32 data_len = user_data_to_tgt_data(cqe->user_data)
+			? ublk_integrity_data_len(q, cqe->res)
+			: cqe->res;
+
+		if (!io->result || data_len < io->result)
+			io->result = data_len;
 	}
 
 	/* buffer register op is IOSQE_CQE_SKIP_SUCCESS */
@@ -135,9 +154,30 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q,
 		ublk_complete_io(t, q, tag, io->result);
 }
 
+static int ublk_loop_memset_file(int fd, __u8 byte, size_t len)
+{
+	off_t offset = 0;
+	__u8 buf[4096];
+
+	memset(buf, byte, sizeof(buf));
+	while (len) {
+		int ret = pwrite(fd, buf, min(len, sizeof(buf)), offset);
+
+		if (ret < 0)
+			return -errno;
+		if (!ret)
+			return -EIO;
+
+		len -= ret;
+		offset += ret;
+	}
+	return 0;
+}
+
 static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
 {
 	unsigned long long bytes;
+	unsigned long blocks;
 	int ret;
 	struct ublk_params p = {
 		.types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN,
@@ -154,23 +194,39 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
 		},
 	};
 
+	ublk_set_integrity_params(ctx, &p);
 	if (ctx->auto_zc_fallback) {
 		ublk_err("%s: not support auto_zc_fallback\n", __func__);
 		return -EINVAL;
 	}
-	if (ctx->metadata_size) {
-		ublk_err("%s: integrity not supported\n", __func__);
-		return -EINVAL;
-	}
 
+	/* Use O_DIRECT only for data file */
 	ret = backing_file_tgt_init(dev, 1);
 	if (ret)
 		return ret;
 
-	if (dev->tgt.nr_backing_files != 1)
+	/* Expect a second file for integrity data */
+	if (dev->tgt.nr_backing_files != 1 + !!ctx->metadata_size)
 		return -EINVAL;
 
-	bytes = dev->tgt.backing_file_size[0];
+	blocks = dev->tgt.backing_file_size[0] >> p.basic.logical_bs_shift;
+	if (ctx->metadata_size) {
+		unsigned long metadata_blocks =
+			dev->tgt.backing_file_size[1] / ctx->metadata_size;
+		unsigned long integrity_len;
+
+		/* Ensure both data and integrity data fit in backing files */
+		blocks = min(blocks, metadata_blocks);
+		integrity_len = blocks * ctx->metadata_size;
+		/*
+		 * Initialize PI app tag and ref tag to 0xFF
+		 * to disable bio-integrity-auto checks
+		 */
+		ret = ublk_loop_memset_file(dev->fds[2], 0xFF, integrity_len);
+		if (ret)
+			return ret;
+	}
+	bytes = blocks << p.basic.logical_bs_shift;
 	dev->tgt.dev_size = bytes;
 	p.basic.dev_sectors = bytes >> 9;
 	dev->tgt.params = p;
-- 
cgit v1.2.3


From 9e9f635525b12f055558a7cfe2e54d109839d030 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Thu, 8 Jan 2026 02:19:46 -0700
Subject: selftests: ublk: add integrity params test

Add test case null_04 to exercise all the different integrity params. It
creates 4 different ublk devices with different combinations of
integrity arguments and verifies their integrity limits via sysfs and
the metadata_size utility.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile        |   1 +
 tools/testing/selftests/ublk/test_common.sh  |  10 ++
 tools/testing/selftests/ublk/test_null_04.sh | 166 +++++++++++++++++++++++++++
 3 files changed, 177 insertions(+)
 create mode 100755 tools/testing/selftests/ublk/test_null_04.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 351ac6438561..239ad1c741ef 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -27,6 +27,7 @@ TEST_PROGS += test_generic_15.sh
 TEST_PROGS += test_null_01.sh
 TEST_PROGS += test_null_02.sh
 TEST_PROGS += test_null_03.sh
+TEST_PROGS += test_null_04.sh
 TEST_PROGS += test_loop_01.sh
 TEST_PROGS += test_loop_02.sh
 TEST_PROGS += test_loop_03.sh
diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index ea9a5f3eb70a..7ff6ce79d62c 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -384,6 +384,16 @@ _ublk_test_top_dir()
 	cd "$(dirname "$0")" && pwd
 }
 
+METADATA_SIZE_PROG="$(_ublk_test_top_dir)/metadata_size"
+
+_get_metadata_size()
+{
+	local dev_id=$1
+	local field=$2
+
+	"$METADATA_SIZE_PROG" "/dev/ublkb$dev_id" | grep "$field" | grep -o "[0-9]*"
+}
+
 UBLK_PROG=$(_ublk_test_top_dir)/kublk
 UBLK_TEST_QUIET=1
 UBLK_TEST_SHOW_RESULT=1
diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh
new file mode 100755
index 000000000000..0b0719ea33a3
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_null_04.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID=null_04
+
+_prep_test "null" "integrity params"
+
+dev_id=$(_add_ublk_dev -t null -u --metadata_size 8)
+_check_add_dev $TID $?
+metadata_size=$(_get_metadata_size "$dev_id" metadata_size)
+if [ "$metadata_size" != 8 ]; then
+	echo "metadata_size $metadata_size != 8"
+	_show_result $TID 255
+fi
+pi_offset=$(_get_metadata_size "$dev_id" pi_offset)
+if [ "$pi_offset" != 0 ]; then
+	echo "pi_offset $pi_offset != 0"
+	_show_result $TID 255
+fi
+pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size)
+if [ "$pi_tuple_size" != 0 ]; then
+	echo "pi_tuple_size $pi_tuple_size != 0"
+	_show_result $TID 255
+fi
+capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")
+if [ "$capable" != 0 ]; then
+	echo "device_is_integrity_capable $capable != 0"
+	_show_result $TID 255
+fi
+format=$(cat "/sys/block/ublkb$dev_id/integrity/format")
+if [ "$format" != nop ]; then
+	echo "format $format != nop"
+	_show_result $TID 255
+fi
+protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")
+if [ "$protection_interval_bytes" != 512 ]; then
+	echo "protection_interval_bytes $protection_interval_bytes != 512"
+	_show_result $TID 255
+fi
+tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")
+if [ "$tag_size" != 0 ]; then
+	echo "tag_size $tag_size != 0"
+	_show_result $TID 255
+fi
+_cleanup_test
+
+dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip)
+_check_add_dev $TID $?
+metadata_size=$(_get_metadata_size "$dev_id" metadata_size)
+if [ "$metadata_size" != 64 ]; then
+	echo "metadata_size $metadata_size != 64"
+	_show_result $TID 255
+fi
+pi_offset=$(_get_metadata_size "$dev_id" pi_offset)
+if [ "$pi_offset" != 56 ]; then
+	echo "pi_offset $pi_offset != 56"
+	_show_result $TID 255
+fi
+pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size)
+if [ "$pi_tuple_size" != 8 ]; then
+	echo "pi_tuple_size $pi_tuple_size != 8"
+	_show_result $TID 255
+fi
+capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")
+if [ "$capable" != 1 ]; then
+	echo "device_is_integrity_capable $capable != 1"
+	_show_result $TID 255
+fi
+format=$(cat "/sys/block/ublkb$dev_id/integrity/format")
+if [ "$format" != T10-DIF-TYPE3-IP ]; then
+	echo "format $format != T10-DIF-TYPE3-IP"
+	_show_result $TID 255
+fi
+protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")
+if [ "$protection_interval_bytes" != 512 ]; then
+	echo "protection_interval_bytes $protection_interval_bytes != 512"
+	_show_result $TID 255
+fi
+tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")
+if [ "$tag_size" != 0 ]; then
+	echo "tag_size $tag_size != 0"
+	_show_result $TID 255
+fi
+_cleanup_test
+
+dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif)
+_check_add_dev $TID $?
+metadata_size=$(_get_metadata_size "$dev_id" metadata_size)
+if [ "$metadata_size" != 8 ]; then
+	echo "metadata_size $metadata_size != 8"
+	_show_result $TID 255
+fi
+pi_offset=$(_get_metadata_size "$dev_id" pi_offset)
+if [ "$pi_offset" != 0 ]; then
+	echo "pi_offset $pi_offset != 0"
+	_show_result $TID 255
+fi
+pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size)
+if [ "$pi_tuple_size" != 8 ]; then
+	echo "pi_tuple_size $pi_tuple_size != 8"
+	_show_result $TID 255
+fi
+capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")
+if [ "$capable" != 0 ]; then
+	echo "device_is_integrity_capable $capable != 0"
+	_show_result $TID 255
+fi
+format=$(cat "/sys/block/ublkb$dev_id/integrity/format")
+if [ "$format" != T10-DIF-TYPE1-CRC ]; then
+	echo "format $format != T10-DIF-TYPE1-CRC"
+	_show_result $TID 255
+fi
+protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")
+if [ "$protection_interval_bytes" != 512 ]; then
+	echo "protection_interval_bytes $protection_interval_bytes != 512"
+	_show_result $TID 255
+fi
+tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")
+if [ "$tag_size" != 0 ]; then
+	echo "tag_size $tag_size != 0"
+	_show_result $TID 255
+fi
+_cleanup_test
+
+dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8)
+_check_add_dev $TID $?
+metadata_size=$(_get_metadata_size "$dev_id" metadata_size)
+if [ "$metadata_size" != 16 ]; then
+	echo "metadata_size $metadata_size != 16"
+	_show_result $TID 255
+fi
+pi_offset=$(_get_metadata_size "$dev_id" pi_offset)
+if [ "$pi_offset" != 0 ]; then
+	echo "pi_offset $pi_offset != 0"
+	_show_result $TID 255
+fi
+pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size)
+if [ "$pi_tuple_size" != 16 ]; then
+	echo "pi_tuple_size $pi_tuple_size != 16"
+	_show_result $TID 255
+fi
+capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")
+if [ "$capable" != 0 ]; then
+	echo "device_is_integrity_capable $capable != 0"
+	_show_result $TID 255
+fi
+format=$(cat "/sys/block/ublkb$dev_id/integrity/format")
+if [ "$format" != EXT-DIF-TYPE3-CRC64 ]; then
+	echo "format $format != EXT-DIF-TYPE3-CRC64"
+	_show_result $TID 255
+fi
+protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")
+if [ "$protection_interval_bytes" != 512 ]; then
+	echo "protection_interval_bytes $protection_interval_bytes != 512"
+	_show_result $TID 255
+fi
+tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")
+if [ "$tag_size" != 8 ]; then
+	echo "tag_size $tag_size != 8"
+	_show_result $TID 255
+fi
+_cleanup_test
+
+_show_result $TID 0
-- 
cgit v1.2.3


From 78796b6bae8684b753b658f431b5b1ee24300d64 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Thu, 8 Jan 2026 02:19:47 -0700
Subject: selftests: ublk: add end-to-end integrity test

Add test case loop_08 to verify the ublk integrity data flow. It uses
the kublk loop target to create a ublk device with integrity on top of
backing data and integrity files. It then writes to the whole device
with fio configured to generate integrity data. Then it reads back the
whole device with fio configured to verify the integrity data.
It also verifies that injected guard, reftag, and apptag corruptions are
correctly detected.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile        |   1 +
 tools/testing/selftests/ublk/test_loop_08.sh | 111 +++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100755 tools/testing/selftests/ublk/test_loop_08.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 239ad1c741ef..036a9f01b464 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -35,6 +35,7 @@ TEST_PROGS += test_loop_04.sh
 TEST_PROGS += test_loop_05.sh
 TEST_PROGS += test_loop_06.sh
 TEST_PROGS += test_loop_07.sh
+TEST_PROGS += test_loop_08.sh
 TEST_PROGS += test_stripe_01.sh
 TEST_PROGS += test_stripe_02.sh
 TEST_PROGS += test_stripe_03.sh
diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh
new file mode 100755
index 000000000000..ca289cfb2ad4
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_loop_08.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+if ! _have_program fio; then
+	exit $UBLK_SKIP_CODE
+fi
+
+fio_version=$(fio --version)
+if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then
+	echo "Requires development fio version with https://github.com/axboe/fio/pull/1992"
+	exit $UBLK_SKIP_CODE
+fi
+
+TID=loop_08
+
+_prep_test "loop" "end-to-end integrity"
+
+_create_backfile 0 256M
+_create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes)
+integrity_params="--integrity_capable --integrity_reftag
+                  --metadata_size 64 --pi_offset 56 --csum_type t10dif"
+dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}")
+_check_add_dev $TID $?
+
+# 1M * (64 integrity bytes / 512 data bytes) = 128K
+fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32
+          --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG
+          --filename /dev/ublkb$dev_id"
+fio --name fill --rw randwrite $fio_args > /dev/null
+err=$?
+if [ $err != 0 ]; then
+	echo "fio fill failed"
+	_show_result $TID $err
+fi
+
+fio --name verify --rw randread $fio_args > /dev/null
+err=$?
+if [ $err != 0 ]; then
+	echo "fio verify failed"
+	_show_result $TID $err
+fi
+
+fio_err=$(mktemp fio_err_XXXXX)
+
+# Overwrite 4-byte reftag at offset 56 + 4 = 60
+dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none"
+dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args
+err=$?
+if [ $err != 0 ]; then
+	echo "dd corrupted_reftag failed"
+	rm -f "$fio_err"
+	_show_result $TID $err
+fi
+if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then
+	echo "fio corrupted_reftag unexpectedly succeeded"
+	rm -f "$fio_err"
+	_show_result $TID 255
+fi
+expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual="
+if ! grep -q "$expected_err" "$fio_err"; then
+	echo "fio corrupted_reftag message not found: $expected_err"
+	rm -f "$fio_err"
+	_show_result $TID 255
+fi
+# Reset to 0
+dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args
+err=$?
+if [ $err != 0 ]; then
+	echo "dd restore corrupted_reftag failed"
+	rm -f "$fio_err"
+	_show_result $TID $err
+fi
+
+dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none"
+dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args
+err=$?
+if [ $err != 0 ]; then
+	echo "dd corrupted_data failed"
+	rm -f "$fio_err"
+	_show_result $TID $err
+fi
+if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then
+	echo "fio corrupted_data unexpectedly succeeded"
+	rm -f "$fio_err"
+	_show_result $TID 255
+fi
+expected_err="Guard compare error: LBA: 0 Expected=0, Actual="
+if ! grep -q "$expected_err" "$fio_err"; then
+	echo "fio corrupted_data message not found: $expected_err"
+	rm -f "$fio_err"
+	_show_result $TID 255
+fi
+
+if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then
+	echo "fio bad_apptag unexpectedly succeeded"
+	rm -f "$fio_err"
+	_show_result $TID 255
+fi
+expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234"
+if ! grep -q "$expected_err" "$fio_err"; then
+	echo "fio bad_apptag message not found: $expected_err"
+	rm -f "$fio_err"
+	_show_result $TID 255
+fi
+
+rm -f "$fio_err"
+
+_cleanup_test
+_show_result $TID 0
-- 
cgit v1.2.3


From bc5e8e2fa2e28ef6c2a55ae294d04100d4b1bffe Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Mon, 5 Jan 2026 12:05:14 +0100
Subject: x86/xen: Drop xen_irq_ops

Instead of having a pre-filled array xen_irq_ops for Xen PV paravirt
functions, drop the array and assign each element individually.

This is in preparation of reducing the paravirt include hell by
splitting paravirt.h into multiple more fine grained header files,
which will in turn require to split up the pv_ops vector as well.
Dropping the pre-filled array makes life easier for objtool to
detect missing initializers in multiple pv_ops_ arrays.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://patch.msgid.link/20260105110520.21356-16-jgross@suse.com
---
 arch/x86/xen/irq.c    | 20 +++++++-------------
 tools/objtool/check.c |  1 -
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 39982f955cfe..d8678c3d3971 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -40,20 +40,14 @@ static void xen_halt(void)
 		xen_safe_halt();
 }
 
-static const typeof(pv_ops) xen_irq_ops __initconst = {
-	.irq = {
-		/* Initial interrupt flag handling only called while interrupts off. */
-		.save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0),
-		.irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop),
-		.irq_enable = __PV_IS_CALLEE_SAVE(BUG_func),
-
-		.safe_halt = xen_safe_halt,
-		.halt = xen_halt,
-	},
-};
-
 void __init xen_init_irq_ops(void)
 {
-	pv_ops.irq = xen_irq_ops.irq;
+	/* Initial interrupt flag handling only called while interrupts off. */
+	pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0);
+	pv_ops.irq.irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop);
+	pv_ops.irq.irq_enable = __PV_IS_CALLEE_SAVE(BUG_func);
+	pv_ops.irq.safe_halt = xen_safe_halt;
+	pv_ops.irq.halt = xen_halt;
+
 	x86_init.irqs.intr_init = xen_init_IRQ;
 }
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 3f7999317f4d..0c32a92dc693 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -571,7 +571,6 @@ static int init_pv_ops(struct objtool_file *file)
 	static const char *pv_ops_tables[] = {
 		"pv_ops",
 		"xen_cpu_ops",
-		"xen_irq_ops",
 		"xen_mmu_ops",
 		NULL,
 	};
-- 
cgit v1.2.3


From 2f8d489897ae7183b535b1881478b2c6b66d520b Mon Sep 17 00:00:00 2001
From: George Guo <guodongtai@kylinos.cn>
Date: Sat, 10 Jan 2026 00:12:14 +0800
Subject: sched_ext: Add error logging for dsq creation failures

Add scx_bpf_error() calls when scx_bpf_create_dsq() fails in multiple
schedulers to improve debuggability:

- scx_central.bpf.c: central_init()
- scx_flatcg.bpf.c: fcg_cgroup_init() and fcg_init()
- scx_qmap.bpf.c: qmap_init()

Signed-off-by: George Guo <guodongtai@kylinos.cn>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_central.bpf.c | 4 +++-
 tools/sched_ext/scx_flatcg.bpf.c  | 4 +++-
 tools/sched_ext/scx_qmap.bpf.c    | 8 ++++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 55df8b798865..1c2376b75b5d 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -301,8 +301,10 @@ int BPF_STRUCT_OPS_SLEEPABLE(central_init)
 	int ret;
 
 	ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
-	if (ret)
+	if (ret) {
+		scx_bpf_error("scx_bpf_create_dsq failed (%d)", ret);
 		return ret;
+	}
 
 	timer = bpf_map_lookup_elem(&central_timer, &key);
 	if (!timer)
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 43126858b8e4..c216480c3ee0 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -842,8 +842,10 @@ int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp,
 	 * unlikely case that it breaks.
 	 */
 	ret = scx_bpf_create_dsq(cgid, -1);
-	if (ret)
+	if (ret) {
+		scx_bpf_error("scx_bpf_create_dsq failed (%d)", ret);
 		return ret;
+	}
 
 	cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0,
 				   BPF_LOCAL_STORAGE_GET_F_CREATE);
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index df21fad0c438..d51d8c38f1cf 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -866,12 +866,16 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 		print_cpus();
 
 	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
-	if (ret)
+	if (ret) {
+		scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
 		return ret;
+	}
 
 	ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
-	if (ret)
+	if (ret) {
+		scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret);
 		return ret;
+	}
 
 	timer = bpf_map_lookup_elem(&monitor_timer, &key);
 	if (!timer)
-- 
cgit v1.2.3


From e272628902c1c96731e2d9f62a7fc77767686eb0 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Wed, 7 Jan 2026 14:32:16 +0100
Subject: perf test stat tests: Fix for virtualized machines

On s390 'perf test's 'perf stat tests', subtest test_hybrid fails for
z/VM systems.  The root cause is this statement:

  $(perf stat -a -- sleep 0.1 2>&1 |\
                  grep -E "/cpu-cycles/[uH]*|  cpu-cycles[:uH]* -c)

The 'perf stat' output on a s390 z/VM system is

  # perf stat -a -- sleep 0.1 2>&1
  Performance counter stats for 'system wide':

        56      context-switches  #     46.3 cs/sec  cs_per_second
  1,210.41 msec cpu-clock         #     11.9 CPUs  CPUs_utilized
        12      cpu-migrations    #      9.9 migrations/sec ...
        81      page-faults       #     66.9 faults/sec ...

        0.100891009 seconds time elapsed

The grep command does not match any single line and exits with error
code 1.

As the bash script is executed with 'set -e', it aborts with the first
error code being non-zero.

Fix this and use 'wc -l' to count matching lines instead of 'grep ... -c'.

Output before:

  # perf test 102
  102: perf stat tests                      : FAILED!
  #

Output after:

  # perf test 102
  102: perf stat tests                      : Ok
  #

Fixes: bb6e7cb11d97ce19 ("perf tools: Add fallback for exclude_guest")
Reviewed-by: Ian Rogers <irogers@google.com>
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Jan Polensky <japo@linux.ibm.com>
Cc: linux-s390@vger.kernel.org
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/stat.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh
index 0b2f0f88ca16..792a0b79f6b8 100755
--- a/tools/perf/tests/shell/stat.sh
+++ b/tools/perf/tests/shell/stat.sh
@@ -233,7 +233,7 @@ test_hybrid() {
   fi
 
   # Run default Perf stat
-  cycles_events=$(perf stat -a -- sleep 0.1 2>&1 | grep -E "/cpu-cycles/[uH]*|  cpu-cycles[:uH]*  " -c)
+  cycles_events=$(perf stat -a -- sleep 0.1 2>&1 | grep -E "/cpu-cycles/[uH]*|  cpu-cycles[:uH]*  "  | wc -l)
 
   # The expectation is that default output will have a cycles events on each
   # hybrid PMU. In situations with no cycles PMU events, like virtualized, this
-- 
cgit v1.2.3


From 383f8e26e2c483e25453f8c3d0839877708ac701 Mon Sep 17 00:00:00 2001
From: Nicolas Schier <n.schier@avm.de>
Date: Thu, 8 Jan 2026 12:29:10 +0100
Subject: perf build: Raise minimum shellcheck version to 0.7.2

Raise the minimum shellcheck version for perf builds to 0.7.2, so that
systems with shellcheck versions below 0.7.2 will automatically skip the
shell script checking, even if NO_SHELLCHECK is unset.

Since commit 241f21be7d0fdf3c ("perf test perftool_testsuite: Use
absolute paths"), shellcheck versions before 0.7.2 break the perf build
with several SC1090 [2] warnings due to its too strict dynamic source
handling [1], e.g.:

  In tests/shell/base_probe/test_line_semantics.sh line 20:
  . "$DIR_PATH/../common/init.sh"
    ^---------------------------^ SC1090: Can't follow non-constant source. Use a directive to specify location.

Fixes: 241f21be7d0fdf3c ("perf test perftool_testsuite: Use absolute paths")
Signed-off-by: Nicolas Schier <n.schier@avm.de>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jakub Brnak <jbrnak@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Nicolas Schier <nsc@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Philipp Hahn <p.hahn@avm.de>
Cc: Veronika Molnarova <vmolnaro@redhat.com>
Link: https://github.com/koalaman/shellcheck/issues/1998 # [1]
Link: https://www.shellcheck.net/wiki/SC1090
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.perf | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index b3f481a626af..e6895626c187 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -251,11 +251,12 @@ else
 endif
 
 # shellcheck is using in tools/perf/tests/Build with option -a/--check-sourced (
-# introduced in v0.4.7) and -S/--severity (introduced in v0.6.0). So make the
-# minimal shellcheck version as v0.6.0.
+# introduced in v0.4.7) and -S/--severity (introduced in v0.6.0) as well as
+# dynamic source inclusions (properly handled since v0.7.2).
+# So make the minimal shellcheck version as v0.7.2.
 ifneq ($(SHELLCHECK),)
   ifeq ($(shell expr $(shell $(SHELLCHECK) --version | grep version: | \
-        sed -e 's/.\+ \([0-9]\+\).\([0-9]\+\).\([0-9]\+\)/\1\2\3/g') \< 060), 1)
+        sed -e 's/.\+ \([0-9]\+\).\([0-9]\+\).\([0-9]\+\)/\1\2\3/g') \< 072), 1)
     SHELLCHECK :=
   else
     SHELLCHECK := $(SHELLCHECK) -s bash -a -S warning
-- 
cgit v1.2.3


From 817f66e39e39b914aac25065a34f4462ab45ed26 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Mon, 5 Jan 2026 12:05:15 +0100
Subject: x86/xen: Drop xen_cpu_ops

Instead of having a pre-filled array xen_cpu_ops for Xen PV paravirt
functions, drop the array and assign each element individually.

This is in preparation of reducing the paravirt include hell by
splitting paravirt.h into multiple more fine grained header files,
which will in turn require to split up the pv_ops vector as well.
Dropping the pre-filled array makes life easier for objtool to
detect missing initializers in multiple pv_ops_ arrays.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://patch.msgid.link/20260105110520.21356-17-jgross@suse.com
---
 arch/x86/xen/enlighten_pv.c | 82 ++++++++++++++++++---------------------------
 tools/objtool/check.c       |  1 -
 2 files changed, 33 insertions(+), 50 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index b74ff8bc7f2a..8a19a88190ee 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1212,54 +1212,6 @@ static const struct pv_info xen_info __initconst = {
 	.name = "Xen",
 };
 
-static const typeof(pv_ops) xen_cpu_ops __initconst = {
-	.cpu = {
-		.cpuid = xen_cpuid,
-
-		.set_debugreg = xen_set_debugreg,
-		.get_debugreg = xen_get_debugreg,
-
-		.read_cr0 = xen_read_cr0,
-		.write_cr0 = xen_write_cr0,
-
-		.write_cr4 = xen_write_cr4,
-
-		.read_msr = xen_read_msr,
-		.write_msr = xen_write_msr,
-
-		.read_msr_safe = xen_read_msr_safe,
-		.write_msr_safe = xen_write_msr_safe,
-
-		.read_pmc = xen_read_pmc,
-
-		.load_tr_desc = paravirt_nop,
-		.set_ldt = xen_set_ldt,
-		.load_gdt = xen_load_gdt,
-		.load_idt = xen_load_idt,
-		.load_tls = xen_load_tls,
-		.load_gs_index = xen_load_gs_index,
-
-		.alloc_ldt = xen_alloc_ldt,
-		.free_ldt = xen_free_ldt,
-
-		.store_tr = xen_store_tr,
-
-		.write_ldt_entry = xen_write_ldt_entry,
-		.write_gdt_entry = xen_write_gdt_entry,
-		.write_idt_entry = xen_write_idt_entry,
-		.load_sp0 = xen_load_sp0,
-
-#ifdef CONFIG_X86_IOPL_IOPERM
-		.invalidate_io_bitmap = xen_invalidate_io_bitmap,
-		.update_io_bitmap = xen_update_io_bitmap,
-#endif
-		.io_delay = xen_io_delay,
-
-		.start_context_switch = xen_start_context_switch,
-		.end_context_switch = xen_end_context_switch,
-	},
-};
-
 static void xen_restart(char *msg)
 {
 	xen_reboot(SHUTDOWN_reboot);
@@ -1411,7 +1363,39 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
 
 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
-	pv_ops.cpu = xen_cpu_ops.cpu;
+
+	pv_ops.cpu.cpuid = xen_cpuid;
+	pv_ops.cpu.set_debugreg = xen_set_debugreg;
+	pv_ops.cpu.get_debugreg = xen_get_debugreg;
+	pv_ops.cpu.read_cr0 = xen_read_cr0;
+	pv_ops.cpu.write_cr0 = xen_write_cr0;
+	pv_ops.cpu.write_cr4 = xen_write_cr4;
+	pv_ops.cpu.read_msr = xen_read_msr;
+	pv_ops.cpu.write_msr = xen_write_msr;
+	pv_ops.cpu.read_msr_safe = xen_read_msr_safe;
+	pv_ops.cpu.write_msr_safe = xen_write_msr_safe;
+	pv_ops.cpu.read_pmc = xen_read_pmc;
+	pv_ops.cpu.load_tr_desc = paravirt_nop;
+	pv_ops.cpu.set_ldt = xen_set_ldt;
+	pv_ops.cpu.load_gdt = xen_load_gdt;
+	pv_ops.cpu.load_idt = xen_load_idt;
+	pv_ops.cpu.load_tls = xen_load_tls;
+	pv_ops.cpu.load_gs_index = xen_load_gs_index;
+	pv_ops.cpu.alloc_ldt = xen_alloc_ldt;
+	pv_ops.cpu.free_ldt = xen_free_ldt;
+	pv_ops.cpu.store_tr = xen_store_tr;
+	pv_ops.cpu.write_ldt_entry = xen_write_ldt_entry;
+	pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry;
+	pv_ops.cpu.write_idt_entry = xen_write_idt_entry;
+	pv_ops.cpu.load_sp0 = xen_load_sp0;
+#ifdef CONFIG_X86_IOPL_IOPERM
+	pv_ops.cpu.invalidate_io_bitmap = xen_invalidate_io_bitmap;
+	pv_ops.cpu.update_io_bitmap = xen_update_io_bitmap;
+#endif
+	pv_ops.cpu.io_delay = xen_io_delay;
+	pv_ops.cpu.start_context_switch = xen_start_context_switch;
+	pv_ops.cpu.end_context_switch = xen_end_context_switch;
+
 	xen_init_irq_ops();
 
 	/*
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 0c32a92dc693..8ab88f2b2c1b 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -570,7 +570,6 @@ static int init_pv_ops(struct objtool_file *file)
 {
 	static const char *pv_ops_tables[] = {
 		"pv_ops",
-		"xen_cpu_ops",
 		"xen_mmu_ops",
 		NULL,
 	};
-- 
cgit v1.2.3


From 6e5f2ad6bb74fd743c2162e32ac15e9061591ab1 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 9 Dec 2025 09:36:10 -0800
Subject: perf stat display: Make %f precision consistent

Commit bc22de9bcdb22491 ("perf stat: Display time in precision based on
std deviation") added multirun workload elapsed time. There was an
effort to make the precision in the output most useful for the user,
however, when gathering over runs it means the formatting varies. This
change just makes the output format fixed.

Before:
```
$ while :; do perf stat --null --repeat 3 sleep 0.1 2>&1 | grep elapsed; done
          0.101140 +- 0.000149 seconds time elapsed  ( +-  0.15% )
         0.1011396 +- 0.0000218 seconds time elapsed  ( +-  0.02% )
          0.101331 +- 0.000124 seconds time elapsed  ( +-  0.12% )
^C
$ while :; do perf stat --null --repeat 3 sleep 1 2>&1 | grep elapsed; done
          1.001317 +- 0.000146 seconds time elapsed  ( +-  0.01% )
          1.001377 +- 0.000172 seconds time elapsed  ( +-  0.02% )
           1.00253 +- 0.00131 seconds time elapsed  ( +-  0.13% )
```

After:
```
$ while :; do perf stat --null --repeat 3 sleep 0.1 2>&1 | grep elapsed; done
       0.101406408 +- 0.000064778 seconds time elapsed  ( +-  0.06% )
       0.101367315 +- 0.000027253 seconds time elapsed  ( +-  0.03% )
       0.101434164 +- 0.000084750 seconds time elapsed  ( +-  0.08% )
^C
$ while :; do perf stat --null --repeat 3 sleep 1 2>&1 | grep elapsed; done
       1.001525467 +- 0.000051703 seconds time elapsed  ( +-  0.01% )
       1.001375093 +- 0.000116200 seconds time elapsed  ( +-  0.01% )
       1.001141025 +- 0.000046361 seconds time elapsed  ( +-  0.00% )
```

Closes: https://lore.kernel.org/lkml/aTQRgAOpKyI53TEq@gmail.com/
Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Ingo Molnar <mingo@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat-display.c | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 6d02f84c5691..2ce0602974a1 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -1397,21 +1397,12 @@ static void print_header(struct perf_stat_config *config,
 		num_print_iv = 0;
 }
 
-static int get_precision(double num)
-{
-	if (num > 1)
-		return 0;
-
-	return lround(ceil(-log10(num)));
-}
-
-static void print_table(struct perf_stat_config *config,
-			FILE *output, int precision, double avg)
+static void print_table(struct perf_stat_config *config, FILE *output, double avg)
 {
 	char tmp[64];
 	int idx, indent = 0;
 
-	scnprintf(tmp, 64, " %17.*f", precision, avg);
+	scnprintf(tmp, 64, " %17.9f", avg);
 	while (tmp[indent] == ' ')
 		indent++;
 
@@ -1421,8 +1412,7 @@ static void print_table(struct perf_stat_config *config,
 		double run = (double) config->walltime_run[idx] / NSEC_PER_SEC;
 		int h, n = 1 + abs((int) (100.0 * (run - avg)/run) / 5);
 
-		fprintf(output, " %17.*f (%+.*f) ",
-			precision, run, precision, run - avg);
+		fprintf(output, " %17.9f (%+.9f) ", run, run - avg);
 
 		for (h = 0; h < n; h++)
 			fprintf(output, "#");
@@ -1462,17 +1452,11 @@ static void print_footer(struct perf_stat_config *config)
 		}
 	} else {
 		double sd = stddev_stats(config->walltime_nsecs_stats) / NSEC_PER_SEC;
-		/*
-		 * Display at most 2 more significant
-		 * digits than the stddev inaccuracy.
-		 */
-		int precision = get_precision(sd) + 2;
 
 		if (config->walltime_run_table)
-			print_table(config, output, precision, avg);
+			print_table(config, output, avg);
 
-		fprintf(output, " %17.*f +- %.*f seconds time elapsed",
-			precision, avg, precision, sd);
+		fprintf(output, " %17.9f +- %.9f seconds time elapsed", avg, sd);
 
 		print_noise_pct(config, NULL, sd, avg, /*before_metric=*/false);
 	}
-- 
cgit v1.2.3


From ef92c4351ec75bcfb8a1cc3a88109b5339f296ef Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 8 Dec 2025 09:23:39 -0800
Subject: perf test subcmd help: Add exclude disjoint subcmd names

The test is based on an error/fix posted to linux-perf-users.

Reported-by: Sri Jayaramappa <sjayaram@akamai.com>
Reviewed-by: Sri Jayaramappa <sjayaram@akamai.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Guilherme Amadio <amadio@gentoo.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Closes: https://lore.kernel.org/linux-perf-users/20251202213632.2873731-1-sjayaram@akamai.com/
Closes: https://urldefense.com/v3/__https://lore.kernel.org/linux-perf-users/20251202213632.2873731-1-sjayaram@akamai.com/__;!!GjvTz_vk!XehekKNUE4Ib_tvqIH6PMIIhly4X3BZ-Y40RC1HKMQ-6OdYEFvUPQhyWv_gk9vsRRN4_RcOLS2Bh0CQ$
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/subcmd-help.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/subcmd-help.c b/tools/perf/tests/subcmd-help.c
index 2280b4c0e5e7..9da96a16fd20 100644
--- a/tools/perf/tests/subcmd-help.c
+++ b/tools/perf/tests/subcmd-help.c
@@ -95,10 +95,36 @@ static int test__exclude_cmdnames(struct test_suite *test __maybe_unused,
 	return TEST_OK;
 }
 
+static int test__exclude_cmdnames_no_overlap(struct test_suite *test __maybe_unused,
+					     int subtest __maybe_unused)
+{
+	struct cmdnames cmds1 = {};
+	struct cmdnames cmds2 = {};
+
+	add_cmdname(&cmds1, "read-vdso32", 11);
+	add_cmdname(&cmds2, "archive", 7);
+
+	TEST_ASSERT_VAL("invalid original size", cmds1.cnt == 1);
+	TEST_ASSERT_VAL("invalid original size", cmds2.cnt == 1);
+
+	exclude_cmds(&cmds1, &cmds2);
+
+	TEST_ASSERT_VAL("invalid excluded size", cmds1.cnt == 1);
+	TEST_ASSERT_VAL("invalid excluded size", cmds2.cnt == 1);
+
+	TEST_ASSERT_VAL("cannot find cmd", is_in_cmdlist(&cmds1, "read-vdso32") == 1);
+	TEST_ASSERT_VAL("wrong cmd", is_in_cmdlist(&cmds1, "archive") == 0);
+
+	clean_cmdnames(&cmds1);
+	clean_cmdnames(&cmds2);
+	return TEST_OK;
+}
+
 static struct test_case tests__subcmd_help[] = {
 	TEST_CASE("Load subcmd names", load_cmdnames),
 	TEST_CASE("Uniquify subcmd names", uniq_cmdnames),
 	TEST_CASE("Exclude duplicate subcmd names", exclude_cmdnames),
+	TEST_CASE("Exclude disjoint subcmd names", exclude_cmdnames_no_overlap),
 	{	.name = NULL, }
 };
 
-- 
cgit v1.2.3


From 2a3602030d800b6600ef55c31e21bc54611f7770 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 12 Jan 2026 11:00:20 -0500
Subject: cgroup/cpuset: Don't invalidate sibling partitions on cpuset.cpus
 conflict

Currently, when setting a cpuset's cpuset.cpus to a value that conflicts
with the cpuset.cpus/cpuset.cpus.exclusive of a sibling partition,
the sibling's partition state becomes invalid. This is overly harsh and
is probably not necessary.

The cpuset.cpus.exclusive control file, if set, will override the
cpuset.cpus of the same cpuset when creating a cpuset partition.
So cpuset.cpus has less priority than cpuset.cpus.exclusive in setting up
a partition.  However, it cannot override a conflicting cpuset.cpus file
in a sibling cpuset and the partition creation process will fail. This
is inconsistent.  That will also make using cpuset.cpus.exclusive less
valuable as a tool to set up cpuset partitions as the users have to
check if such a cpuset.cpus conflict exists or not.

Fix these problems by making sure that once a cpuset.cpus.exclusive
is set without failure, it will always be allowed to form a valid
partition as long as at least one CPU can be granted from its parent
irrespective of the state of the siblings' cpuset.cpus values. Of
course, setting cpuset.cpus.exclusive will fail if it conflicts with
the cpuset.cpus.exclusive or the cpuset.cpus.exclusive.effective value
of a sibling.

Partition can still be created by setting only cpuset.cpus without
setting cpuset.cpus.exclusive. However, any conflicting CPUs in sibling's
cpuset.cpus.exclusive.effective and cpuset.cpus.exclusive values will
be removed from its cpuset.cpus.exclusive.effective as long as there
is still one or more CPUs left and can be granted from its parent. This
CPU stripping is currently done in rm_siblings_excl_cpus().

The new code will now try its best to enable the creation of new
partitions with only cpuset.cpus set without invalidating existing ones.
However it is not guaranteed that all the CPUs requested in cpuset.cpus
will be used in the new partition even when all these CPUs can be
granted from the parent.

This is similar to the fact that cpuset.cpus.effective may not be
able to include all the CPUs requested in cpuset.cpus. In this case,
the parent may not able to grant all the exclusive CPUs requested in
cpuset.cpus to cpuset.cpus.exclusive.effective if some of them have
already been granted to other partitions earlier.

With the creation of multiple sibling partitions by setting
only cpuset.cpus, this does have the side effect that their exact
cpuset.cpus.exclusive.effective settings will depend on the order of
partition creation if there are conflicts. Due to the exclusive nature
of the CPUs in a partition, it is not easy to make it fair other than
the old behavior of invalidating all the conflicting partitions.

For example,
  # echo "0-2" > A1/cpuset.cpus
  # echo "root" > A1/cpuset.cpus.partition
  # cat A1/cpuset.cpus.partition
  root
  # cat A1/cpuset.cpus.exclusive.effective
  0-2
  # echo "2-4" > B1/cpuset.cpus
  # echo "root" > B1/cpuset.cpus.partition
  # cat B1/cpuset.cpus.partition
  root
  # cat B1/cpuset.cpus.exclusive.effective
  3-4
  # cat B1/cpuset.cpus.effective
  3-4

For users who want to be sure that they can get most of the CPUs they
want, cpuset.cpus.exclusive should be used instead if they can set
it successfully without failure. Setting cpuset.cpus.exclusive will
guarantee that sibling conflicts from then onward is no longer possible.

To make this change, we have to separate out the is_cpu_exclusive()
check in cpus_excl_conflict() into a cgroup v1 only
cpuset1_cpus_excl_conflict() helper. The cpus_allowed_validate_change()
helper is now no longer needed and can be removed.

Some existing tests in test_cpuset_prs.sh are updated and new ones are
added to reflect the new behavior. The cgroup-v2.rst doc file is also
updated the clarify what exclusive CPUs will be used when a partition
is created.

Reported-by: Sun Shaojie <sunshaojie@kylinos.cn>
Closes: https://lore.kernel.org/lkml/20251117015708.977585-1-sunshaojie@kylinos.cn/
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Chen Ridong <chenridong@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/admin-guide/cgroup-v2.rst           | 33 +++++++---
 kernel/cgroup/cpuset-internal.h                   |  3 +
 kernel/cgroup/cpuset-v1.c                         | 19 ++++++
 kernel/cgroup/cpuset.c                            | 80 ++++++++---------------
 tools/testing/selftests/cgroup/test_cpuset_prs.sh | 26 ++++++--
 5 files changed, 90 insertions(+), 71 deletions(-)

(limited to 'tools')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 510df2461aff..28613c0e1c90 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2584,9 +2584,9 @@ Cpuset Interface Files
 	of this file will always be a subset of its parent's
 	"cpuset.cpus.exclusive.effective" if its parent is not the root
 	cgroup.  It will also be a subset of "cpuset.cpus.exclusive"
-	if it is set.  If "cpuset.cpus.exclusive" is not set, it is
-	treated to have an implicit value of "cpuset.cpus" in the
-	formation of local partition.
+	if it is set.  This file should only be non-empty if either
+	"cpuset.cpus.exclusive" is set or when the current cpuset is
+	a valid partition root.
 
   cpuset.cpus.isolated
 	A read-only and root cgroup only multiple values file.
@@ -2618,13 +2618,22 @@ Cpuset Interface Files
 	There are two types of partitions - local and remote.  A local
 	partition is one whose parent cgroup is also a valid partition
 	root.  A remote partition is one whose parent cgroup is not a
-	valid partition root itself.  Writing to "cpuset.cpus.exclusive"
-	is optional for the creation of a local partition as its
-	"cpuset.cpus.exclusive" file will assume an implicit value that
-	is the same as "cpuset.cpus" if it is not set.	Writing the
-	proper "cpuset.cpus.exclusive" values down the cgroup hierarchy
-	before the target partition root is mandatory for the creation
-	of a remote partition.
+	valid partition root itself.
+
+	Writing to "cpuset.cpus.exclusive" is optional for the creation
+	of a local partition as its "cpuset.cpus.exclusive" file will
+	assume an implicit value that is the same as "cpuset.cpus" if it
+	is not set.  Writing the proper "cpuset.cpus.exclusive" values
+	down the cgroup hierarchy before the target partition root is
+	mandatory for the creation of a remote partition.
+
+	Not all the CPUs requested in "cpuset.cpus.exclusive" can be
+	used to form a new partition.  Only those that were present
+	in its parent's "cpuset.cpus.exclusive.effective" control
+	file can be used.  For partitions created without setting
+	"cpuset.cpus.exclusive", exclusive CPUs specified in sibling's
+	"cpuset.cpus.exclusive" or "cpuset.cpus.exclusive.effective"
+	also cannot be used.
 
 	Currently, a remote partition cannot be created under a local
 	partition.  All the ancestors of a remote partition root except
@@ -2632,6 +2641,10 @@ Cpuset Interface Files
 
 	The root cgroup is always a partition root and its state cannot
 	be changed.  All other non-root cgroups start out as "member".
+	Even though the "cpuset.cpus.exclusive*" and "cpuset.cpus"
+	control files are not present in the root cgroup, they are
+	implicitly the same as the "/sys/devices/system/cpu/possible"
+	sysfs file.
 
 	When set to "root", the current cgroup is the root of a new
 	partition or scheduling domain.  The set of exclusive CPUs is
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index e718a4f54360..e8e2683cb067 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -312,6 +312,7 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
 			    struct cpumask *new_cpus, nodemask_t *new_mems,
 			    bool cpus_updated, bool mems_updated);
 int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
+bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2);
 void cpuset1_init(struct cpuset *cs);
 void cpuset1_online_css(struct cgroup_subsys_state *css);
 int cpuset1_generate_sched_domains(cpumask_var_t **domains,
@@ -326,6 +327,8 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs,
 			    bool cpus_updated, bool mems_updated) {}
 static inline int cpuset1_validate_change(struct cpuset *cur,
 				struct cpuset *trial) { return 0; }
+static inline bool cpuset1_cpus_excl_conflict(struct cpuset *cs1,
+					struct cpuset *cs2) { return false; }
 static inline void cpuset1_init(struct cpuset *cs) {}
 static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {}
 static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains,
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index ecfea7800f0d..04124c38a774 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -373,6 +373,25 @@ out:
 	return ret;
 }
 
+/*
+ * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts
+ *                                to legacy (v1)
+ * @cs1: first cpuset to check
+ * @cs2: second cpuset to check
+ *
+ * Returns: true if CPU exclusivity conflict exists, false otherwise
+ *
+ * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect.
+ */
+bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+	if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
+		return cpumask_intersects(cs1->cpus_allowed,
+					  cs2->cpus_allowed);
+
+	return false;
+}
+
 #ifdef CONFIG_PROC_PID_CPUSET
 /*
  * proc_cpuset_show()
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4819ab429771..83fb83a86b4b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -129,6 +129,17 @@ static bool force_sd_rebuild;
  *  For simplicity, a local partition can be created under a local or remote
  *  partition but a remote partition cannot have any partition root in its
  *  ancestor chain except the cgroup root.
+ *
+ *  A valid partition can be formed by setting exclusive_cpus or cpus_allowed
+ *  if exclusive_cpus is not set. In the case of partition with empty
+ *  exclusive_cpus, all the conflicting exclusive CPUs specified in the
+ *  following cpumasks of sibling cpusets will be removed from its
+ *  cpus_allowed in determining its effective_xcpus.
+ *  - effective_xcpus
+ *  - exclusive_cpus
+ *
+ *  The "cpuset.cpus.exclusive" control file should be used for setting up
+ *  partition if the users want to get as many CPUs as possible.
  */
 #define PRS_MEMBER		0
 #define PRS_ROOT		1
@@ -616,27 +627,25 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
  * Returns: true if CPU exclusivity conflict exists, false otherwise
  *
  * Conflict detection rules:
- * 1. If either cpuset is CPU exclusive, they must be mutually exclusive
- * 2. exclusive_cpus masks cannot intersect between cpusets
- * 3. The allowed CPUs of a sibling cpuset cannot be a subset of the new exclusive CPUs
+ *  o cgroup v1
+ *    See cpuset1_cpus_excl_conflict()
+ *  o cgroup v2
+ *    - The exclusive_cpus values cannot overlap.
+ *    - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed.
  */
 static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling,
 				      bool xcpus_changed)
 {
-	/* If either cpuset is exclusive, check if they are mutually exclusive */
-	if (is_cpu_exclusive(trial) || is_cpu_exclusive(sibling))
-		return !cpusets_are_exclusive(trial, sibling);
-
-	/* Exclusive_cpus cannot intersect */
-	if (cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus))
-		return true;
+	if (!cpuset_v2())
+		return cpuset1_cpus_excl_conflict(trial, sibling);
 
 	/* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */
 	if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) &&
 	    cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus))
 		return true;
 
-	return false;
+	/* Exclusive_cpus cannot intersect */
+	return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus);
 }
 
 static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
@@ -2312,43 +2321,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri
 	return PERR_NONE;
 }
 
-static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,
-					struct tmpmasks *tmp)
-{
-	int retval;
-	struct cpuset *parent = parent_cs(cs);
-
-	retval = validate_change(cs, trialcs);
-
-	if ((retval == -EINVAL) && cpuset_v2()) {
-		struct cgroup_subsys_state *css;
-		struct cpuset *cp;
-
-		/*
-		 * The -EINVAL error code indicates that partition sibling
-		 * CPU exclusivity rule has been violated. We still allow
-		 * the cpumask change to proceed while invalidating the
-		 * partition. However, any conflicting sibling partitions
-		 * have to be marked as invalid too.
-		 */
-		trialcs->prs_err = PERR_NOTEXCL;
-		rcu_read_lock();
-		cpuset_for_each_child(cp, css, parent) {
-			struct cpumask *xcpus = user_xcpus(trialcs);
-
-			if (is_partition_valid(cp) &&
-			    cpumask_intersects(xcpus, cp->effective_xcpus)) {
-				rcu_read_unlock();
-				update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);
-				rcu_read_lock();
-			}
-		}
-		rcu_read_unlock();
-		retval = 0;
-	}
-	return retval;
-}
-
 /**
  * partition_cpus_change - Handle partition state changes due to CPU mask updates
  * @cs: The target cpuset being modified
@@ -2408,15 +2380,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
 		return 0;
 
-	if (alloc_tmpmasks(&tmp))
-		return -ENOMEM;
-
 	compute_trialcs_excpus(trialcs, cs);
 	trialcs->prs_err = PERR_NONE;
 
-	retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
+	retval = validate_change(cs, trialcs);
 	if (retval < 0)
-		goto out_free;
+		return retval;
+
+	if (alloc_tmpmasks(&tmp))
+		return -ENOMEM;
 
 	/*
 	 * Check all the descendants in update_cpumasks_hier() if
@@ -2439,7 +2411,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
 	if (cs->partition_root_state)
 		update_partition_sd_lb(cs, old_prs);
-out_free:
+
 	free_tmpmasks(&tmp);
 	return retval;
 }
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
index a17256d9f88a..ff4540b0490e 100755
--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -269,7 +269,7 @@ TEST_MATRIX=(
 	" C0-3:S+ C1-3:S+ C2-3     .    X2-3   X3:P2    .      .     0 A1:0-2|A2:3|A3:3 A1:P0|A2:P2 3"
 	" C0-3:S+ C1-3:S+ C2-3     .    X2-3   X2-3  X2-3:P2   .     0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3"
 	" C0-3:S+ C1-3:S+ C2-3     .    X2-3   X2-3 X2-3:P2:C3 .     0 A1:0-1|A2:1|A3:2-3 A1:P0|A3:P2 2-3"
-	" C0-3:S+ C1-3:S+ C2-3   C2-3     .      .      .      P2    0 A1:0-3|A2:1-3|A3:2-3|B1:2-3 A1:P0|A3:P0|B1:P-2"
+	" C0-3:S+ C1-3:S+ C2-3   C2-3     .      .      .      P2    0 A1:0-1|A2:1|A3:1|B1:2-3 A1:P0|A3:P0|B1:P2"
 	" C0-3:S+ C1-3:S+ C2-3   C4-5     .      .      .      P2    0 B1:4-5 B1:P2 4-5"
 	" C0-3:S+ C1-3:S+ C2-3    C4    X2-3   X2-3  X2-3:P2   P2    0 A3:2-3|B1:4 A3:P2|B1:P2 2-4"
 	" C0-3:S+ C1-3:S+ C2-3    C4    X2-3   X2-3 X2-3:P2:C1-3 P2  0 A3:2-3|B1:4 A3:P2|B1:P2 2-4"
@@ -318,7 +318,7 @@ TEST_MATRIX=(
 	# Invalid to valid local partition direct transition tests
 	" C1-3:S+:P2 X4:P2  .      .      .      .      .      .     0 A1:1-3|XA1:1-3|A2:1-3:XA2: A1:P2|A2:P-2 1-3"
 	" C1-3:S+:P2 X4:P2  .      .      .    X3:P2    .      .     0 A1:1-2|XA1:1-3|A2:3:XA2:3 A1:P2|A2:P2 1-3"
-	"  C0-3:P2   .      .    C4-6   C0-4     .      .      .     0 A1:0-4|B1:4-6 A1:P-2|B1:P0"
+	"  C0-3:P2   .      .    C4-6   C0-4     .      .      .     0 A1:0-4|B1:5-6 A1:P2|B1:P0"
 	"  C0-3:P2   .      .    C4-6 C0-4:C0-3  .      .      .     0 A1:0-3|B1:4-6 A1:P2|B1:P0 0-3"
 
 	# Local partition invalidation tests
@@ -388,10 +388,10 @@ TEST_MATRIX=(
 	"  C0-1:S+  C1      .    C2-3     .      P2     .      .     0 A1:0-1|A2:1 A1:P0|A2:P-2"
 	"  C0-1:S+ C1:P2    .    C2-3     P1     .      .      .     0 A1:0|A2:1 A1:P1|A2:P2 0-1|1"
 
-	# A non-exclusive cpuset.cpus change will invalidate partition and its siblings
-	"  C0-1:P1   .      .    C2-3   C0-2     .      .      .     0 A1:0-2|B1:2-3 A1:P-1|B1:P0"
-	"  C0-1:P1   .      .  P1:C2-3  C0-2     .      .      .     0 A1:0-2|B1:2-3 A1:P-1|B1:P-1"
-	"   C0-1     .      .  P1:C2-3  C0-2     .      .      .     0 A1:0-2|B1:2-3 A1:P0|B1:P-1"
+	# A non-exclusive cpuset.cpus change will not invalidate its siblings partition.
+	"  C0-1:P1   .      .    C2-3   C0-2     .      .      .     0 A1:0-2|B1:3 A1:P1|B1:P0"
+	"  C0-1:P1   .      .  P1:C2-3  C0-2     .      .      .     0 A1:0-1|XA1:0-1|B1:2-3 A1:P1|B1:P1"
+	"   C0-1     .      .  P1:C2-3  C0-2     .      .      .     0 A1:0-1|B1:2-3 A1:P0|B1:P1"
 
 	# cpuset.cpus can overlap with sibling cpuset.cpus.exclusive but not subsumed by it
 	"   C0-3     .      .    C4-5     X5     .      .      .     0 A1:0-3|B1:4-5"
@@ -417,6 +417,14 @@ TEST_MATRIX=(
 	" CX1-4:S+ CX2-4:P2 .    C5-6      .     .      .   P1:C3-6  0 A1:1|A2:2-4|B1:5-6 \
 								       A1:P0|A2:P2:B1:P-1 2-4"
 
+	# When multiple partitions with conflicting cpuset.cpus are created, the
+	# latter created ones will only get what are left of the available exclusive
+	# CPUs.
+	"  C1-3:P1   .      .      .       .     .      .   C3-5:P1  0 A1:1-3|B1:4-5:XB1:4-5 A1:P1|B1:P1"
+
+	# cpuset.cpus can be set to a subset of sibling's cpuset.cpus.exclusive
+	" C1-3:X1-3  .      .    C4-5      .     .      .     C1-2   0 A1:1-3|B1:1-2"
+
 	#  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
 	#  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
 	# Failure cases:
@@ -427,7 +435,7 @@ TEST_MATRIX=(
 	# Changes to cpuset.cpus.exclusive that violate exclusivity rule is rejected
 	"   C0-3     .      .    C4-5   X0-3     .      .     X3-5   1 A1:0-3|B1:4-5"
 
-	# cpuset.cpus cannot be a subset of sibling cpuset.cpus.exclusive
+	# cpuset.cpus.exclusive cannot be set to a superset of sibling's cpuset.cpus
 	"   C0-3     .      .    C4-5   X3-5     .      .      .     1 A1:0-3|B1:4-5"
 )
 
@@ -477,6 +485,10 @@ REMOTE_TEST_MATRIX=(
 	      .      .   X1-2:P2  X4-5:P1  .     X1-7:P2 p1:3|c11:1-2|c12:4:c22:5-6 \
 							 p1:P0|p2:P1|c11:P2|c12:P1|c22:P2 \
 							 1-2,4-6|1-2,5-6"
+	# c12 whose cpuset.cpus CPUs are all granted to c11 will become invalid partition
+	" C1-5:P1:S+ .  C1-4:P1   C2-3     .       .  \
+	      .      .     .       P1      .       .     p1:5|c11:1-4|c12:5 \
+							 p1:P1|c11:P1|c12:P-1"
 )
 
 #
-- 
cgit v1.2.3


From 272bd8183376a9e20fe08bacbaa44003d7c8acaa Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 12 Jan 2026 11:00:21 -0500
Subject: cgroup/cpuset: Move the v1 empty cpus/mems check to
 cpuset1_validate_change()

As stated in commit 1c09b195d37f ("cpuset: fix a regression in validating
config change"), it is not allowed to clear masks of a cpuset if
there're tasks in it. This is specific to v1 since empty "cpuset.cpus"
or "cpuset.mems" will cause the v2 cpuset to inherit the effective CPUs
or memory nodes from its parent. So it is OK to have empty cpus or mems
even if there are tasks in the cpuset.

Move this empty cpus/mems check in validate_change() to
cpuset1_validate_change() to allow more flexibility in setting
cpus or mems in v2. cpuset_is_populated() needs to be moved into
cpuset-internal.h as it is needed by the empty cpus/mems checking code.

Also add a test case to test_cpuset_prs.sh to verify that.

Reported-by: Chen Ridong <chenridong@huaweicloud.com>
Closes: https://lore.kernel.org/lkml/7a3ec392-2e86-4693-aa9f-1e668a668b9c@huaweicloud.com/
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Chen Ridong <chenridong@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cpuset-internal.h                   |  9 +++++++++
 kernel/cgroup/cpuset-v1.c                         | 14 ++++++++++++++
 kernel/cgroup/cpuset.c                            | 23 -----------------------
 tools/testing/selftests/cgroup/test_cpuset_prs.sh |  3 +++
 4 files changed, 26 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index e8e2683cb067..fd7d19842ded 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -260,6 +260,15 @@ static inline int nr_cpusets(void)
 	return static_key_count(&cpusets_enabled_key.key) + 1;
 }
 
+static inline bool cpuset_is_populated(struct cpuset *cs)
+{
+	lockdep_assert_cpuset_lock_held();
+
+	/* Cpusets in the process of attaching should be considered as populated */
+	return cgroup_is_populated(cs->css.cgroup) ||
+		cs->attach_in_progress;
+}
+
 /**
  * cpuset_for_each_child - traverse online children of a cpuset
  * @child_cs: loop cursor pointing to the current child
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index 04124c38a774..7a23b9e8778f 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -368,6 +368,20 @@ int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
 	if (par && !is_cpuset_subset(trial, par))
 		goto out;
 
+	/*
+	 * Cpusets with tasks - existing or newly being attached - can't
+	 * be changed to have empty cpus_allowed or mems_allowed.
+	 */
+	ret = -ENOSPC;
+	if (cpuset_is_populated(cur)) {
+		if (!cpumask_empty(cur->cpus_allowed) &&
+		    cpumask_empty(trial->cpus_allowed))
+			goto out;
+		if (!nodes_empty(cur->mems_allowed) &&
+		    nodes_empty(trial->mems_allowed))
+			goto out;
+	}
+
 	ret = 0;
 out:
 	return ret;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 83fb83a86b4b..a3dbca125588 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -370,15 +370,6 @@ static inline bool is_in_v2_mode(void)
 	      (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
 }
 
-static inline bool cpuset_is_populated(struct cpuset *cs)
-{
-	lockdep_assert_held(&cpuset_mutex);
-
-	/* Cpusets in the process of attaching should be considered as populated */
-	return cgroup_is_populated(cs->css.cgroup) ||
-		cs->attach_in_progress;
-}
-
 /**
  * partition_is_populated - check if partition has tasks
  * @cs: partition root to be checked
@@ -695,20 +686,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 
 	par = parent_cs(cur);
 
-	/*
-	 * Cpusets with tasks - existing or newly being attached - can't
-	 * be changed to have empty cpus_allowed or mems_allowed.
-	 */
-	ret = -ENOSPC;
-	if (cpuset_is_populated(cur)) {
-		if (!cpumask_empty(cur->cpus_allowed) &&
-		    cpumask_empty(trial->cpus_allowed))
-			goto out;
-		if (!nodes_empty(cur->mems_allowed) &&
-		    nodes_empty(trial->mems_allowed))
-			goto out;
-	}
-
 	/*
 	 * We can't shrink if we won't have enough room for SCHED_DEADLINE
 	 * tasks. This check is not done when scheduling is disabled as the
diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
index ff4540b0490e..5dff3ad53867 100755
--- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
+++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
@@ -425,6 +425,9 @@ TEST_MATRIX=(
 	# cpuset.cpus can be set to a subset of sibling's cpuset.cpus.exclusive
 	" C1-3:X1-3  .      .    C4-5      .     .      .     C1-2   0 A1:1-3|B1:1-2"
 
+	# cpuset.cpus can become empty with task in it as it inherits parent's effective CPUs
+	" C1-3:S+   C2      .      .       .    T:C     .      .     0 A1:1-3|A2:1-3"
+
 	#  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs Pstate ISOLCPUS
 	#  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- ------ --------
 	# Failure cases:
-- 
cgit v1.2.3


From f815fc0c66e777c727689666cfb46b8d461c2f99 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 10 Jan 2026 20:13:32 -0800
Subject: perf unwind-libdw: Fix invalid reference counts

The addition of addr_location__exit() causes use-after put on the maps
and map references in the unwind info. Add the gets and then add the
map_symbol__exit() calls.

Fixes: 0dd5041c9a0eaf8c ("perf addr_location: Add init/exit/copy functions")
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Tony Jones <tonyj@suse.de>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/unwind-libdw.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index ae70fb56a057..3ff427a49e4c 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -136,8 +136,8 @@ static int entry(u64 ip, struct unwind_info *ui)
 	}
 
 	e->ip	  = ip;
-	e->ms.maps = al.maps;
-	e->ms.map = al.map;
+	e->ms.maps = maps__get(al.maps);
+	e->ms.map = map__get(al.map);
 	e->ms.sym = al.sym;
 
 	pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n",
@@ -325,6 +325,9 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 	if (err)
 		pr_debug("unwind: failed with '%s'\n", dwfl_errmsg(-1));
 
+	for (i = 0; i < ui->idx; i++)
+		map_symbol__exit(&ui->entries[i].ms);
+
 	dwfl_end(ui->dwfl);
 	free(ui);
 	return 0;
-- 
cgit v1.2.3


From 27fc6f565d06837e71001368c84ee71e5221ce48 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 10 Jan 2026 20:13:37 -0800
Subject: perf test workload: Add inlineloop test workload

The purpose of this workload is to gather samples in an inlined
function. This can be used to test whether inlined addr2line works
correctly.

Committer testing:

  $ perf record perf test -w inlineloop 1
  [ perf record: Woken up 2 times to write data ]
  [ perf record: Captured and wrote 0.161 MB perf.data (4005 samples) ]
  $ perf report --stdio --dso perf -s srcfile,srcline
  #
  # Total Lost Samples: 0
  #
  # Samples: 4K of event 'cpu/cycles/Pu'
  # Event count (approx.): 5535180842
  #
  # Overhead  Source File   Source:Line
  # ........  ............  ...............
  #
      99.04%  inlineloop.c  inlineloop.c:21
       0.46%  inlineloop.c  inlineloop.c:20
  #
  $

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Tony Jones <tonyj@suse.de>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/builtin-test.c         |  1 +
 tools/perf/tests/tests.h                |  1 +
 tools/perf/tests/workloads/Build        |  2 ++
 tools/perf/tests/workloads/inlineloop.c | 52 +++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+)
 create mode 100644 tools/perf/tests/workloads/inlineloop.c

(limited to 'tools')

diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index bd6ffa8e4578..e2490652f030 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -153,6 +153,7 @@ static struct test_workload *workloads[] = {
 	&workload__datasym,
 	&workload__landlock,
 	&workload__traploop,
+	&workload__inlineloop,
 };
 
 #define workloads__for_each(workload) \
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index cb67ddbd0375..1f0f8b267fb1 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -240,6 +240,7 @@ DECLARE_WORKLOAD(brstack);
 DECLARE_WORKLOAD(datasym);
 DECLARE_WORKLOAD(landlock);
 DECLARE_WORKLOAD(traploop);
+DECLARE_WORKLOAD(inlineloop);
 
 extern const char *dso_to_test;
 extern const char *test_objdump_path;
diff --git a/tools/perf/tests/workloads/Build b/tools/perf/tests/workloads/Build
index fb1012cc4fc3..866a00bd14a0 100644
--- a/tools/perf/tests/workloads/Build
+++ b/tools/perf/tests/workloads/Build
@@ -8,9 +8,11 @@ perf-test-y += brstack.o
 perf-test-y += datasym.o
 perf-test-y += landlock.o
 perf-test-y += traploop.o
+perf-test-y += inlineloop.o
 
 CFLAGS_sqrtloop.o         = -g -O0 -fno-inline -U_FORTIFY_SOURCE
 CFLAGS_leafloop.o         = -g -O0 -fno-inline -fno-omit-frame-pointer -U_FORTIFY_SOURCE
 CFLAGS_brstack.o          = -g -O0 -fno-inline -U_FORTIFY_SOURCE
 CFLAGS_datasym.o          = -g -O0 -fno-inline -U_FORTIFY_SOURCE
 CFLAGS_traploop.o         = -g -O0 -fno-inline -U_FORTIFY_SOURCE
+CFLAGS_inlineloop.o       = -g -O2
diff --git a/tools/perf/tests/workloads/inlineloop.c b/tools/perf/tests/workloads/inlineloop.c
new file mode 100644
index 000000000000..bc82dfc7c410
--- /dev/null
+++ b/tools/perf/tests/workloads/inlineloop.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <pthread.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <linux/compiler.h>
+#include "../tests.h"
+
+static volatile int a;
+static volatile sig_atomic_t done;
+
+static void sighandler(int sig __maybe_unused)
+{
+	done = 1;
+}
+
+static inline void __attribute__((always_inline)) leaf(int b)
+{
+again:
+	a += b;
+	if (!done)
+		goto again;
+}
+
+static inline void __attribute__((always_inline)) middle(int b)
+{
+	leaf(b);
+}
+
+static noinline void parent(int b)
+{
+	middle(b);
+}
+
+static int inlineloop(int argc, const char **argv)
+{
+	int sec = 1;
+
+	pthread_setname_np(pthread_self(), "perf-inlineloop");
+	if (argc > 0)
+		sec = atoi(argv[0]);
+
+	signal(SIGINT, sighandler);
+	signal(SIGALRM, sighandler);
+	alarm(sec);
+
+	parent(sec);
+
+	return 0;
+}
+
+DEFINE_WORKLOAD(inlineloop);
-- 
cgit v1.2.3


From 88c51002d06f9a68a2b666f7e2c262b6e198f566 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 10 Jan 2026 20:13:33 -0800
Subject: perf addr2line: Add a libdw implementation

Add an implementation of addr2line that uses libdw.

Other addr2line implementations are slow, particularly in the case of
forking addr2line.

Add an implementation that caches the libdw information in the dso and
uses it to find the file and line number information.

Inline information is supported but because cu_walk_functions_at visits
the leaf function last add a inline_list__append_tail to reverse the
lists order.

Committer testing:

  # perf probe -x ~/bin/perf libdw__addr2line
  Added new event:
    probe_perf:libdw_addr2line (on libdw__addr2line in /home/acme/bin/perf)

  You can now use it in all perf tools, such as:

  	perf record -e probe_perf:libdw_addr2line -aR sleep 1

  #
  # perf stat -e probe_perf:libdw_addr2line perf report -f --dso perf --stdio -s srcfile,srcline
  # To display the perf.data header info, please use --header/--header-only options.
  #
  #
  # Total Lost Samples: 0
  #
  # Samples: 4K of event 'cpu/cycles/Pu'
  # Event count (approx.): 5535180842
  #
  # Overhead  Source File   Source:Line
  # ........  ............  ...............
  #
      99.04%  inlineloop.c  inlineloop.c:21
       0.46%  inlineloop.c  inlineloop.c:20

  #
  # (Tip: For tracepoint events, try: perf report -s trace_fields)
  #

   Performance counter stats for 'perf report -f --dso perf --stdio -s srcfile,srcline':

                  44      probe_perf:libdw_addr2line

         0.037260744 seconds time elapsed

         0.025299000 seconds user
         0.011918000 seconds sys
  #

Adding probes to the other addr2line implementations (llvm__addr2line,
libbfd__addr2line and cmd__addr2line) I noticed some fallbacks to the
llvm one:

 Performance counter stats for 'perf report -f --dso perf --stdio -s srcfile,srcline':

                44      probe_perf:libdw_addr2line
                23      probe_perf:llvm_addr2line
                 0	probe_perf:libbfd_addr2line
                 0      probe_perf:cmd_addr2line

Something to investigate further, but at least we don't fallback to the
cmd based one :-)

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Tony Jones <tonyj@suse.de>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/Build     |   1 +
 tools/perf/util/dso.c     |   2 +
 tools/perf/util/dso.h     |  11 ++++
 tools/perf/util/libdw.c   | 153 ++++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/libdw.h   |  60 ++++++++++++++++++
 tools/perf/util/srcline.c |  24 ++++++++
 tools/perf/util/srcline.h |   1 +
 7 files changed, 252 insertions(+)
 create mode 100644 tools/perf/util/libdw.c
 create mode 100644 tools/perf/util/libdw.h

(limited to 'tools')

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 1c2a43e1dc68..2bed6274e248 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -224,6 +224,7 @@ perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o
 perf-util-$(CONFIG_LIBDW) += debuginfo.o
 perf-util-$(CONFIG_LIBDW) += annotate-data.o
+perf-util-$(CONFIG_LIBDW) += libdw.o
 
 perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
 perf-util-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind-local.o
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 344e689567ee..06980844c014 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -32,6 +32,7 @@
 #include "string2.h"
 #include "vdso.h"
 #include "annotate-data.h"
+#include "libdw.h"
 
 static const char * const debuglink_paths[] = {
 	"%.0s%s",
@@ -1605,6 +1606,7 @@ void dso__delete(struct dso *dso)
 	auxtrace_cache__free(RC_CHK_ACCESS(dso)->auxtrace_cache);
 	dso_cache__free(dso);
 	dso__free_a2l(dso);
+	dso__free_a2l_libdw(dso);
 	dso__free_symsrc_filename(dso);
 	nsinfo__zput(RC_CHK_ACCESS(dso)->nsinfo);
 	mutex_destroy(dso__lock(dso));
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index f8ccb9816b89..4aee23775054 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -268,6 +268,7 @@ DECLARE_RC_STRUCT(dso) {
 	const char	 *short_name;
 	const char	 *long_name;
 	void		 *a2l;
+	void		 *a2l_libdw;
 	char		 *symsrc_filename;
 #if defined(__powerpc__)
 	void		*dwfl;			/* DWARF debug info */
@@ -334,6 +335,16 @@ static inline void dso__set_a2l(struct dso *dso, void *val)
 	RC_CHK_ACCESS(dso)->a2l = val;
 }
 
+static inline void *dso__a2l_libdw(const struct dso *dso)
+{
+	return RC_CHK_ACCESS(dso)->a2l_libdw;
+}
+
+static inline void dso__set_a2l_libdw(struct dso *dso, void *val)
+{
+	RC_CHK_ACCESS(dso)->a2l_libdw = val;
+}
+
 static inline unsigned int dso__a2l_fails(const struct dso *dso)
 {
 	return RC_CHK_ACCESS(dso)->a2l_fails;
diff --git a/tools/perf/util/libdw.c b/tools/perf/util/libdw.c
new file mode 100644
index 000000000000..e4bfd52bd172
--- /dev/null
+++ b/tools/perf/util/libdw.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "dso.h"
+#include "libdw.h"
+#include "srcline.h"
+#include "symbol.h"
+#include "dwarf-aux.h"
+#include <fcntl.h>
+#include <unistd.h>
+#include <elfutils/libdwfl.h>
+
+void dso__free_a2l_libdw(struct dso *dso)
+{
+	Dwfl *dwfl = dso__a2l_libdw(dso);
+
+	if (dwfl) {
+		dwfl_end(dwfl);
+		dso__set_a2l_libdw(dso, NULL);
+	}
+}
+
+struct libdw_a2l_cb_args {
+	struct dso *dso;
+	struct symbol *sym;
+	struct inline_node *node;
+	char *leaf_srcline;
+	bool leaf_srcline_used;
+};
+
+static int libdw_a2l_cb(Dwarf_Die *die, void *_args)
+{
+	struct libdw_a2l_cb_args *args  = _args;
+	struct symbol *inline_sym = new_inline_sym(args->dso, args->sym, dwarf_diename(die));
+	const char *call_fname = die_get_call_file(die);
+	char *call_srcline = srcline__unknown;
+	struct inline_list *ilist;
+
+	if (!inline_sym)
+		return -ENOMEM;
+
+	/* Assign caller information to the parent. */
+	if (call_fname)
+		call_srcline = srcline_from_fileline(call_fname, die_get_call_lineno(die));
+
+	list_for_each_entry(ilist, &args->node->val, list) {
+		ilist->srcline =  call_srcline;
+		call_srcline = NULL;
+		break;
+	}
+	if (call_srcline && call_fname)
+		free(call_srcline);
+
+	/* Add this symbol to the chain as the leaf. */
+	inline_list__append_tail(inline_sym, args->leaf_srcline, args->node);
+	args->leaf_srcline_used = true;
+	return 0;
+}
+
+int libdw__addr2line(const char *dso_name, u64 addr,
+		     char **file, unsigned int *line_nr,
+		     struct dso *dso, bool unwind_inlines,
+		     struct inline_node *node, struct symbol *sym)
+{
+	static const Dwfl_Callbacks offline_callbacks = {
+		.find_debuginfo = dwfl_standard_find_debuginfo,
+		.section_address = dwfl_offline_section_address,
+		.find_elf = dwfl_build_id_find_elf,
+	};
+	Dwfl *dwfl = dso__a2l_libdw(dso);
+	Dwfl_Module *mod;
+	Dwfl_Line *dwline;
+	Dwarf_Addr bias;
+	const char *src;
+	int lineno = 0;
+
+	if (!dwfl) {
+		/*
+		 * Initialize Dwfl session.
+		 * We need to open the DSO file to report it to libdw.
+		 */
+		int fd;
+
+		fd = open(dso_name, O_RDONLY);
+		if (fd < 0)
+			return 0;
+
+		dwfl = dwfl_begin(&offline_callbacks);
+		if (!dwfl) {
+			close(fd);
+			return 0;
+		}
+
+		/*
+		 * If the report is successful, the file descriptor fd is consumed
+		 * and closed by the Dwfl. If not, it is not closed.
+		 */
+		mod = dwfl_report_offline(dwfl, dso_name, dso_name, fd);
+		if (!mod) {
+			dwfl_end(dwfl);
+			close(fd);
+			return 0;
+		}
+
+		dwfl_report_end(dwfl, /*removed=*/NULL, /*arg=*/NULL);
+		dso__set_a2l_libdw(dso, dwfl);
+	} else {
+		/* Dwfl session already initialized, get module for address. */
+		mod = dwfl_addrmodule(dwfl, addr);
+	}
+
+	if (!mod)
+		return 0;
+
+	/*
+	 * Get/ignore the dwarf information. Determine the bias, difference
+	 * between the regular ELF addr2line addresses and those to use with
+	 * libdw.
+	 */
+	if (!dwfl_module_getdwarf(mod, &bias))
+		return 0;
+
+	/* Find source line information for the address. */
+	dwline = dwfl_module_getsrc(mod, addr + bias);
+	if (!dwline)
+		return 0;
+
+	/* Get line information. */
+	src = dwfl_lineinfo(dwline, /*addr=*/NULL, &lineno, /*col=*/NULL, /*mtime=*/NULL,
+			    /*length=*/NULL);
+
+	if (file)
+		*file = src ? strdup(src) : NULL;
+	if (line_nr)
+		*line_nr = lineno;
+
+	/* Optionally unwind inline function call chain. */
+	if (unwind_inlines && node) {
+		Dwarf_Addr unused_bias;
+		Dwarf_Die *cudie = dwfl_module_addrdie(mod, addr + bias, &unused_bias);
+		struct libdw_a2l_cb_args args = {
+			.dso = dso,
+			.sym = sym,
+			.node = node,
+			.leaf_srcline = srcline_from_fileline(src ?: "<unknown>", lineno),
+		};
+
+		/* Walk from the parent down to the leaf. */
+		cu_walk_functions_at(cudie, addr, libdw_a2l_cb, &args);
+
+		if (!args.leaf_srcline_used)
+			free(args.leaf_srcline);
+	}
+	return 1;
+}
diff --git a/tools/perf/util/libdw.h b/tools/perf/util/libdw.h
new file mode 100644
index 000000000000..0f8d7b4a11a5
--- /dev/null
+++ b/tools/perf/util/libdw.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef PERF_LIBDW_H
+#define PERF_LIBDW_H
+
+#include <linux/types.h>
+
+struct dso;
+struct inline_node;
+struct symbol;
+
+#ifdef HAVE_LIBDW_SUPPORT
+/*
+ * libdw__addr2line - Convert address to source location using libdw
+ * @dso_name: Name of the DSO
+ * @addr: Address to resolve
+ * @file: Pointer to return filename (caller must free)
+ * @line_nr: Pointer to return line number
+ * @dso: The dso struct
+ * @unwind_inlines: Whether to unwind inline function calls
+ * @node: Inline node list to append to
+ * @sym: The symbol associated with the address
+ *
+ * This function initializes a Dwfl context for the DSO if not already present,
+ * finds the source line information for the given address, and optionally
+ * resolves inline function call chains.
+ *
+ * Returns 1 on success (found), 0 on failure (not found).
+ */
+int libdw__addr2line(const char *dso_name, u64 addr, char **file,
+		     unsigned int *line_nr, struct dso *dso,
+		     bool unwind_inlines, struct inline_node *node,
+		     struct symbol *sym);
+
+/*
+ * dso__free_a2l_libdw - Free libdw resources associated with the DSO
+ * @dso: The dso to free resources for
+ *
+ * This function cleans up the Dwfl context used for addr2line lookups.
+ */
+void dso__free_a2l_libdw(struct dso *dso);
+
+#else /* HAVE_LIBDW_SUPPORT */
+
+static inline int libdw__addr2line(const char *dso_name __maybe_unused,
+				   u64 addr __maybe_unused, char **file __maybe_unused,
+				   unsigned int *line_nr __maybe_unused,
+				   struct dso *dso __maybe_unused,
+				   bool unwind_inlines __maybe_unused,
+				   struct inline_node *node __maybe_unused,
+				   struct symbol *sym __maybe_unused)
+{
+	return 0;
+}
+
+static inline void dso__free_a2l_libdw(struct dso *dso __maybe_unused)
+{
+}
+#endif /* HAVE_LIBDW_SUPPORT */
+
+#endif /* PERF_LIBDW_H */
diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c
index 27c0966611ab..e2d280678b02 100644
--- a/tools/perf/util/srcline.c
+++ b/tools/perf/util/srcline.c
@@ -6,6 +6,7 @@
 #include "libbfd.h"
 #include "llvm.h"
 #include "symbol.h"
+#include "libdw.h"
 
 #include <inttypes.h>
 #include <string.h>
@@ -51,6 +52,25 @@ int inline_list__append(struct symbol *symbol, char *srcline, struct inline_node
 	return 0;
 }
 
+int inline_list__append_tail(struct symbol *symbol, char *srcline, struct inline_node *node)
+{
+	struct inline_list *ilist;
+
+	ilist = zalloc(sizeof(*ilist));
+	if (ilist == NULL)
+		return -1;
+
+	ilist->symbol = symbol;
+	ilist->srcline = srcline;
+
+	if (callchain_param.order == ORDER_CALLEE)
+		list_add(&ilist->list, &node->val);
+	else
+		list_add_tail(&ilist->list, &node->val);
+
+	return 0;
+}
+
 /* basename version that takes a const input string */
 static const char *gnu_basename(const char *path)
 {
@@ -120,6 +140,10 @@ static int addr2line(const char *dso_name, u64 addr, char **file, unsigned int *
 {
 	int ret;
 
+	ret = libdw__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym);
+	if (ret > 0)
+		return ret;
+
 	ret = llvm__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym);
 	if (ret > 0)
 		return ret;
diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h
index c36f573cd339..be9f002bf234 100644
--- a/tools/perf/util/srcline.h
+++ b/tools/perf/util/srcline.h
@@ -57,6 +57,7 @@ struct inline_node *inlines__tree_find(struct rb_root_cached *tree, u64 addr);
 void inlines__tree_delete(struct rb_root_cached *tree);
 
 int inline_list__append(struct symbol *symbol, char *srcline, struct inline_node *node);
+int inline_list__append_tail(struct symbol *symbol, char *srcline, struct inline_node *node);
 char *srcline_from_fileline(const char *file, unsigned int line);
 struct symbol *new_inline_sym(struct dso *dso,
 			      struct symbol *base_sym,
-- 
cgit v1.2.3


From ec9426655dcee3e337735935dcc2dea7684a5bf8 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 10 Jan 2026 20:13:34 -0800
Subject: perf addr2line.c: Rename a2l_style to cmd_a2l_style

The a2l_style is only relevant to the command line version, so rename
to make this clearer.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Tony Jones <tonyj@suse.de>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/addr2line.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/addr2line.c b/tools/perf/util/addr2line.c
index f2d94a3272d7..0f1499350d47 100644
--- a/tools/perf/util/addr2line.c
+++ b/tools/perf/util/addr2line.c
@@ -90,16 +90,16 @@ static struct child_process *addr2line_subprocess_init(const char *addr2line_pat
 	return a2l;
 }
 
-enum a2l_style {
+enum cmd_a2l_style {
 	BROKEN,
 	GNU_BINUTILS,
 	LLVM,
 };
 
-static enum a2l_style addr2line_configure(struct child_process *a2l, const char *dso_name)
+static enum cmd_a2l_style cmd_addr2line_configure(struct child_process *a2l, const char *dso_name)
 {
 	static bool cached;
-	static enum a2l_style style;
+	static enum cmd_a2l_style style;
 
 	if (!cached) {
 		char buf[128];
@@ -149,7 +149,7 @@ static enum a2l_style addr2line_configure(struct child_process *a2l, const char
 }
 
 static int read_addr2line_record(struct io *io,
-				 enum a2l_style style,
+				 enum cmd_a2l_style style,
 				 const char *dso_name,
 				 u64 addr,
 				 bool first,
@@ -298,7 +298,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 	char buf[128];
 	ssize_t written;
 	struct io io = { .eof = false };
-	enum a2l_style a2l_style;
+	enum cmd_a2l_style cmd_a2l_style;
 
 	if (!a2l) {
 		if (!filename__has_section(dso_name, ".debug_line"))
@@ -314,8 +314,8 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 			pr_warning("%s %s: addr2line_subprocess_init failed\n", __func__, dso_name);
 		goto out;
 	}
-	a2l_style = addr2line_configure(a2l, dso_name);
-	if (a2l_style == BROKEN)
+	cmd_a2l_style = cmd_addr2line_configure(a2l, dso_name);
+	if (cmd_a2l_style == BROKEN)
 		goto out;
 
 	/*
@@ -336,7 +336,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 	}
 	io__init(&io, a2l->out, buf, sizeof(buf));
 	io.timeout_ms = addr2line_timeout_ms;
-	switch (read_addr2line_record(&io, a2l_style, dso_name, addr, /*first=*/true,
+	switch (read_addr2line_record(&io, cmd_a2l_style, dso_name, addr, /*first=*/true,
 				      &record_function, &record_filename, &record_line_nr)) {
 	case -1:
 		if (!symbol_conf.disable_add2line_warn)
@@ -351,7 +351,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 		 * binutils, also force a non-zero address as we're no longer
 		 * reading that record.
 		 */
-		switch (read_addr2line_record(&io, a2l_style, dso_name,
+		switch (read_addr2line_record(&io, cmd_a2l_style, dso_name,
 					      /*addr=*/1, /*first=*/true,
 					      NULL, NULL, NULL)) {
 		case -1:
@@ -397,7 +397,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 	 * as we're reading records beyond the first.
 	 */
 	while ((record_status = read_addr2line_record(&io,
-						      a2l_style,
+						      cmd_a2l_style,
 						      dso_name,
 						      /*addr=*/1,
 						      /*first=*/false,
-- 
cgit v1.2.3


From 7aef17f367c94d6cef00f45b193e37d30ff4a3b5 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Mon, 5 Jan 2026 12:05:16 +0100
Subject: x86/xen: Drop xen_mmu_ops

Instead of having a pre-filled array xen_mmu_ops for Xen PV paravirt
functions, drop the array and assign each element individually.

This is in preparation of reducing the paravirt include hell by
splitting paravirt.h into multiple more fine grained header files,
which will in turn require to split up the pv_ops vector as well.
Dropping the pre-filled array makes life easier for objtool to
detect missing initializers in multiple pv_ops_ arrays.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://patch.msgid.link/20260105110520.21356-18-jgross@suse.com
---
 arch/x86/xen/mmu_pv.c | 100 +++++++++++++++++++-------------------------------
 tools/objtool/check.c |   1 -
 2 files changed, 38 insertions(+), 63 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 2a4a8deaf612..9fa00c4a8858 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2175,73 +2175,49 @@ static void xen_leave_lazy_mmu(void)
 	preempt_enable();
 }
 
-static const typeof(pv_ops) xen_mmu_ops __initconst = {
-	.mmu = {
-		.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2),
-		.write_cr2 = xen_write_cr2,
-
-		.read_cr3 = xen_read_cr3,
-		.write_cr3 = xen_write_cr3_init,
-
-		.flush_tlb_user = xen_flush_tlb,
-		.flush_tlb_kernel = xen_flush_tlb,
-		.flush_tlb_one_user = xen_flush_tlb_one_user,
-		.flush_tlb_multi = xen_flush_tlb_multi,
-
-		.pgd_alloc = xen_pgd_alloc,
-		.pgd_free = xen_pgd_free,
-
-		.alloc_pte = xen_alloc_pte_init,
-		.release_pte = xen_release_pte_init,
-		.alloc_pmd = xen_alloc_pmd_init,
-		.release_pmd = xen_release_pmd_init,
-
-		.set_pte = xen_set_pte_init,
-		.set_pmd = xen_set_pmd_hyper,
-
-		.ptep_modify_prot_start = xen_ptep_modify_prot_start,
-		.ptep_modify_prot_commit = xen_ptep_modify_prot_commit,
-
-		.pte_val = PV_CALLEE_SAVE(xen_pte_val),
-		.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
-
-		.make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
-		.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
-
-		.set_pud = xen_set_pud_hyper,
-
-		.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
-		.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
-
-		.pud_val = PV_CALLEE_SAVE(xen_pud_val),
-		.make_pud = PV_CALLEE_SAVE(xen_make_pud),
-		.set_p4d = xen_set_p4d_hyper,
-
-		.alloc_pud = xen_alloc_pmd_init,
-		.release_pud = xen_release_pmd_init,
-
-		.p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
-		.make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
-
-		.enter_mmap = xen_enter_mmap,
-		.exit_mmap = xen_exit_mmap,
-
-		.lazy_mode = {
-			.enter = xen_enter_lazy_mmu,
-			.leave = xen_leave_lazy_mmu,
-			.flush = xen_flush_lazy_mmu,
-		},
-
-		.set_fixmap = xen_set_fixmap,
-	},
-};
-
 void __init xen_init_mmu_ops(void)
 {
 	x86_init.paging.pagetable_init = xen_pagetable_init;
 	x86_init.hyper.init_after_bootmem = xen_after_bootmem;
 
-	pv_ops.mmu = xen_mmu_ops.mmu;
+	pv_ops.mmu.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2);
+	pv_ops.mmu.write_cr2 = xen_write_cr2;
+	pv_ops.mmu.read_cr3 = xen_read_cr3;
+	pv_ops.mmu.write_cr3 = xen_write_cr3_init;
+	pv_ops.mmu.flush_tlb_user = xen_flush_tlb;
+	pv_ops.mmu.flush_tlb_kernel = xen_flush_tlb;
+	pv_ops.mmu.flush_tlb_one_user = xen_flush_tlb_one_user;
+	pv_ops.mmu.flush_tlb_multi = xen_flush_tlb_multi;
+	pv_ops.mmu.pgd_alloc = xen_pgd_alloc;
+	pv_ops.mmu.pgd_free = xen_pgd_free;
+	pv_ops.mmu.alloc_pte = xen_alloc_pte_init;
+	pv_ops.mmu.release_pte = xen_release_pte_init;
+	pv_ops.mmu.alloc_pmd = xen_alloc_pmd_init;
+	pv_ops.mmu.release_pmd = xen_release_pmd_init;
+	pv_ops.mmu.set_pte = xen_set_pte_init;
+	pv_ops.mmu.set_pmd = xen_set_pmd_hyper;
+	pv_ops.mmu.ptep_modify_prot_start = xen_ptep_modify_prot_start;
+	pv_ops.mmu.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
+	pv_ops.mmu.pte_val = PV_CALLEE_SAVE(xen_pte_val);
+	pv_ops.mmu.pgd_val = PV_CALLEE_SAVE(xen_pgd_val);
+	pv_ops.mmu.make_pte = PV_CALLEE_SAVE(xen_make_pte_init);
+	pv_ops.mmu.make_pgd = PV_CALLEE_SAVE(xen_make_pgd);
+	pv_ops.mmu.set_pud = xen_set_pud_hyper;
+	pv_ops.mmu.make_pmd = PV_CALLEE_SAVE(xen_make_pmd);
+	pv_ops.mmu.pmd_val = PV_CALLEE_SAVE(xen_pmd_val);
+	pv_ops.mmu.pud_val = PV_CALLEE_SAVE(xen_pud_val);
+	pv_ops.mmu.make_pud = PV_CALLEE_SAVE(xen_make_pud);
+	pv_ops.mmu.set_p4d = xen_set_p4d_hyper;
+	pv_ops.mmu.alloc_pud = xen_alloc_pmd_init;
+	pv_ops.mmu.release_pud = xen_release_pmd_init;
+	pv_ops.mmu.p4d_val = PV_CALLEE_SAVE(xen_p4d_val);
+	pv_ops.mmu.make_p4d = PV_CALLEE_SAVE(xen_make_p4d);
+	pv_ops.mmu.enter_mmap = xen_enter_mmap;
+	pv_ops.mmu.exit_mmap = xen_exit_mmap;
+	pv_ops.mmu.lazy_mode.enter = xen_enter_lazy_mmu;
+	pv_ops.mmu.lazy_mode.leave = xen_leave_lazy_mmu;
+	pv_ops.mmu.lazy_mode.flush = xen_flush_lazy_mmu;
+	pv_ops.mmu.set_fixmap = xen_set_fixmap;
 
 	memset(dummy_mapping, 0xff, PAGE_SIZE);
 }
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 8ab88f2b2c1b..61f3e0c48fcc 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -570,7 +570,6 @@ static int init_pv_ops(struct objtool_file *file)
 {
 	static const char *pv_ops_tables[] = {
 		"pv_ops",
-		"xen_mmu_ops",
 		NULL,
 	};
 	const char *pv_ops;
-- 
cgit v1.2.3


From 21eb90fb5fbc939eb68262efc0e916293d8299d2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 10 Jan 2026 15:31:36 -0800
Subject: tools: ynl: cli: introduce formatting for attr names in --list-attrs

It's a little hard to make sense of the output of --list-attrs,
it looks like a wall of text. Sprinkle a little bit of formatting -
make op and attr names bold, and Enum: / Flags: keywords italics.

Tested-by: Gal Pressman <gal@nvidia.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260110233142.3921386-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/cli.py | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py
index 5fee45e48bbf..aa50d42e35ac 100755
--- a/tools/net/ynl/pyynl/cli.py
+++ b/tools/net/ynl/pyynl/cli.py
@@ -20,6 +20,29 @@ from lib import YnlFamily, Netlink, NlError, SpecFamily, SpecException, YnlExcep
 SYS_SCHEMA_DIR='/usr/share/ynl'
 RELATIVE_SCHEMA_DIR='../../../../Documentation/netlink'
 
+# pylint: disable=too-few-public-methods,too-many-locals
+class Colors:
+    """ANSI color and font modifier codes"""
+    RESET = '\033[0m'
+
+    BOLD = '\033[1m'
+    ITALICS = '\033[3m'
+    UNDERLINE = '\033[4m'
+    INVERT = '\033[7m'
+
+
+def color(text, modifiers):
+    """Add color to text if output is a TTY
+
+    Returns:
+        Colored text if stdout is a TTY, otherwise plain text
+    """
+    if sys.stdout.isatty():
+        # Join the colors if they are a list, if it's a string this a noop
+        modifiers = "".join(modifiers)
+        return f"{modifiers}{text}{Colors.RESET}"
+    return text
+
 def schema_dir():
     """
     Return the effective schema directory, preferring in-tree before
@@ -60,7 +83,7 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2):
     for attr_name in attr_names:
         if attr_name in attr_set.attrs:
             attr = attr_set.attrs[attr_name]
-            attr_info = f'{prefix}- {attr_name}: {attr.type}'
+            attr_info = f'{prefix}- {color(attr_name, Colors.BOLD)}: {attr.type}'
             if 'enum' in attr.yaml:
                 enum_name = attr.yaml['enum']
                 attr_info += f" (enum: {enum_name})"
@@ -68,7 +91,8 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2):
                 if enum_name in ynl.consts:
                     const = ynl.consts[enum_name]
                     enum_values = list(const.entries.keys())
-                    attr_info += f"\n{prefix}  {const.type.capitalize()}: {', '.join(enum_values)}"
+                    type_fmted = color(const.type.capitalize(), Colors.ITALICS)
+                    attr_info += f"\n{prefix}  {type_fmted}: {', '.join(enum_values)}"
 
             # Show nested attributes reference and recursively display them
             nested_set_name = None
@@ -226,7 +250,7 @@ def main():
             print(f'Operation {args.list_attrs} not found')
             sys.exit(1)
 
-        print(f'Operation: {op.name}')
+        print(f'Operation: {color(op.name, Colors.BOLD)}')
         print(op.yaml['doc'])
 
         for mode in ['do', 'dump', 'event']:
-- 
cgit v1.2.3


From 101a7d57d518c0c9e9eefc3768909ae02b96b3ef Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 10 Jan 2026 15:31:37 -0800
Subject: tools: ynl: cli: wrap the doc text if it's long

We already use textwrap when printing "doc" section about an attribute,
but only to indent the text. Switch to using fill() to split and indent
all the lines. While at it indent the text by 2 more spaces, so that it
doesn't align with the name of the attribute.

Before (I'm drawing a "box" at ~60 cols here, in an attempt for clarity):

 |  - irq-suspend-timeout: uint                              |
 |    The timeout, in nanoseconds, of how long to suspend irq|
 |processing, if event polling finds events                  |

After:

 |  - irq-suspend-timeout: uint                              |
 |      The timeout, in nanoseconds, of how long to suspend  |
 |      irq processing, if event polling finds events        |

Tested-by: Gal Pressman <gal@nvidia.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260110233142.3921386-3-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/cli.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py
index aa50d42e35ac..dc84619e5518 100755
--- a/tools/net/ynl/pyynl/cli.py
+++ b/tools/net/ynl/pyynl/cli.py
@@ -10,6 +10,7 @@ import json
 import os
 import pathlib
 import pprint
+import shutil
 import sys
 import textwrap
 
@@ -101,7 +102,11 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2):
                 attr_info += f" -> {nested_set_name}"
 
             if attr.yaml.get('doc'):
-                doc_text = textwrap.indent(attr.yaml['doc'], prefix + '  ')
+                doc_prefix = prefix + ' ' * 4
+                term_width = shutil.get_terminal_size().columns
+                doc_text = textwrap.fill(attr.yaml['doc'], width=term_width,
+                                         initial_indent=doc_prefix,
+                                         subsequent_indent=doc_prefix)
                 attr_info += f"\n{doc_text}"
             print(attr_info)
 
-- 
cgit v1.2.3


From 1b7fbf62ad8b404e55d195021724067b5122b630 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 10 Jan 2026 15:31:38 -0800
Subject: tools: ynl: cli: improve --help

Improve the clarity of --help. Reorder, provide some grouping and
add help messages to most of the options.

No functional changes intended.

Tested-by: Gal Pressman <gal@nvidia.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260110233142.3921386-4-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/cli.py | 113 +++++++++++++++++++++++++++++----------------
 1 file changed, 72 insertions(+), 41 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py
index dc84619e5518..3aa1f1e816bf 100755
--- a/tools/net/ynl/pyynl/cli.py
+++ b/tools/net/ynl/pyynl/cli.py
@@ -151,47 +151,78 @@ def main():
     """
 
     parser = argparse.ArgumentParser(description=description,
-                                     epilog=epilog)
-    spec_group = parser.add_mutually_exclusive_group(required=True)
-    spec_group.add_argument('--family', dest='family', type=str,
-                            help='name of the netlink FAMILY')
-    spec_group.add_argument('--list-families', action='store_true',
-                            help='list all netlink families supported by YNL (has spec)')
-    spec_group.add_argument('--spec', dest='spec', type=str,
-                            help='choose the family by SPEC file path')
-
-    parser.add_argument('--schema', dest='schema', type=str)
-    parser.add_argument('--no-schema', action='store_true')
-    parser.add_argument('--json', dest='json_text', type=str)
-
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument('--do', dest='do', metavar='DO-OPERATION', type=str)
-    group.add_argument('--multi', dest='multi', nargs=2, action='append',
-                       metavar=('DO-OPERATION', 'JSON_TEXT'), type=str)
-    group.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str)
-    group.add_argument('--list-ops', action='store_true')
-    group.add_argument('--list-msgs', action='store_true')
-    group.add_argument('--list-attrs', dest='list_attrs', metavar='OPERATION', type=str,
-                       help='List attributes for an operation')
-    group.add_argument('--validate', action='store_true')
-
-    parser.add_argument('--duration', dest='duration', type=int,
-                        help='when subscribed, watch for DURATION seconds')
-    parser.add_argument('--sleep', dest='duration', type=int,
-                        help='alias for duration')
-    parser.add_argument('--subscribe', dest='ntf', type=str)
-    parser.add_argument('--replace', dest='flags', action='append_const',
-                        const=Netlink.NLM_F_REPLACE)
-    parser.add_argument('--excl', dest='flags', action='append_const',
-                        const=Netlink.NLM_F_EXCL)
-    parser.add_argument('--create', dest='flags', action='append_const',
-                        const=Netlink.NLM_F_CREATE)
-    parser.add_argument('--append', dest='flags', action='append_const',
-                        const=Netlink.NLM_F_APPEND)
-    parser.add_argument('--process-unknown', action=argparse.BooleanOptionalAction)
-    parser.add_argument('--output-json', action='store_true')
-    parser.add_argument('--dbg-small-recv', default=0, const=4000,
-                        action='store', nargs='?', type=int)
+                                     epilog=epilog, add_help=False)
+
+    gen_group = parser.add_argument_group('General options')
+    gen_group.add_argument('-h', '--help', action='help',
+                           help='show this help message and exit')
+
+    spec_group = parser.add_argument_group('Netlink family selection')
+    spec_sel = spec_group.add_mutually_exclusive_group(required=True)
+    spec_sel.add_argument('--list-families', action='store_true',
+                          help=('list Netlink families supported by YNL '
+                                '(which have a spec available in the standard '
+                                'system path)'))
+    spec_sel.add_argument('--family', dest='family', type=str,
+                          help='name of the Netlink FAMILY to use')
+    spec_sel.add_argument('--spec', dest='spec', type=str,
+                          help='full file path to the YAML spec file')
+
+    ops_group = parser.add_argument_group('Operations')
+    ops = ops_group.add_mutually_exclusive_group()
+    ops.add_argument('--do', dest='do', metavar='DO-OPERATION', type=str)
+    ops.add_argument('--dump', dest='dump', metavar='DUMP-OPERATION', type=str)
+    ops.add_argument('--multi', dest='multi', nargs=2, action='append',
+                     metavar=('DO-OPERATION', 'JSON_TEXT'), type=str,
+                     help="Multi-message operation sequence (for nftables)")
+    ops.add_argument('--list-ops', action='store_true',
+                     help="List available --do and --dump operations")
+    ops.add_argument('--list-msgs', action='store_true',
+                     help="List all messages of the family (incl. notifications)")
+    ops.add_argument('--list-attrs', dest='list_attrs', metavar='MSG',
+                     type=str, help='List attributes for a message / operation')
+    ops.add_argument('--validate', action='store_true',
+                     help="Validate the spec against schema and exit")
+
+    io_group = parser.add_argument_group('Input / Output')
+    io_group.add_argument('--json', dest='json_text', type=str,
+                          help=('Specify attributes of the message to send '
+                                'to the kernel in JSON format. Can be left out '
+                                'if the message is expected to be empty.'))
+    io_group.add_argument('--output-json', action='store_true',
+                          help='Format output as JSON')
+
+    ntf_group = parser.add_argument_group('Notifications')
+    ntf_group.add_argument('--subscribe', dest='ntf', type=str)
+    ntf_group.add_argument('--duration', dest='duration', type=int,
+                           help='when subscribed, watch for DURATION seconds')
+    ntf_group.add_argument('--sleep', dest='duration', type=int,
+                           help='alias for duration')
+
+    nlflags = parser.add_argument_group('Netlink message flags (NLM_F_*)',
+                                        ('Extra flags to set in nlmsg_flags of '
+                                         'the request, used mostly by older '
+                                         'Classic Netlink families.'))
+    nlflags.add_argument('--replace', dest='flags', action='append_const',
+                         const=Netlink.NLM_F_REPLACE)
+    nlflags.add_argument('--excl', dest='flags', action='append_const',
+                         const=Netlink.NLM_F_EXCL)
+    nlflags.add_argument('--create', dest='flags', action='append_const',
+                         const=Netlink.NLM_F_CREATE)
+    nlflags.add_argument('--append', dest='flags', action='append_const',
+                         const=Netlink.NLM_F_APPEND)
+
+    schema_group = parser.add_argument_group('Development options')
+    schema_group.add_argument('--schema', dest='schema', type=str,
+                              help="JSON schema to validate the spec")
+    schema_group.add_argument('--no-schema', action='store_true')
+
+    dbg_group = parser.add_argument_group('Debug options')
+    dbg_group.add_argument('--dbg-small-recv', default=0, const=4000,
+                           action='store', nargs='?', type=int, metavar='INT',
+                           help="Length of buffers used for recv()")
+    dbg_group.add_argument('--process-unknown', action=argparse.BooleanOptionalAction)
+
     args = parser.parse_args()
 
     def output(msg):
-- 
cgit v1.2.3


From aca1fe235c10f7d06e9ebab4534852f109e6a8e9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 10 Jan 2026 15:31:39 -0800
Subject: tools: ynl: cli: add --doc as alias to --list-attrs

--list-attrs also provides information about the operation itself.
So --doc seems more appropriate. Add an alias.

Tested-by: Gal Pressman <gal@nvidia.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260110233142.3921386-5-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py
index 3aa1f1e816bf..4147c498b479 100755
--- a/tools/net/ynl/pyynl/cli.py
+++ b/tools/net/ynl/pyynl/cli.py
@@ -179,7 +179,7 @@ def main():
                      help="List available --do and --dump operations")
     ops.add_argument('--list-msgs', action='store_true',
                      help="List all messages of the family (incl. notifications)")
-    ops.add_argument('--list-attrs', dest='list_attrs', metavar='MSG',
+    ops.add_argument('--list-attrs', '--doc', dest='list_attrs', metavar='MSG',
                      type=str, help='List attributes for a message / operation')
     ops.add_argument('--validate', action='store_true',
                      help="Validate the spec against schema and exit")
-- 
cgit v1.2.3


From 45b99bb464eb62da555ecbef31583d9701881d43 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 10 Jan 2026 15:31:40 -0800
Subject: tools: ynl: cli: factor out --list-attrs / --doc handling

We'll soon add more code to the --doc handling. Factor it out
to avoid making main() too long.

Tested-by: Gal Pressman <gal@nvidia.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260110233142.3921386-6-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/cli.py | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py
index 4147c498b479..6975efa7874f 100755
--- a/tools/net/ynl/pyynl/cli.py
+++ b/tools/net/ynl/pyynl/cli.py
@@ -137,6 +137,25 @@ def print_mode_attrs(ynl, mode, mode_spec, attr_set, print_request=True):
         print_attr_list(ynl, mode_spec['attributes'], attr_set)
 
 
+def do_doc(ynl, op):
+    """Handle --list-attrs $op, print the attr information to stdout"""
+    print(f'Operation: {color(op.name, Colors.BOLD)}')
+    print(op.yaml['doc'])
+
+    for mode in ['do', 'dump', 'event']:
+        if mode in op.yaml:
+            print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set, True)
+
+    if 'notify' in op.yaml:
+        mode_spec = op.yaml['notify']
+        ref_spec = ynl.msgs.get(mode_spec).yaml.get('do')
+        if ref_spec:
+            print_mode_attrs(ynl, 'notify', ref_spec, op.attr_set, False)
+
+    if 'mcgrp' in op.yaml:
+        print(f"\nMulticast group: {op.yaml['mcgrp']}")
+
+
 # pylint: disable=too-many-locals,too-many-branches,too-many-statements
 def main():
     """YNL cli tool"""
@@ -286,21 +305,7 @@ def main():
             print(f'Operation {args.list_attrs} not found')
             sys.exit(1)
 
-        print(f'Operation: {color(op.name, Colors.BOLD)}')
-        print(op.yaml['doc'])
-
-        for mode in ['do', 'dump', 'event']:
-            if mode in op.yaml:
-                print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set, True)
-
-        if 'notify' in op.yaml:
-            mode_spec = op.yaml['notify']
-            ref_spec = ynl.msgs.get(mode_spec).yaml.get('do')
-            if ref_spec:
-                print_mode_attrs(ynl, 'notify', ref_spec, op.attr_set, False)
-
-        if 'mcgrp' in op.yaml:
-            print(f"\nMulticast group: {op.yaml['mcgrp']}")
+        do_doc(ynl, op)
 
     try:
         if args.do:
-- 
cgit v1.2.3


From 6ccc421b14613aac32b2647462ae4d40f5dd43b8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 10 Jan 2026 15:31:41 -0800
Subject: tools: ynl: cli: extract the event/notify handling in --list-attrs

Event and notify handling is quite different from do / dump
handling. Forcing it into print_mode_attrs() doesn't really
buy us anything as events and notifications do not have requests.
Call print_attr_list() directly. Apart form subjective code
clarity this also removes the word "reply" from the output:

Before:

  Event reply attributes:

Now:

  Event attributes:

Tested-by: Gal Pressman <gal@nvidia.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260110233142.3921386-7-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/cli.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py
index 6975efa7874f..6d2f0412dde0 100755
--- a/tools/net/ynl/pyynl/cli.py
+++ b/tools/net/ynl/pyynl/cli.py
@@ -120,11 +120,11 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2):
                     print_attr_list(ynl, nested_names, nested_set, indent + 4)
 
 
-def print_mode_attrs(ynl, mode, mode_spec, attr_set, print_request=True):
+def print_mode_attrs(ynl, mode, mode_spec, attr_set):
     """Print a given mode (do/dump/event/notify)."""
     mode_title = mode.capitalize()
 
-    if print_request and 'request' in mode_spec and 'attributes' in mode_spec['request']:
+    if 'request' in mode_spec and 'attributes' in mode_spec['request']:
         print(f'\n{mode_title} request attributes:')
         print_attr_list(ynl, mode_spec['request']['attributes'], attr_set)
 
@@ -132,25 +132,28 @@ def print_mode_attrs(ynl, mode, mode_spec, attr_set, print_request=True):
         print(f'\n{mode_title} reply attributes:')
         print_attr_list(ynl, mode_spec['reply']['attributes'], attr_set)
 
-    if 'attributes' in mode_spec:
-        print(f'\n{mode_title} attributes:')
-        print_attr_list(ynl, mode_spec['attributes'], attr_set)
-
 
 def do_doc(ynl, op):
     """Handle --list-attrs $op, print the attr information to stdout"""
     print(f'Operation: {color(op.name, Colors.BOLD)}')
     print(op.yaml['doc'])
 
-    for mode in ['do', 'dump', 'event']:
+    for mode in ['do', 'dump']:
         if mode in op.yaml:
-            print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set, True)
+            print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set)
+
+    if 'attributes' in op.yaml.get('event', {}):
+        print('\nEvent attributes:')
+        print_attr_list(ynl, op.yaml['event']['attributes'], op.attr_set)
 
     if 'notify' in op.yaml:
         mode_spec = op.yaml['notify']
         ref_spec = ynl.msgs.get(mode_spec).yaml.get('do')
+        if not ref_spec:
+            ref_spec = ynl.msgs.get(mode_spec).yaml.get('dump')
         if ref_spec:
-            print_mode_attrs(ynl, 'notify', ref_spec, op.attr_set, False)
+            print('\nNotification attributes:')
+            print_attr_list(ynl, ref_spec['reply']['attributes'], op.attr_set)
 
     if 'mcgrp' in op.yaml:
         print(f"\nMulticast group: {op.yaml['mcgrp']}")
-- 
cgit v1.2.3


From 60411adedf70abec0ac221ec3d88f6453b031dd2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 10 Jan 2026 15:31:42 -0800
Subject: tools: ynl: cli: print reply in combined format if possible

As pointed out during review of the --list-attrs support the GET
ops very often return the same attrs from do and dump. Make the
output more readable by combining the reply information, from:

  Do request attributes:
    - ifindex: u32
      netdev ifindex

  Do reply attributes:
    - ifindex: u32
      netdev ifindex
    [ .. other attrs .. ]

  Dump reply attributes:
    - ifindex: u32
      netdev ifindex
    [ .. other attrs .. ]

To, after:

  Do request attributes:
    - ifindex: u32
      netdev ifindex

  Do and Dump reply attributes:
    - ifindex: u32
      netdev ifindex
    [ .. other attrs .. ]

Tested-by: Gal Pressman <gal@nvidia.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260110233142.3921386-8-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/cli.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py
index 6d2f0412dde0..fdac1ab10a40 100755
--- a/tools/net/ynl/pyynl/cli.py
+++ b/tools/net/ynl/pyynl/cli.py
@@ -120,7 +120,7 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2):
                     print_attr_list(ynl, nested_names, nested_set, indent + 4)
 
 
-def print_mode_attrs(ynl, mode, mode_spec, attr_set):
+def print_mode_attrs(ynl, mode, mode_spec, attr_set, consistent_dd_reply=None):
     """Print a given mode (do/dump/event/notify)."""
     mode_title = mode.capitalize()
 
@@ -129,8 +129,15 @@ def print_mode_attrs(ynl, mode, mode_spec, attr_set):
         print_attr_list(ynl, mode_spec['request']['attributes'], attr_set)
 
     if 'reply' in mode_spec and 'attributes' in mode_spec['reply']:
-        print(f'\n{mode_title} reply attributes:')
-        print_attr_list(ynl, mode_spec['reply']['attributes'], attr_set)
+        if consistent_dd_reply and mode == "do":
+            title = None  # Dump handling will print in combined format
+        elif consistent_dd_reply and mode == "dump":
+            title = 'Do and Dump'
+        else:
+            title = f'{mode_title}'
+        if title:
+            print(f'\n{title} reply attributes:')
+            print_attr_list(ynl, mode_spec['reply']['attributes'], attr_set)
 
 
 def do_doc(ynl, op):
@@ -138,9 +145,15 @@ def do_doc(ynl, op):
     print(f'Operation: {color(op.name, Colors.BOLD)}')
     print(op.yaml['doc'])
 
+    consistent_dd_reply = False
+    if 'do' in op.yaml and 'dump' in op.yaml and 'reply' in op.yaml['do'] and \
+       op.yaml['do']['reply'] == op.yaml['dump'].get('reply'):
+        consistent_dd_reply = True
+
     for mode in ['do', 'dump']:
         if mode in op.yaml:
-            print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set)
+            print_mode_attrs(ynl, mode, op.yaml[mode], op.attr_set,
+                             consistent_dd_reply=consistent_dd_reply)
 
     if 'attributes' in op.yaml.get('event', {}):
         print('\nEvent attributes:')
-- 
cgit v1.2.3


From 65955a0993a0a9536263fea2eaae8aed496dcc9c Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 13 Jan 2026 00:05:02 +0200
Subject: selftests: ublk: add stop command with --safe option

Add 'stop' subcommand to kublk utility that uses the new
UBLK_CMD_TRY_STOP_DEV command when --safe option is specified.
This allows stopping a device only if it has no active openers,
returning -EBUSY otherwise.

Also add test_generic_16.sh to test the new functionality.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile           |  1 +
 tools/testing/selftests/ublk/kublk.c            | 53 +++++++++++++++++++++++
 tools/testing/selftests/ublk/kublk.h            |  1 +
 tools/testing/selftests/ublk/test_generic_16.sh | 57 +++++++++++++++++++++++++
 4 files changed, 112 insertions(+)
 create mode 100755 tools/testing/selftests/ublk/test_generic_16.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 036a9f01b464..3a2498089b15 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -23,6 +23,7 @@ TEST_PROGS += test_generic_12.sh
 TEST_PROGS += test_generic_13.sh
 TEST_PROGS += test_generic_14.sh
 TEST_PROGS += test_generic_15.sh
+TEST_PROGS += test_generic_16.sh
 
 TEST_PROGS += test_null_01.sh
 TEST_PROGS += test_null_02.sh
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index d95937dd6167..3472ce7426ba 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -108,6 +108,15 @@ static int ublk_ctrl_stop_dev(struct ublk_dev *dev)
 	return __ublk_ctrl_cmd(dev, &data);
 }
 
+static int ublk_ctrl_try_stop_dev(struct ublk_dev *dev)
+{
+	struct ublk_ctrl_cmd_data data = {
+		.cmd_op	= UBLK_U_CMD_TRY_STOP_DEV,
+	};
+
+	return __ublk_ctrl_cmd(dev, &data);
+}
+
 static int ublk_ctrl_start_dev(struct ublk_dev *dev,
 		int daemon_pid)
 {
@@ -1424,6 +1433,42 @@ static int cmd_dev_del(struct dev_ctx *ctx)
 	return 0;
 }
 
+static int cmd_dev_stop(struct dev_ctx *ctx)
+{
+	int number = ctx->dev_id;
+	struct ublk_dev *dev;
+	int ret;
+
+	if (number < 0) {
+		ublk_err("%s: device id is required\n", __func__);
+		return -EINVAL;
+	}
+
+	dev = ublk_ctrl_init();
+	dev->dev_info.dev_id = number;
+
+	ret = ublk_ctrl_get_info(dev);
+	if (ret < 0)
+		goto fail;
+
+	if (ctx->safe_stop) {
+		ret = ublk_ctrl_try_stop_dev(dev);
+		if (ret < 0)
+			ublk_err("%s: try_stop dev %d failed ret %d\n",
+					__func__, number, ret);
+	} else {
+		ret = ublk_ctrl_stop_dev(dev);
+		if (ret < 0)
+			ublk_err("%s: stop dev %d failed ret %d\n",
+					__func__, number, ret);
+	}
+
+fail:
+	ublk_ctrl_deinit(dev);
+
+	return ret;
+}
+
 static int __cmd_dev_list(struct dev_ctx *ctx)
 {
 	struct ublk_dev *dev = ublk_ctrl_init();
@@ -1487,6 +1532,7 @@ static int cmd_dev_get_features(void)
 		FEAT_NAME(UBLK_F_PER_IO_DAEMON),
 		FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON),
 		FEAT_NAME(UBLK_F_INTEGRITY),
+		FEAT_NAME(UBLK_F_SAFE_STOP_DEV)
 	};
 	struct ublk_dev *dev;
 	__u64 features = 0;
@@ -1616,6 +1662,8 @@ static int cmd_dev_help(char *exe)
 
 	printf("%s del [-n dev_id] -a \n", exe);
 	printf("\t -a delete all devices -n delete specified device\n\n");
+	printf("%s stop -n dev_id [--safe]\n", exe);
+	printf("\t --safe only stop if device has no active openers\n\n");
 	printf("%s list [-n dev_id] -a \n", exe);
 	printf("\t -a list all devices, -n list specified device, default -a \n\n");
 	printf("%s features\n", exe);
@@ -1653,6 +1701,7 @@ int main(int argc, char *argv[])
 		{ "pi_offset",		1,	NULL,  0 },
 		{ "csum_type",		1,	NULL,  0 },
 		{ "tag_size",		1,	NULL,  0 },
+		{ "safe",		0,	NULL,  0 },
 		{ 0, 0, 0, 0 }
 	};
 	const struct ublk_tgt_ops *ops = NULL;
@@ -1760,6 +1809,8 @@ int main(int argc, char *argv[])
 			}
 			if (!strcmp(longopts[option_idx].name, "tag_size"))
 				ctx.tag_size = strtoul(optarg, NULL, 0);
+			if (!strcmp(longopts[option_idx].name, "safe"))
+				ctx.safe_stop = 1;
 			break;
 		case '?':
 			/*
@@ -1842,6 +1893,8 @@ int main(int argc, char *argv[])
 		}
 	} else if (!strcmp(cmd, "del"))
 		ret = cmd_dev_del(&ctx);
+	else if (!strcmp(cmd, "stop"))
+		ret = cmd_dev_stop(&ctx);
 	else if (!strcmp(cmd, "list")) {
 		ctx.all = 1;
 		ret = cmd_dev_list(&ctx);
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 96c66b337bc0..cb757fd9bf9d 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -83,6 +83,7 @@ struct dev_ctx {
 	__u8 pi_offset;
 	__u8 csum_type;
 	__u8 tag_size;
+	unsigned int	safe_stop:1;
 
 	int _evtfd;
 	int _shmid;
diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh
new file mode 100755
index 000000000000..e08af7b685c9
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_generic_16.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="generic_16"
+ERR_CODE=0
+
+_prep_test "null" "stop --safe command"
+
+# Check if SAFE_STOP_DEV feature is supported
+if ! _have_feature "SAFE_STOP_DEV"; then
+	_cleanup_test "null"
+	exit "$UBLK_SKIP_CODE"
+fi
+
+# Test 1: stop --safe on idle device should succeed
+dev_id=$(_add_ublk_dev -t null -q 2 -d 32)
+_check_add_dev $TID $?
+
+# Device is idle (no openers), stop --safe should succeed
+if ! ${UBLK_PROG} stop -n "${dev_id}" --safe; then
+	echo "stop --safe on idle device failed unexpectedly!"
+	ERR_CODE=255
+fi
+
+# Clean up device
+${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1
+udevadm settle
+
+# Test 2: stop --safe on device with active opener should fail
+dev_id=$(_add_ublk_dev -t null -q 2 -d 32)
+_check_add_dev $TID $?
+
+# Open device in background (dd reads indefinitely)
+dd if=/dev/ublkb${dev_id} of=/dev/null bs=4k iflag=direct > /dev/null 2>&1 &
+dd_pid=$!
+
+# Give dd time to start
+sleep 0.2
+
+# Device has active opener, stop --safe should fail with -EBUSY
+if ${UBLK_PROG} stop -n "${dev_id}" --safe 2>/dev/null; then
+	echo "stop --safe on busy device succeeded unexpectedly!"
+	ERR_CODE=255
+fi
+
+# Kill dd and clean up
+kill $dd_pid 2>/dev/null
+wait $dd_pid 2>/dev/null
+
+# Now device should be idle, regular delete should work
+${UBLK_PROG} del -n "${dev_id}"
+udevadm settle
+
+_cleanup_test "null"
+_show_result $TID $ERR_CODE
-- 
cgit v1.2.3


From ba7f1024a1024f219a18c40b4ab2d7d900fd2d15 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Sat, 10 Jan 2026 08:25:52 +0000
Subject: selftests/bpf: Use the correct destructor kfunc type

With CONFIG_CFI enabled, the kernel strictly enforces that indirect
function calls use a function pointer type that matches the target
function. As bpf_testmod_ctx_release() signature differs from the
btf_dtor_kfunc_t pointer type used for the destructor calls in
bpf_obj_free_fields(), add a stub function with the correct type to
fix the type mismatch.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20260110082548.113748-9-samitolvanen@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index 1c41d03bd5a1..bc07ce9d5477 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -285,6 +285,12 @@ __bpf_kfunc void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx)
 		call_rcu(&ctx->rcu, testmod_free_cb);
 }
 
+__bpf_kfunc void bpf_testmod_ctx_release_dtor(void *ctx)
+{
+	bpf_testmod_ctx_release(ctx);
+}
+CFI_NOSEAL(bpf_testmod_ctx_release_dtor);
+
 static struct bpf_testmod_ops3 *st_ops3;
 
 static int bpf_testmod_test_3(void)
@@ -707,7 +713,7 @@ BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids)
 
 BTF_ID_LIST(bpf_testmod_dtor_ids)
 BTF_ID(struct, bpf_testmod_ctx)
-BTF_ID(func, bpf_testmod_ctx_release)
+BTF_ID(func, bpf_testmod_ctx_release_dtor)
 
 static const struct btf_kfunc_id_set bpf_testmod_common_kfunc_set = {
 	.owner = THIS_MODULE,
-- 
cgit v1.2.3


From 088f35ab9fd4a03b8c6ccdda7b92461d92bf7b8b Mon Sep 17 00:00:00 2001
From: Ankit Khushwaha <ankitkhushwaha.linux@gmail.com>
Date: Fri, 9 Jan 2026 20:52:01 +0530
Subject: selftests/net/ipsec: Fix variable size type not at the end of struct

The "struct alg" object contains a union of 3 xfrm structures:

	union {
		struct xfrm_algo;
		struct xfrm_algo_aead;
		struct xfrm_algo_auth;
	}

All of them end with a flexible array member used to store key material,
but the flexible array appears at *different offsets* in each struct.
bcz of this, union itself is of variable-sized & Placing it above
char buf[...] triggers:

ipsec.c:835:5: warning: field 'u' with variable sized type 'union
(unnamed union at ipsec.c:831:3)' not at the end of a struct or class
is a GNU extension [-Wgnu-variable-sized-type-not-at-end]
  835 |                 } u;
      |                   ^

one fix is to use "TRAILING_OVERLAP()" which works with one flexible
array member only.

But In "struct alg" flexible array member exists in all union members,
but not at the same offset, so TRAILING_OVERLAP cannot be applied.

so the fix is to explicitly overlay the key buffer at the correct offset
for the largest union member (xfrm_algo_auth). This ensures that the
flexible-array region and the fixed buffer line up.

No functional change.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Ankit Khushwaha <ankitkhushwaha.linux@gmail.com>
Link: https://patch.msgid.link/20260109152201.15668-1-ankitkhushwaha.linux@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/ipsec.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c
index 0ccf484b1d9d..f4afef51b930 100644
--- a/tools/testing/selftests/net/ipsec.c
+++ b/tools/testing/selftests/net/ipsec.c
@@ -43,6 +43,10 @@
 
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
 
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER)	__builtin_offsetof(TYPE, MEMBER)
+#endif
+
 #define IPV4_STR_SZ	16	/* xxx.xxx.xxx.xxx is longest + \0 */
 #define MAX_PAYLOAD	2048
 #define XFRM_ALGO_KEY_BUF_SIZE	512
@@ -827,13 +831,16 @@ static int xfrm_fill_key(char *name, char *buf,
 static int xfrm_state_pack_algo(struct nlmsghdr *nh, size_t req_sz,
 		struct xfrm_desc *desc)
 {
-	struct {
+	union {
 		union {
 			struct xfrm_algo	alg;
 			struct xfrm_algo_aead	aead;
 			struct xfrm_algo_auth	auth;
 		} u;
-		char buf[XFRM_ALGO_KEY_BUF_SIZE];
+		struct {
+			unsigned char __offset_to_FAM[offsetof(struct xfrm_algo_auth, alg_key)];
+			char buf[XFRM_ALGO_KEY_BUF_SIZE];
+		};
 	} alg = {};
 	size_t alen, elen, clen, aelen;
 	unsigned short type;
-- 
cgit v1.2.3


From 6ea8a206108fe8b5940c2797afc54ae9f5a7bbdd Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Mon, 12 Jan 2026 21:26:41 +0200
Subject: rtla: Fix parse_cpu_set() bug introduced by strtoi()

The patch 'Replace atoi() with a robust strtoi()' introduced a bug
in parse_cpu_set(), which relies on partial parsing of the input string.

The function parses CPU specifications like '0-3,5' by incrementing
a pointer through the string. strtoi() rejects strings with trailing
characters, causing parse_cpu_set() to fail on any CPU list with
multiple entries.

Restore the original use of atoi() in parse_cpu_set().

Fixes: 7e9dfccf8f11 ("rtla: Replace atoi() with a robust strtoi()")
Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20260112192642.212848-2-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
---
 tools/tracing/rtla/src/utils.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c
index 18986a5aed3c..0da3b2470c31 100644
--- a/tools/tracing/rtla/src/utils.c
+++ b/tools/tracing/rtla/src/utils.c
@@ -128,18 +128,16 @@ int parse_cpu_set(char *cpu_list, cpu_set_t *set)
 	nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
 
 	for (p = cpu_list; *p; ) {
-		if (strtoi(p, &cpu))
-			goto err;
-		if (cpu < 0 || cpu >= nr_cpus)
+		cpu = atoi(p);
+		if (cpu < 0 || (!cpu && *p != '0') || cpu >= nr_cpus)
 			goto err;
 
 		while (isdigit(*p))
 			p++;
 		if (*p == '-') {
 			p++;
-			if (strtoi(p, &end_cpu))
-				goto err;
-			if (end_cpu < cpu || end_cpu >= nr_cpus)
+			end_cpu = atoi(p);
+			if (end_cpu < cpu || (!end_cpu && *p != '0') || end_cpu >= nr_cpus)
 				goto err;
 			while (isdigit(*p))
 				p++;
-- 
cgit v1.2.3


From 602544773763da411ffa67567fa1d146f3a40231 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 12 Jan 2026 16:31:09 -0800
Subject: uapi: promote EFSCORRUPTED and EUCLEAN to errno.h

Stop definining these privately and instead move them to the uapi
errno.h so that they become canonical instead of copy pasta.

Cc: linux-api@vger.kernel.org
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Link: https://patch.msgid.link/176826402587.3490369.17659117524205214600.stgit@frogsfrogsfrogs
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 arch/alpha/include/uapi/asm/errno.h        | 2 ++
 arch/mips/include/uapi/asm/errno.h         | 2 ++
 arch/parisc/include/uapi/asm/errno.h       | 2 ++
 arch/sparc/include/uapi/asm/errno.h        | 2 ++
 fs/erofs/internal.h                        | 2 --
 fs/ext2/ext2.h                             | 1 -
 fs/ext4/ext4.h                             | 3 ---
 fs/f2fs/f2fs.h                             | 3 ---
 fs/minix/minix.h                           | 2 --
 fs/udf/udf_sb.h                            | 2 --
 fs/xfs/xfs_linux.h                         | 2 --
 include/linux/jbd2.h                       | 3 ---
 include/uapi/asm-generic/errno.h           | 2 ++
 tools/arch/alpha/include/uapi/asm/errno.h  | 2 ++
 tools/arch/mips/include/uapi/asm/errno.h   | 2 ++
 tools/arch/parisc/include/uapi/asm/errno.h | 2 ++
 tools/arch/sparc/include/uapi/asm/errno.h  | 2 ++
 tools/include/uapi/asm-generic/errno.h     | 2 ++
 18 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/arch/alpha/include/uapi/asm/errno.h b/arch/alpha/include/uapi/asm/errno.h
index 3d265f6babaf..6791f6508632 100644
--- a/arch/alpha/include/uapi/asm/errno.h
+++ b/arch/alpha/include/uapi/asm/errno.h
@@ -55,6 +55,7 @@
 #define	ENOSR		82	/* Out of streams resources */
 #define	ETIME		83	/* Timer expired */
 #define	EBADMSG		84	/* Not a data message */
+#define	EFSBADCRC	EBADMSG	/* Bad CRC detected */
 #define	EPROTO		85	/* Protocol error */
 #define	ENODATA		86	/* No data available */
 #define	ENOSTR		87	/* Device not a stream */
@@ -96,6 +97,7 @@
 #define	EREMCHG		115	/* Remote address changed */
 
 #define	EUCLEAN		117	/* Structure needs cleaning */
+#define	EFSCORRUPTED	EUCLEAN	/* Filesystem is corrupted */
 #define	ENOTNAM		118	/* Not a XENIX named type file */
 #define	ENAVAIL		119	/* No XENIX semaphores available */
 #define	EISNAM		120	/* Is a named type file */
diff --git a/arch/mips/include/uapi/asm/errno.h b/arch/mips/include/uapi/asm/errno.h
index 2fb714e2d6d8..c01ed91b1ef4 100644
--- a/arch/mips/include/uapi/asm/errno.h
+++ b/arch/mips/include/uapi/asm/errno.h
@@ -50,6 +50,7 @@
 #define EDOTDOT		73	/* RFS specific error */
 #define EMULTIHOP	74	/* Multihop attempted */
 #define EBADMSG		77	/* Not a data message */
+#define EFSBADCRC	EBADMSG	/* Bad CRC detected */
 #define ENAMETOOLONG	78	/* File name too long */
 #define EOVERFLOW	79	/* Value too large for defined data type */
 #define ENOTUNIQ	80	/* Name not unique on network */
@@ -88,6 +89,7 @@
 #define EISCONN		133	/* Transport endpoint is already connected */
 #define ENOTCONN	134	/* Transport endpoint is not connected */
 #define EUCLEAN		135	/* Structure needs cleaning */
+#define EFSCORRUPTED	EUCLEAN	/* Filesystem is corrupted */
 #define ENOTNAM		137	/* Not a XENIX named type file */
 #define ENAVAIL		138	/* No XENIX semaphores available */
 #define EISNAM		139	/* Is a named type file */
diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h
index 8d94739d75c6..8cbc07c1903e 100644
--- a/arch/parisc/include/uapi/asm/errno.h
+++ b/arch/parisc/include/uapi/asm/errno.h
@@ -36,6 +36,7 @@
 
 #define	EDOTDOT		66	/* RFS specific error */
 #define	EBADMSG		67	/* Not a data message */
+#define	EFSBADCRC	EBADMSG	/* Bad CRC detected */
 #define	EUSERS		68	/* Too many users */
 #define	EDQUOT		69	/* Quota exceeded */
 #define	ESTALE		70	/* Stale file handle */
@@ -62,6 +63,7 @@
 #define	ERESTART	175	/* Interrupted system call should be restarted */
 #define	ESTRPIPE	176	/* Streams pipe error */
 #define	EUCLEAN		177	/* Structure needs cleaning */
+#define	EFSCORRUPTED	EUCLEAN	/* Filesystem is corrupted */
 #define	ENOTNAM		178	/* Not a XENIX named type file */
 #define	ENAVAIL		179	/* No XENIX semaphores available */
 #define	EISNAM		180	/* Is a named type file */
diff --git a/arch/sparc/include/uapi/asm/errno.h b/arch/sparc/include/uapi/asm/errno.h
index 81a732b902ee..4a41e7835fd5 100644
--- a/arch/sparc/include/uapi/asm/errno.h
+++ b/arch/sparc/include/uapi/asm/errno.h
@@ -48,6 +48,7 @@
 #define	ENOSR		74	/* Out of streams resources */
 #define	ENOMSG		75	/* No message of desired type */
 #define	EBADMSG		76	/* Not a data message */
+#define	EFSBADCRC	EBADMSG	/* Bad CRC detected */
 #define	EIDRM		77	/* Identifier removed */
 #define	EDEADLK		78	/* Resource deadlock would occur */
 #define	ENOLCK		79	/* No record locks available */
@@ -91,6 +92,7 @@
 #define	ENOTUNIQ	115	/* Name not unique on network */
 #define	ERESTART	116	/* Interrupted syscall should be restarted */
 #define	EUCLEAN		117	/* Structure needs cleaning */
+#define	EFSCORRUPTED	EUCLEAN	/* Filesystem is corrupted */
 #define	ENOTNAM		118	/* Not a XENIX named type file */
 #define	ENAVAIL		119	/* No XENIX semaphores available */
 #define	EISNAM		120	/* Is a named type file */
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index f7f622836198..d06e99baf5d5 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -541,6 +541,4 @@ long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 long erofs_compat_ioctl(struct file *filp, unsigned int cmd,
 			unsigned long arg);
 
-#define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
-
 #endif	/* __EROFS_INTERNAL_H */
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index cf97b76e9fd3..5e0c6c5fcb6c 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -357,7 +357,6 @@ struct ext2_inode {
  */
 #define	EXT2_VALID_FS			0x0001	/* Unmounted cleanly */
 #define	EXT2_ERROR_FS			0x0002	/* Errors detected */
-#define	EFSCORRUPTED			EUCLEAN	/* Filesystem is corrupted */
 
 /*
  * Mount flags
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 56112f201cac..62c091b52bac 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3938,7 +3938,4 @@ extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
 				  get_block_t *get_block);
 #endif	/* __KERNEL__ */
 
-#define EFSBADCRC	EBADMSG		/* Bad CRC detected */
-#define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
-
 #endif	/* _EXT4_H */
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 20edbb99b814..9f3aa3c7f126 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -5004,7 +5004,4 @@ static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi,
 	f2fs_invalidate_compress_pages_range(sbi, blkaddr, len);
 }
 
-#define EFSBADCRC	EBADMSG		/* Bad CRC detected */
-#define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
-
 #endif /* _LINUX_F2FS_H */
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 2bfaf377f208..7e1f652f16d3 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -175,6 +175,4 @@ static inline int minix_test_bit(int nr, const void *vaddr)
 	__minix_error_inode((inode), __func__, __LINE__,	\
 			    (fmt), ##__VA_ARGS__)
 
-#define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
-
 #endif /* FS_MINIX_H */
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 08ec8756b948..8399accc788d 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -55,8 +55,6 @@
 #define MF_DUPLICATE_MD		0x01
 #define MF_MIRROR_FE_LOADED	0x02
 
-#define EFSCORRUPTED EUCLEAN
-
 struct udf_meta_data {
 	__u32	s_meta_file_loc;
 	__u32	s_mirror_file_loc;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 4dd747bdbcca..55064228c4d5 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -121,8 +121,6 @@ typedef __u32			xfs_nlink_t;
 
 #define ENOATTR		ENODATA		/* Attribute not found */
 #define EWRONGFS	EINVAL		/* Mount with wrong filesystem type */
-#define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
-#define EFSBADCRC	EBADMSG		/* Bad CRC detected */
 
 #define __return_address __builtin_return_address(0)
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f5eaf76198f3..a53a00d36228 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1815,7 +1815,4 @@ static inline int jbd2_handle_buffer_credits(handle_t *handle)
 
 #endif	/* __KERNEL__ */
 
-#define EFSBADCRC	EBADMSG		/* Bad CRC detected */
-#define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
-
 #endif	/* _LINUX_JBD2_H */
diff --git a/include/uapi/asm-generic/errno.h b/include/uapi/asm-generic/errno.h
index cf9c51ac49f9..92e7ae493ee3 100644
--- a/include/uapi/asm-generic/errno.h
+++ b/include/uapi/asm-generic/errno.h
@@ -55,6 +55,7 @@
 #define	EMULTIHOP	72	/* Multihop attempted */
 #define	EDOTDOT		73	/* RFS specific error */
 #define	EBADMSG		74	/* Not a data message */
+#define	EFSBADCRC	EBADMSG	/* Bad CRC detected */
 #define	EOVERFLOW	75	/* Value too large for defined data type */
 #define	ENOTUNIQ	76	/* Name not unique on network */
 #define	EBADFD		77	/* File descriptor in bad state */
@@ -98,6 +99,7 @@
 #define	EINPROGRESS	115	/* Operation now in progress */
 #define	ESTALE		116	/* Stale file handle */
 #define	EUCLEAN		117	/* Structure needs cleaning */
+#define	EFSCORRUPTED	EUCLEAN	/* Filesystem is corrupted */
 #define	ENOTNAM		118	/* Not a XENIX named type file */
 #define	ENAVAIL		119	/* No XENIX semaphores available */
 #define	EISNAM		120	/* Is a named type file */
diff --git a/tools/arch/alpha/include/uapi/asm/errno.h b/tools/arch/alpha/include/uapi/asm/errno.h
index 3d265f6babaf..6791f6508632 100644
--- a/tools/arch/alpha/include/uapi/asm/errno.h
+++ b/tools/arch/alpha/include/uapi/asm/errno.h
@@ -55,6 +55,7 @@
 #define	ENOSR		82	/* Out of streams resources */
 #define	ETIME		83	/* Timer expired */
 #define	EBADMSG		84	/* Not a data message */
+#define	EFSBADCRC	EBADMSG	/* Bad CRC detected */
 #define	EPROTO		85	/* Protocol error */
 #define	ENODATA		86	/* No data available */
 #define	ENOSTR		87	/* Device not a stream */
@@ -96,6 +97,7 @@
 #define	EREMCHG		115	/* Remote address changed */
 
 #define	EUCLEAN		117	/* Structure needs cleaning */
+#define	EFSCORRUPTED	EUCLEAN	/* Filesystem is corrupted */
 #define	ENOTNAM		118	/* Not a XENIX named type file */
 #define	ENAVAIL		119	/* No XENIX semaphores available */
 #define	EISNAM		120	/* Is a named type file */
diff --git a/tools/arch/mips/include/uapi/asm/errno.h b/tools/arch/mips/include/uapi/asm/errno.h
index 2fb714e2d6d8..c01ed91b1ef4 100644
--- a/tools/arch/mips/include/uapi/asm/errno.h
+++ b/tools/arch/mips/include/uapi/asm/errno.h
@@ -50,6 +50,7 @@
 #define EDOTDOT		73	/* RFS specific error */
 #define EMULTIHOP	74	/* Multihop attempted */
 #define EBADMSG		77	/* Not a data message */
+#define EFSBADCRC	EBADMSG	/* Bad CRC detected */
 #define ENAMETOOLONG	78	/* File name too long */
 #define EOVERFLOW	79	/* Value too large for defined data type */
 #define ENOTUNIQ	80	/* Name not unique on network */
@@ -88,6 +89,7 @@
 #define EISCONN		133	/* Transport endpoint is already connected */
 #define ENOTCONN	134	/* Transport endpoint is not connected */
 #define EUCLEAN		135	/* Structure needs cleaning */
+#define EFSCORRUPTED	EUCLEAN	/* Filesystem is corrupted */
 #define ENOTNAM		137	/* Not a XENIX named type file */
 #define ENAVAIL		138	/* No XENIX semaphores available */
 #define EISNAM		139	/* Is a named type file */
diff --git a/tools/arch/parisc/include/uapi/asm/errno.h b/tools/arch/parisc/include/uapi/asm/errno.h
index 8d94739d75c6..8cbc07c1903e 100644
--- a/tools/arch/parisc/include/uapi/asm/errno.h
+++ b/tools/arch/parisc/include/uapi/asm/errno.h
@@ -36,6 +36,7 @@
 
 #define	EDOTDOT		66	/* RFS specific error */
 #define	EBADMSG		67	/* Not a data message */
+#define	EFSBADCRC	EBADMSG	/* Bad CRC detected */
 #define	EUSERS		68	/* Too many users */
 #define	EDQUOT		69	/* Quota exceeded */
 #define	ESTALE		70	/* Stale file handle */
@@ -62,6 +63,7 @@
 #define	ERESTART	175	/* Interrupted system call should be restarted */
 #define	ESTRPIPE	176	/* Streams pipe error */
 #define	EUCLEAN		177	/* Structure needs cleaning */
+#define	EFSCORRUPTED	EUCLEAN	/* Filesystem is corrupted */
 #define	ENOTNAM		178	/* Not a XENIX named type file */
 #define	ENAVAIL		179	/* No XENIX semaphores available */
 #define	EISNAM		180	/* Is a named type file */
diff --git a/tools/arch/sparc/include/uapi/asm/errno.h b/tools/arch/sparc/include/uapi/asm/errno.h
index 81a732b902ee..4a41e7835fd5 100644
--- a/tools/arch/sparc/include/uapi/asm/errno.h
+++ b/tools/arch/sparc/include/uapi/asm/errno.h
@@ -48,6 +48,7 @@
 #define	ENOSR		74	/* Out of streams resources */
 #define	ENOMSG		75	/* No message of desired type */
 #define	EBADMSG		76	/* Not a data message */
+#define	EFSBADCRC	EBADMSG	/* Bad CRC detected */
 #define	EIDRM		77	/* Identifier removed */
 #define	EDEADLK		78	/* Resource deadlock would occur */
 #define	ENOLCK		79	/* No record locks available */
@@ -91,6 +92,7 @@
 #define	ENOTUNIQ	115	/* Name not unique on network */
 #define	ERESTART	116	/* Interrupted syscall should be restarted */
 #define	EUCLEAN		117	/* Structure needs cleaning */
+#define	EFSCORRUPTED	EUCLEAN	/* Filesystem is corrupted */
 #define	ENOTNAM		118	/* Not a XENIX named type file */
 #define	ENAVAIL		119	/* No XENIX semaphores available */
 #define	EISNAM		120	/* Is a named type file */
diff --git a/tools/include/uapi/asm-generic/errno.h b/tools/include/uapi/asm-generic/errno.h
index cf9c51ac49f9..92e7ae493ee3 100644
--- a/tools/include/uapi/asm-generic/errno.h
+++ b/tools/include/uapi/asm-generic/errno.h
@@ -55,6 +55,7 @@
 #define	EMULTIHOP	72	/* Multihop attempted */
 #define	EDOTDOT		73	/* RFS specific error */
 #define	EBADMSG		74	/* Not a data message */
+#define	EFSBADCRC	EBADMSG	/* Bad CRC detected */
 #define	EOVERFLOW	75	/* Value too large for defined data type */
 #define	ENOTUNIQ	76	/* Name not unique on network */
 #define	EBADFD		77	/* File descriptor in bad state */
@@ -98,6 +99,7 @@
 #define	EINPROGRESS	115	/* Operation now in progress */
 #define	ESTALE		116	/* Stale file handle */
 #define	EUCLEAN		117	/* Structure needs cleaning */
+#define	EFSCORRUPTED	EUCLEAN	/* Filesystem is corrupted */
 #define	ENOTNAM		118	/* Not a XENIX named type file */
 #define	ENAVAIL		119	/* No XENIX semaphores available */
 #define	EISNAM		120	/* Is a named type file */
-- 
cgit v1.2.3


From 26bea10450afe5ad4dd0e0bbb797c44e1df110fe Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 6 Jan 2026 12:13:15 +0100
Subject: objtool: fix compilation failure with the x32 toolchain

When using the x32 toolchain, compilation fails because the printf
specifier "%lx" (long), doesn't match the type of the "checksum" variable
(long long). Fix this by changing the printf specifier to "%llx" and
casting "checksum" to unsigned long long.

Fixes: a3493b33384a ("objtool/klp: Add --debug-checksum=<funcs> to show per-instruction checksums")

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/a1158c99-fe0e-a218-4b5b-ffac212489f6@redhat.com
---
 tools/objtool/include/objtool/warn.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/include/objtool/warn.h b/tools/objtool/include/objtool/warn.h
index 25ff7942b4d5..2b27b54096b8 100644
--- a/tools/objtool/include/objtool/warn.h
+++ b/tools/objtool/include/objtool/warn.h
@@ -152,8 +152,8 @@ static inline void unindent(int *unused) { indent--; }
 	if (unlikely(insn->sym && insn->sym->pfunc &&			\
 		     insn->sym->pfunc->debug_checksum)) {		\
 		char *insn_off = offstr(insn->sec, insn->offset);	\
-		__dbg("checksum: %s %s %016lx",				\
-		      func->name, insn_off, checksum);			\
+		__dbg("checksum: %s %s %016llx",			\
+		      func->name, insn_off, (unsigned long long)checksum);\
 		free(insn_off);						\
 	}								\
 })
-- 
cgit v1.2.3


From 436326bc525d467e38db1da576139ec5f28268c5 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sashal@kernel.org>
Date: Tue, 23 Dec 2025 07:03:57 -0500
Subject: objtool: fix build failure due to missing libopcodes check

Commit 59953303827e ("objtool: Disassemble code with libopcodes instead
of running objdump") added support for using libopcodes for disassembly.
However, the feature detection checks for libbfd availability but then
unconditionally links against libopcodes:

  ifeq ($(feature-libbfd),1)
      OBJTOOL_LDFLAGS += -lopcodes
  endif

This causes build failures in environments where libbfd is installed but
libopcodes is not, since the test-libbfd.c feature test only links
against -lbfd and -ldl, not -lopcodes:

  /usr/bin/ld: cannot find -lopcodes: No such file or directory
  collect2: error: ld returned 1 exit status
  make[4]: *** [Makefile:109: objtool] Error 1

Additionally, the shared feature framework uses $(CC) which is the
cross-compiler in cross-compilation builds. Since objtool is a host tool
that links with $(HOSTCC) against host libraries, the feature detection
can falsely report libopcodes as available when the cross-compiler's
sysroot has it but the host system doesn't.

Fix this by replacing the feature framework check with a direct inline
test that uses $(HOSTCC) to compile and link a test program against
libopcodes, similar to how xxhash availability is detected.

Fixes: 59953303827e ("objtool: Disassemble code with libopcodes instead of running objdump")
Assisted-by: claude-opus-4-5-20251101
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251223120357.2492008-1-sashal@kernel.org
---
 tools/objtool/Makefile | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index ad6e1ec706ce..9b4503113ce5 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -72,23 +72,27 @@ HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)"
 
 #
 # To support disassembly, objtool needs libopcodes which is provided
-# with libbdf (binutils-dev or binutils-devel package).
+# with libbfd (binutils-dev or binutils-devel package).
 #
-FEATURE_USER = .objtool
-FEATURE_TESTS = libbfd disassembler-init-styled
-FEATURE_DISPLAY =
-include $(srctree)/tools/build/Makefile.feature
+# We check using HOSTCC directly rather than the shared feature framework
+# because objtool is a host tool that links against host libraries.
+#
+HAVE_LIBOPCODES := $(shell echo 'int main(void) { return 0; }' | \
+			$(HOSTCC) -xc - -o /dev/null -lopcodes 2>/dev/null && echo y)
 
-ifeq ($(feature-disassembler-init-styled), 1)
-	OBJTOOL_CFLAGS += -DDISASM_INIT_STYLED
-endif
+# Styled disassembler support requires binutils >= 2.39
+HAVE_DISASM_STYLED := $(shell echo '$(pound)include <dis-asm.h>' | \
+			$(HOSTCC) -E -xc - 2>/dev/null | grep -q disassembler_style && echo y)
 
 BUILD_DISAS := n
 
-ifeq ($(feature-libbfd),1)
+ifeq ($(HAVE_LIBOPCODES),y)
 	BUILD_DISAS := y
-	OBJTOOL_CFLAGS += -DDISAS -DPACKAGE="objtool"
+	OBJTOOL_CFLAGS += -DDISAS -DPACKAGE='"objtool"'
 	OBJTOOL_LDFLAGS += -lopcodes
+ifeq ($(HAVE_DISASM_STYLED),y)
+	OBJTOOL_CFLAGS += -DDISASM_INIT_STYLED
+endif
 endif
 
 export BUILD_DISAS
-- 
cgit v1.2.3


From 8d61f1a9f2541c6ef51d4997e6a4c5a1c0d8b27c Mon Sep 17 00:00:00 2001
From: Jonas Köppeler <j.koeppeler@tu-berlin.de>
Date: Fri, 9 Jan 2026 14:15:35 +0100
Subject: selftests/tc-testing: add selftests for cake_mq qdisc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Test 684b: Create CAKE_MQ with default setting (4 queues)
Test 7ee8: Create CAKE_MQ with bandwidth limit (4 queues)
Test 1f87: Create CAKE_MQ with rtt time (4 queues)
Test e9cf: Create CAKE_MQ with besteffort flag (4 queues)
Test 7c05: Create CAKE_MQ with diffserv8 flag (4 queues)
Test 5a77: Create CAKE_MQ with diffserv4 flag (4 queues)
Test 8f7a: Create CAKE_MQ with flowblind flag (4 queues)
Test 7ef7: Create CAKE_MQ with dsthost and nat flag (4 queues)
Test 2e4d: Create CAKE_MQ with wash flag (4 queues)
Test b3e6: Create CAKE_MQ with flowblind and no-split-gso flag (4 queues)
Test 62cd: Create CAKE_MQ with dual-srchost and ack-filter flag (4 queues)
Test 0df3: Create CAKE_MQ with dual-dsthost and ack-filter-aggressive flag (4 queues)
Test 9a75: Create CAKE_MQ with memlimit and ptm flag (4 queues)
Test cdef: Create CAKE_MQ with fwmark and atm flag (4 queues)
Test 93dd: Create CAKE_MQ with overhead 0 and mpu (4 queues)
Test 1475: Create CAKE_MQ with conservative and ingress flag (4 queues)
Test 7bf1: Delete CAKE_MQ with conservative and ingress flag (4 queues)
Test ee55: Replace CAKE_MQ with mpu (4 queues)
Test 6df9: Change CAKE_MQ with mpu (4 queues)
Test 67e2: Show CAKE_MQ class (4 queues)
Test 2de4: Change bandwidth of CAKE_MQ (4 queues)
Test 5f62: Fail to create CAKE_MQ with autorate-ingress flag (4 queues)
Test 038e: Fail to change setting of sub-qdisc under CAKE_MQ
Test 7bdc: Fail to replace sub-qdisc under CAKE_MQ
Test 18e0: Fail to install CAKE_MQ on single queue device

Reviewed-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: Jonas Köppeler <j.koeppeler@tu-berlin.de>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://patch.msgid.link/20260109-mq-cake-sub-qdisc-v8-6-8d613fece5d8@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../tc-testing/tc-tests/qdiscs/cake_mq.json        | 559 +++++++++++++++++++++
 1 file changed, 559 insertions(+)
 create mode 100644 tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json

(limited to 'tools')

diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json
new file mode 100644
index 000000000000..0efe229fb86e
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/cake_mq.json
@@ -0,0 +1,559 @@
+[
+    {
+        "id": "684b",
+        "name": "Create CAKE_MQ with default setting (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device || true",
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "7ee8",
+        "name": "Create CAKE_MQ with bandwidth limit (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq bandwidth 1000",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth 1Kbit diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "1f87",
+        "name": "Create CAKE_MQ with rtt time (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq rtt 200",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 200us raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "e9cf",
+        "name": "Create CAKE_MQ with besteffort flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq besteffort",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited besteffort triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "7c05",
+        "name": "Create CAKE_MQ with diffserv8 flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq diffserv8",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv8 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "5a77",
+        "name": "Create CAKE_MQ with diffserv4 flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq diffserv4",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv4 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "8f7a",
+        "name": "Create CAKE_MQ with flowblind flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq flowblind",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 flowblind nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "7ef7",
+        "name": "Create CAKE_MQ with dsthost and nat flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq dsthost nat",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 dsthost nat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "2e4d",
+        "name": "Create CAKE_MQ with wash flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq hosts wash",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 hosts nonat wash no-ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "b3e6",
+        "name": "Create CAKE_MQ with flowblind and no-split-gso flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq flowblind no-split-gso",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 flowblind nonat nowash no-ack-filter no-split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "62cd",
+        "name": "Create CAKE_MQ with dual-srchost and ack-filter flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq dual-srchost ack-filter",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 dual-srchost nonat nowash ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "0df3",
+        "name": "Create CAKE_MQ with dual-dsthost and ack-filter-aggressive flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq dual-dsthost ack-filter-aggressive",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 dual-dsthost nonat nowash ack-filter-aggressive split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "9a75",
+        "name": "Create CAKE_MQ with memlimit and ptm flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq memlimit 10000 ptm",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw ptm overhead 0 memlimit 10000b ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "cdef",
+        "name": "Create CAKE_MQ with fwmark and atm flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq fwmark 8 atm",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw atm overhead 0 fwmark 0x8 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "93dd",
+        "name": "Create CAKE_MQ with overhead 0 and mpu (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq overhead 128 mpu 256",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms noatm overhead 128 mpu 256 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "1475",
+        "name": "Create CAKE_MQ with conservative and ingress flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq conservative ingress",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash ingress no-ack-filter split-gso rtt 100ms atm overhead 48 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "7bf1",
+        "name": "Delete CAKE_MQ with conservative and ingress flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device",
+            "$TC qdisc add dev $ETH handle 1: root cake_mq conservative ingress"
+        ],
+        "cmdUnderTest": "$TC qdisc del dev $ETH handle 1: root",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash ingress no-ack-filter split-gso rtt 100ms atm overhead 48 ",
+        "matchCount": "0",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "ee55",
+        "name": "Replace CAKE_MQ with mpu (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device",
+            "$TC qdisc add dev $ETH handle 1: root cake_mq overhead 128 mpu 256"
+        ],
+        "cmdUnderTest": "$TC qdisc replace dev $ETH handle 1: root cake_mq mpu 128",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms noatm overhead 128 mpu 128 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "6df9",
+        "name": "Change CAKE_MQ with mpu (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device",
+            "$TC qdisc add dev $ETH handle 1: root cake_mq overhead 128 mpu 256"
+        ],
+        "cmdUnderTest": "$TC qdisc change dev $ETH handle 1: root cake_mq mpu 128",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms noatm overhead 128 mpu 128 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "67e2",
+        "name": "Show CAKE_MQ class (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq",
+        "expExitCode": "0",
+        "verifyCmd": "$TC class show dev $ETH",
+        "matchPattern": "class cake_mq",
+        "matchCount": "4",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "2de4",
+        "name": "Change bandwidth of CAKE_MQ (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device",
+            "$TC qdisc add dev $ETH handle 1: root cake_mq"
+        ],
+        "cmdUnderTest": "$TC qdisc replace dev $ETH handle 1: root cake_mq bandwidth 1000",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth 1Kbit diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "5f62",
+        "name": "Fail to create CAKE_MQ with autorate-ingress flag (4 queues)",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq autorate-ingress",
+        "expExitCode": "2",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited autorate-ingress diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "0",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "038e",
+        "name": "Fail to change setting of sub-qdisc under CAKE_MQ",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device",
+            "$TC qdisc add dev $ETH handle 1: root cake_mq"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH parent 1:1 cake besteffort flows",
+        "expExitCode": "2",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "7bdc",
+        "name": "Fail to replace sub-qdisc under CAKE_MQ",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 4\" > /sys/bus/netdevsim/new_device",
+            "$TC qdisc add dev $ETH handle 1: root cake_mq"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH parent 1:1 fq",
+        "expExitCode": "2",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "5",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    },
+    {
+        "id": "18e0",
+        "name": "Fail to install CAKE_MQ on single queue device",
+        "category": [
+            "qdisc",
+            "cake_mq"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "echo \"1 1 1\" > /sys/bus/netdevsim/new_device"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $ETH handle 1: root cake_mq",
+        "expExitCode": "2",
+        "verifyCmd": "$TC qdisc show dev $ETH",
+        "matchPattern": "qdisc (cake_mq 1: root|cake 0: parent 1:[1-4]) bandwidth unlimited diffserv3 triple-isolate nonat nowash no-ack-filter split-gso rtt 100ms raw overhead 0 ",
+        "matchCount": "0",
+        "teardown": [
+            "echo \"1\" > /sys/bus/netdevsim/del_device"
+        ]
+    }
+]
-- 
cgit v1.2.3


From f88dc319fcb6d6a155e94469a355ce456dd85441 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Mon, 5 Jan 2026 12:05:17 +0100
Subject: objtool: Allow multiple pv_ops arrays

Having a single large pv_ops array has the main disadvantage of needing all
prototypes of the single array members in one header file. This is adding up
to the need to include lots of otherwise unrelated headers.

In order to allow multiple smaller pv_ops arrays dedicated to one area of the
kernel each, allow multiple arrays in objtool.

For better performance limit the possible names of the arrays to start with
"pv_ops".

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20260105110520.21356-19-jgross@suse.com
---
 tools/objtool/arch/x86/decode.c       |  8 +++-
 tools/objtool/check.c                 | 74 +++++++++++++++++++++++++++--------
 tools/objtool/include/objtool/check.h |  1 +
 3 files changed, 65 insertions(+), 18 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index f4af82508228..73bfea220d1b 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -711,10 +711,14 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec
 			immr = find_reloc_by_dest(elf, (void *)sec, offset+3);
 			disp = find_reloc_by_dest(elf, (void *)sec, offset+7);
 
-			if (!immr || strcmp(immr->sym->name, "pv_ops"))
+			if (!immr || strncmp(immr->sym->name, "pv_ops", 6))
 				break;
 
-			idx = (reloc_addend(immr) + 8) / sizeof(void *);
+			idx = pv_ops_idx_off(immr->sym->name);
+			if (idx < 0)
+				break;
+
+			idx += (reloc_addend(immr) + 8) / sizeof(void *);
 
 			func = disp->sym;
 			if (disp->sym->type == STT_SECTION)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 61f3e0c48fcc..b3fec88d5bd3 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -520,21 +520,57 @@ static int decode_instructions(struct objtool_file *file)
 }
 
 /*
- * Read the pv_ops[] .data table to find the static initialized values.
+ * Known pv_ops*[] arrays.
  */
-static int add_pv_ops(struct objtool_file *file, const char *symname)
+static struct {
+	const char *name;
+	int idx_off;
+} pv_ops_tables[] = {
+	{ .name = "pv_ops", },
+	{ .name = NULL, .idx_off = -1 }
+};
+
+/*
+ * Get index offset for a pv_ops* array.
+ */
+int pv_ops_idx_off(const char *symname)
+{
+	int idx;
+
+	for (idx = 0; pv_ops_tables[idx].name; idx++) {
+		if (!strcmp(symname, pv_ops_tables[idx].name))
+			break;
+	}
+
+	return pv_ops_tables[idx].idx_off;
+}
+
+/*
+ * Read a pv_ops*[] .data table to find the static initialized values.
+ */
+static int add_pv_ops(struct objtool_file *file, int pv_ops_idx)
 {
 	struct symbol *sym, *func;
 	unsigned long off, end;
 	struct reloc *reloc;
-	int idx;
+	int idx, idx_off;
+	const char *symname;
 
+	symname = pv_ops_tables[pv_ops_idx].name;
 	sym = find_symbol_by_name(file->elf, symname);
-	if (!sym)
-		return 0;
+	if (!sym) {
+		ERROR("Unknown pv_ops array %s", symname);
+		return -1;
+	}
 
 	off = sym->offset;
 	end = off + sym->len;
+	idx_off = pv_ops_tables[pv_ops_idx].idx_off;
+	if (idx_off < 0) {
+		ERROR("pv_ops array %s has unknown index offset", symname);
+		return -1;
+	}
+
 	for (;;) {
 		reloc = find_reloc_by_dest_range(file->elf, sym->sec, off, end - off);
 		if (!reloc)
@@ -552,7 +588,7 @@ static int add_pv_ops(struct objtool_file *file, const char *symname)
 			return -1;
 		}
 
-		if (objtool_pv_add(file, idx, func))
+		if (objtool_pv_add(file, idx + idx_off, func))
 			return -1;
 
 		off = reloc_offset(reloc) + 1;
@@ -568,11 +604,6 @@ static int add_pv_ops(struct objtool_file *file, const char *symname)
  */
 static int init_pv_ops(struct objtool_file *file)
 {
-	static const char *pv_ops_tables[] = {
-		"pv_ops",
-		NULL,
-	};
-	const char *pv_ops;
 	struct symbol *sym;
 	int idx, nr;
 
@@ -581,11 +612,20 @@ static int init_pv_ops(struct objtool_file *file)
 
 	file->pv_ops = NULL;
 
-	sym = find_symbol_by_name(file->elf, "pv_ops");
-	if (!sym)
+	nr = 0;
+	for (idx = 0; pv_ops_tables[idx].name; idx++) {
+		sym = find_symbol_by_name(file->elf, pv_ops_tables[idx].name);
+		if (!sym) {
+			pv_ops_tables[idx].idx_off = -1;
+			continue;
+		}
+		pv_ops_tables[idx].idx_off = nr;
+		nr += sym->len / sizeof(unsigned long);
+	}
+
+	if (nr == 0)
 		return 0;
 
-	nr = sym->len / sizeof(unsigned long);
 	file->pv_ops = calloc(nr, sizeof(struct pv_state));
 	if (!file->pv_ops) {
 		ERROR_GLIBC("calloc");
@@ -595,8 +635,10 @@ static int init_pv_ops(struct objtool_file *file)
 	for (idx = 0; idx < nr; idx++)
 		INIT_LIST_HEAD(&file->pv_ops[idx].targets);
 
-	for (idx = 0; (pv_ops = pv_ops_tables[idx]); idx++) {
-		if (add_pv_ops(file, pv_ops))
+	for (idx = 0; pv_ops_tables[idx].name; idx++) {
+		if (pv_ops_tables[idx].idx_off < 0)
+			continue;
+		if (add_pv_ops(file, idx))
 			return -1;
 	}
 
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index 2e1346ad5e92..5f2f77bd9b41 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -159,5 +159,6 @@ const char *objtool_disas_insn(struct instruction *insn);
 
 extern size_t sym_name_max_len;
 extern struct disas_context *objtool_disas_ctx;
+int pv_ops_idx_off(const char *symname);
 
 #endif /* _CHECK_H */
-- 
cgit v1.2.3


From 609e359ab904698f1e5aa0ab2fee2f4c29ee0886 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 23 Dec 2025 07:59:13 +0100
Subject: selftests: vDSO: vdso_config: Add configurations for
 clock_getres_time64()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some architectures will start to implement this function.
Make sure that tests can be written for it.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Link: https://patch.msgid.link/20251223-vdso-compat-time32-v1-2-97ea7a06a543@linutronix.de
---
 tools/testing/selftests/vDSO/vdso_config.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_config.h b/tools/testing/selftests/vDSO/vdso_config.h
index 50c261005111..5da223731b81 100644
--- a/tools/testing/selftests/vDSO/vdso_config.h
+++ b/tools/testing/selftests/vDSO/vdso_config.h
@@ -66,7 +66,7 @@ static const char *versions[7] = {
 };
 
 __attribute__((unused))
-static const char *names[2][7] = {
+static const char *names[2][8] = {
 	{
 		"__kernel_gettimeofday",
 		"__kernel_clock_gettime",
@@ -75,6 +75,7 @@ static const char *names[2][7] = {
 		"__kernel_getcpu",
 		"__kernel_clock_gettime64",
 		"__kernel_getrandom",
+		"__kernel_clock_getres_time64",
 	},
 	{
 		"__vdso_gettimeofday",
@@ -84,6 +85,7 @@ static const char *names[2][7] = {
 		"__vdso_getcpu",
 		"__vdso_clock_gettime64",
 		"__vdso_getrandom",
+		"__vdso_clock_getres_time64",
 	},
 };
 
-- 
cgit v1.2.3


From 1dcd1273add368c2b7c65135e22b416e1b374781 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 23 Dec 2025 07:59:14 +0100
Subject: selftests: vDSO: vdso_test_abi: Use UAPI system call numbers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SYS_clock_getres might have been redirected by libc to some other system
call than the actual clock_getres. For testing it is required to use
exactly this system call.

Use the system call number exported by the UAPI headers which is always
correct.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Link: https://patch.msgid.link/20251223-vdso-compat-time32-v1-3-97ea7a06a543@linutronix.de
---
 tools/testing/selftests/vDSO/vdso_test_abi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c
index c620317eaeea..a75c12dcb0f1 100644
--- a/tools/testing/selftests/vDSO/vdso_test_abi.c
+++ b/tools/testing/selftests/vDSO/vdso_test_abi.c
@@ -179,7 +179,7 @@ static void vdso_test_clock_getres(clockid_t clk_id)
 		clock_getres_fail++;
 	}
 
-	ret = syscall(SYS_clock_getres, clk_id, &sys_ts);
+	ret = syscall(__NR_clock_getres, clk_id, &sys_ts);
 
 	ksft_print_msg("The syscall resolution is %lld %lld\n",
 			(long long)sys_ts.tv_sec, (long long)sys_ts.tv_nsec);
-- 
cgit v1.2.3


From 4e6a2312986d437cc22805b9e08f86b15fee0318 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 23 Dec 2025 07:59:15 +0100
Subject: selftests: vDSO: vdso_test_abi: Add test for clock_getres_time64()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some architectures will start to implement this function.
Make sure it works correctly.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Link: https://patch.msgid.link/20251223-vdso-compat-time32-v1-4-97ea7a06a543@linutronix.de
---
 tools/testing/selftests/vDSO/vdso_test_abi.c | 53 +++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vdso_test_abi.c b/tools/testing/selftests/vDSO/vdso_test_abi.c
index a75c12dcb0f1..b162a4ba9c4f 100644
--- a/tools/testing/selftests/vDSO/vdso_test_abi.c
+++ b/tools/testing/selftests/vDSO/vdso_test_abi.c
@@ -36,6 +36,7 @@ typedef long (*vdso_gettimeofday_t)(struct timeval *tv, struct timezone *tz);
 typedef long (*vdso_clock_gettime_t)(clockid_t clk_id, struct timespec *ts);
 typedef long (*vdso_clock_gettime64_t)(clockid_t clk_id, struct vdso_timespec64 *ts);
 typedef long (*vdso_clock_getres_t)(clockid_t clk_id, struct timespec *ts);
+typedef long (*vdso_clock_getres_time64_t)(clockid_t clk_id, struct vdso_timespec64 *ts);
 typedef time_t (*vdso_time_t)(time_t *t);
 
 static const char * const vdso_clock_name[] = {
@@ -196,6 +197,55 @@ static void vdso_test_clock_getres(clockid_t clk_id)
 	}
 }
 
+#ifdef __NR_clock_getres_time64
+static void vdso_test_clock_getres_time64(clockid_t clk_id)
+{
+	int clock_getres_fail = 0;
+
+	/* Find clock_getres. */
+	vdso_clock_getres_time64_t vdso_clock_getres_time64 =
+		(vdso_clock_getres_time64_t)vdso_sym(version, name[7]);
+
+	if (!vdso_clock_getres_time64) {
+		ksft_print_msg("Couldn't find %s\n", name[7]);
+		ksft_test_result_skip("%s %s\n", name[7],
+				      vdso_clock_name[clk_id]);
+		return;
+	}
+
+	struct vdso_timespec64 ts, sys_ts;
+	long ret = VDSO_CALL(vdso_clock_getres_time64, 2, clk_id, &ts);
+
+	if (ret == 0) {
+		ksft_print_msg("The vdso resolution is %lld %lld\n",
+			       (long long)ts.tv_sec, (long long)ts.tv_nsec);
+	} else {
+		clock_getres_fail++;
+	}
+
+	ret = syscall(__NR_clock_getres_time64, clk_id, &sys_ts);
+
+	ksft_print_msg("The syscall resolution is %lld %lld\n",
+			(long long)sys_ts.tv_sec, (long long)sys_ts.tv_nsec);
+
+	if ((sys_ts.tv_sec != ts.tv_sec) || (sys_ts.tv_nsec != ts.tv_nsec))
+		clock_getres_fail++;
+
+	if (clock_getres_fail > 0) {
+		ksft_test_result_fail("%s %s\n", name[7],
+				      vdso_clock_name[clk_id]);
+	} else {
+		ksft_test_result_pass("%s %s\n", name[7],
+				      vdso_clock_name[clk_id]);
+	}
+}
+#else /* !__NR_clock_getres_time64 */
+static void vdso_test_clock_getres_time64(clockid_t clk_id)
+{
+	ksft_test_result_skip("%s %s\n", name[7], vdso_clock_name[clk_id]);
+}
+#endif /* __NR_clock_getres_time64 */
+
 /*
  * This function calls vdso_test_clock_gettime and vdso_test_clock_getres
  * with different values for clock_id.
@@ -208,9 +258,10 @@ static inline void vdso_test_clock(clockid_t clock_id)
 	vdso_test_clock_gettime64(clock_id);
 
 	vdso_test_clock_getres(clock_id);
+	vdso_test_clock_getres_time64(clock_id);
 }
 
-#define VDSO_TEST_PLAN	29
+#define VDSO_TEST_PLAN	38
 
 int main(int argc, char **argv)
 {
-- 
cgit v1.2.3


From b0b449e6fec4cd182bd4384f7eb9002596079f68 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Mon, 5 Jan 2026 12:05:20 +0100
Subject: x86/pvlocks: Move paravirt spinlock functions into own header

Instead of having the pv spinlock function definitions in paravirt.h,
move them into the new header paravirt-spinlock.h.

Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20260105110520.21356-22-jgross@suse.com
---
 arch/x86/hyperv/hv_spinlock.c            |  10 +--
 arch/x86/include/asm/paravirt-base.h     |   6 ++
 arch/x86/include/asm/paravirt-spinlock.h | 145 +++++++++++++++++++++++++++++++
 arch/x86/include/asm/paravirt.h          |  61 -------------
 arch/x86/include/asm/paravirt_types.h    |  17 ----
 arch/x86/include/asm/qspinlock.h         |  87 ++-----------------
 arch/x86/kernel/Makefile                 |   2 +-
 arch/x86/kernel/kvm.c                    |  12 +--
 arch/x86/kernel/paravirt-spinlocks.c     |  26 +++++-
 arch/x86/kernel/paravirt.c               |  21 -----
 arch/x86/xen/spinlock.c                  |  10 +--
 tools/objtool/check.c                    |   1 +
 12 files changed, 198 insertions(+), 200 deletions(-)
 create mode 100644 arch/x86/include/asm/paravirt-spinlock.h

(limited to 'tools')

diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c
index 2a3c2afb0154..210b494e4de0 100644
--- a/arch/x86/hyperv/hv_spinlock.c
+++ b/arch/x86/hyperv/hv_spinlock.c
@@ -78,11 +78,11 @@ void __init hv_init_spinlocks(void)
 	pr_info("PV spinlocks enabled\n");
 
 	__pv_init_lock_hash();
-	pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
-	pv_ops.lock.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
-	pv_ops.lock.wait = hv_qlock_wait;
-	pv_ops.lock.kick = hv_qlock_kick;
-	pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(hv_vcpu_is_preempted);
+	pv_ops_lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+	pv_ops_lock.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+	pv_ops_lock.wait = hv_qlock_wait;
+	pv_ops_lock.kick = hv_qlock_kick;
+	pv_ops_lock.vcpu_is_preempted = PV_CALLEE_SAVE(hv_vcpu_is_preempted);
 }
 
 static __init int hv_parse_nopvspin(char *arg)
diff --git a/arch/x86/include/asm/paravirt-base.h b/arch/x86/include/asm/paravirt-base.h
index 3827ea20de18..982a0b93bc76 100644
--- a/arch/x86/include/asm/paravirt-base.h
+++ b/arch/x86/include/asm/paravirt-base.h
@@ -26,4 +26,10 @@ u64 _paravirt_ident_64(u64);
 #endif
 #define paravirt_nop	((void *)nop_func)
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void paravirt_set_cap(void);
+#else
+static inline void paravirt_set_cap(void) { }
+#endif
+
 #endif /* _ASM_X86_PARAVIRT_BASE_H */
diff --git a/arch/x86/include/asm/paravirt-spinlock.h b/arch/x86/include/asm/paravirt-spinlock.h
new file mode 100644
index 000000000000..a5011ef3a6cc
--- /dev/null
+++ b/arch/x86/include/asm/paravirt-spinlock.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_X86_PARAVIRT_SPINLOCK_H
+#define _ASM_X86_PARAVIRT_SPINLOCK_H
+
+#include <asm/paravirt_types.h>
+
+#ifdef CONFIG_SMP
+#include <asm/spinlock_types.h>
+#endif
+
+struct qspinlock;
+
+struct pv_lock_ops {
+	void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
+	struct paravirt_callee_save queued_spin_unlock;
+
+	void (*wait)(u8 *ptr, u8 val);
+	void (*kick)(int cpu);
+
+	struct paravirt_callee_save vcpu_is_preempted;
+} __no_randomize_layout;
+
+extern struct pv_lock_ops pv_ops_lock;
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+extern bool nopvspin;
+
+static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
+							 u32 val)
+{
+	PVOP_VCALL2(pv_ops_lock, queued_spin_lock_slowpath, lock, val);
+}
+
+static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+	PVOP_ALT_VCALLEE1(pv_ops_lock, queued_spin_unlock, lock,
+			  "movb $0, (%%" _ASM_ARG1 ");",
+			  ALT_NOT(X86_FEATURE_PVUNLOCK));
+}
+
+static __always_inline bool pv_vcpu_is_preempted(long cpu)
+{
+	return PVOP_ALT_CALLEE1(bool, pv_ops_lock, vcpu_is_preempted, cpu,
+				"xor %%" _ASM_AX ", %%" _ASM_AX ";",
+				ALT_NOT(X86_FEATURE_VCPUPREEMPT));
+}
+
+#define queued_spin_unlock queued_spin_unlock
+/**
+ * queued_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * A smp_store_release() on the least-significant byte.
+ */
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+	smp_store_release(&lock->locked, 0);
+}
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+	pv_queued_spin_lock_slowpath(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	kcsan_release();
+	pv_queued_spin_unlock(lock);
+}
+
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(long cpu)
+{
+	return pv_vcpu_is_preempted(cpu);
+}
+
+static __always_inline void pv_wait(u8 *ptr, u8 val)
+{
+	PVOP_VCALL2(pv_ops_lock, wait, ptr, val);
+}
+
+static __always_inline void pv_kick(int cpu)
+{
+	PVOP_VCALL1(pv_ops_lock, kick, cpu);
+}
+
+void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock);
+bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+void __init native_pv_lock_init(void);
+__visible void __native_queued_spin_unlock(struct qspinlock *lock);
+bool pv_is_native_spin_unlock(void);
+__visible bool __native_vcpu_is_preempted(long cpu);
+bool pv_is_native_vcpu_is_preempted(void);
+
+/*
+ * virt_spin_lock_key - disables by default the virt_spin_lock() hijack.
+ *
+ * Native (and PV wanting native due to vCPU pinning) should keep this key
+ * disabled. Native does not touch the key.
+ *
+ * When in a guest then native_pv_lock_init() enables the key first and
+ * KVM/XEN might conditionally disable it later in the boot process again.
+ */
+DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
+
+/*
+ * Shortcut for the queued_spin_lock_slowpath() function that allows
+ * virt to hijack it.
+ *
+ * Returns:
+ *   true - lock has been negotiated, all done;
+ *   false - queued_spin_lock_slowpath() will do its thing.
+ */
+#define virt_spin_lock virt_spin_lock
+static inline bool virt_spin_lock(struct qspinlock *lock)
+{
+	int val;
+
+	if (!static_branch_likely(&virt_spin_lock_key))
+		return false;
+
+	/*
+	 * On hypervisors without PARAVIRT_SPINLOCKS support we fall
+	 * back to a Test-and-Set spinlock, because fair locks have
+	 * horrible lock 'holder' preemption issues.
+	 */
+
+ __retry:
+	val = atomic_read(&lock->val);
+
+	if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) {
+		cpu_relax();
+		goto __retry;
+	}
+
+	return true;
+}
+
+#endif /* _ASM_X86_PARAVIRT_SPINLOCK_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ec274d13bae0..b21072af731d 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -19,15 +19,6 @@
 #include <linux/cpumask.h>
 #include <asm/frame.h>
 
-__visible void __native_queued_spin_unlock(struct qspinlock *lock);
-bool pv_is_native_spin_unlock(void);
-__visible bool __native_vcpu_is_preempted(long cpu);
-bool pv_is_native_vcpu_is_preempted(void);
-
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-void __init paravirt_set_cap(void);
-#endif
-
 /* The paravirtualized I/O functions */
 static inline void slow_down_io(void)
 {
@@ -522,46 +513,7 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
 {
 	pv_ops.mmu.set_fixmap(idx, phys, flags);
 }
-#endif
-
-#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
-
-static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
-							u32 val)
-{
-	PVOP_VCALL2(pv_ops, lock.queued_spin_lock_slowpath, lock, val);
-}
-
-static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
-{
-	PVOP_ALT_VCALLEE1(pv_ops, lock.queued_spin_unlock, lock,
-			  "movb $0, (%%" _ASM_ARG1 ");",
-			  ALT_NOT(X86_FEATURE_PVUNLOCK));
-}
-
-static __always_inline void pv_wait(u8 *ptr, u8 val)
-{
-	PVOP_VCALL2(pv_ops, lock.wait, ptr, val);
-}
-
-static __always_inline void pv_kick(int cpu)
-{
-	PVOP_VCALL1(pv_ops, lock.kick, cpu);
-}
-
-static __always_inline bool pv_vcpu_is_preempted(long cpu)
-{
-	return PVOP_ALT_CALLEE1(bool, pv_ops, lock.vcpu_is_preempted, cpu,
-				"xor %%" _ASM_AX ", %%" _ASM_AX ";",
-				ALT_NOT(X86_FEATURE_VCPUPREEMPT));
-}
 
-void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock);
-bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
-
-#endif /* SMP && PARAVIRT_SPINLOCKS */
-
-#ifdef CONFIG_PARAVIRT_XXL
 static __always_inline unsigned long arch_local_save_flags(void)
 {
 	return PVOP_ALT_CALLEE0(unsigned long, pv_ops, irq.save_fl, "pushf; pop %%rax;",
@@ -588,8 +540,6 @@ static __always_inline unsigned long arch_local_irq_save(void)
 }
 #endif
 
-void native_pv_lock_init(void) __init;
-
 #else  /* __ASSEMBLER__ */
 
 #ifdef CONFIG_X86_64
@@ -613,12 +563,6 @@ void native_pv_lock_init(void) __init;
 #endif /* __ASSEMBLER__ */
 #else  /* CONFIG_PARAVIRT */
 # define default_banner x86_init_noop
-
-#ifndef __ASSEMBLER__
-static inline void native_pv_lock_init(void)
-{
-}
-#endif
 #endif /* !CONFIG_PARAVIRT */
 
 #ifndef __ASSEMBLER__
@@ -634,10 +578,5 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
 }
 #endif
 
-#ifndef CONFIG_PARAVIRT_SPINLOCKS
-static inline void paravirt_set_cap(void)
-{
-}
-#endif
 #endif /* __ASSEMBLER__ */
 #endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b36d425d099b..7ccd41628d36 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -184,22 +184,6 @@ struct pv_mmu_ops {
 #endif
 } __no_randomize_layout;
 
-#ifdef CONFIG_SMP
-#include <asm/spinlock_types.h>
-#endif
-
-struct qspinlock;
-
-struct pv_lock_ops {
-	void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
-	struct paravirt_callee_save queued_spin_unlock;
-
-	void (*wait)(u8 *ptr, u8 val);
-	void (*kick)(int cpu);
-
-	struct paravirt_callee_save vcpu_is_preempted;
-} __no_randomize_layout;
-
 /* This contains all the paravirt structures: we get a convenient
  * number for each function using the offset which we use to indicate
  * what to patch. */
@@ -207,7 +191,6 @@ struct paravirt_patch_template {
 	struct pv_cpu_ops	cpu;
 	struct pv_irq_ops	irq;
 	struct pv_mmu_ops	mmu;
-	struct pv_lock_ops	lock;
 } __no_randomize_layout;
 
 extern struct paravirt_patch_template pv_ops;
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 68da67df304d..25a1919542d9 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -7,6 +7,9 @@
 #include <asm-generic/qspinlock_types.h>
 #include <asm/paravirt.h>
 #include <asm/rmwcc.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt-spinlock.h>
+#endif
 
 #define _Q_PENDING_LOOPS	(1 << 9)
 
@@ -27,90 +30,10 @@ static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lo
 	return val;
 }
 
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __pv_init_lock_hash(void);
-extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
-extern bool nopvspin;
-
-#define	queued_spin_unlock queued_spin_unlock
-/**
- * queued_spin_unlock - release a queued spinlock
- * @lock : Pointer to queued spinlock structure
- *
- * A smp_store_release() on the least-significant byte.
- */
-static inline void native_queued_spin_unlock(struct qspinlock *lock)
-{
-	smp_store_release(&lock->locked, 0);
-}
-
-static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
-{
-	pv_queued_spin_lock_slowpath(lock, val);
-}
-
-static inline void queued_spin_unlock(struct qspinlock *lock)
-{
-	kcsan_release();
-	pv_queued_spin_unlock(lock);
-}
-
-#define vcpu_is_preempted vcpu_is_preempted
-static inline bool vcpu_is_preempted(long cpu)
-{
-	return pv_vcpu_is_preempted(cpu);
-}
+#ifndef CONFIG_PARAVIRT
+static inline void native_pv_lock_init(void) { }
 #endif
 
-#ifdef CONFIG_PARAVIRT
-/*
- * virt_spin_lock_key - disables by default the virt_spin_lock() hijack.
- *
- * Native (and PV wanting native due to vCPU pinning) should keep this key
- * disabled. Native does not touch the key.
- *
- * When in a guest then native_pv_lock_init() enables the key first and
- * KVM/XEN might conditionally disable it later in the boot process again.
- */
-DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
-
-/*
- * Shortcut for the queued_spin_lock_slowpath() function that allows
- * virt to hijack it.
- *
- * Returns:
- *   true - lock has been negotiated, all done;
- *   false - queued_spin_lock_slowpath() will do its thing.
- */
-#define virt_spin_lock virt_spin_lock
-static inline bool virt_spin_lock(struct qspinlock *lock)
-{
-	int val;
-
-	if (!static_branch_likely(&virt_spin_lock_key))
-		return false;
-
-	/*
-	 * On hypervisors without PARAVIRT_SPINLOCKS support we fall
-	 * back to a Test-and-Set spinlock, because fair locks have
-	 * horrible lock 'holder' preemption issues.
-	 */
-
- __retry:
-	val = atomic_read(&lock->val);
-
-	if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) {
-		cpu_relax();
-		goto __retry;
-	}
-
-	return true;
-}
-
-#endif /* CONFIG_PARAVIRT */
-
 #include <asm-generic/qspinlock.h>
 
 #endif /* _ASM_X86_QSPINLOCK_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index bc184dd38d99..e9aeeeafad17 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -126,7 +126,7 @@ obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
 
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
-obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
 obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
 
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 21b4de55f823..de550b12d9ab 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -829,8 +829,10 @@ static void __init kvm_guest_init(void)
 		has_steal_clock = 1;
 		static_call_update(pv_steal_clock, kvm_steal_clock);
 
-		pv_ops.lock.vcpu_is_preempted =
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+		pv_ops_lock.vcpu_is_preempted =
 			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
+#endif
 	}
 
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -1126,11 +1128,11 @@ void __init kvm_spinlock_init(void)
 	pr_info("PV spinlocks enabled\n");
 
 	__pv_init_lock_hash();
-	pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
-	pv_ops.lock.queued_spin_unlock =
+	pv_ops_lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+	pv_ops_lock.queued_spin_unlock =
 		PV_CALLEE_SAVE(__pv_queued_spin_unlock);
-	pv_ops.lock.wait = kvm_wait;
-	pv_ops.lock.kick = kvm_kick_cpu;
+	pv_ops_lock.wait = kvm_wait;
+	pv_ops_lock.kick = kvm_kick_cpu;
 
 	/*
 	 * When PV spinlock is enabled which is preferred over
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 9e1ea99ad9df..95452444868f 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -3,12 +3,22 @@
  * Split spinlock implementation out into its own file, so it can be
  * compiled in a FTRACE-compatible way.
  */
+#include <linux/static_call.h>
 #include <linux/spinlock.h>
 #include <linux/export.h>
 #include <linux/jump_label.h>
 
-#include <asm/paravirt.h>
+DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
 
+#ifdef CONFIG_SMP
+void __init native_pv_lock_init(void)
+{
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		static_branch_enable(&virt_spin_lock_key);
+}
+#endif
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
 __visible void __native_queued_spin_unlock(struct qspinlock *lock)
 {
 	native_queued_spin_unlock(lock);
@@ -17,7 +27,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
 
 bool pv_is_native_spin_unlock(void)
 {
-	return pv_ops.lock.queued_spin_unlock.func ==
+	return pv_ops_lock.queued_spin_unlock.func ==
 		__raw_callee_save___native_queued_spin_unlock;
 }
 
@@ -29,7 +39,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
 
 bool pv_is_native_vcpu_is_preempted(void)
 {
-	return pv_ops.lock.vcpu_is_preempted.func ==
+	return pv_ops_lock.vcpu_is_preempted.func ==
 		__raw_callee_save___native_vcpu_is_preempted;
 }
 
@@ -41,3 +51,13 @@ void __init paravirt_set_cap(void)
 	if (!pv_is_native_vcpu_is_preempted())
 		setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT);
 }
+
+struct pv_lock_ops pv_ops_lock = {
+	.queued_spin_lock_slowpath	= native_queued_spin_lock_slowpath,
+	.queued_spin_unlock		= PV_CALLEE_SAVE(__native_queued_spin_unlock),
+	.wait				= paravirt_nop,
+	.kick				= paravirt_nop,
+	.vcpu_is_preempted		= PV_CALLEE_SAVE(__native_vcpu_is_preempted),
+};
+EXPORT_SYMBOL(pv_ops_lock);
+#endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5dfbd3f55792..a6ed52cae003 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -57,14 +57,6 @@ DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
 DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
 #endif
 
-DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
-
-void __init native_pv_lock_init(void)
-{
-	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
-		static_branch_enable(&virt_spin_lock_key);
-}
-
 static noinstr void pv_native_safe_halt(void)
 {
 	native_safe_halt();
@@ -221,19 +213,6 @@ struct paravirt_patch_template pv_ops = {
 
 	.mmu.set_fixmap		= native_set_fixmap,
 #endif /* CONFIG_PARAVIRT_XXL */
-
-#if defined(CONFIG_PARAVIRT_SPINLOCKS)
-	/* Lock ops. */
-#ifdef CONFIG_SMP
-	.lock.queued_spin_lock_slowpath	= native_queued_spin_lock_slowpath,
-	.lock.queued_spin_unlock	=
-				PV_CALLEE_SAVE(__native_queued_spin_unlock),
-	.lock.wait			= paravirt_nop,
-	.lock.kick			= paravirt_nop,
-	.lock.vcpu_is_preempted		=
-				PV_CALLEE_SAVE(__native_vcpu_is_preempted),
-#endif /* SMP */
-#endif
 };
 
 #ifdef CONFIG_PARAVIRT_XXL
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index fe56646d6919..83ac24ead289 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -134,10 +134,10 @@ void __init xen_init_spinlocks(void)
 	printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
 
 	__pv_init_lock_hash();
-	pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
-	pv_ops.lock.queued_spin_unlock =
+	pv_ops_lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+	pv_ops_lock.queued_spin_unlock =
 		PV_CALLEE_SAVE(__pv_queued_spin_unlock);
-	pv_ops.lock.wait = xen_qlock_wait;
-	pv_ops.lock.kick = xen_qlock_kick;
-	pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
+	pv_ops_lock.wait = xen_qlock_wait;
+	pv_ops_lock.kick = xen_qlock_kick;
+	pv_ops_lock.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
 }
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index b3fec88d5bd3..c2952df6842c 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -527,6 +527,7 @@ static struct {
 	int idx_off;
 } pv_ops_tables[] = {
 	{ .name = "pv_ops", },
+	{ .name = "pv_ops_lock", },
 	{ .name = NULL, .idx_off = -1 }
 };
 
-- 
cgit v1.2.3


From 8441c7d3bd6c5a52ab2ecf77e43a5bf262004f5c Mon Sep 17 00:00:00 2001
From: Robert Richter <rrichter@amd.com>
Date: Wed, 7 Jan 2026 13:05:43 +0100
Subject: cxl: Check for invalid addresses returned from translation functions
 on errors

Translation functions may return an invalid address in case of errors.
If the address is not checked the further use of the invalid value
will cause an address corruption.

Consistently check for a valid address returned by translation
functions. Use RESOURCE_SIZE_MAX to indicate an invalid address for
type resource_size_t. Depending on the type either RESOURCE_SIZE_MAX
or ULLONG_MAX is used to indicate an address error.

Propagating an invalid address from a failed translation may cause
userspace to think it has received a valid SPA, when in fact it is
wrong. The CXL userspace API, using trace events, expects ULLONG_MAX
to indicate a translation failure. If ULLONG_MAX is not returned
immediately, subsequent calculations can transform that bad address
into a different value (!ULLONG_MAX), and an invalid SPA may be
returned to userspace. This can lead to incorrect diagnostics and
erroneous corrective actions.

[ dj: Added user impact statement from Alison. ]
[ dj: Fixed checkpatch tab alignment issue. ]

Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Robert Richter <rrichter@amd.com>
Fixes: c3dd67681c70 ("cxl/region: Add inject and clear poison by region offset")
Fixes: b78b9e7b7979 ("cxl/region: Refactor address translation funcs for testing")
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20260107120544.410993-1-rrichter@amd.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/hdm.c                 |  2 +-
 drivers/cxl/core/region.c              | 34 ++++++++++++++++++++++++++--------
 tools/testing/cxl/test/cxl_translate.c | 30 ++++++++++++++++++------------
 3 files changed, 45 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index a470099a69f1..eb5a3a7640c6 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -530,7 +530,7 @@ resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled)
 
 resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled)
 {
-	resource_size_t base = -1;
+	resource_size_t base = RESOURCE_SIZE_MAX;
 
 	lockdep_assert_held(&cxl_rwsem.dpa);
 	if (cxled->dpa_res)
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index fc36a5413d3f..5bd1213737fa 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -3118,7 +3118,7 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
 	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
 	struct cxl_region_params *p = &cxlr->params;
 	struct cxl_endpoint_decoder *cxled = NULL;
-	u64 dpa_offset, hpa_offset, hpa;
+	u64 base, dpa_offset, hpa_offset, hpa;
 	u16 eig = 0;
 	u8 eiw = 0;
 	int pos;
@@ -3136,8 +3136,14 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
 	ways_to_eiw(p->interleave_ways, &eiw);
 	granularity_to_eig(p->interleave_granularity, &eig);
 
-	dpa_offset = dpa - cxl_dpa_resource_start(cxled);
+	base = cxl_dpa_resource_start(cxled);
+	if (base == RESOURCE_SIZE_MAX)
+		return ULLONG_MAX;
+
+	dpa_offset = dpa - base;
 	hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig);
+	if (hpa_offset == ULLONG_MAX)
+		return ULLONG_MAX;
 
 	/* Apply the hpa_offset to the region base address */
 	hpa = hpa_offset + p->res->start + p->cache_size;
@@ -3146,6 +3152,9 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
 	if (cxlrd->ops.hpa_to_spa)
 		hpa = cxlrd->ops.hpa_to_spa(cxlrd, hpa);
 
+	if (hpa == ULLONG_MAX)
+		return ULLONG_MAX;
+
 	if (!cxl_resource_contains_addr(p->res, hpa)) {
 		dev_dbg(&cxlr->dev,
 			"Addr trans fail: hpa 0x%llx not in region\n", hpa);
@@ -3170,7 +3179,8 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset,
 	struct cxl_region_params *p = &cxlr->params;
 	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
 	struct cxl_endpoint_decoder *cxled;
-	u64 hpa, hpa_offset, dpa_offset;
+	u64 hpa_offset = offset;
+	u64 dpa, dpa_offset;
 	u16 eig = 0;
 	u8 eiw = 0;
 	int pos;
@@ -3187,10 +3197,13 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset,
 	 * CXL HPA is assumed to equal SPA.
 	 */
 	if (cxlrd->ops.spa_to_hpa) {
-		hpa = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset);
-		hpa_offset = hpa - p->res->start;
-	} else {
-		hpa_offset = offset;
+		hpa_offset = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset);
+		if (hpa_offset == ULLONG_MAX) {
+			dev_dbg(&cxlr->dev, "HPA not found for %pr offset %#llx\n",
+				p->res, offset);
+			return -ENXIO;
+		}
+		hpa_offset -= p->res->start;
 	}
 
 	pos = cxl_calculate_position(hpa_offset, eiw, eig);
@@ -3207,8 +3220,13 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset,
 		cxled = p->targets[i];
 		if (cxled->pos != pos)
 			continue;
+
+		dpa = cxl_dpa_resource_start(cxled);
+		if (dpa != RESOURCE_SIZE_MAX)
+			dpa += dpa_offset;
+
 		result->cxlmd = cxled_to_memdev(cxled);
-		result->dpa = cxl_dpa_resource_start(cxled) + dpa_offset;
+		result->dpa = dpa;
 
 		return 0;
 	}
diff --git a/tools/testing/cxl/test/cxl_translate.c b/tools/testing/cxl/test/cxl_translate.c
index 2200ae21795c..16328b2112b2 100644
--- a/tools/testing/cxl/test/cxl_translate.c
+++ b/tools/testing/cxl/test/cxl_translate.c
@@ -68,6 +68,8 @@ static u64 to_hpa(u64 dpa_offset, int pos, u8 r_eiw, u16 r_eig, u8 hb_ways,
 
 	/* Calculate base HPA offset from DPA and position */
 	hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, r_eiw, r_eig);
+	if (hpa_offset == ULLONG_MAX)
+		return ULLONG_MAX;
 
 	if (math == XOR_MATH) {
 		cximsd->nr_maps = hbiw_to_nr_maps[hb_ways];
@@ -258,19 +260,23 @@ static int test_random_params(void)
 		pos = get_random_u32() % ways;
 		dpa = get_random_u64() >> 12;
 
+		reverse_dpa = ULLONG_MAX;
+		reverse_pos = -1;
+
 		hpa = cxl_calculate_hpa_offset(dpa, pos, eiw, eig);
-		reverse_dpa = cxl_calculate_dpa_offset(hpa, eiw, eig);
-		reverse_pos = cxl_calculate_position(hpa, eiw, eig);
-
-		if (reverse_dpa != dpa || reverse_pos != pos) {
-			pr_err("test random iter %d FAIL hpa=%llu, dpa=%llu reverse_dpa=%llu, pos=%d reverse_pos=%d eiw=%u eig=%u\n",
-			       i, hpa, dpa, reverse_dpa, pos, reverse_pos, eiw,
-			       eig);
-
-			if (failures++ > 10) {
-				pr_err("test random too many failures, stop\n");
-				break;
-			}
+		if (hpa != ULLONG_MAX) {
+			reverse_dpa = cxl_calculate_dpa_offset(hpa, eiw, eig);
+			reverse_pos = cxl_calculate_position(hpa, eiw, eig);
+			if (reverse_dpa == dpa && reverse_pos == pos)
+				continue;
+		}
+
+		pr_err("test random iter %d FAIL hpa=%llu, dpa=%llu reverse_dpa=%llu, pos=%d reverse_pos=%d eiw=%u eig=%u\n",
+		       i, hpa, dpa, reverse_dpa, pos, reverse_pos, eiw, eig);
+
+		if (failures++ > 10) {
+			pr_err("test random too many failures, stop\n");
+			break;
 		}
 	}
 	pr_info("..... test random: PASS %d FAIL %d\n", i - failures, failures);
-- 
cgit v1.2.3


From 2465a08d433dd4ae0c4eecdb8e79c54b7c5e5a55 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Mon, 12 Jan 2026 22:10:23 -0800
Subject: selftests/bpf: Fix dmabuf_iter/lots_of_buffers failure with 64K page

On arm64 with 64K page , I observed the following test failure:
  ...
  subtest_dmabuf_iter_check_lots_of_buffers:FAIL:total_bytes_read unexpected total_bytes_read:
      actual 4696 <= expected 65536
  #97/3    dmabuf_iter/lots_of_buffers:FAIL

With 4K page on x86, the total_bytes_read is 4593.
With 64K page on arm64, the total_byte_read is 4696.

In progs/dmabuf_iter.c, for each iteration, the output is
  BPF_SEQ_PRINTF(seq, "%lu\n%llu\n%s\n%s\n", inode, size, name, exporter);

The only difference between 4K and 64K page is 'size' in
the above BPF_SEQ_PRINTF. The 4K page will output '4096' and
the 64K page will output '65536'. So the total_bytes_read with 64K page
is slighter greater than 4K page.

Adjusting the total_bytes_read from 65536 to 4096 fixed the issue.

Cc: T.J. Mercier <tjmercier@google.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20260113061023.3798085-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c b/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c
index e442be9dde7e..fb2cea710db3 100644
--- a/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c
@@ -233,7 +233,7 @@ static void subtest_dmabuf_iter_check_lots_of_buffers(struct dmabuf_iter *skel)
 	while ((bytes_read = read(iter_fd, buf, sizeof(buf))) > 0)
 		total_bytes_read += bytes_read;
 
-	ASSERT_GT(total_bytes_read, getpagesize(), "total_bytes_read");
+	ASSERT_GT(total_bytes_read, 4096, "total_bytes_read");
 
 	close(iter_fd);
 }
-- 
cgit v1.2.3


From d2f7cd20a7c7742b83d2c899044d4f2a851a4a7d Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Mon, 12 Jan 2026 22:10:28 -0800
Subject: selftests/bpf: Fix sk_bypass_prot_mem failure with 64K page

The current selftest sk_bypass_prot_mem only supports 4K page.
When running with 64K page on arm64, the following failure happens:
  ...
  check_bypass:FAIL:no bypass unexpected no bypass: actual 3 <= expected 32
  ...
  #385/1   sk_bypass_prot_mem/TCP  :FAIL
  ...
  check_bypass:FAIL:no bypass unexpected no bypass: actual 4 <= expected 32
  ...
  #385/2   sk_bypass_prot_mem/UDP  :FAIL
  ...

Adding support to 64K page as well fixed the failure.

Cc: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20260113061028.3798326-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c b/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c
index e4940583924b..e2c867fd5244 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c
@@ -5,9 +5,14 @@
 #include "sk_bypass_prot_mem.skel.h"
 #include "network_helpers.h"
 
+#ifndef PAGE_SIZE
+#include <unistd.h>
+#define PAGE_SIZE getpagesize()
+#endif
+
 #define NR_PAGES	32
 #define NR_SOCKETS	2
-#define BUF_TOTAL	(NR_PAGES * 4096 / NR_SOCKETS)
+#define BUF_TOTAL	(NR_PAGES * PAGE_SIZE / NR_SOCKETS)
 #define BUF_SINGLE	1024
 #define NR_SEND		(BUF_TOTAL / BUF_SINGLE)
 
-- 
cgit v1.2.3


From 951d79017e8a0af6db4a167a89746b4a0924626b Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Mon, 12 Jan 2026 22:10:33 -0800
Subject: selftests/bpf: Fix verifier_arena_globals1 failure with 64K page

With 64K page on arm64, verifier_arena_globals1 failed like below:
  ...
  libbpf: map 'arena': failed to create: -E2BIG
  ...
  #509/1   verifier_arena_globals1/check_reserve1:FAIL
  ...

For 64K page, if the number of arena pages is (1UL << 20), the total
memory will exceed 4G and this will cause map creation failure.
Adjusting ARENA_PAGES based on the actual page size fixed the problem.

Cc: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Link: https://lore.kernel.org/r/20260113061033.3798549-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_arena_globals1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c
index 14afef3d6442..83182ddbfb95 100644
--- a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c
+++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c
@@ -9,7 +9,7 @@
 #include "bpf_arena_common.h"
 #include "bpf_misc.h"
 
-#define ARENA_PAGES (1UL<< (32 - 12))
+#define ARENA_PAGES (1UL<< (32 - __builtin_ffs(__PAGE_SIZE) + 1))
 #define GLOBAL_PAGES (16)
 
 struct {
-- 
cgit v1.2.3


From 9160335317cb404f54ad2f509546c666ddd4d0eb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 12 Jan 2026 12:13:58 -0800
Subject: selftests/bpf: Add tests for s>>=31 and s>>=63

Add tests for special arithmetic shift right.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Co-developed-by: Puranjay Mohan <puranjay@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20260112201424.816836-3-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/progs/verifier_subreg.c  | 85 ++++++++++++++++++++++
 1 file changed, 85 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_subreg.c b/tools/testing/selftests/bpf/progs/verifier_subreg.c
index b3e1c3eef9ae..be328100ba53 100644
--- a/tools/testing/selftests/bpf/progs/verifier_subreg.c
+++ b/tools/testing/selftests/bpf/progs/verifier_subreg.c
@@ -738,4 +738,89 @@ __naked void ldx_w_zero_extend_check(void)
 	: __clobber_all);
 }
 
+SEC("socket")
+__success __success_unpriv __retval(0)
+__naked void arsh_31_and(void)
+{
+	/* Below is what LLVM generates in cilium's bpf_wiregard.o */
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w2 = w0;					\
+	w2 s>>= 31;					\
+	w2 &= -134; /* w2 becomes 0 or -134 */		\
+	if w2 s> -1 goto +2;				\
+	/* Branch always taken because w2 = -134 */	\
+	if w2 != -136 goto +1;				\
+	w0 /= 0;					\
+	w0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__success __success_unpriv __retval(0)
+__naked void arsh_63_and(void)
+{
+	/* Copy of arsh_31 with s/w/r/ */
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r2 = r0;					\
+	r2 <<= 32;					\
+	r2 s>>= 63;					\
+	r2 &= -134;					\
+	if r2 s> -1 goto +2;				\
+	/* Branch always taken because w2 = -134 */	\
+	if r2 != -136 goto +1;				\
+	r0 /= 0;					\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__success __success_unpriv __retval(0)
+__naked void arsh_31_or(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w2 = w0;					\
+	w2 s>>= 31;					\
+	w2 |= 134; /* w2 becomes -1 or 134 */		\
+	if w2 s> -1 goto +2;				\
+	/* Branch always taken because w2 = -1 */	\
+	if w2 == -1 goto +1;				\
+	w0 /= 0;					\
+	w0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__success __success_unpriv __retval(0)
+__naked void arsh_63_or(void)
+{
+	/* Copy of arsh_31 with s/w/r/ */
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r2 = r0;					\
+	r2 <<= 32;					\
+	r2 s>>= 63;					\
+	r2 |= 134; /* r2 becomes -1 or 134 */		\
+	if r2 s> -1 goto +2;				\
+	/* Branch always taken because w2 = -1 */	\
+	if r2 == -1 goto +1;				\
+	r0 /= 0;					\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From a2297e74a07d21eb498d8549ae6fddc35cf26ec6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 10 Jan 2026 20:13:35 -0800
Subject: perf srcline: Add configuration support for the addr2line style

Allow the addr2line style to be specified on the `perf report` command
line or in the .perfconfig file.

Committer testing:

The methods:

  # perf probe -x ~/bin/perf -F *__addr2line
  cmd__addr2line
  libbfd__addr2line
  libdw__addr2line
  llvm__addr2line
  #

So if we configure one of them, say 'addr2line':

  # perf config addr2line.style=addr2line
  # perf config addr2line.style
  addr2line.style=addr2line
  #

And have probes on all of them:

  # perf probe -x ~/bin/perf *__addr2line
  Added new events:
    probe_perf:cmd__addr2line (on *__addr2line in /home/acme/bin/perf)
    probe_perf:llvm__addr2line (on *__addr2line in /home/acme/bin/perf)
    probe_perf:libbfd__addr2line (on *__addr2line in /home/acme/bin/perf)
    probe_perf:libdw__addr2line (on *__addr2line in /home/acme/bin/perf)

  You can now use it in all perf tools, such as:

  	perf record -e probe_perf:libdw__addr2line -aR sleep 1

  #

Only the selected method should be used:

  # perf stat -e probe_perf:*_addr2line perf report -f --dso perf --stdio -s srcfile,srcline
  # Total Lost Samples: 0
  #
  # Samples: 4K of event 'cpu/cycles/Pu'
  # Event count (approx.): 5535180842
  #
  # Overhead  Source File   Source:Line
  # ........  ............  ...............
  #
      99.04%  inlineloop.c  inlineloop.c:21
       0.46%  inlineloop.c  inlineloop.c:20

  #
  # (Tip: For hierarchical output, try: perf report --hierarchy)
  #

   Performance counter stats for 'perf report -f --dso perf --stdio -s srcfile,srcline':

                  44      probe_perf:cmd__addr2line
                   0      probe_perf:llvm__addr2line
                   0      probe_perf:libbfd__addr2line
                   0      probe_perf:libdw__addr2line

         0.035915611 seconds time elapsed

         0.028008000 seconds user
         0.009051000 seconds sys
  #

I checked and that is the case for the other methods.

Also when using:

  # perf config addr2line.style=libdw,llvm

 Performance counter stats for 'perf report -f --dso perf --stdio -s srcfile,srcline':

                 0      probe_perf:cmd__addr2line
                23      probe_perf:llvm__addr2line
                 0      probe_perf:libbfd__addr2line
                44      probe_perf:libdw__addr2line

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Tony Jones <tonyj@suse.de>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c   | 10 +++++
 tools/perf/util/config.c      |  4 ++
 tools/perf/util/srcline.c     | 98 ++++++++++++++++++++++++++++++++++++++-----
 tools/perf/util/srcline.h     |  2 +
 tools/perf/util/symbol_conf.h | 10 +++++
 5 files changed, 113 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 6c2b4f93ec78..2e936928e8c0 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1271,6 +1271,13 @@ parse_percent_limit(const struct option *opt, const char *str,
 	return 0;
 }
 
+static int
+report_parse_addr2line_config(const struct option *opt __maybe_unused,
+			      const char *arg, int unset __maybe_unused)
+{
+	return addr2line_configure("addr2line.style", arg, NULL);
+}
+
 static int process_attr(const struct perf_tool *tool __maybe_unused,
 			union perf_event *event,
 			struct evlist **pevlist)
@@ -1447,6 +1454,9 @@ int cmd_report(int argc, const char **argv)
 		   "objdump binary to use for disassembly and annotations"),
 	OPT_STRING(0, "addr2line", &addr2line_path, "path",
 		   "addr2line binary to use for line numbers"),
+	OPT_CALLBACK(0, "addr2line-style", NULL, "addr2line style",
+		     "addr2line styles (libdw,llvm,libbfd,addr2line)",
+		     report_parse_addr2line_config),
 	OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle,
 		    "Symbol demangling. Enabled by default, use --no-demangle to disable."),
 	OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel,
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index e0219bc6330a..0452fbc6c085 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -20,6 +20,7 @@
 #include "util/stat.h"  /* perf_stat__set_big_num */
 #include "util/evsel.h"  /* evsel__hw_names, evsel__use_bpf_counters */
 #include "util/addr2line.h"  /* addr2line_timeout_ms */
+#include "srcline.h"
 #include "build-id.h"
 #include "debug.h"
 #include "config.h"
@@ -519,6 +520,9 @@ int perf_default_config(const char *var, const char *value,
 	if (strstarts(var, "stat."))
 		return perf_stat_config(var, value);
 
+	if (strstarts(var, "addr2line."))
+		return addr2line_configure(var, value, dummy);
+
 	/* Add other config variables here. */
 	return 0;
 }
diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c
index e2d280678b02..28fa1abd1fd3 100644
--- a/tools/perf/util/srcline.c
+++ b/tools/perf/util/srcline.c
@@ -7,9 +7,11 @@
 #include "llvm.h"
 #include "symbol.h"
 #include "libdw.h"
+#include "debug.h"
 
 #include <inttypes.h>
 #include <string.h>
+#include <linux/string.h>
 
 bool srcline_full_filename;
 
@@ -138,21 +140,95 @@ static int addr2line(const char *dso_name, u64 addr, char **file, unsigned int *
 		     struct dso *dso, bool unwind_inlines, struct inline_node *node,
 		     struct symbol *sym)
 {
-	int ret;
+	int ret = 0;
+
+	if (symbol_conf.addr2line_style[0] == A2L_STYLE_UNKNOWN) {
+		int i = 0;
+
+		/* Default addr2line fallback order. */
+#ifdef HAVE_LIBDW_SUPPORT
+		symbol_conf.addr2line_style[i++] = A2L_STYLE_LIBDW;
+#endif
+#ifdef HAVE_LIBLLVM_SUPPORT
+		symbol_conf.addr2line_style[i++] = A2L_STYLE_LLVM;
+#endif
+#ifdef HAVE_LIBBFD_SUPPORT
+		symbol_conf.addr2line_style[i++] = A2L_STYLE_LIBBFD;
+#endif
+		symbol_conf.addr2line_style[i++] = A2L_STYLE_CMD;
+	}
+
+	for (size_t i = 0; i < ARRAY_SIZE(symbol_conf.addr2line_style); i++) {
+		switch (symbol_conf.addr2line_style[i]) {
+		case A2L_STYLE_LIBDW:
+			ret = libdw__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines,
+					       node, sym);
+			break;
+		case A2L_STYLE_LLVM:
+			ret = llvm__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines,
+					      node, sym);
+			break;
+		case A2L_STYLE_LIBBFD:
+			ret = libbfd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines,
+						node, sym);
+			break;
+		case A2L_STYLE_CMD:
+			ret = cmd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines,
+					     node, sym);
+			break;
+		case A2L_STYLE_UNKNOWN:
+		default:
+			break;
+		}
+		if (ret > 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+int addr2line_configure(const char *var, const char *value, void *cb __maybe_unused)
+{
+	static const char * const a2l_style_names[] = {
+		[A2L_STYLE_LIBDW] = "libdw",
+		[A2L_STYLE_LLVM] = "llvm",
+		[A2L_STYLE_LIBBFD] = "libbfd",
+		[A2L_STYLE_CMD] = "addr2line",
+		NULL
+	};
+
+	char *s, *p, *saveptr;
+	size_t i = 0;
 
-	ret = libdw__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym);
-	if (ret > 0)
-		return ret;
+	if (strcmp(var, "addr2line.style"))
+		return 0;
+
+	if (!value)
+		return -1;
 
-	ret = llvm__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym);
-	if (ret > 0)
-		return ret;
+	s = strdup(value);
+	if (!s)
+		return -1;
 
-	ret = libbfd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym);
-	if (ret > 0)
-		return ret;
+	p = strtok_r(s, ",", &saveptr);
+	while (p && i < ARRAY_SIZE(symbol_conf.addr2line_style)) {
+		bool found = false;
+		char *q = strim(p);
+
+		for (size_t j = A2L_STYLE_LIBDW; j < MAX_A2L_STYLE; j++) {
+			if (!strcasecmp(q, a2l_style_names[j])) {
+				symbol_conf.addr2line_style[i++] = j;
+				found = true;
+				break;
+			}
+		}
+		if (!found)
+			pr_warning("Unknown addr2line style: %s\n", q);
+		p = strtok_r(NULL, ",", &saveptr);
+	}
 
-	return cmd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym);
+	free(s);
+	return 0;
 }
 
 static struct inline_node *addr2inlines(const char *dso_name, u64 addr,
diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h
index be9f002bf234..7c37b3bf9ce7 100644
--- a/tools/perf/util/srcline.h
+++ b/tools/perf/util/srcline.h
@@ -63,4 +63,6 @@ struct symbol *new_inline_sym(struct dso *dso,
 			      struct symbol *base_sym,
 			      const char *funcname);
 
+int addr2line_configure(const char *var, const char *value, void *cb);
+
 #endif /* PERF_SRCLINE_H */
diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h
index 7a80d2c14d9b..71bb17372a6c 100644
--- a/tools/perf/util/symbol_conf.h
+++ b/tools/perf/util/symbol_conf.h
@@ -9,6 +9,15 @@
 struct strlist;
 struct intlist;
 
+enum a2l_style {
+	A2L_STYLE_UNKNOWN = 0,
+	A2L_STYLE_LIBDW,
+	A2L_STYLE_LLVM,
+	A2L_STYLE_LIBBFD,
+	A2L_STYLE_CMD,
+};
+#define MAX_A2L_STYLE (A2L_STYLE_CMD + 1)
+
 struct symbol_conf {
 	bool		nanosecs;
 	unsigned short	priv_size;
@@ -70,6 +79,7 @@ struct symbol_conf {
 			*col_width_list_str,
 			*bt_stop_list_str;
 	const char		*addr2line_path;
+	enum a2l_style	addr2line_style[MAX_A2L_STYLE];
 	unsigned long	time_quantum;
        struct strlist	*dso_list,
 			*comm_list,
-- 
cgit v1.2.3


From abec464767b5d26f0612250d511c18f420826ca1 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 10 Jan 2026 20:13:36 -0800
Subject: perf callchain: Fix srcline printing with inlines

sample__fprintf_callchain() was using map__fprintf_srcline() which won't
report inline line numbers.

Fix by using the srcline from the callchain and falling back to the map
variant.

Fixes: 25da4fab5f66e659 ("perf evsel: Move fprintf methods to separate source file")
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Tony Jones <tonyj@suse.de>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel_fprintf.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c
index 10f1a03c2860..5521d00bff2c 100644
--- a/tools/perf/util/evsel_fprintf.c
+++ b/tools/perf/util/evsel_fprintf.c
@@ -185,8 +185,12 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment,
 			if (print_dso && (!sym || !sym->inlined))
 				printed += map__fprintf_dsoname_dsoff(map, print_dsoff, addr, fp);
 
-			if (print_srcline)
-				printed += map__fprintf_srcline(map, addr, "\n  ", fp);
+			if (print_srcline) {
+				if (node->srcline)
+					printed += fprintf(fp, "\n  %s", node->srcline);
+				else
+					printed += map__fprintf_srcline(map, addr, "\n  ", fp);
+			}
 
 			if (sym && sym->inlined)
 				printed += fprintf(fp, " (inlined)");
-- 
cgit v1.2.3


From 54a23bff770961e024e2c61cd1f46888190c3e79 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 10 Jan 2026 20:13:38 -0800
Subject: perf test: Test addr2line unwinding works with inline functions

Add a test that seeks to see inline functions correctly displayed in
'perf script' from the inlineloop workload.

Committer testing:

  # perf test 'addr2line inline unwinding'
   76: test addr2line inline unwinding                                 : Ok
  # perf test -vv 'addr2line inline unwinding'
   76: test addr2line inline unwinding:
  --- start ---
  test child forked, pid 1508628
  Inline unwinding verification test
  [ perf record: Woken up 129 times to write data ]
  [ perf record: Captured and wrote 32.282 MB /tmp/perf-test-inline-addr2line.L4Sz8QtADJ/perf.data (4014 samples) ]
  Inline unwinding verification test [Success]
  ---- end(0) ----
   76: test addr2line inline unwinding                                 : Ok
  #

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Tony Jones <tonyj@suse.de>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/addr2line_inlines.sh | 47 +++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100755 tools/perf/tests/shell/addr2line_inlines.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/addr2line_inlines.sh b/tools/perf/tests/shell/addr2line_inlines.sh
new file mode 100755
index 000000000000..4a5b6f5be23d
--- /dev/null
+++ b/tools/perf/tests/shell/addr2line_inlines.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# test addr2line inline unwinding
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+err=0
+test_dir=$(mktemp -d /tmp/perf-test-inline-addr2line.XXXXXXXXXX)
+perf_data="${test_dir}/perf.data"
+perf_script_txt="${test_dir}/perf_script.txt"
+
+cleanup() {
+    rm -rf "${test_dir}"
+    trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+    echo "Unexpected signal in ${FUNCNAME[1]}"
+    cleanup
+    exit 1
+}
+trap trap_cleanup EXIT TERM INT
+
+test_inlinedloop() {
+    echo "Inline unwinding verification test"
+    # Record data. Currently only dwarf callchains support inlined functions.
+    perf record --call-graph dwarf -e task-clock:u -o "${perf_data}" -- perf test -w inlineloop 1
+
+    # Check output with inline (default) and srcline
+    perf script -i "${perf_data}" --fields +srcline > "${perf_script_txt}"
+
+    # Expect the leaf and middle functions to occur on lines in the 20s, with
+    # the non-inlined parent function on a line in the 30s.
+    if grep -q "inlineloop.c:2. (inlined)" "${perf_script_txt}" &&
+       grep -q "inlineloop.c:3.$" "${perf_script_txt}"
+    then
+        echo "Inline unwinding verification test [Success]"
+    else
+        echo "Inline unwinding verification test [Failed missing inlined functions]"
+        err=1
+    fi
+}
+
+test_inlinedloop
+
+cleanup
+exit $err
-- 
cgit v1.2.3


From b6ee9b6e206b288921c14c906eebf4b32fe0c0d8 Mon Sep 17 00:00:00 2001
From: Sri Jayaramappa <sjayaram@akamai.com>
Date: Tue, 2 Dec 2025 16:36:32 -0500
Subject: libsubcmd: Fix null intersection case in exclude_cmds()

When there is no exclusion occurring from the cmds list - for example -
cmds contains ["read-vdso32"] and excludes contains ["archive"] - the
main loop completes with ci == cj == 0. In the original code the loop
processing the remaining elements in the list was conditional:

    if (ci != cj) { ...}

So we end up in the assertion loop since ci < cmds->cnt and we
incorrectly try to assert the list elements to be NULL and fail with
the following error

   help.c:104: exclude_cmds: Assertion `cmds->names[ci] == NULL' failed.

Fix this by moving the if (ci != cj) check inside of a broader loop.
If ci != cj, left shift the list elements, as before, and then
unconditionally advance the ci and cj indicies which also covers the
ci == cj case.

Fixes: 1fdf938168c4d26f ("perf tools: Fix use-after-free in help_unknown_cmd()")
Reviewed-by: Guilherme Amadio <amadio@gentoo.org>
Signed-off-by: Sri Jayaramappa <sjayaram@akamai.com>
Tested-by: Guilherme Amadio <amadio@gentoo.org>
Tested-by: Ian Rogers <irogers@google.com>
Cc: Joshua Hunt <johunt@akamai.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20251202213632.2873731-1-sjayaram@akamai.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/subcmd/help.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c
index ddaeb4eb3e24..db94aa685b73 100644
--- a/tools/lib/subcmd/help.c
+++ b/tools/lib/subcmd/help.c
@@ -97,11 +97,13 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
 			ei++;
 		}
 	}
-	if (ci != cj) {
-		while (ci < cmds->cnt) {
-			cmds->names[cj++] = cmds->names[ci];
-			cmds->names[ci++] = NULL;
+	while (ci < cmds->cnt) {
+		if (ci != cj) {
+			cmds->names[cj] = cmds->names[ci];
+			cmds->names[ci] = NULL;
 		}
+		ci++;
+		cj++;
 	}
 	for (ci = cj; ci < cmds->cnt; ci++)
 		assert(cmds->names[ci] == NULL);
-- 
cgit v1.2.3


From c65182ef9df6bb96fd85b56a2bcdd18d64c4d3b5 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 12 Jan 2026 11:33:39 -0500
Subject: selftests: net: reduce txtimestamp deschedule flakes

This test occasionally fails due to exceeding timing bounds, as
run in continuous testing on netdev.bots:

  https://netdev.bots.linux.dev/contest.html?test=txtimestamp-sh

A common pattern is a single elevated delay between USR and SND.

    # 8.36 [+0.00] test SND
    # 8.36 [+0.00]     USR: 1767864384 s 240994 us (seq=0, len=0)
    # 8.44 [+0.08] ERROR: 18461 us expected between 10000 and 18000
    # 8.44 [+0.00]     SND: 1767864384 s 259455 us (seq=42, len=10)  (USR +18460 us)
    # 8.52 [+0.07]     SND: 1767864384 s 339523 us (seq=42, len=10)  (USR +10005 us)
    # 8.52 [+0.00]     USR: 1767864384 s 409580 us (seq=0, len=0)
    # 8.60 [+0.08]     SND: 1767864384 s 419586 us (seq=42, len=10)  (USR +10005 us)
    # 8.60 [+0.00]     USR: 1767864384 s 489645 us (seq=0, len=0)
    # 8.68 [+0.08]     SND: 1767864384 s 499651 us (seq=42, len=10)  (USR +10005 us)
    # 8.68 [+0.00]     USR-SND: count=4, avg=12119 us, min=10005 us, max=18460 us

(Note that other delays are nowhere near the large 8ms tolerance.)

One hypothesis is that the task is descheduled between taking the USR
timestamp and sending the packet. Possibly in printing.

Delay taking the timestamp closer to sendmsg, and delay printing until
after sendmsg.

With this change, failure rate is significantly lower in current runs.

Link: https://lore.kernel.org/netdev/20260107110521.1aab55e9@kernel.org/
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20260112163355.3510150-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/txtimestamp.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c
index bcc14688661d..170be192f5c7 100644
--- a/tools/testing/selftests/net/txtimestamp.c
+++ b/tools/testing/selftests/net/txtimestamp.c
@@ -206,12 +206,10 @@ static void __print_timestamp(const char *name, struct timespec *cur,
 	fprintf(stderr, "\n");
 }
 
-static void print_timestamp_usr(void)
+static void record_timestamp_usr(void)
 {
 	if (clock_gettime(CLOCK_REALTIME, &ts_usr))
 		error(1, errno, "clock_gettime");
-
-	__print_timestamp("  USR", &ts_usr, 0, 0);
 }
 
 static void print_timestamp(struct scm_timestamping *tss, int tstype,
@@ -599,8 +597,6 @@ static void do_test(int family, unsigned int report_opt)
 			fill_header_udp(buf + off, family == PF_INET);
 		}
 
-		print_timestamp_usr();
-
 		iov.iov_base = buf;
 		iov.iov_len = total_len;
 
@@ -655,10 +651,14 @@ static void do_test(int family, unsigned int report_opt)
 
 		}
 
+		record_timestamp_usr();
+
 		val = sendmsg(fd, &msg, 0);
 		if (val != total_len)
 			error(1, errno, "send");
 
+		__print_timestamp("  USR", &ts_usr, 0, 0);
+
 		/* wait for all errors to be queued, else ACKs arrive OOO */
 		if (cfg_sleep_usec)
 			usleep(cfg_sleep_usec);
-- 
cgit v1.2.3


From fa5726692e4ca0d4e56d7cbd1b33126efd3f849e Mon Sep 17 00:00:00 2001
From: Donald Hunter <donald.hunter@gmail.com>
Date: Mon, 12 Jan 2026 15:34:36 +0000
Subject: tools: ynl: render event op docs correctly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The docs for YNL event ops currently render raw python structs. For
example in:

https://docs.kernel.org/netlink/specs/ethtool.html#cable-test-ntf

  event: {‘attributes’: [‘header’, ‘status’, ‘nest’], ‘__lineno__’: 2385}

Handle event ops correctly and render their op attributes:

  event: attributes: [header, status]

Signed-off-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260112153436.75495-1-donald.hunter@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/lib/doc_generator.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/lib/doc_generator.py b/tools/net/ynl/pyynl/lib/doc_generator.py
index 3a16b8eb01ca..8b922d8f89e8 100644
--- a/tools/net/ynl/pyynl/lib/doc_generator.py
+++ b/tools/net/ynl/pyynl/lib/doc_generator.py
@@ -166,13 +166,13 @@ class YnlDocGenerator:
                 continue
             lines.append(self.fmt.rst_paragraph(self.fmt.bold(key), level + 1))
             if key in ['request', 'reply']:
-                lines.append(self.parse_do_attributes(do_dict[key], level + 1) + "\n")
+                lines.append(self.parse_op_attributes(do_dict[key], level + 1) + "\n")
             else:
                 lines.append(self.fmt.headroom(level + 2) + do_dict[key] + "\n")
 
         return "\n".join(lines)
 
-    def parse_do_attributes(self, attrs: Dict[str, Any], level: int = 0) -> str:
+    def parse_op_attributes(self, attrs: Dict[str, Any], level: int = 0) -> str:
         """Parse 'attributes' section"""
         if "attributes" not in attrs:
             return ""
@@ -184,7 +184,7 @@ class YnlDocGenerator:
 
     def parse_operations(self, operations: List[Dict[str, Any]], namespace: str) -> str:
         """Parse operations block"""
-        preprocessed = ["name", "doc", "title", "do", "dump", "flags"]
+        preprocessed = ["name", "doc", "title", "do", "dump", "flags", "event"]
         linkable = ["fixed-header", "attribute-set"]
         lines = []
 
@@ -217,6 +217,9 @@ class YnlDocGenerator:
             if "dump" in operation:
                 lines.append(self.fmt.rst_paragraph(":dump:", 0))
                 lines.append(self.parse_do(operation["dump"], 0))
+            if "event" in operation:
+                lines.append(self.fmt.rst_paragraph(":event:", 0))
+                lines.append(self.parse_op_attributes(operation["event"], 0))
 
             # New line after fields
             lines.append("\n")
-- 
cgit v1.2.3


From 8d3b6649499edd85e88b763e77bbce2ab016eb47 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 8 Dec 2025 17:57:27 -0800
Subject: perf util: Add BLAKE2s support

Add BLAKE2s support to the perf utility library.  The code is borrowed
from the kernel.  This will replace the use of SHA-1 in genelf.c.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Tested-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Fangrui Song <maskray@sourceware.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Pablo Galindo <pablogsal@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/util.c   |  78 +++++++++++++++++++++++-
 tools/perf/util/Build     |   1 +
 tools/perf/util/blake2s.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/blake2s.h |  73 ++++++++++++++++++++++
 4 files changed, 302 insertions(+), 1 deletion(-)
 create mode 100644 tools/perf/util/blake2s.c
 create mode 100644 tools/perf/util/blake2s.h

(limited to 'tools')

diff --git a/tools/perf/tests/util.c b/tools/perf/tests/util.c
index b273d287e164..efc3e4e4c6fa 100644
--- a/tools/perf/tests/util.c
+++ b/tools/perf/tests/util.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "tests.h"
+#include "util/blake2s.h"
 #include "util/debug.h"
 #include "util/sha1.h"
 
@@ -59,8 +60,79 @@ static int test_sha1(void)
 	return 0;
 }
 
+/* Maximum data length tested by test_blake2s() */
+#define MAX_DATA_LEN 512
+
+/*
+ * Hash length tested by test_blake2s().  BLAKE2s supports variable-length
+ * hashes.  However, the only user of BLAKE2s in 'perf' uses 20-byte hashes,
+ * matching the length of the ELF build ID field.  So that's the length we test.
+ */
+#define HASH_LEN 20
+
+/* Test the implementation of the BLAKE2s hash algorithm. */
+static int test_blake2s(void)
+{
+	u8 data[MAX_DATA_LEN];
+	u8 hash[HASH_LEN];
+	u8 hash2[HASH_LEN];
+	struct blake2s_ctx main_ctx;
+	/*
+	 * This value was generated by the following Python code:
+	 *
+	 * import hashlib
+	 *
+	 * data = bytes(i % 256 for i in range(513))
+	 * h = hashlib.blake2s(digest_size=20)
+	 * for i in range(513):
+	 *     h.update(hashlib.blake2s(data=data[:i], digest_size=20).digest())
+	 * print(h.hexdigest())
+	 */
+	static const u8 expected_hash_of_hashes[20] = {
+		0xef, 0x9b, 0x13, 0x98, 0x78, 0x8e, 0x74, 0x59, 0x9c, 0xd5,
+		0x0c, 0xf0, 0x33, 0x97, 0x79, 0x3d, 0x3e, 0xd0, 0x95, 0xa6
+	};
+	size_t i;
+
+	/* Generate MAX_DATA_LEN bytes of data. */
+	for (i = 0; i < MAX_DATA_LEN; i++)
+		data[i] = i;
+
+	blake2s_init(&main_ctx, sizeof(hash));
+	for (i = 0; i <= MAX_DATA_LEN; i++) {
+		struct blake2s_ctx ctx;
+
+		/* Compute the BLAKE2s hash of 'i' data bytes. */
+		blake2s_init(&ctx, HASH_LEN);
+		blake2s_update(&ctx, data, i);
+		blake2s_final(&ctx, hash);
+
+		/* Verify that multiple updates produce the same result. */
+		blake2s_init(&ctx, HASH_LEN);
+		blake2s_update(&ctx, data, i / 2);
+		blake2s_update(&ctx, &data[i / 2], i - (i / 2));
+		blake2s_final(&ctx, hash2);
+		TEST_ASSERT_VAL("inconsistent BLAKE2s hashes",
+				memcmp(hash, hash2, HASH_LEN) == 0);
+
+		/*
+		 * Pass the hash to another BLAKE2s context, so that we
+		 * incrementally compute the hash of all the hashes.
+		 */
+		blake2s_update(&main_ctx, hash, HASH_LEN);
+	}
+
+	/* Verify the hash of all the hashes. */
+	blake2s_final(&main_ctx, hash);
+	TEST_ASSERT_VAL("wrong BLAKE2s hashes",
+			memcmp(hash, expected_hash_of_hashes, HASH_LEN) == 0);
+	return 0;
+}
+
 static int test__util(struct test_suite *t __maybe_unused, int subtest __maybe_unused)
 {
+	int ret;
+
 	TEST_ASSERT_VAL("empty string", test_strreplace(' ', "", "123", ""));
 	TEST_ASSERT_VAL("no match", test_strreplace('5', "123", "4", "123"));
 	TEST_ASSERT_VAL("replace 1", test_strreplace('3', "123", "4", "124"));
@@ -68,7 +140,11 @@ static int test__util(struct test_suite *t __maybe_unused, int subtest __maybe_u
 	TEST_ASSERT_VAL("replace long", test_strreplace('a', "abcabc", "longlong",
 							"longlongbclonglongbc"));
 
-	return test_sha1();
+	ret = test_sha1();
+	if (ret != TEST_OK)
+		return ret;
+
+	return test_blake2s();
 }
 
 DEFINE_SUITE("util", util);
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 2bed6274e248..0c1cfcbed815 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -5,6 +5,7 @@ perf-util-y += arm64-frame-pointer-unwind-support.o
 perf-util-y += addr2line.o
 perf-util-y += addr_location.o
 perf-util-y += annotate.o
+perf-util-y += blake2s.o
 perf-util-y += block-info.o
 perf-util-y += block-range.o
 perf-util-y += build-id.o
diff --git a/tools/perf/util/blake2s.c b/tools/perf/util/blake2s.c
new file mode 100644
index 000000000000..ce5d89a19376
--- /dev/null
+++ b/tools/perf/util/blake2s.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is an implementation of the BLAKE2s hash and PRF functions.
+ *
+ * Information: https://blake2.net/
+ */
+
+#include "blake2s.h"
+#include <linux/kernel.h>
+
+static inline u32 ror32(u32 v, int n)
+{
+	return (v >> n) | (v << (32 - n));
+}
+
+static inline void le32_to_cpu_array(u32 a[], size_t n)
+{
+	for (size_t i = 0; i < n; i++)
+		a[i] = le32_to_cpu((__force __le32)a[i]);
+}
+
+static inline void cpu_to_le32_array(u32 a[], size_t n)
+{
+	for (size_t i = 0; i < n; i++)
+		a[i] = (__force u32)cpu_to_le32(a[i]);
+}
+
+static const u8 blake2s_sigma[10][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
+
+static inline void blake2s_increment_counter(struct blake2s_ctx *ctx, u32 inc)
+{
+	ctx->t[0] += inc;
+	ctx->t[1] += (ctx->t[0] < inc);
+}
+
+static void blake2s_compress(struct blake2s_ctx *ctx,
+			     const u8 *data, size_t nblocks, u32 inc)
+{
+	u32 m[16];
+	u32 v[16];
+	int i;
+
+	while (nblocks > 0) {
+		blake2s_increment_counter(ctx, inc);
+		memcpy(m, data, BLAKE2S_BLOCK_SIZE);
+		le32_to_cpu_array(m, ARRAY_SIZE(m));
+		memcpy(v, ctx->h, 32);
+		v[ 8] = BLAKE2S_IV0;
+		v[ 9] = BLAKE2S_IV1;
+		v[10] = BLAKE2S_IV2;
+		v[11] = BLAKE2S_IV3;
+		v[12] = BLAKE2S_IV4 ^ ctx->t[0];
+		v[13] = BLAKE2S_IV5 ^ ctx->t[1];
+		v[14] = BLAKE2S_IV6 ^ ctx->f[0];
+		v[15] = BLAKE2S_IV7 ^ ctx->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+	a += b + m[blake2s_sigma[r][2 * i + 0]]; \
+	d = ror32(d ^ a, 16); \
+	c += d; \
+	b = ror32(b ^ c, 12); \
+	a += b + m[blake2s_sigma[r][2 * i + 1]]; \
+	d = ror32(d ^ a, 8); \
+	c += d; \
+	b = ror32(b ^ c, 7); \
+} while (0)
+
+#define ROUND(r) do { \
+	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+	G(r, 2, v[2], v[ 6], v[10], v[14]); \
+	G(r, 3, v[3], v[ 7], v[11], v[15]); \
+	G(r, 4, v[0], v[ 5], v[10], v[15]); \
+	G(r, 5, v[1], v[ 6], v[11], v[12]); \
+	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+		ROUND(0);
+		ROUND(1);
+		ROUND(2);
+		ROUND(3);
+		ROUND(4);
+		ROUND(5);
+		ROUND(6);
+		ROUND(7);
+		ROUND(8);
+		ROUND(9);
+
+#undef G
+#undef ROUND
+
+		for (i = 0; i < 8; ++i)
+			ctx->h[i] ^= v[i] ^ v[i + 8];
+
+		data += BLAKE2S_BLOCK_SIZE;
+		--nblocks;
+	}
+}
+
+static inline void blake2s_set_lastblock(struct blake2s_ctx *ctx)
+{
+	ctx->f[0] = -1;
+}
+
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen)
+{
+	const size_t fill = BLAKE2S_BLOCK_SIZE - ctx->buflen;
+
+	if (unlikely(!inlen))
+		return;
+	if (inlen > fill) {
+		memcpy(ctx->buf + ctx->buflen, in, fill);
+		blake2s_compress(ctx, ctx->buf, 1, BLAKE2S_BLOCK_SIZE);
+		ctx->buflen = 0;
+		in += fill;
+		inlen -= fill;
+	}
+	if (inlen > BLAKE2S_BLOCK_SIZE) {
+		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
+
+		blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+	}
+	memcpy(ctx->buf + ctx->buflen, in, inlen);
+	ctx->buflen += inlen;
+}
+
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out)
+{
+	blake2s_set_lastblock(ctx);
+	memset(ctx->buf + ctx->buflen, 0,
+	       BLAKE2S_BLOCK_SIZE - ctx->buflen); /* Padding */
+	blake2s_compress(ctx, ctx->buf, 1, ctx->buflen);
+	cpu_to_le32_array(ctx->h, ARRAY_SIZE(ctx->h));
+	memcpy(out, ctx->h, ctx->outlen);
+	memset(ctx, 0, sizeof(*ctx));
+}
diff --git a/tools/perf/util/blake2s.h b/tools/perf/util/blake2s.h
new file mode 100644
index 000000000000..a1fe81a4bea8
--- /dev/null
+++ b/tools/perf/util/blake2s.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#ifndef _CRYPTO_BLAKE2S_H
+#define _CRYPTO_BLAKE2S_H
+
+#include <string.h>
+#include <linux/types.h>
+
+#define BLAKE2S_BLOCK_SIZE 64
+
+struct blake2s_ctx {
+	u32 h[8];
+	u32 t[2];
+	u32 f[2];
+	u8 buf[BLAKE2S_BLOCK_SIZE];
+	unsigned int buflen;
+	unsigned int outlen;
+};
+
+enum blake2s_iv {
+	BLAKE2S_IV0 = 0x6A09E667UL,
+	BLAKE2S_IV1 = 0xBB67AE85UL,
+	BLAKE2S_IV2 = 0x3C6EF372UL,
+	BLAKE2S_IV3 = 0xA54FF53AUL,
+	BLAKE2S_IV4 = 0x510E527FUL,
+	BLAKE2S_IV5 = 0x9B05688CUL,
+	BLAKE2S_IV6 = 0x1F83D9ABUL,
+	BLAKE2S_IV7 = 0x5BE0CD19UL,
+};
+
+static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen,
+				  const void *key, size_t keylen)
+{
+	ctx->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen);
+	ctx->h[1] = BLAKE2S_IV1;
+	ctx->h[2] = BLAKE2S_IV2;
+	ctx->h[3] = BLAKE2S_IV3;
+	ctx->h[4] = BLAKE2S_IV4;
+	ctx->h[5] = BLAKE2S_IV5;
+	ctx->h[6] = BLAKE2S_IV6;
+	ctx->h[7] = BLAKE2S_IV7;
+	ctx->t[0] = 0;
+	ctx->t[1] = 0;
+	ctx->f[0] = 0;
+	ctx->f[1] = 0;
+	ctx->buflen = 0;
+	ctx->outlen = outlen;
+	if (keylen) {
+		memcpy(ctx->buf, key, keylen);
+		memset(&ctx->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen);
+		ctx->buflen = BLAKE2S_BLOCK_SIZE;
+	}
+}
+
+static inline void blake2s_init(struct blake2s_ctx *ctx, size_t outlen)
+{
+	__blake2s_init(ctx, outlen, NULL, 0);
+}
+
+static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen,
+				    const void *key, size_t keylen)
+{
+	__blake2s_init(ctx, outlen, key, keylen);
+}
+
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen);
+
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out);
+
+#endif /* _CRYPTO_BLAKE2S_H */
-- 
cgit v1.2.3


From f136fc491b2a48dbfcb98cac372303dc0e18f0c1 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 8 Dec 2025 17:57:28 -0800
Subject: perf genelf: Switch from SHA-1 to BLAKE2s for build ID generation

Recent patches [1] [2] added an implementation of SHA-1 to perf and made
it be used for build ID generation.

I had understood the choice of SHA-1, which is a legacy algorithm, to be
for backwards compatibility.

It turns out, though, that there's no backwards compatibility
requirement here other than the size of the build ID field, which is
fixed at 20 bytes.  Not only did the hash algorithm already change (from
MD5 to SHA-1), but the inputs to the hash changed too: from 'load_addr
|| code' to just 'code', and now again to 'code || symtab || strsym'
[3].  Different linkers generate different build IDs, with the LLVM
linker using BLAKE3 hashes for example [4].

Therefore, we might as well switch to a more modern algorithm.  Let's go
with BLAKE2s.  It's faster than SHA-1, isn't cryptographically broken,
is easier to implement than BLAKE3, and the kernel's implementation in
lib/crypto/blake2s.c is easily borrowed.  It also natively supports
variable-length hashes, so it can directly produce the needed 20 bytes.

Also make the following additional improvements:

- Hash the three inputs incrementally, so they don't all have to be
  concatenated into one buffer.

- Add tag/length prefixes to each of the three inputs, so that distinct
  input tuples reliably result in distinct hashes.

[1] https://lore.kernel.org/linux-perf-users/20250521225307.743726-1-yuzhuo@google.com/
[2] https://lore.kernel.org/linux-perf-users/20250625202311.23244-1-ebiggers@kernel.org/
[3] https://lore.kernel.org/linux-perf-users/20251125080748.461014-1-namhyung@kernel.org/
[4] https://github.com/llvm/llvm-project/commit/d3e5b6f7539b86995aef6e2075c1edb3059385ce

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Tested-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Fangrui Song <maskray@sourceware.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Pablo Galindo <pablogsal@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/genelf.c | 58 +++++++++++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 30 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/genelf.c b/tools/perf/util/genelf.c
index a1cd5196f4ec..14882def9704 100644
--- a/tools/perf/util/genelf.c
+++ b/tools/perf/util/genelf.c
@@ -18,8 +18,8 @@
 #include <dwarf.h>
 #endif
 
+#include "blake2s.h"
 #include "genelf.h"
-#include "sha1.h"
 #include "../util/jitdump.h"
 #include <linux/compiler.h>
 
@@ -51,7 +51,7 @@ static char shd_string_table[] = {
 static struct buildid_note {
 	Elf_Note desc;		/* descsz: size of build-id, must be multiple of 4 */
 	char	 name[4];	/* GNU\0 */
-	u8	 build_id[SHA1_DIGEST_SIZE];
+	u8	 build_id[20];
 } bnote;
 
 static Elf_Sym symtab[]={
@@ -152,9 +152,28 @@ jit_add_eh_frame_info(Elf *e, void* unwinding, uint64_t unwinding_header_size,
 	return 0;
 }
 
+enum {
+	TAG_CODE = 0,
+	TAG_SYMTAB = 1,
+	TAG_STRSYM = 2,
+};
+
+/*
+ * Update the hash using the given data, also prepending a (tag, len) prefix to
+ * ensure that distinct input tuples reliably result in distinct hashes.
+ */
+static void blake2s_update_tagged(struct blake2s_ctx *ctx, int tag,
+				  const void *data, size_t len)
+{
+	u64 prefix = ((u64)tag << 56) | len;
+
+	blake2s_update(ctx, (const u8 *)&prefix, sizeof(prefix));
+	blake2s_update(ctx, data, len);
+}
+
 /*
  * fd: file descriptor open for writing for the output file
- * load_addr: code load address (could be zero, just used for buildid)
+ * load_addr: code load address (could be zero)
  * sym: function name (for native code - used as the symbol)
  * code: the native code
  * csize: the code size in bytes
@@ -173,8 +192,7 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym,
 	Elf_Shdr *shdr;
 	uint64_t eh_frame_base_offset;
 	char *strsym = NULL;
-	void *build_id_data = NULL, *tmp;
-	int build_id_data_len;
+	struct blake2s_ctx ctx;
 	int symlen;
 	int retval = -1;
 
@@ -253,13 +271,8 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym,
 	shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
 	shdr->sh_entsize = 0;
 
-	build_id_data = malloc(csize);
-	if (build_id_data == NULL) {
-		warnx("cannot allocate build-id data");
-		goto error;
-	}
-	memcpy(build_id_data, code, csize);
-	build_id_data_len = csize;
+	blake2s_init(&ctx, sizeof(bnote.build_id));
+	blake2s_update_tagged(&ctx, TAG_CODE, code, csize);
 
 	/*
 	 * Setup .eh_frame_hdr and .eh_frame
@@ -344,14 +357,7 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym,
 	shdr->sh_entsize = sizeof(Elf_Sym);
 	shdr->sh_link = unwinding ? 6 : 4; /* index of .strtab section */
 
-	tmp = realloc(build_id_data, build_id_data_len + sizeof(symtab));
-	if (tmp == NULL) {
-		warnx("cannot allocate build-id data");
-		goto error;
-	}
-	memcpy(tmp + build_id_data_len, symtab, sizeof(symtab));
-	build_id_data = tmp;
-	build_id_data_len += sizeof(symtab);
+	blake2s_update_tagged(&ctx, TAG_SYMTAB, symtab, sizeof(symtab));
 
 	/*
 	 * setup symbols string table
@@ -395,14 +401,7 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym,
 	shdr->sh_flags = 0;
 	shdr->sh_entsize = 0;
 
-	tmp = realloc(build_id_data, build_id_data_len + symlen);
-	if (tmp == NULL) {
-		warnx("cannot allocate build-id data");
-		goto error;
-	}
-	memcpy(tmp + build_id_data_len, strsym, symlen);
-	build_id_data = tmp;
-	build_id_data_len += symlen;
+	blake2s_update_tagged(&ctx, TAG_STRSYM, strsym, symlen);
 
 	/*
 	 * setup build-id section
@@ -422,7 +421,7 @@ jit_write_elf(int fd, uint64_t load_addr __maybe_unused, const char *sym,
 	/*
 	 * build-id generation
 	 */
-	sha1(build_id_data, build_id_data_len, bnote.build_id);
+	blake2s_final(&ctx, bnote.build_id);
 	bnote.desc.namesz = sizeof(bnote.name); /* must include 0 termination */
 	bnote.desc.descsz = sizeof(bnote.build_id);
 	bnote.desc.type   = NT_GNU_BUILD_ID;
@@ -467,7 +466,6 @@ error:
 	(void)elf_end(e);
 
 	free(strsym);
-	free(build_id_data);
 
 	return retval;
 }
-- 
cgit v1.2.3


From e35dd81017011be0fb0cbb2ae80a6bc24962f0f7 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 8 Dec 2025 17:57:29 -0800
Subject: perf util: Remove SHA-1 code

Now that the SHA-1 code is no longer used, remove it.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Tested-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Fangrui Song <maskray@sourceware.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Pablo Galindo <pablogsal@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/util.c | 49 -------------------------
 tools/perf/util/Build   |  1 -
 tools/perf/util/sha1.c  | 97 -------------------------------------------------
 tools/perf/util/sha1.h  |  6 ---
 4 files changed, 153 deletions(-)
 delete mode 100644 tools/perf/util/sha1.c
 delete mode 100644 tools/perf/util/sha1.h

(limited to 'tools')

diff --git a/tools/perf/tests/util.c b/tools/perf/tests/util.c
index efc3e4e4c6fa..bf2c5b133884 100644
--- a/tools/perf/tests/util.c
+++ b/tools/perf/tests/util.c
@@ -2,7 +2,6 @@
 #include "tests.h"
 #include "util/blake2s.h"
 #include "util/debug.h"
-#include "util/sha1.h"
 
 #include <linux/compiler.h>
 #include <stdlib.h>
@@ -18,48 +17,6 @@ static int test_strreplace(char needle, const char *haystack,
 	return ret == 0;
 }
 
-#define MAX_LEN 512
-
-/* Test sha1() for all lengths from 0 to MAX_LEN inclusively. */
-static int test_sha1(void)
-{
-	u8 data[MAX_LEN];
-	size_t digests_size = (MAX_LEN + 1) * SHA1_DIGEST_SIZE;
-	u8 *digests;
-	u8 digest_of_digests[SHA1_DIGEST_SIZE];
-	/*
-	 * The correctness of this value was verified by running this test with
-	 * sha1() replaced by OpenSSL's SHA1().
-	 */
-	static const u8 expected_digest_of_digests[SHA1_DIGEST_SIZE] = {
-		0x74, 0xcd, 0x4c, 0xb9, 0xd8, 0xa6, 0xd5, 0x95, 0x22, 0x8b,
-		0x7e, 0xd6, 0x8b, 0x7e, 0x46, 0x95, 0x31, 0x9b, 0xa2, 0x43,
-	};
-	size_t i;
-
-	digests = malloc(digests_size);
-	TEST_ASSERT_VAL("failed to allocate digests", digests != NULL);
-
-	/* Generate MAX_LEN bytes of data. */
-	for (i = 0; i < MAX_LEN; i++)
-		data[i] = i;
-
-	/* Calculate a SHA-1 for each length 0 through MAX_LEN inclusively. */
-	for (i = 0; i <= MAX_LEN; i++)
-		sha1(data, i, &digests[i * SHA1_DIGEST_SIZE]);
-
-	/* Calculate digest of all digests calculated above. */
-	sha1(digests, digests_size, digest_of_digests);
-
-	free(digests);
-
-	/* Check for the expected result. */
-	TEST_ASSERT_VAL("wrong output from sha1()",
-			memcmp(digest_of_digests, expected_digest_of_digests,
-			       SHA1_DIGEST_SIZE) == 0);
-	return 0;
-}
-
 /* Maximum data length tested by test_blake2s() */
 #define MAX_DATA_LEN 512
 
@@ -131,8 +88,6 @@ static int test_blake2s(void)
 
 static int test__util(struct test_suite *t __maybe_unused, int subtest __maybe_unused)
 {
-	int ret;
-
 	TEST_ASSERT_VAL("empty string", test_strreplace(' ', "", "123", ""));
 	TEST_ASSERT_VAL("no match", test_strreplace('5', "123", "4", "123"));
 	TEST_ASSERT_VAL("replace 1", test_strreplace('3', "123", "4", "124"));
@@ -140,10 +95,6 @@ static int test__util(struct test_suite *t __maybe_unused, int subtest __maybe_u
 	TEST_ASSERT_VAL("replace long", test_strreplace('a', "abcabc", "longlong",
 							"longlongbclonglongbc"));
 
-	ret = test_sha1();
-	if (ret != TEST_OK)
-		return ret;
-
 	return test_blake2s();
 }
 
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 0c1cfcbed815..248ad3ac64da 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -44,7 +44,6 @@ perf-util-y += rbtree.o
 perf-util-y += libstring.o
 perf-util-y += bitmap.o
 perf-util-y += hweight.o
-perf-util-y += sha1.o
 perf-util-y += smt.o
 perf-util-y += strbuf.o
 perf-util-y += string.o
diff --git a/tools/perf/util/sha1.c b/tools/perf/util/sha1.c
deleted file mode 100644
index 7032fa4ff3fd..000000000000
--- a/tools/perf/util/sha1.c
+++ /dev/null
@@ -1,97 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * SHA-1 message digest algorithm
- *
- * Copyright 2025 Google LLC
- */
-#include <linux/bitops.h>
-#include <linux/kernel.h>
-#include <linux/unaligned.h>
-#include <string.h>
-
-#include "sha1.h"
-
-#define SHA1_BLOCK_SIZE 64
-
-static const u32 sha1_K[4] = { 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 };
-
-#define SHA1_ROUND(i, a, b, c, d, e)                                          \
-	do {                                                                  \
-		if ((i) >= 16)                                                \
-			w[i] = rol32(w[(i) - 16] ^ w[(i) - 14] ^ w[(i) - 8] ^ \
-					     w[(i) - 3],                      \
-				     1);                                      \
-		e += w[i] + rol32(a, 5) + sha1_K[(i) / 20];                   \
-		if ((i) < 20)                                                 \
-			e += (b & (c ^ d)) ^ d;                               \
-		else if ((i) < 40 || (i) >= 60)                               \
-			e += b ^ c ^ d;                                       \
-		else                                                          \
-			e += (c & d) ^ (b & (c ^ d));                         \
-		b = rol32(b, 30);                                             \
-		/* The new (a, b, c, d, e) is the old (e, a, b, c, d). */     \
-	} while (0)
-
-#define SHA1_5ROUNDS(i)                             \
-	do {                                        \
-		SHA1_ROUND((i) + 0, a, b, c, d, e); \
-		SHA1_ROUND((i) + 1, e, a, b, c, d); \
-		SHA1_ROUND((i) + 2, d, e, a, b, c); \
-		SHA1_ROUND((i) + 3, c, d, e, a, b); \
-		SHA1_ROUND((i) + 4, b, c, d, e, a); \
-	} while (0)
-
-#define SHA1_20ROUNDS(i)                \
-	do {                            \
-		SHA1_5ROUNDS((i) + 0);  \
-		SHA1_5ROUNDS((i) + 5);  \
-		SHA1_5ROUNDS((i) + 10); \
-		SHA1_5ROUNDS((i) + 15); \
-	} while (0)
-
-static void sha1_blocks(u32 h[5], const u8 *data, size_t nblocks)
-{
-	while (nblocks--) {
-		u32 a = h[0];
-		u32 b = h[1];
-		u32 c = h[2];
-		u32 d = h[3];
-		u32 e = h[4];
-		u32 w[80];
-
-		for (int i = 0; i < 16; i++)
-			w[i] = get_unaligned_be32(&data[i * 4]);
-		SHA1_20ROUNDS(0);
-		SHA1_20ROUNDS(20);
-		SHA1_20ROUNDS(40);
-		SHA1_20ROUNDS(60);
-
-		h[0] += a;
-		h[1] += b;
-		h[2] += c;
-		h[3] += d;
-		h[4] += e;
-		data += SHA1_BLOCK_SIZE;
-	}
-}
-
-/* Calculate the SHA-1 message digest of the given data. */
-void sha1(const void *data, size_t len, u8 out[SHA1_DIGEST_SIZE])
-{
-	u32 h[5] = { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476,
-		     0xC3D2E1F0 };
-	u8 final_data[2 * SHA1_BLOCK_SIZE] = { 0 };
-	size_t final_len = len % SHA1_BLOCK_SIZE;
-
-	sha1_blocks(h, data, len / SHA1_BLOCK_SIZE);
-
-	memcpy(final_data, data + len - final_len, final_len);
-	final_data[final_len] = 0x80;
-	final_len = round_up(final_len + 9, SHA1_BLOCK_SIZE);
-	put_unaligned_be64((u64)len * 8, &final_data[final_len - 8]);
-
-	sha1_blocks(h, final_data, final_len / SHA1_BLOCK_SIZE);
-
-	for (int i = 0; i < 5; i++)
-		put_unaligned_be32(h[i], &out[i * 4]);
-}
diff --git a/tools/perf/util/sha1.h b/tools/perf/util/sha1.h
deleted file mode 100644
index e92c9966e1d5..000000000000
--- a/tools/perf/util/sha1.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#include <linux/types.h>
-
-#define SHA1_DIGEST_SIZE 20
-
-void sha1(const void *data, size_t len, u8 out[SHA1_DIGEST_SIZE]);
-- 
cgit v1.2.3


From 9f8f5edc79b6f22d0b4510d08b6a9c6e7f2c96e5 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 17 Dec 2025 10:39:27 -0800
Subject: perf inject: Keep build-ID data if no option is used

The keep_feat() determines which header features will be kept or
discarded.  Usually 'perf inject' will add build-IDs based on -b, -B or
other related options.  But it lose build-ID when none of those options
are used.  This is meaningful only when --buildid-mmap is not used.

The following example shows the impact of this change.

  $ perf record --no-buildid-mmap true
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.037 MB perf.data (5 samples) ]

  $ perf inject -i perf.data -o perf.data.inject

  $ perf buildid-list -i perf.data
  08cccc2a9388d5247ccb3e864f3063b975b0a15d /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
  fd5c4d5673256cd6bda51725dba048dabb0f854e [kernel.kallsyms]
  97a36ce1140071be5c36b147fa0bed173e05a602 [vdso]

  $ perf buildid-list -i perf.data.inject
  97a36ce1140071be5c36b147fa0bed173e05a602 [vdso]

With this change, perf.data.inject would show the same list (of course,
you need to run perf inject again).

Reported-by: Gabriel Marin <gmx@google.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index aa7be4fb5838..6080afec537d 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -2022,7 +2022,7 @@ static int save_section_info(struct perf_inject *inject)
 	return perf_header__process_sections(header, fd, inject, save_section_info_cb);
 }
 
-static bool keep_feat(int feat)
+static bool keep_feat(struct perf_inject *inject, int feat)
 {
 	switch (feat) {
 	/* Keep original information that describes the machine or software */
@@ -2050,6 +2050,7 @@ static bool keep_feat(int feat)
 		return true;
 	/* Information that can be updated */
 	case HEADER_BUILD_ID:
+		return inject->build_id_style == BID_RWS__NONE;
 	case HEADER_CMDLINE:
 	case HEADER_EVENT_DESC:
 	case HEADER_BRANCH_STACK:
@@ -2108,7 +2109,7 @@ static int feat_copy_cb(struct feat_copier *fc, int feat, struct feat_writer *fw
 	int ret;
 
 	if (!inject->secs[feat].offset ||
-	    !keep_feat(feat))
+	    !keep_feat(inject, feat))
 		return 0;
 
 	ret = feat_copy(inject, feat, fw);
-- 
cgit v1.2.3


From b2629e7846e35dbf12de6d7b8e81f0049f6a50ea Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 18 Dec 2025 17:18:17 -0800
Subject: perf test: Skip dlfilter test for build failures

For some reason, it may fail to build the dlfilter.  Let's skip the test
as it's not an error in the perf.  This can happen when you run the perf
test without source code or in a different directory.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/script_dlfilter.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/script_dlfilter.sh b/tools/perf/tests/shell/script_dlfilter.sh
index 45c97d4a7d5f..7895ab0309b2 100755
--- a/tools/perf/tests/shell/script_dlfilter.sh
+++ b/tools/perf/tests/shell/script_dlfilter.sh
@@ -70,15 +70,15 @@ test_dlfilter() {
 	# Build the dlfilter
 	if ! cc -c -I tools/perf/include -fpic -x c "${dlfilter_c}" -o "${dlfilter_so}.o"
 	then
-		echo "Basic --dlfilter test [Failed to build dlfilter object]"
-		err=1
+		echo "Basic --dlfilter test [Skip - failed to build dlfilter object]"
+		err=2
 		return
 	fi
 
 	if ! cc -shared -o "${dlfilter_so}" "${dlfilter_so}.o"
 	then
-		echo "Basic --dlfilter test [Failed to link dlfilter shared object]"
-		err=1
+		echo "Basic --dlfilter test [Skip - failed to link dlfilter shared object]"
+		err=2
 		return
 	fi
 
-- 
cgit v1.2.3


From f552878a720bf765cc1616ee4a4e243cc03e4b27 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 18 Dec 2025 17:18:18 -0800
Subject: perf test: Use shelldir to refer perf source location

It uses tools/perf/include which assumes it's running from the root of
the linux kernel source tree.  But you can run perf from other places
like tools/perf, then the include path won't match.  We can use the
shelldir variable to locate the test script in the tree.

  $ cd tools/perf

  $ ./perf test dlfilter
   63: dlfilter C API                                                  : Ok
  101: perf script --dlfilter tests                                    : Ok

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/script_dlfilter.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/script_dlfilter.sh b/tools/perf/tests/shell/script_dlfilter.sh
index 7895ab0309b2..aaed92bb7828 100755
--- a/tools/perf/tests/shell/script_dlfilter.sh
+++ b/tools/perf/tests/shell/script_dlfilter.sh
@@ -68,7 +68,7 @@ test_dlfilter() {
 	fi
 
 	# Build the dlfilter
-	if ! cc -c -I tools/perf/include -fpic -x c "${dlfilter_c}" -o "${dlfilter_so}.o"
+	if ! cc -c -I ${shelldir}/../../include -fpic -x c "${dlfilter_c}" -o "${dlfilter_so}.o"
 	then
 		echo "Basic --dlfilter test [Skip - failed to build dlfilter object]"
 		err=2
-- 
cgit v1.2.3


From 1c89bc1b95fa9058f3e7cd37f1142939261417d5 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 18 Dec 2025 17:18:19 -0800
Subject: perf test: Do not skip when some metrics tests succeeded

I think the return value of SKIP (2) should be used when it skipped the
entire test suite rather than a few of them.  While the FAIL should be
reserved if any of test failed.

  $ perf test -vv 110
  110: perf all metrics test:
  --- start ---
  test child forked, pid 2496399
  Testing tma_core_bound
  Testing tma_info_core_ilp
  Testing tma_info_memory_l2mpki
  Testing tma_memory_bound
  Testing tma_bottleneck_irregular_overhead
  Testing tma_bottleneck_mispredictions
  Testing tma_info_bad_spec_branch_misprediction_cost
  Testing tma_info_bad_spec_ipmisp_cond_ntaken
  Testing tma_info_bad_spec_ipmisp_cond_taken
  Testing tma_info_bad_spec_ipmisp_indirect
  Testing tma_info_bad_spec_ipmisp_ret
  Testing tma_info_bad_spec_ipmispredict
  Testing tma_info_branches_callret
  Testing tma_info_branches_cond_nt
  Testing tma_info_branches_cond_tk
  Testing tma_info_branches_jump
  Testing tma_info_branches_other_branches
  Testing tma_branch_mispredicts
  Testing tma_clears_resteers
  Testing tma_machine_clears
  Testing tma_mispredicts_resteers
  Testing tma_bottleneck_big_code
  Testing tma_icache_misses
  Testing tma_itlb_misses
  Testing tma_unknown_branches
  Testing tma_info_bad_spec_spec_clears_ratio
  Testing tma_other_mispredicts
  Testing tma_branch_instructions
  Testing tma_info_frontend_tbpc
  Testing tma_info_inst_mix_bptkbranch
  Testing tma_info_inst_mix_ipbranch
  Testing tma_info_inst_mix_ipcall
  Testing tma_info_inst_mix_iptb
  Testing tma_info_system_ipfarbranch
  Testing tma_info_thread_uptb
  Testing tma_bottleneck_branching_overhead
  Testing tma_nop_instructions
  Testing tma_bottleneck_compute_bound_est
  Testing tma_divider
  Testing tma_ports_utilized_3m
  Testing tma_bottleneck_instruction_fetch_bw
  Testing tma_frontend_bound
  Testing tma_assists
  Testing tma_other_nukes
  Testing tma_serializing_operation
  Testing tma_bottleneck_data_cache_memory_bandwidth
  Testing tma_fb_full
  Testing tma_mem_bandwidth
  Testing tma_sq_full
  Testing tma_bottleneck_data_cache_memory_latency
  Testing tma_l1_latency_dependency
  Testing tma_l2_bound
  Testing tma_l3_hit_latency
  Testing tma_mem_latency
  Testing tma_store_latency
  Testing tma_bottleneck_memory_synchronization
  Testing tma_contested_accesses
  Testing tma_data_sharing
  Testing tma_false_sharing
  Testing tma_bottleneck_memory_data_tlbs
  Testing tma_dtlb_load
  Testing tma_dtlb_store
  Testing tma_backend_bound
  Testing tma_bottleneck_other_bottlenecks
  Testing tma_bottleneck_useful_work
  Testing tma_retiring
  Testing tma_info_memory_fb_hpki
  Testing tma_info_memory_l1mpki
  Testing tma_info_memory_l1mpki_load
  Testing tma_info_memory_l2hpki_all
  Testing tma_info_memory_l2hpki_load
  Testing tma_info_memory_l2mpki_all
  Testing tma_info_memory_l2mpki_load
  Testing tma_l1_bound
  Testing tma_l3_bound
  Testing tma_info_memory_l2mpki_rfo
  Testing tma_fp_scalar
  Testing tma_fp_vector
  Testing tma_fp_vector_128b
  Testing tma_fp_vector_256b
  Testing tma_fp_vector_512b
  Testing tma_port_0
  Testing tma_x87_use
  Testing tma_info_botlnk_l0_core_bound_likely
  Testing tma_info_core_fp_arith_utilization
  Testing tma_info_pipeline_execute
  Testing tma_info_system_gflops
  Testing tma_info_thread_execute_per_issue
  Testing tma_dsb
  Testing tma_info_botlnk_l2_dsb_bandwidth
  Testing tma_info_frontend_dsb_coverage
  Testing tma_decoder0_alone
  Testing tma_dsb_switches
  Testing tma_info_botlnk_l2_dsb_misses
  Testing tma_info_frontend_dsb_switch_cost
  Testing tma_info_frontend_ipdsb_miss_ret
  Testing tma_mite
  Testing tma_mite_4wide
  Testing CPUs_utilized
  Testing backend_cycles_idle
  [Ignored backend_cycles_idle] failed but as a Default metric this can be expected
  Performance counter stats for 'perf test -w noploop': <not counted> cpu-cycles:u <not supported> stalled-cycles-backend:u 1.014051473 seconds time elapsed 1.005718000 seconds user 0.008013000 seconds sys
  Testing branch_frequency
  Testing branch_miss_rate
  Testing cs_per_second
  Testing cycles_frequency
  Testing frontend_cycles_idle
  [Ignored frontend_cycles_idle] failed but as a Default metric this can be expected
  Performance counter stats for 'perf test -w noploop': <not counted> cpu-cycles:u <not supported> stalled-cycles-frontend:u 1.012813656 seconds time elapsed 1.004603000 seconds user 0.008004000 seconds sys
  Testing insn_per_cycle
  Testing migrations_per_second
  Testing page_faults_per_second
  Testing stalled_cycles_per_instruction
  [Ignored stalled_cycles_per_instruction] failed but as a Default metric this can be expected
  Error: No supported events found. The stalled-cycles-backend:u event is not supported.
  Testing tma_bad_speculation
  Testing l1d_miss_rate
  Testing llc_miss_rate
  Testing dtlb_miss_rate
  Testing itlb_miss_rate
  [Ignored itlb_miss_rate] failed but as a Default metric this can be expected
  Performance counter stats for 'perf test -w noploop': <not supported> iTLB-loads:u 3,097 iTLB-load-misses:u 1.012766732 seconds time elapsed 1.004318000 seconds user 0.008002000 seconds sys
  Testing l1i_miss_rate
  [Ignored l1i_miss_rate] failed but as a Default metric this can be expected
  Performance counter stats for 'perf test -w noploop': <not counted> L1-icache-load-misses:u <not supported> L1-icache-loads:u 1.013606395 seconds time elapsed 1.001371000 seconds user 0.011968000 seconds sys
  Testing l1_prefetch_miss_rate
  [Ignored l1_prefetch_miss_rate] failed but as a Default metric this can be expected
  Error: No supported events found. The L1-dcache-prefetches:u event is not supported.
  Testing tma_info_botlnk_l2_ic_misses
  Testing tma_info_frontend_fetch_upc
  Testing tma_info_frontend_icache_miss_latency
  Testing tma_info_frontend_ipunknown_branch
  Testing tma_info_frontend_lsd_coverage
  Testing tma_info_memory_tlb_code_stlb_mpki
  Testing tma_info_pipeline_fetch_dsb
  Testing tma_info_pipeline_fetch_lsd
  Testing tma_info_pipeline_fetch_mite
  Testing tma_info_pipeline_fetch_ms
  Testing tma_fetch_bandwidth
  Testing tma_lsd
  Testing tma_branch_resteers
  Testing tma_code_l2_hit
  Testing tma_code_l2_miss
  Testing tma_code_stlb_hit
  Testing tma_code_stlb_miss
  Testing tma_code_stlb_miss_2m
  Testing tma_code_stlb_miss_4k
  Testing tma_lcp
  Testing tma_ms_switches
  Testing tma_info_core_flopc
  Testing tma_info_inst_mix_iparith
  Testing tma_info_inst_mix_iparith_avx128
  Testing tma_info_inst_mix_iparith_avx256
  Testing tma_info_inst_mix_iparith_avx512
  Testing tma_info_inst_mix_iparith_scalar_dp
  Testing tma_info_inst_mix_iparith_scalar_sp
  Testing tma_info_inst_mix_ipflop
  Testing tma_info_inst_mix_ippause
  Testing tma_fetch_latency
  Testing tma_fp_arith
  Testing tma_fp_assists
  Testing tma_info_system_cpu_utilization
  Testing tma_info_system_dram_bw_use
  [Skipped tma_info_system_dram_bw_use] Not supported events
  Performance counter stats for 'perf test -w noploop': <not supported> UNC_ARB_TRK_REQUESTS.ALL:u <not supported> UNC_ARB_COH_TRK_REQUESTS.ALL:u 1,013,554,749 duration_time 1.013527265 seconds time elapsed 1.005417000 seconds user 0.008011000 seconds sys
  Testing tma_info_frontend_l2mpki_code
  Testing tma_info_frontend_l2mpki_code_all
  Testing tma_info_inst_mix_ipload
  Testing tma_info_inst_mix_ipstore
  Testing tma_info_memory_latency_load_l2_miss_latency
  Testing tma_lock_latency
  Testing tma_info_memory_core_l1d_cache_fill_bw_2t
  Testing tma_info_memory_core_l2_cache_fill_bw_2t
  Testing tma_info_memory_core_l3_cache_access_bw_2t
  Testing tma_info_memory_core_l3_cache_fill_bw_2t
  Testing tma_info_memory_l1d_cache_fill_bw
  Testing tma_info_memory_l2_cache_fill_bw
  Testing tma_info_memory_l3_cache_access_bw
  Testing tma_info_memory_l3_cache_fill_bw
  Testing tma_info_memory_l3mpki
  Testing tma_info_memory_load_miss_real_latency
  Testing tma_info_memory_mix_bus_lock_pki
  Testing tma_info_memory_mix_uc_load_pki
  Testing tma_info_memory_mlp
  Testing tma_info_memory_tlb_load_stlb_mpki
  Testing tma_info_memory_tlb_page_walks_utilization
  Testing tma_info_memory_tlb_store_stlb_mpki
  Testing tma_info_system_mem_parallel_reads
  [Skipped tma_info_system_mem_parallel_reads] Not supported events
  Performance counter stats for 'perf test -w noploop': <not supported> UNC_ARB_DAT_OCCUPANCY.RD:u <not counted> UNC_ARB_DAT_OCCUPANCY.RD/cmask=1/ 1.013354884 seconds time elapsed 1.009239000 seconds user 0.004004000 seconds sys
  Testing tma_info_system_mem_read_latency
  [Skipped tma_info_system_mem_read_latency] Not supported events
  Performance counter stats for 'perf test -w noploop': <not supported> UNC_ARB_DAT_OCCUPANCY.RD:u <not counted> UNC_ARB_TRK_OCCUPANCY.RD <not counted> UNC_ARB_TRK_REQUESTS.RD 1.012882143 seconds time elapsed 1.004600000 seconds user 0.008036000 seconds sys
  Testing tma_info_thread_cpi
  Testing tma_streaming_stores
  Testing tma_dram_bound
  Testing tma_store_bound
  Testing tma_l2_hit_latency
  Testing tma_load_stlb_hit
  Testing tma_load_stlb_miss
  Testing tma_load_stlb_miss_1g
  Testing tma_load_stlb_miss_2m
  Testing tma_load_stlb_miss_4k
  Testing tma_store_stlb_hit
  Testing tma_store_stlb_miss
  Testing tma_store_stlb_miss_1g
  Testing tma_store_stlb_miss_2m
  Testing tma_store_stlb_miss_4k
  Testing tma_info_memory_latency_data_l2_mlp
  Testing tma_info_memory_latency_load_l2_mlp
  Testing tma_info_pipeline_ipassist
  Testing tma_microcode_sequencer
  Testing tma_ms
  Testing tma_info_system_kernel_cpi
  [Failed tma_info_system_kernel_cpi] Metric contains missing events
  Error: No supported events found. Access to performance monitoring and observability operations is limited. Consider adjusting /proc/sys/kernel/perf_event_paranoid setting to open access to performance monitoring and observability operations for processes without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability. More information can be found at 'Perf events and tool security' document: https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html perf_event_paranoid setting is 2: -1: Allow use of (almost) all events by all users Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK >= 0: Disallow raw and ftrace function tracepoint access >= 1: Disallow CPU event access >= 2: Disallow kernel profiling To make the adjusted perf_event_paranoid setting permanent preserve it in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = <setting>)
  Testing tma_info_system_kernel_utilization
  [Failed tma_info_system_kernel_utilization] Metric contains missing events
  Error: No supported events found. Access to performance monitoring and observability operations is limited. Consider adjusting /proc/sys/kernel/perf_event_paranoid setting to open access to performance monitoring and observability operations for processes without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability. More information can be found at 'Perf events and tool security' document: https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html perf_event_paranoid setting is 2: -1: Allow use of (almost) all events by all users Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK >= 0: Disallow raw and ftrace function tracepoint access >= 1: Disallow CPU event access >= 2: Disallow kernel profiling To make the adjusted perf_event_paranoid setting permanent preserve it in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = <setting>)
  Testing tma_info_pipeline_retire
  Testing tma_info_thread_clks
  Testing tma_info_thread_uoppi
  Testing tma_memory_operations
  Testing tma_other_light_ops
  Testing tma_ports_utilization
  Testing tma_ports_utilized_0
  Testing tma_ports_utilized_1
  Testing tma_ports_utilized_2
  Testing C10_Pkg_Residency
  [Failed C10_Pkg_Residency] Metric contains missing events
  WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_pkg/c10-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_pkg/c10-residency/u) in per-thread mode, enable system wide with '-a'.
  Testing C2_Pkg_Residency
  [Failed C2_Pkg_Residency] Metric contains missing events
  WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_pkg/c2-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_pkg/c2-residency/u) in per-thread mode, enable system wide with '-a'.
  Testing C3_Pkg_Residency
  [Failed C3_Pkg_Residency] Metric contains missing events
  WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { msr/tsc/, cstate_pkg/c3-residency/ } Error: No supported events found. Invalid event (msr/tsc/u) in per-thread mode, enable system wide with '-a'.
  Testing C6_Core_Residency
  [Failed C6_Core_Residency] Metric contains missing events
  WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_core/c6-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_core/c6-residency/u) in per-thread mode, enable system wide with '-a'.
  Testing C6_Pkg_Residency
  [Failed C6_Pkg_Residency] Metric contains missing events
  WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_pkg/c6-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_pkg/c6-residency/u) in per-thread mode, enable system wide with '-a'.
  Testing C7_Core_Residency
  [Failed C7_Core_Residency] Metric contains missing events
  WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_core/c7-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_core/c7-residency/u) in per-thread mode, enable system wide with '-a'.
  Testing C7_Pkg_Residency
  [Failed C7_Pkg_Residency] Metric contains missing events
  WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_pkg/c7-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_pkg/c7-residency/u) in per-thread mode, enable system wide with '-a'.
  Testing C8_Pkg_Residency
  [Failed C8_Pkg_Residency] Metric contains missing events
  WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_pkg/c8-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_pkg/c8-residency/u) in per-thread mode, enable system wide with '-a'.
  Testing C9_Pkg_Residency
  [Failed C9_Pkg_Residency] Metric contains missing events
  WARNING: grouped events cpus do not match. Events with CPUs not matching the leader will be removed from the group. anon group { cstate_pkg/c9-residency/, msr/tsc/ } Error: No supported events found. Invalid event (cstate_pkg/c9-residency/u) in per-thread mode, enable system wide with '-a'.
  Testing tma_info_core_epc
  Testing tma_info_system_core_frequency
  Testing tma_info_system_power
  [Skipped tma_info_system_power] Not supported events
  Performance counter stats for 'perf test -w noploop': <not supported> Joules power/energy-pkg/u 1,013,238,256 duration_time 1.013223072 seconds time elapsed 0.995924000 seconds user 0.011903000 seconds sys
  Testing tma_info_system_power_license0_utilization
  Testing tma_info_system_power_license1_utilization
  Testing tma_info_system_power_license2_utilization
  Testing tma_info_system_turbo_utilization
  Testing tma_info_inst_mix_ipswpf
  Testing tma_info_memory_prefetches_useless_hwpf
  Testing tma_info_core_coreipc
  Testing tma_info_thread_ipc
  Testing tma_heavy_operations
  Testing tma_light_operations
  Testing tma_info_core_core_clks
  Testing tma_info_system_smt_2t_utilization
  Testing tma_info_thread_slots_utilization
  Testing UNCORE_FREQ
  [Skipped UNCORE_FREQ] Not supported events
  Performance counter stats for 'perf test -w noploop': <not supported> UNC_CLOCK.SOCKET:u 1,015,993,466 duration_time 1.015949387 seconds time elapsed 1.007676000 seconds user 0.008029000 seconds sys
  Testing tma_info_system_socket_clks
  [Failed tma_info_system_socket_clks] Metric contains missing events
  Error: No supported events found. Invalid event (UNC_CLOCK.SOCKET:u) in per-thread mode, enable system wide with '-a'.
  Testing tma_info_inst_mix_instructions
  Testing tma_info_system_cpus_utilized
  Testing tma_info_system_mux
  Testing tma_info_system_time
  Testing tma_info_thread_slots
  Testing tma_few_uops_instructions
  Testing tma_4k_aliasing
  Testing tma_cisc
  Testing tma_fp_divider
  Testing tma_int_divider
  Testing tma_slow_pause
  Testing tma_split_loads
  Testing tma_split_stores
  Testing tma_store_fwd_blk
  Testing tma_alu_op_utilization
  Testing tma_load_op_utilization
  Testing tma_mixing_vectors
  Testing tma_store_op_utilization
  Testing tma_port_1
  Testing tma_port_5
  Testing tma_port_6
  Testing smi_cycles
  [Skipped smi_cycles] Not supported events
  Performance counter stats for 'perf test -w noploop': <not supported> msr/smi/u <not supported> msr/aperf/u 3,965,789,327 cycles:u 1.012779591 seconds time elapsed 1.004579000 seconds user 0.007972000 seconds sys
  Testing smi_num
  [Failed smi_num] Metric contains missing events
  Error: No supported events found. Invalid event (msr/smi/u) in per-thread mode, enable system wide with '-a'.
  Testing tsx_aborted_cycles
  Testing tsx_cycles_per_elision
  Testing tsx_cycles_per_transaction
  Testing tsx_transactional_cycles
  ---- end(-1) ----
  110: perf all metrics test                                           : FAILED!

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/stat_all_metrics.sh | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/stat_all_metrics.sh b/tools/perf/tests/shell/stat_all_metrics.sh
index 3dabb39c7cc8..b582d23f28c9 100755
--- a/tools/perf/tests/shell/stat_all_metrics.sh
+++ b/tools/perf/tests/shell/stat_all_metrics.sh
@@ -15,7 +15,8 @@ then
   test_prog="perf test -w noploop"
 fi
 
-err=0
+skip=0
+err=3
 for m in $(perf list --raw-dump metrics); do
   echo "Testing $m"
   result=$(perf stat -M "$m" $system_wide_flag -- $test_prog 2>&1)
@@ -23,6 +24,10 @@ for m in $(perf list --raw-dump metrics); do
   if [[ $result_err -eq 0 && "$result" =~ ${m:0:50} ]]
   then
     # No error result and metric shown.
+    if [[ "$err" -ne 1 ]]
+    then
+      err=0
+    fi
     continue
   fi
   if [[ "$result" =~ "Cannot resolve IDs for" || "$result" =~ "No supported events found" ]]
@@ -44,7 +49,7 @@ for m in $(perf list --raw-dump metrics); do
     echo $result
     if [[ $err -eq 0 ]]
     then
-      err=2 # Skip
+      skip=1
     fi
     continue
   elif [[ "$result" =~ "in per-thread mode, enable system wide" ]]
@@ -53,7 +58,7 @@ for m in $(perf list --raw-dump metrics); do
     echo $result
     if [[ $err -eq 0 ]]
     then
-      err=2 # Skip
+      skip=1
     fi
     continue
   elif [[ "$result" =~ "<not supported>" ]]
@@ -68,7 +73,7 @@ for m in $(perf list --raw-dump metrics); do
     echo $result
     if [[ $err -eq 0 ]]
     then
-      err=2 # Skip
+      skip=1
     fi
     continue
   elif [[ "$result" =~ "<not counted>" ]]
@@ -77,7 +82,7 @@ for m in $(perf list --raw-dump metrics); do
     echo $result
     if [[ $err -eq 0 ]]
     then
-      err=2 # Skip
+      skip=1
     fi
     continue
   elif [[ "$result" =~ "FP_ARITH" || "$result" =~ "AMX" ]]
@@ -86,7 +91,7 @@ for m in $(perf list --raw-dump metrics); do
     echo $result
     if [[ $err -eq 0 ]]
     then
-      err=2 # Skip
+      skip=1
     fi
     continue
   elif [[ "$result" =~ "PMM" ]]
@@ -95,7 +100,7 @@ for m in $(perf list --raw-dump metrics); do
     echo $result
     if [[ $err -eq 0 ]]
     then
-      err=2 # Skip
+      skip=1
     fi
     continue
   fi
@@ -106,6 +111,10 @@ for m in $(perf list --raw-dump metrics); do
   if [[ $result_err -eq 0 && "$result" =~ ${m:0:50} ]]
   then
     # No error result and metric shown.
+    if [[ "$err" -ne 1 ]]
+    then
+      err=0
+    fi
     continue
   fi
   echo "[Failed $m] has non-zero error '$result_err' or not printed in:"
@@ -113,4 +122,10 @@ for m in $(perf list --raw-dump metrics); do
   err=1
 done
 
+# return SKIP only if no success returned
+if [[ "$err" -eq 3 && "$skip" -eq 1 ]]
+then
+  err=2
+fi
+
 exit "$err"
-- 
cgit v1.2.3


From 84010f9bcf5389717b4ad02b6f2124ff59413bdf Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 18 Dec 2025 17:18:20 -0800
Subject: perf test: Do not skip when some metric-group tests succeed

I think the return value of SKIP (2) should be used when it skipped the
entire test suite rather than a few of them.  While the FAIL should be
reserved if any of test failed.

  $ perf test -vv 109
  109: perf all metricgroups test:
  --- start ---
  test child forked, pid 2493003
  Testing Backend
  Testing Bad
  Testing BadSpec
  Testing BigFootprint
  Testing BrMispredicts
  Testing Branches
  Testing BvBC
  Testing BvBO
  Testing BvCB
  Testing BvFB
  Testing BvIO
  Testing BvMB
  Testing BvML
  Testing BvMP
  Testing BvMS
  Testing BvMT
  Testing BvOB
  Testing BvUW
  Testing CacheHits
  Testing CacheMisses
  Testing CodeGen
  Testing Compute
  Testing Cor
  Testing DSB
  Testing DSBmiss
  Testing DataSharing
  Testing Default
  Testing Default2
  Testing Default3
  Testing Default4
  Ignoring failures in Default4 that may contain unsupported legacy events
  Testing Fed
  Testing FetchBW
  Testing FetchLat
  Testing Flops
  Testing FpScalar
  Testing FpVector
  Testing Frontend
  Testing HPC
  Testing IcMiss
  Testing InsType
  Testing LSD
  Testing LockCont
  Testing MachineClears
  Testing Machine_Clears
  Testing Mem
  Testing MemOffcore
  Testing MemoryBW
  Testing MemoryBound
  Testing MemoryLat
  Testing MemoryTLB
  Testing Memory_BW
  Testing Memory_Lat
  Testing MicroSeq
  Testing OS
  Testing Offcore
  Testing PGO
  Testing Pipeline
  Testing PortsUtil
  Testing Power
  Testing Prefetches
  Testing Ret
  Testing Retire
  Testing SMT
  Testing Snoop
  Testing SoC
  Testing Summary
  Testing TmaL1
  Testing TmaL2
  Testing TmaL3mem
  Testing TopdownL1
  Testing TopdownL2
  Testing TopdownL3
  Testing TopdownL4
  Testing TopdownL5
  Testing TopdownL6
  Testing smi
  Testing tma_L1_group
  Testing tma_L2_group
  Testing tma_L3_group
  Testing tma_L4_group
  Testing tma_L5_group
  Testing tma_L6_group
  Testing tma_alu_op_utilization_group
  Testing tma_assists_group
  Testing tma_backend_bound_group
  Testing tma_bad_speculation_group
  Testing tma_branch_mispredicts_group
  Testing tma_branch_resteers_group
  Testing tma_code_stlb_miss_group
  Testing tma_core_bound_group
  Testing tma_divider_group
  Testing tma_dram_bound_group
  Testing tma_dtlb_load_group
  Testing tma_dtlb_store_group
  Testing tma_fetch_bandwidth_group
  Testing tma_fetch_latency_group
  Testing tma_fp_arith_group
  Testing tma_fp_vector_group
  Testing tma_frontend_bound_group
  Testing tma_heavy_operations_group
  Testing tma_icache_misses_group
  Testing tma_issue2P
  Testing tma_issueBM
  Testing tma_issueBW
  Testing tma_issueComp
  Testing tma_issueD0
  Testing tma_issueFB
  Testing tma_issueFL
  Testing tma_issueL1
  Testing tma_issueLat
  Testing tma_issueMC
  Testing tma_issueMS
  Testing tma_issueMV
  Testing tma_issueRFO
  Testing tma_issueSL
  Testing tma_issueSO
  Testing tma_issueSmSt
  Testing tma_issueSpSt
  Testing tma_issueSyncxn
  Testing tma_issueTLB
  Testing tma_itlb_misses_group
  Testing tma_l1_bound_group
  Testing tma_l2_bound_group
  Testing tma_l3_bound_group
  Testing tma_light_operations_group
  Testing tma_load_stlb_miss_group
  Testing tma_machine_clears_group
  Testing tma_memory_bound_group
  Testing tma_microcode_sequencer_group
  Testing tma_mite_group
  Testing tma_other_light_ops_group
  Testing tma_ports_utilization_group
  Testing tma_ports_utilized_0_group
  Testing tma_ports_utilized_3m_group
  Testing tma_retiring_group
  Testing tma_serializing_operation_group
  Testing tma_store_bound_group
  Testing tma_store_stlb_miss_group
  Testing transaction
  ---- end(0) ----
  109: perf all metricgroups test                                      : Ok

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/stat_all_metricgroups.sh | 26 +++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/stat_all_metricgroups.sh b/tools/perf/tests/shell/stat_all_metricgroups.sh
index 1400880ec01f..81bc7070b5ab 100755
--- a/tools/perf/tests/shell/stat_all_metricgroups.sh
+++ b/tools/perf/tests/shell/stat_all_metricgroups.sh
@@ -12,31 +12,32 @@ if ParanoidAndNotRoot 0
 then
   system_wide_flag=""
 fi
-err=0
+
+err=3
+skip=0
 for m in $(perf list --raw-dump metricgroups)
 do
   echo "Testing $m"
   result=$(perf stat -M "$m" $system_wide_flag sleep 0.01 2>&1)
   result_err=$?
-  if [[ $result_err -gt 0 ]]
+  if [[ $result_err -eq 0 ]]
   then
+    if [[ "$err" -ne 1 ]]
+    then
+      err=0
+    fi
+  else
     if [[ "$result" =~ \
           "Access to performance monitoring and observability operations is limited" ]]
     then
       echo "Permission failure"
       echo $result
-      if [[ $err -eq 0 ]]
-      then
-        err=2 # Skip
-      fi
+      skip=1
     elif [[ "$result" =~ "in per-thread mode, enable system wide" ]]
     then
       echo "Permissions - need system wide mode"
       echo $result
-      if [[ $err -eq 0 ]]
-      then
-        err=2 # Skip
-      fi
+      skip=1
     elif [[ "$m" == @(Default2|Default3|Default4) ]]
     then
       echo "Ignoring failures in $m that may contain unsupported legacy events"
@@ -48,4 +49,9 @@ do
   fi
 done
 
+if [[ "$err" -eq 3 && "$skip" -eq 1 ]]
+then
+  err=2
+fi
+
 exit $err
-- 
cgit v1.2.3


From d1f9dc67238e716a4cc0ffd7014f501775d5f3ed Mon Sep 17 00:00:00 2001
From: Dapeng Mi <dapeng1.mi@linux.intel.com>
Date: Tue, 16 Dec 2025 09:39:49 +0800
Subject: perf Documentation: Correct branch stack sampling call-stack option

The correct call-stack option for branch stack sampling should be "stack"
instead of "call_stack". Correct it.

$perf record -e instructions -j call_stack -- sleep 1
unknown branch filter call_stack, check man page

 Usage: perf record [<options>] [<command>]
    or: perf record [<options>] -- <command> [<options>]

    -j, --branch-filter <branch filter mask>
                          branch stack filter modes

Fixes: 955f6def5590ce6c ("perf record: Add remaining branch filters: "no_cycles", "no_flags" & "hw_index"")
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Xudong Hao <xudong.hao@intel.com>
Cc: Zide Chen <zide.chen@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-record.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index c402e74172f6..178f483140ed 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -455,7 +455,7 @@ following filters are defined:
 	- no_tx: only when the target is not in a hardware transaction
 	- abort_tx: only when the target is a hardware transaction abort
 	- cond: conditional branches
-	- call_stack: save call stack
+	- stack: save call stack
 	- no_flags: don't save branch flags e.g prediction, misprediction etc
 	- no_cycles: don't save branch cycles
 	- hw_index: save branch hardware index
-- 
cgit v1.2.3


From a66f6242fbf521f8371d6cda5eaee6dc7668683b Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 31 Dec 2025 12:12:28 +0000
Subject: perf vendor events arm64: Remove uncountable events

These events are never countable by the PMU and are only intended to
be used as external inputs to trace. Therefore showing them in 'perf
list' is misleading so remove them.

The generator script doesn't emit these events when used with the new
telemetry-solution input files [1].

'perf list' should only show countable events because there are events
that are sometimes implemented, sometimes countable and sometimes not,
for example TRB_TRIG. If we always include any implemented events
whether they are countable or not then it's not possible to tell whether
they are usable in perf without going to the docs, defeating the point
of 'perf list'.

It's also not useful yet to display implemented events that are not
countable (for help in using trace rather than perf stat), because
PMU_OVFS and PMU_HOVFS are practically always implemented and TRB_TRIG
is always implemented when there is TRBE.

[1]: https://gitlab.arm.com/telemetry-solution/telemetry-solution/-/tree/main/data/pmu/cpu

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Akio Kakuno <fj3333bs@aa.jp.fujitsu.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshihiro Furudera <fj5100bi@fujitsu.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/arch/arm64/arm/cortex-a510/pmu.json  |  8 --------
 tools/perf/pmu-events/arch/arm64/common-and-microarch.json | 12 ------------
 tools/perf/pmu-events/arch/arm64/fujitsu/monaka/pmu.json   | 10 ----------
 3 files changed, 30 deletions(-)
 delete mode 100644 tools/perf/pmu-events/arch/arm64/arm/cortex-a510/pmu.json
 delete mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/monaka/pmu.json

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a510/pmu.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a510/pmu.json
deleted file mode 100644
index d8b7b9f9e5fa..000000000000
--- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a510/pmu.json
+++ /dev/null
@@ -1,8 +0,0 @@
-[
-    {
-        "ArchStdEvent": "PMU_OVFS"
-    },
-    {
-        "ArchStdEvent": "PMU_HOVFS"
-    }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/common-and-microarch.json b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json
index 2416d9f8a83d..468cb085d879 100644
--- a/tools/perf/pmu-events/arch/arm64/common-and-microarch.json
+++ b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json
@@ -432,24 +432,12 @@
         "EventName": "TRB_WRAP",
         "BriefDescription": "Trace buffer current write pointer wrapped"
     },
-    {
-        "PublicDescription": "PMU overflow, counters accessible to EL1 and EL0",
-        "EventCode": "0x400D",
-        "EventName": "PMU_OVFS",
-        "BriefDescription": "PMU overflow, counters accessible to EL1 and EL0"
-    },
     {
         "PublicDescription": "Trace buffer Trigger Event",
         "EventCode": "0x400E",
         "EventName": "TRB_TRIG",
         "BriefDescription": "Trace buffer Trigger Event"
     },
-    {
-        "PublicDescription": "PMU overflow, counters reserved for use by EL2",
-        "EventCode": "0x400F",
-        "EventName": "PMU_HOVFS",
-        "BriefDescription": "PMU overflow, counters reserved for use by EL2"
-    },
     {
         "PublicDescription": "PE Trace Unit external output 0",
         "EventCode": "0x4010",
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/monaka/pmu.json b/tools/perf/pmu-events/arch/arm64/fujitsu/monaka/pmu.json
deleted file mode 100644
index 65bd6cdd0dd5..000000000000
--- a/tools/perf/pmu-events/arch/arm64/fujitsu/monaka/pmu.json
+++ /dev/null
@@ -1,10 +0,0 @@
-[
-    {
-        "ArchStdEvent": "PMU_OVFS",
-        "BriefDescription": "This event counts the event generated each time one of the condition occurs described in Arm Architecture Reference Manual for A-profile architecture. This event is only for output to the trace unit."
-    },
-    {
-        "ArchStdEvent": "PMU_HOVFS",
-        "BriefDescription": "This event counts the event generated each time an event is counted by an event counter <n> and all of the condition occur described in Arm Architecture Reference Manual for A-profile architecture. This event is only for output to the trace unit."
-    }
-]
-- 
cgit v1.2.3


From 6e052cfe47c7fea0ac7cae271c69c69f0db3ca0e Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Fri, 2 Jan 2026 12:15:43 +0000
Subject: perf tools: Dump callchain context marker names

These are hard to interpret in the raw output because they are printed
as hex but are defined in perf_event.h as decimal. Make it much easier
to read the raw callchains by just printing their names.

For example:

  $ perf report -D

  1798195372321 0x4638 [0xb0]: PERF_RECORD_SAMPLE(IP, 0x4002): 44922/44922: 0x7c8046dd3400 period: 120218 addr: 0
  ... FP chain: nr:12
  .....  0: fffffffffffffe00 (PERF_CONTEXT_USER)
  .....  1: 00007c8046dd3400
  .....  2: 00007c8046db86d3

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
[ Add PERF_CONTEXT_USER_DEFERRED too, as per Namhyung's review comment ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/session.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 4236503c8f6c..65fa9bdff1b8 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -841,6 +841,28 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample)
 	}
 }
 
+static const char *callchain_context_str(u64 ip)
+{
+	switch (ip) {
+	case PERF_CONTEXT_HV:
+		return " (PERF_CONTEXT_HV)";
+	case PERF_CONTEXT_KERNEL:
+		return " (PERF_CONTEXT_KERNEL)";
+	case PERF_CONTEXT_USER:
+		return " (PERF_CONTEXT_USER)";
+	case PERF_CONTEXT_GUEST:
+		return " (PERF_CONTEXT_GUEST)";
+	case PERF_CONTEXT_GUEST_KERNEL:
+		return " (PERF_CONTEXT_GUEST_KERNEL)";
+	case PERF_CONTEXT_GUEST_USER:
+		return " (PERF_CONTEXT_GUEST_USER)";
+	case PERF_CONTEXT_USER_DEFERRED:
+		return " (PERF_CONTEXT_USER_DEFERRED)";
+	default:
+		return "";
+	}
+}
+
 static void callchain__printf(struct evsel *evsel,
 			      struct perf_sample *sample)
 {
@@ -853,8 +875,9 @@ static void callchain__printf(struct evsel *evsel,
 	printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr);
 
 	for (i = 0; i < callchain->nr; i++)
-		printf("..... %2d: %016" PRIx64 "\n",
-		       i, callchain->ips[i]);
+		printf("..... %2d: %016" PRIx64 "%s\n",
+		       i, callchain->ips[i],
+		       callchain_context_str(callchain->ips[i]));
 
 	if (sample->deferred_callchain)
 		printf("...... (deferred)\n");
-- 
cgit v1.2.3


From c55741148294700115ecacd19cb9c173721c1b6a Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@inria.fr>
Date: Tue, 30 Dec 2025 17:52:13 +0100
Subject: perf addr_location: Update outdated comment

The function addr_location__put() was renamed addr_location__exit() in
commit 0dd5041c9a0eaf8c ("perf addr_location: Add init/exit/copy
functions").  Make the comment preceding the function consistent with
the function itself.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kexin Sun <kexinsun@smail.nju.edu.cn>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ratnadira Widyasari <ratnadiraw@smu.edu.sg>
Cc: Xutong Ma <xutong.ma@inria.fr>
Cc: Yumbo Lyu <yunbolyu@smu.edu.sg>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/addr_location.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/addr_location.c b/tools/perf/util/addr_location.c
index 007a2f5df9a6..81a0b79c5e10 100644
--- a/tools/perf/util/addr_location.c
+++ b/tools/perf/util/addr_location.c
@@ -24,7 +24,7 @@ void addr_location__init(struct addr_location *al)
  * The preprocess_sample method will return with reference counts for the
  * in it, when done using (and perhaps getting ref counts if needing to
  * keep a pointer to one of those entries) it must be paired with
- * addr_location__put(), so that the refcounts can be decremented.
+ * addr_location__exit(), so that the refcounts can be decremented.
  */
 void addr_location__exit(struct addr_location *al)
 {
-- 
cgit v1.2.3


From 6fbf129c49905e9e34801b362c9c30ae383d7a90 Mon Sep 17 00:00:00 2001
From: Donglin Peng <pengdonglin@xiaomi.com>
Date: Fri, 9 Jan 2026 20:59:53 +0800
Subject: libbpf: Add BTF permutation support for type reordering

Introduce btf__permute() API to allow in-place rearrangement of BTF types.
This function reorganizes BTF type order according to a provided array of
type IDs, updating all type references to maintain consistency.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-2-dolinux.peng@gmail.com
---
 tools/lib/bpf/btf.c      | 133 +++++++++++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/btf.h      |  42 +++++++++++++++
 tools/lib/bpf/libbpf.map |   1 +
 3 files changed, 176 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index b136572e889a..bf75f770d29a 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -5887,3 +5887,136 @@ int btf__relocate(struct btf *btf, const struct btf *base_btf)
 		btf->owns_base = false;
 	return libbpf_err(err);
 }
+
+struct btf_permute {
+	struct btf *btf;
+	__u32 *id_map;
+	__u32 start_offs;
+};
+
+/* Callback function to remap individual type ID references */
+static int btf_permute_remap_type_id(__u32 *type_id, void *ctx)
+{
+	struct btf_permute *p = ctx;
+	__u32 new_id = *type_id;
+
+	/* refer to the base BTF or VOID type */
+	if (new_id < p->btf->start_id)
+		return 0;
+
+	if (new_id >= btf__type_cnt(p->btf))
+		return -EINVAL;
+
+	*type_id = p->id_map[new_id - p->btf->start_id + p->start_offs];
+	return 0;
+}
+
+int btf__permute(struct btf *btf, __u32 *id_map, __u32 id_map_cnt,
+		 const struct btf_permute_opts *opts)
+{
+	struct btf_permute p;
+	struct btf_ext *btf_ext;
+	void *nt, *new_types = NULL;
+	__u32 *order_map = NULL;
+	int err = 0, i;
+	__u32 n, id, start_offs = 0;
+
+	if (!OPTS_VALID(opts, btf_permute_opts))
+		return libbpf_err(-EINVAL);
+
+	if (btf__base_btf(btf)) {
+		n = btf->nr_types;
+	} else {
+		if (id_map[0] != 0)
+			return libbpf_err(-EINVAL);
+		n = btf__type_cnt(btf);
+		start_offs = 1;
+	}
+
+	if (id_map_cnt != n)
+		return libbpf_err(-EINVAL);
+
+	/* record the sequence of types */
+	order_map = calloc(id_map_cnt, sizeof(*id_map));
+	if (!order_map) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	new_types = calloc(btf->hdr->type_len, 1);
+	if (!new_types) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	if (btf_ensure_modifiable(btf)) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	for (i = start_offs; i < id_map_cnt; i++) {
+		id = id_map[i];
+		if (id < btf->start_id || id >= btf__type_cnt(btf)) {
+			err = -EINVAL;
+			goto done;
+		}
+		id -= btf->start_id - start_offs;
+		/* cannot be mapped to the same ID */
+		if (order_map[id]) {
+			err = -EINVAL;
+			goto done;
+		}
+		order_map[id] = i + btf->start_id - start_offs;
+	}
+
+	p.btf = btf;
+	p.id_map = id_map;
+	p.start_offs = start_offs;
+	nt = new_types;
+	for (i = start_offs; i < id_map_cnt; i++) {
+		struct btf_field_iter it;
+		const struct btf_type *t;
+		__u32 *type_id;
+		int type_size;
+
+		id = order_map[i];
+		t = btf__type_by_id(btf, id);
+		type_size = btf_type_size(t);
+		memcpy(nt, t, type_size);
+
+		/* fix up referenced IDs for BTF */
+		err = btf_field_iter_init(&it, nt, BTF_FIELD_ITER_IDS);
+		if (err)
+			goto done;
+		while ((type_id = btf_field_iter_next(&it))) {
+			err = btf_permute_remap_type_id(type_id, &p);
+			if (err)
+				goto done;
+		}
+
+		nt += type_size;
+	}
+
+	/* fix up referenced IDs for btf_ext */
+	btf_ext = OPTS_GET(opts, btf_ext, NULL);
+	if (btf_ext) {
+		err = btf_ext_visit_type_ids(btf_ext, btf_permute_remap_type_id, &p);
+		if (err)
+			goto done;
+	}
+
+	for (nt = new_types, i = 0; i < id_map_cnt - start_offs; i++) {
+		btf->type_offs[i] = nt - new_types;
+		nt += btf_type_size(nt);
+	}
+
+	free(order_map);
+	free(btf->types_data);
+	btf->types_data = new_types;
+	return 0;
+
+done:
+	free(order_map);
+	free(new_types);
+	return libbpf_err(err);
+}
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index cc01494d6210..b30008c267c0 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -281,6 +281,48 @@ LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts);
  */
 LIBBPF_API int btf__relocate(struct btf *btf, const struct btf *base_btf);
 
+struct btf_permute_opts {
+	size_t sz;
+	/* optional .BTF.ext info along the main BTF info */
+	struct btf_ext *btf_ext;
+	size_t :0;
+};
+#define btf_permute_opts__last_field btf_ext
+
+/**
+ * @brief **btf__permute()** rearranges BTF types in-place according to a specified ID mapping
+ * @param btf BTF object to permute
+ * @param id_map Array mapping original type IDs to new IDs
+ * @param id_map_cnt Number of elements in @id_map
+ * @param opts Optional parameters, including BTF extension data for reference updates
+ * @return 0 on success, negative error code on failure
+ *
+ * **btf__permute()** reorders BTF types based on the provided @id_map array,
+ * updating all internal type references to maintain consistency. The function
+ * operates in-place, modifying the BTF object directly.
+ *
+ * For **base BTF**:
+ * - @id_map must include all types from ID 0 to `btf__type_cnt(btf) - 1`
+ * - @id_map_cnt must be `btf__type_cnt(btf)`
+ * - Mapping is defined as `id_map[original_id] = new_id`
+ * - `id_map[0]` must be 0 (void type cannot be moved)
+ *
+ * For **split BTF**:
+ * - @id_map must include only split types (types added on top of the base BTF)
+ * - @id_map_cnt must be `btf__type_cnt(btf) - btf__type_cnt(btf__base_btf(btf))`
+ * - Mapping is defined as `id_map[original_id - start_id] = new_id`
+ * - `start_id` equals `btf__type_cnt(btf__base_btf(btf))`
+ *
+ * After permutation, all type references within the BTF data and optional
+ * BTF extension (if provided via @opts) are updated automatically.
+ *
+ * On error, returns a negative error code and sets errno:
+ *   - `-EINVAL`: Invalid parameters or invalid ID mapping
+ *   - `-ENOMEM`: Memory allocation failure
+ */
+LIBBPF_API int btf__permute(struct btf *btf, __u32 *id_map, __u32 id_map_cnt,
+			    const struct btf_permute_opts *opts);
+
 struct btf_dump;
 
 struct btf_dump_opts {
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 84fb90a016c9..d18fbcea7578 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -453,4 +453,5 @@ LIBBPF_1.7.0 {
 		bpf_map__exclusive_program;
 		bpf_prog_assoc_struct_ops;
 		bpf_program__assoc_struct_ops;
+		btf__permute;
 } LIBBPF_1.6.0;
-- 
cgit v1.2.3


From a3acd7d43462a7f7429301afad3c0059276f427e Mon Sep 17 00:00:00 2001
From: Donglin Peng <pengdonglin@xiaomi.com>
Date: Fri, 9 Jan 2026 20:59:54 +0800
Subject: selftests/bpf: Add test cases for btf__permute functionality

This patch introduces test cases for the btf__permute function to ensure
it works correctly with both base BTF and split BTF scenarios.

The test suite includes:
- test_permute_base: Validates permutation on base BTF
- test_permute_split: Tests permutation on split BTF

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-3-dolinux.peng@gmail.com
---
 .../testing/selftests/bpf/prog_tests/btf_permute.c | 244 +++++++++++++++++++++
 1 file changed, 244 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/btf_permute.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_permute.c b/tools/testing/selftests/bpf/prog_tests/btf_permute.c
new file mode 100644
index 000000000000..04ade5ad77ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/btf_permute.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Xiaomi */
+
+#include <test_progs.h>
+#include <bpf/btf.h>
+#include "btf_helpers.h"
+
+static void permute_base_check(struct btf *btf)
+{
+	VALIDATE_RAW_BTF(
+		btf,
+		"[1] STRUCT 's2' size=4 vlen=1\n"
+		"\t'm' type_id=4 bits_offset=0",
+		"[2] FUNC 'f' type_id=6 linkage=static",
+		"[3] PTR '(anon)' type_id=4",
+		"[4] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+		"[5] STRUCT 's1' size=4 vlen=1\n"
+		"\t'm' type_id=4 bits_offset=0",
+		"[6] FUNC_PROTO '(anon)' ret_type_id=4 vlen=1\n"
+		"\t'p' type_id=3");
+}
+
+/* Ensure btf__permute works as expected in the base-BTF scenario */
+static void test_permute_base(void)
+{
+	struct btf *btf;
+	__u32 permute_ids[7];
+	int err;
+
+	btf = btf__new_empty();
+	if (!ASSERT_OK_PTR(btf, "empty_main_btf"))
+		return;
+
+	btf__add_int(btf, "int", 4, BTF_INT_SIGNED);	/* [1] int */
+	btf__add_ptr(btf, 1);				/* [2] ptr to int */
+	btf__add_struct(btf, "s1", 4);			/* [3] struct s1 { */
+	btf__add_field(btf, "m", 1, 0, 0);		/*       int m; */
+							/* } */
+	btf__add_struct(btf, "s2", 4);			/* [4] struct s2 { */
+	btf__add_field(btf, "m", 1, 0, 0);		/*       int m; */
+							/* } */
+	btf__add_func_proto(btf, 1);			/* [5] int (*)(int *p); */
+	btf__add_func_param(btf, "p", 2);
+	btf__add_func(btf, "f", BTF_FUNC_STATIC, 5);	/* [6] int f(int *p); */
+
+	VALIDATE_RAW_BTF(
+		btf,
+		"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+		"[2] PTR '(anon)' type_id=1",
+		"[3] STRUCT 's1' size=4 vlen=1\n"
+		"\t'm' type_id=1 bits_offset=0",
+		"[4] STRUCT 's2' size=4 vlen=1\n"
+		"\t'm' type_id=1 bits_offset=0",
+		"[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n"
+		"\t'p' type_id=2",
+		"[6] FUNC 'f' type_id=5 linkage=static");
+
+	permute_ids[0] = 0; /* [0] -> [0] */
+	permute_ids[1] = 4; /* [1] -> [4] */
+	permute_ids[2] = 3; /* [2] -> [3] */
+	permute_ids[3] = 5; /* [3] -> [5] */
+	permute_ids[4] = 1; /* [4] -> [1] */
+	permute_ids[5] = 6; /* [5] -> [6] */
+	permute_ids[6] = 2; /* [6] -> [2] */
+	err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL);
+	if (!ASSERT_OK(err, "btf__permute_base"))
+		goto done;
+	permute_base_check(btf);
+
+	/* ids[0] must be 0 for base BTF */
+	permute_ids[0] = 4; /* [0] -> [0] */
+	permute_ids[1] = 0; /* [1] -> [4] */
+	permute_ids[2] = 3; /* [2] -> [3] */
+	permute_ids[3] = 5; /* [3] -> [5] */
+	permute_ids[4] = 1; /* [4] -> [1] */
+	permute_ids[5] = 6; /* [5] -> [6] */
+	permute_ids[6] = 2; /* [6] -> [2] */
+	err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL);
+	if (!ASSERT_ERR(err, "btf__permute_base"))
+		goto done;
+	/* BTF is not modified */
+	permute_base_check(btf);
+
+	/* id_map_cnt is invalid */
+	permute_ids[0] = 0; /* [0] -> [0] */
+	permute_ids[1] = 4; /* [1] -> [4] */
+	permute_ids[2] = 3; /* [2] -> [3] */
+	permute_ids[3] = 5; /* [3] -> [5] */
+	permute_ids[4] = 1; /* [4] -> [1] */
+	permute_ids[5] = 6; /* [5] -> [6] */
+	permute_ids[6] = 2; /* [6] -> [2] */
+	err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids) - 1, NULL);
+	if (!ASSERT_ERR(err, "btf__permute_base"))
+		goto done;
+	/* BTF is not modified */
+	permute_base_check(btf);
+
+	/* Multiple types can not be mapped to the same ID */
+	permute_ids[0] = 0;
+	permute_ids[1] = 4;
+	permute_ids[2] = 4;
+	permute_ids[3] = 5;
+	permute_ids[4] = 1;
+	permute_ids[5] = 6;
+	permute_ids[6] = 2;
+	err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL);
+	if (!ASSERT_ERR(err, "btf__permute_base"))
+		goto done;
+	/* BTF is not modified */
+	permute_base_check(btf);
+
+	/* Type ID must be valid */
+	permute_ids[0] = 0;
+	permute_ids[1] = 4;
+	permute_ids[2] = 3;
+	permute_ids[3] = 5;
+	permute_ids[4] = 1;
+	permute_ids[5] = 7;
+	permute_ids[6] = 2;
+	err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL);
+	if (!ASSERT_ERR(err, "btf__permute_base"))
+		goto done;
+	/* BTF is not modified */
+	permute_base_check(btf);
+
+done:
+	btf__free(btf);
+}
+
+static void permute_split_check(struct btf *btf)
+{
+	VALIDATE_RAW_BTF(
+		btf,
+		"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+		"[2] PTR '(anon)' type_id=1",
+		"[3] STRUCT 's2' size=4 vlen=1\n"
+		"\t'm' type_id=1 bits_offset=0",
+		"[4] FUNC 'f' type_id=5 linkage=static",
+		"[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n"
+		"\t'p' type_id=2",
+		"[6] STRUCT 's1' size=4 vlen=1\n"
+		"\t'm' type_id=1 bits_offset=0");
+}
+
+/* Ensure btf__permute works as expected in the split-BTF scenario */
+static void test_permute_split(void)
+{
+	struct btf *split_btf = NULL, *base_btf = NULL;
+	__u32 permute_ids[4];
+	int err, start_id;
+
+	base_btf = btf__new_empty();
+	if (!ASSERT_OK_PTR(base_btf, "empty_main_btf"))
+		return;
+
+	btf__add_int(base_btf, "int", 4, BTF_INT_SIGNED);	/* [1] int */
+	btf__add_ptr(base_btf, 1);				/* [2] ptr to int */
+	VALIDATE_RAW_BTF(
+		base_btf,
+		"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+		"[2] PTR '(anon)' type_id=1");
+	split_btf = btf__new_empty_split(base_btf);
+	if (!ASSERT_OK_PTR(split_btf, "empty_split_btf"))
+		goto cleanup;
+	btf__add_struct(split_btf, "s1", 4);			/* [3] struct s1 { */
+	btf__add_field(split_btf, "m", 1, 0, 0);		/*   int m; */
+								/* } */
+	btf__add_struct(split_btf, "s2", 4);			/* [4] struct s2 { */
+	btf__add_field(split_btf, "m", 1, 0, 0);		/*   int m; */
+								/* } */
+	btf__add_func_proto(split_btf, 1);			/* [5] int (*)(int p); */
+	btf__add_func_param(split_btf, "p", 2);
+	btf__add_func(split_btf, "f", BTF_FUNC_STATIC, 5);	/* [6] int f(int *p); */
+
+	VALIDATE_RAW_BTF(
+		split_btf,
+		"[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED",
+		"[2] PTR '(anon)' type_id=1",
+		"[3] STRUCT 's1' size=4 vlen=1\n"
+		"\t'm' type_id=1 bits_offset=0",
+		"[4] STRUCT 's2' size=4 vlen=1\n"
+		"\t'm' type_id=1 bits_offset=0",
+		"[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n"
+		"\t'p' type_id=2",
+		"[6] FUNC 'f' type_id=5 linkage=static");
+
+	start_id = btf__type_cnt(base_btf);
+	permute_ids[3 - start_id] = 6; /* [3] -> [6] */
+	permute_ids[4 - start_id] = 3; /* [4] -> [3] */
+	permute_ids[5 - start_id] = 5; /* [5] -> [5] */
+	permute_ids[6 - start_id] = 4; /* [6] -> [4] */
+	err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL);
+	if (!ASSERT_OK(err, "btf__permute_split"))
+		goto cleanup;
+	permute_split_check(split_btf);
+
+	/*
+	 * For split BTF, id_map_cnt must equal to the number of types
+	 * added on top of base BTF
+	 */
+	permute_ids[3 - start_id] = 4;
+	permute_ids[4 - start_id] = 3;
+	permute_ids[5 - start_id] = 5;
+	permute_ids[6 - start_id] = 6;
+	err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids) - 1, NULL);
+	if (!ASSERT_ERR(err, "btf__permute_split"))
+		goto cleanup;
+	/* BTF is not modified */
+	permute_split_check(split_btf);
+
+	/* Multiple types can not be mapped to the same ID */
+	permute_ids[3 - start_id] = 4;
+	permute_ids[4 - start_id] = 3;
+	permute_ids[5 - start_id] = 3;
+	permute_ids[6 - start_id] = 6;
+	err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL);
+	if (!ASSERT_ERR(err, "btf__permute_split"))
+		goto cleanup;
+	/* BTF is not modified */
+	permute_split_check(split_btf);
+
+	/* Can not map to base ID */
+	permute_ids[3 - start_id] = 4;
+	permute_ids[4 - start_id] = 2;
+	permute_ids[5 - start_id] = 5;
+	permute_ids[6 - start_id] = 6;
+	err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL);
+	if (!ASSERT_ERR(err, "btf__permute_split"))
+		goto cleanup;
+	/* BTF is not modified */
+	permute_split_check(split_btf);
+
+cleanup:
+	btf__free(split_btf);
+	btf__free(base_btf);
+}
+
+void test_btf_permute(void)
+{
+	if (test__start_subtest("permute_base"))
+		test_permute_base();
+	if (test__start_subtest("permute_split"))
+		test_permute_split();
+}
-- 
cgit v1.2.3


From 230e7d7de5a8d2bcda2a5cdcc8c176c09a63e331 Mon Sep 17 00:00:00 2001
From: Donglin Peng <pengdonglin@xiaomi.com>
Date: Fri, 9 Jan 2026 20:59:55 +0800
Subject: tools/resolve_btfids: Support BTF sorting feature

This introduces a new BTF sorting phase that specifically sorts
BTF types by name in ascending order, so that the binary search
can be used to look up types.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-4-dolinux.peng@gmail.com
---
 tools/bpf/resolve_btfids/main.c | 64 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index df39982f51df..343d08050116 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -850,6 +850,67 @@ static int dump_raw_btf(struct btf *btf, const char *out_path)
 	return 0;
 }
 
+/*
+ * Sort types by name in ascending order resulting in all
+ * anonymous types being placed before named types.
+ */
+static int cmp_type_names(const void *a, const void *b, void *priv)
+{
+	struct btf *btf = (struct btf *)priv;
+	const struct btf_type *ta = btf__type_by_id(btf, *(__u32 *)a);
+	const struct btf_type *tb = btf__type_by_id(btf, *(__u32 *)b);
+	const char *na, *nb;
+
+	na = btf__str_by_offset(btf, ta->name_off);
+	nb = btf__str_by_offset(btf, tb->name_off);
+	return strcmp(na, nb);
+}
+
+static int sort_btf_by_name(struct btf *btf)
+{
+	__u32 *permute_ids = NULL, *id_map = NULL;
+	int nr_types, i, err = 0;
+	__u32 start_id = 0, start_offs = 1, id;
+
+	if (btf__base_btf(btf)) {
+		start_id = btf__type_cnt(btf__base_btf(btf));
+		start_offs = 0;
+	}
+	nr_types = btf__type_cnt(btf) - start_id;
+
+	permute_ids = calloc(nr_types, sizeof(*permute_ids));
+	if (!permute_ids) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	id_map = calloc(nr_types, sizeof(*id_map));
+	if (!id_map) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0, id = start_id; i < nr_types; i++, id++)
+		permute_ids[i] = id;
+
+	qsort_r(permute_ids + start_offs, nr_types - start_offs,
+		sizeof(*permute_ids), cmp_type_names, btf);
+
+	for (i = 0; i < nr_types; i++) {
+		id = permute_ids[i] - start_id;
+		id_map[id] = i + start_id;
+	}
+
+	err = btf__permute(btf, id_map, nr_types, NULL);
+	if (err)
+		pr_err("FAILED: btf permute: %s\n", strerror(-err));
+
+out:
+	free(permute_ids);
+	free(id_map);
+	return err;
+}
+
 static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, const char *suffix)
 {
 	int len = snprintf(buf, buf_sz, "%s%s", in_path, suffix);
@@ -1025,6 +1086,9 @@ int main(int argc, const char **argv)
 	if (load_btf(&obj))
 		goto out;
 
+	if (sort_btf_by_name(obj.btf))
+		goto out;
+
 	if (elf_collect(&obj))
 		goto out;
 
-- 
cgit v1.2.3


From d836e5e64992363b5fa9b121f1ab4a1a1b89162d Mon Sep 17 00:00:00 2001
From: Donglin Peng <pengdonglin@xiaomi.com>
Date: Fri, 9 Jan 2026 20:59:56 +0800
Subject: libbpf: Optimize type lookup with binary search for sorted BTF

This patch introduces binary search optimization for BTF type lookups
when the BTF instance contains sorted types.

The optimization significantly improves performance when searching for
types in large BTF instances with sorted types. For unsorted BTF, the
implementation falls back to the original linear search.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-5-dolinux.peng@gmail.com
---
 tools/lib/bpf/btf.c | 90 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 65 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index bf75f770d29a..5a6ac40439e4 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -92,6 +92,8 @@ struct btf {
 	 *   - for split BTF counts number of types added on top of base BTF.
 	 */
 	__u32 nr_types;
+	/* the start IDs of named types in sorted BTF */
+	int named_start_id;
 	/* if not NULL, points to the base BTF on top of which the current
 	 * split BTF is based
 	 */
@@ -897,46 +899,81 @@ int btf__resolve_type(const struct btf *btf, __u32 type_id)
 	return type_id;
 }
 
-__s32 btf__find_by_name(const struct btf *btf, const char *type_name)
+static __s32 btf_find_type_by_name_bsearch(const struct btf *btf, const char *name,
+					   __s32 start_id)
 {
-	__u32 i, nr_types = btf__type_cnt(btf);
-
-	if (!strcmp(type_name, "void"))
-		return 0;
-
-	for (i = 1; i < nr_types; i++) {
-		const struct btf_type *t = btf__type_by_id(btf, i);
-		const char *name = btf__name_by_offset(btf, t->name_off);
-
-		if (name && !strcmp(type_name, name))
-			return i;
+	const struct btf_type *t;
+	const char *tname;
+	__s32 l, r, m;
+
+	l = start_id;
+	r = btf__type_cnt(btf) - 1;
+	while (l <= r) {
+		m = l + (r - l) / 2;
+		t = btf_type_by_id(btf, m);
+		tname = btf__str_by_offset(btf, t->name_off);
+		if (strcmp(tname, name) >= 0) {
+			if (l == r)
+				return r;
+			r = m;
+		} else {
+			l = m + 1;
+		}
 	}
 
-	return libbpf_err(-ENOENT);
+	return btf__type_cnt(btf);
 }
 
 static __s32 btf_find_by_name_kind(const struct btf *btf, int start_id,
-				   const char *type_name, __u32 kind)
+				   const char *type_name, __s32 kind)
 {
-	__u32 i, nr_types = btf__type_cnt(btf);
+	__u32 nr_types = btf__type_cnt(btf);
+	const struct btf_type *t;
+	const char *tname;
+	__s32 id;
 
-	if (kind == BTF_KIND_UNKN || !strcmp(type_name, "void"))
-		return 0;
+	if (start_id < btf->start_id) {
+		id = btf_find_by_name_kind(btf->base_btf, start_id,
+					   type_name, kind);
+		if (id >= 0)
+			return id;
+		start_id = btf->start_id;
+	}
 
-	for (i = start_id; i < nr_types; i++) {
-		const struct btf_type *t = btf__type_by_id(btf, i);
-		const char *name;
+	if (kind == BTF_KIND_UNKN || strcmp(type_name, "void") == 0)
+		return 0;
 
-		if (btf_kind(t) != kind)
-			continue;
-		name = btf__name_by_offset(btf, t->name_off);
-		if (name && !strcmp(type_name, name))
-			return i;
+	if (btf->named_start_id > 0 && type_name[0]) {
+		start_id = max(start_id, btf->named_start_id);
+		id = btf_find_type_by_name_bsearch(btf, type_name, start_id);
+		for (; id < nr_types; id++) {
+			t = btf__type_by_id(btf, id);
+			tname = btf__str_by_offset(btf, t->name_off);
+			if (strcmp(tname, type_name) != 0)
+				return libbpf_err(-ENOENT);
+			if (kind < 0 || btf_kind(t) == kind)
+				return id;
+		}
+	} else {
+		for (id = start_id; id < nr_types; id++) {
+			t = btf_type_by_id(btf, id);
+			if (kind > 0 && btf_kind(t) != kind)
+				continue;
+			tname = btf__str_by_offset(btf, t->name_off);
+			if (strcmp(tname, type_name) == 0)
+				return id;
+		}
 	}
 
 	return libbpf_err(-ENOENT);
 }
 
+/* the kind value of -1 indicates that kind matching should be skipped */
+__s32 btf__find_by_name(const struct btf *btf, const char *type_name)
+{
+	return btf_find_by_name_kind(btf, 1, type_name, -1);
+}
+
 __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name,
 				 __u32 kind)
 {
@@ -1006,6 +1043,7 @@ static struct btf *btf_new_empty(struct btf *base_btf)
 	btf->fd = -1;
 	btf->ptr_sz = sizeof(void *);
 	btf->swapped_endian = false;
+	btf->named_start_id = 0;
 
 	if (base_btf) {
 		btf->base_btf = base_btf;
@@ -1057,6 +1095,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, b
 	btf->start_id = 1;
 	btf->start_str_off = 0;
 	btf->fd = -1;
+	btf->named_start_id = 0;
 
 	if (base_btf) {
 		btf->base_btf = base_btf;
@@ -1715,6 +1754,7 @@ static void btf_invalidate_raw_data(struct btf *btf)
 		free(btf->raw_data_swapped);
 		btf->raw_data_swapped = NULL;
 	}
+	btf->named_start_id = 0;
 }
 
 /* Ensure BTF is ready to be modified (by splitting into a three memory
-- 
cgit v1.2.3


From 33ecca574f1c27cbf560aee9c1b3045dcb9f8de5 Mon Sep 17 00:00:00 2001
From: Donglin Peng <pengdonglin@xiaomi.com>
Date: Fri, 9 Jan 2026 20:59:57 +0800
Subject: libbpf: Verify BTF sorting

This patch checks whether the BTF is sorted by name in ascending
order. If sorted, binary search will be used when looking up types.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-6-dolinux.peng@gmail.com
---
 tools/lib/bpf/btf.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 5a6ac40439e4..808e53961ed6 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -899,6 +899,30 @@ int btf__resolve_type(const struct btf *btf, __u32 type_id)
 	return type_id;
 }
 
+static void btf_check_sorted(struct btf *btf)
+{
+	__u32 i, n, named_start_id = 0;
+
+	n = btf__type_cnt(btf);
+	for (i = btf->start_id + 1; i < n; i++) {
+		struct btf_type *ta = btf_type_by_id(btf, i - 1);
+		struct btf_type *tb = btf_type_by_id(btf, i);
+		const char *na = btf__str_by_offset(btf, ta->name_off);
+		const char *nb = btf__str_by_offset(btf, tb->name_off);
+
+		if (strcmp(na, nb) > 0)
+			return;
+
+		if (named_start_id == 0 && na[0] != '\0')
+			named_start_id = i - 1;
+		if (named_start_id == 0 && nb[0] != '\0')
+			named_start_id = i;
+	}
+
+	if (named_start_id)
+		btf->named_start_id = named_start_id;
+}
+
 static __s32 btf_find_type_by_name_bsearch(const struct btf *btf, const char *name,
 					   __s32 start_id)
 {
@@ -1130,6 +1154,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, b
 	err = err ?: btf_sanity_check(btf);
 	if (err)
 		goto done;
+	btf_check_sorted(btf);
 
 done:
 	if (err) {
-- 
cgit v1.2.3


From 9282a42a1fe16c61a253293af439d6fecd8b5b6c Mon Sep 17 00:00:00 2001
From: Donglin Peng <pengdonglin@xiaomi.com>
Date: Fri, 9 Jan 2026 21:00:03 +0800
Subject: btf: Refactor the code by calling str_is_empty

Calling the str_is_empty function to clarify the code and
no functional changes are introduced.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/bpf/20260109130003.3313716-12-dolinux.peng@gmail.com
---
 tools/lib/bpf/btf.c    | 34 +++++++++++++++++-----------------
 tools/lib/bpf/libbpf.c |  4 ++--
 2 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 808e53961ed6..83fe79ffcb8f 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -2134,7 +2134,7 @@ int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding
 	int sz, name_off;
 
 	/* non-empty name */
-	if (!name || !name[0])
+	if (str_is_empty(name))
 		return libbpf_err(-EINVAL);
 	/* byte_sz must be power of 2 */
 	if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 16)
@@ -2182,7 +2182,7 @@ int btf__add_float(struct btf *btf, const char *name, size_t byte_sz)
 	int sz, name_off;
 
 	/* non-empty name */
-	if (!name || !name[0])
+	if (str_is_empty(name))
 		return libbpf_err(-EINVAL);
 
 	/* byte_sz must be one of the explicitly allowed values */
@@ -2237,7 +2237,7 @@ static int btf_add_ref_kind(struct btf *btf, int kind, const char *name, int ref
 	if (!t)
 		return libbpf_err(-ENOMEM);
 
-	if (name && name[0]) {
+	if (!str_is_empty(name)) {
 		name_off = btf__add_str(btf, name);
 		if (name_off < 0)
 			return name_off;
@@ -2314,7 +2314,7 @@ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32
 	if (!t)
 		return libbpf_err(-ENOMEM);
 
-	if (name && name[0]) {
+	if (!str_is_empty(name)) {
 		name_off = btf__add_str(btf, name);
 		if (name_off < 0)
 			return name_off;
@@ -2415,7 +2415,7 @@ int btf__add_field(struct btf *btf, const char *name, int type_id,
 	if (!m)
 		return libbpf_err(-ENOMEM);
 
-	if (name && name[0]) {
+	if (!str_is_empty(name)) {
 		name_off = btf__add_str(btf, name);
 		if (name_off < 0)
 			return name_off;
@@ -2453,7 +2453,7 @@ static int btf_add_enum_common(struct btf *btf, const char *name, __u32 byte_sz,
 	if (!t)
 		return libbpf_err(-ENOMEM);
 
-	if (name && name[0]) {
+	if (!str_is_empty(name)) {
 		name_off = btf__add_str(btf, name);
 		if (name_off < 0)
 			return name_off;
@@ -2511,7 +2511,7 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value)
 		return libbpf_err(-EINVAL);
 
 	/* non-empty name */
-	if (!name || !name[0])
+	if (str_is_empty(name))
 		return libbpf_err(-EINVAL);
 	if (value < INT_MIN || value > UINT_MAX)
 		return libbpf_err(-E2BIG);
@@ -2588,7 +2588,7 @@ int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value)
 		return libbpf_err(-EINVAL);
 
 	/* non-empty name */
-	if (!name || !name[0])
+	if (str_is_empty(name))
 		return libbpf_err(-EINVAL);
 
 	/* decompose and invalidate raw data */
@@ -2628,7 +2628,7 @@ int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value)
  */
 int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind)
 {
-	if (!name || !name[0])
+	if (str_is_empty(name))
 		return libbpf_err(-EINVAL);
 
 	switch (fwd_kind) {
@@ -2664,7 +2664,7 @@ int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind)
  */
 int btf__add_typedef(struct btf *btf, const char *name, int ref_type_id)
 {
-	if (!name || !name[0])
+	if (str_is_empty(name))
 		return libbpf_err(-EINVAL);
 
 	return btf_add_ref_kind(btf, BTF_KIND_TYPEDEF, name, ref_type_id, 0);
@@ -2716,7 +2716,7 @@ int btf__add_restrict(struct btf *btf, int ref_type_id)
  */
 int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id)
 {
-	if (!value || !value[0])
+	if (str_is_empty(value))
 		return libbpf_err(-EINVAL);
 
 	return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id, 0);
@@ -2733,7 +2733,7 @@ int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id)
  */
 int btf__add_type_attr(struct btf *btf, const char *value, int ref_type_id)
 {
-	if (!value || !value[0])
+	if (str_is_empty(value))
 		return libbpf_err(-EINVAL);
 
 	return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id, 1);
@@ -2752,7 +2752,7 @@ int btf__add_func(struct btf *btf, const char *name,
 {
 	int id;
 
-	if (!name || !name[0])
+	if (str_is_empty(name))
 		return libbpf_err(-EINVAL);
 	if (linkage != BTF_FUNC_STATIC && linkage != BTF_FUNC_GLOBAL &&
 	    linkage != BTF_FUNC_EXTERN)
@@ -2838,7 +2838,7 @@ int btf__add_func_param(struct btf *btf, const char *name, int type_id)
 	if (!p)
 		return libbpf_err(-ENOMEM);
 
-	if (name && name[0]) {
+	if (!str_is_empty(name)) {
 		name_off = btf__add_str(btf, name);
 		if (name_off < 0)
 			return name_off;
@@ -2873,7 +2873,7 @@ int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id)
 	int sz, name_off;
 
 	/* non-empty name */
-	if (!name || !name[0])
+	if (str_is_empty(name))
 		return libbpf_err(-EINVAL);
 	if (linkage != BTF_VAR_STATIC && linkage != BTF_VAR_GLOBAL_ALLOCATED &&
 	    linkage != BTF_VAR_GLOBAL_EXTERN)
@@ -2922,7 +2922,7 @@ int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz)
 	int sz, name_off;
 
 	/* non-empty name */
-	if (!name || !name[0])
+	if (str_is_empty(name))
 		return libbpf_err(-EINVAL);
 
 	if (btf_ensure_modifiable(btf))
@@ -2999,7 +2999,7 @@ static int btf_add_decl_tag(struct btf *btf, const char *value, int ref_type_id,
 	struct btf_type *t;
 	int sz, value_off;
 
-	if (!value || !value[0] || component_idx < -1)
+	if (str_is_empty(value) || component_idx < -1)
 		return libbpf_err(-EINVAL);
 
 	if (validate_type_id(ref_type_id))
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 6ea81701e274..bbcfd72b07d5 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2904,7 +2904,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
 	var_extra = btf_var(var);
 	map_name = btf__name_by_offset(obj->btf, var->name_off);
 
-	if (map_name == NULL || map_name[0] == '\0') {
+	if (str_is_empty(map_name)) {
 		pr_warn("map #%d: empty name.\n", var_idx);
 		return -EINVAL;
 	}
@@ -4281,7 +4281,7 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
 		if (!sym_is_extern(sym))
 			continue;
 		ext_name = elf_sym_str(obj, sym->st_name);
-		if (!ext_name || !ext_name[0])
+		if (str_is_empty(ext_name))
 			continue;
 
 		ext = obj->externs;
-- 
cgit v1.2.3


From c3a9a27c79e4e5d8bdb20a26d16230111207e98e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 8 Jan 2026 19:45:25 -0800
Subject: KVM: selftests: Add a test to verify APICv updates (while L2 is
 active)

Add a test to verify KVM correctly handles a variety of edge cases related
to APICv updates, and in particular updates that are triggered while L2 is
actively running.

Reviewed-by: Chao Gao <chao.gao@intel.com>
Link: https://patch.msgid.link/20260109034532.1012993-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm           |   1 +
 tools/testing/selftests/kvm/include/x86/apic.h     |   4 +
 .../selftests/kvm/x86/vmx_apicv_updates_test.c     | 155 +++++++++++++++++++++
 3 files changed, 160 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index ba5c2b643efa..6f00bd8271c2 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -115,6 +115,7 @@ TEST_GEN_PROGS_x86 += x86/ucna_injection_test
 TEST_GEN_PROGS_x86 += x86/userspace_io_test
 TEST_GEN_PROGS_x86 += x86/userspace_msr_exit_test
 TEST_GEN_PROGS_x86 += x86/vmx_apic_access_test
+TEST_GEN_PROGS_x86 += x86/vmx_apicv_updates_test
 TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test
 TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state
 TEST_GEN_PROGS_x86 += x86/vmx_msrs_test
diff --git a/tools/testing/selftests/kvm/include/x86/apic.h b/tools/testing/selftests/kvm/include/x86/apic.h
index 80fe9f69b38d..d42a0998d868 100644
--- a/tools/testing/selftests/kvm/include/x86/apic.h
+++ b/tools/testing/selftests/kvm/include/x86/apic.h
@@ -32,6 +32,7 @@
 #define	APIC_SPIV	0xF0
 #define		APIC_SPIV_FOCUS_DISABLED	(1 << 9)
 #define		APIC_SPIV_APIC_ENABLED		(1 << 8)
+#define	APIC_ISR	0x100
 #define APIC_IRR	0x200
 #define	APIC_ICR	0x300
 #define	APIC_LVTCMCI	0x2f0
@@ -68,6 +69,9 @@
 #define	APIC_TMCCT	0x390
 #define	APIC_TDCR	0x3E0
 
+#define APIC_VECTOR_TO_BIT_NUMBER(v) ((unsigned int)(v) % 32)
+#define APIC_VECTOR_TO_REG_OFFSET(v) ((unsigned int)(v) / 32 * 0x10)
+
 void apic_disable(void);
 void xapic_enable(void);
 void x2apic_enable(void);
diff --git a/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c b/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c
new file mode 100644
index 000000000000..337c53fddeff
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/vmx_apicv_updates_test.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define GOOD_IPI_VECTOR 0xe0
+#define BAD_IPI_VECTOR 0xf0
+
+static volatile int good_ipis_received;
+
+static void good_ipi_handler(struct ex_regs *regs)
+{
+	good_ipis_received++;
+}
+
+static void bad_ipi_handler(struct ex_regs *regs)
+{
+	GUEST_FAIL("Received \"bad\" IPI; ICR MMIO write should have been ignored");
+}
+
+static void l2_guest_code(void)
+{
+	x2apic_enable();
+	vmcall();
+
+	xapic_enable();
+	xapic_write_reg(APIC_ID, 1 << 24);
+	vmcall();
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	uint32_t control;
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+
+	/* Prepare the VMCS for L2 execution. */
+	prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+	control |= CPU_BASED_USE_MSR_BITMAPS;
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+
+	/* Modify APIC ID to coerce KVM into inhibiting APICv. */
+	xapic_enable();
+	xapic_write_reg(APIC_ID, 1 << 24);
+
+	/*
+	 * Generate+receive an IRQ without doing EOI to get an IRQ set in vISR
+	 * but not SVI.  APICv should be inhibited due to running with a
+	 * modified APIC ID.
+	 */
+	xapic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_DM_FIXED | GOOD_IPI_VECTOR);
+	GUEST_ASSERT_EQ(xapic_read_reg(APIC_ID), 1 << 24);
+
+	/* Enable IRQs and verify the IRQ was received. */
+	sti_nop();
+	GUEST_ASSERT_EQ(good_ipis_received, 1);
+
+	/*
+	 * Run L2 to switch to x2APIC mode, which in turn will uninhibit APICv,
+	 * as KVM should force the APIC ID back to its default.
+	 */
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + vmreadz(VM_EXIT_INSTRUCTION_LEN));
+	GUEST_ASSERT(rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_EXTD);
+
+	/*
+	 * Scribble the APIC access page to verify KVM disabled xAPIC
+	 * virtualization in vmcs01, and to verify that KVM flushes L1's TLB
+	 * when L2 switches back to accelerated xAPIC mode.
+	 */
+	xapic_write_reg(APIC_ICR2, 0xdeadbeefu);
+	xapic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_DM_FIXED | BAD_IPI_VECTOR);
+
+	/*
+	 * Verify the IRQ is still in-service and emit an EOI to verify KVM
+	 * propagates the highest vISR vector to SVI when APICv is activated
+	 * (and does so even if APICv was uninhibited while L2 was active).
+	 */
+	GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)),
+			BIT(APIC_VECTOR_TO_BIT_NUMBER(GOOD_IPI_VECTOR)));
+	x2apic_write_reg(APIC_EOI, 0);
+	GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), 0);
+
+	/*
+	 * Run L2 one more time to switch back to xAPIC mode to verify that KVM
+	 * handles the x2APIC => xAPIC transition and inhibits APICv while L2
+	 * is active.
+	 */
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	GUEST_ASSERT(!(rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_EXTD));
+
+	xapic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_DM_FIXED | GOOD_IPI_VECTOR);
+	/* Re-enable IRQs, as VM-Exit clears RFLAGS.IF. */
+	sti_nop();
+	GUEST_ASSERT_EQ(good_ipis_received, 2);
+
+	GUEST_ASSERT_EQ(xapic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)),
+			BIT(APIC_VECTOR_TO_BIT_NUMBER(GOOD_IPI_VECTOR)));
+	xapic_write_reg(APIC_EOI, 0);
+	GUEST_ASSERT_EQ(xapic_read_reg(APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(GOOD_IPI_VECTOR)), 0);
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t vmx_pages_gva;
+	struct vmx_pages *vmx;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+	vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	prepare_virtualize_apic_accesses(vmx, vm);
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
+
+	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+	vm_install_exception_handler(vm, BAD_IPI_VECTOR, bad_ipi_handler);
+	vm_install_exception_handler(vm, GOOD_IPI_VECTOR, good_ipi_handler);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		/* NOT REACHED */
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unexpected ucall %lu", uc.cmd);
+	}
+
+	/*
+	 * Verify at least two IRQs were injected.  Unfortunately, KVM counts
+	 * re-injected IRQs (e.g. if delivering the IRQ hits an EPT violation),
+	 * so being more precise isn't possible given the current stats.
+	 */
+	TEST_ASSERT(vcpu_get_stat(vcpu, irq_injections) >= 2,
+		    "Wanted at least 2 IRQ injections, got %lu\n",
+		    vcpu_get_stat(vcpu, irq_injections));
+
+	kvm_vm_free(vm);
+	return 0;
+}
-- 
cgit v1.2.3


From d7507a94a07202234236d7f94bed6015ca645ae6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Tue, 30 Dec 2025 13:13:44 -0800
Subject: KVM: SVM: Treat exit_code as an unsigned 64-bit value through all of
 KVM

Fix KVM's long-standing buggy handling of SVM's exit_code as a 32-bit
value.  Per the APM and Xen commit d1bd157fbc ("Big merge the HVM
full-virtualisation abstractions.") (which is arguably more trustworthy
than KVM), offset 0x70 is a single 64-bit value:

  070h 63:0 EXITCODE

Track exit_code as a single u64 to prevent reintroducing bugs where KVM
neglects to correctly set bits 63:32.

Fixes: 6aa8b732ca01 ("[PATCH] kvm: userspace interface")
Cc: Jim Mattson <jmattson@google.com>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20251230211347.4099600-6-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/svm.h                         |  3 +-
 arch/x86/include/uapi/asm/svm.h                    | 32 +++++++++----------
 arch/x86/kvm/svm/hyperv.c                          |  1 -
 arch/x86/kvm/svm/nested.c                          | 13 ++------
 arch/x86/kvm/svm/sev.c                             | 36 ++++++++--------------
 arch/x86/kvm/svm/svm.c                             |  7 ++---
 arch/x86/kvm/svm/svm.h                             |  4 +--
 arch/x86/kvm/trace.h                               |  6 ++--
 include/hyperv/hvgdk.h                             |  2 +-
 tools/testing/selftests/kvm/include/x86/svm.h      |  3 +-
 .../kvm/x86/svm_nested_soft_inject_test.c          |  4 +--
 11 files changed, 42 insertions(+), 69 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 50ece197c98a..edde36097ddc 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -137,8 +137,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u32 int_vector;
 	u32 int_state;
 	u8 reserved_3[4];
-	u32 exit_code;
-	u32 exit_code_hi;
+	u64 exit_code;
 	u64 exit_info_1;
 	u64 exit_info_2;
 	u32 exit_int_info;
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 650e3256ea7d..010a45c9f614 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -103,38 +103,38 @@
 #define SVM_EXIT_VMGEXIT       0x403
 
 /* SEV-ES software-defined VMGEXIT events */
-#define SVM_VMGEXIT_MMIO_READ			0x80000001
-#define SVM_VMGEXIT_MMIO_WRITE			0x80000002
-#define SVM_VMGEXIT_NMI_COMPLETE		0x80000003
-#define SVM_VMGEXIT_AP_HLT_LOOP			0x80000004
-#define SVM_VMGEXIT_AP_JUMP_TABLE		0x80000005
+#define SVM_VMGEXIT_MMIO_READ			0x80000001ull
+#define SVM_VMGEXIT_MMIO_WRITE			0x80000002ull
+#define SVM_VMGEXIT_NMI_COMPLETE		0x80000003ull
+#define SVM_VMGEXIT_AP_HLT_LOOP			0x80000004ull
+#define SVM_VMGEXIT_AP_JUMP_TABLE		0x80000005ull
 #define SVM_VMGEXIT_SET_AP_JUMP_TABLE		0
 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE		1
-#define SVM_VMGEXIT_PSC				0x80000010
-#define SVM_VMGEXIT_GUEST_REQUEST		0x80000011
-#define SVM_VMGEXIT_EXT_GUEST_REQUEST		0x80000012
-#define SVM_VMGEXIT_AP_CREATION			0x80000013
+#define SVM_VMGEXIT_PSC				0x80000010ull
+#define SVM_VMGEXIT_GUEST_REQUEST		0x80000011ull
+#define SVM_VMGEXIT_EXT_GUEST_REQUEST		0x80000012ull
+#define SVM_VMGEXIT_AP_CREATION			0x80000013ull
 #define SVM_VMGEXIT_AP_CREATE_ON_INIT		0
 #define SVM_VMGEXIT_AP_CREATE			1
 #define SVM_VMGEXIT_AP_DESTROY			2
-#define SVM_VMGEXIT_SNP_RUN_VMPL		0x80000018
-#define SVM_VMGEXIT_SAVIC			0x8000001a
+#define SVM_VMGEXIT_SNP_RUN_VMPL		0x80000018ull
+#define SVM_VMGEXIT_SAVIC			0x8000001aull
 #define SVM_VMGEXIT_SAVIC_REGISTER_GPA		0
 #define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA	1
 #define SVM_VMGEXIT_SAVIC_SELF_GPA		~0ULL
-#define SVM_VMGEXIT_HV_FEATURES			0x8000fffd
-#define SVM_VMGEXIT_TERM_REQUEST		0x8000fffe
+#define SVM_VMGEXIT_HV_FEATURES			0x8000fffdull
+#define SVM_VMGEXIT_TERM_REQUEST		0x8000fffeull
 #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code)	\
 	/* SW_EXITINFO1[3:0] */					\
 	(((((u64)reason_set) & 0xf)) |				\
 	/* SW_EXITINFO1[11:4] */				\
 	((((u64)reason_code) & 0xff) << 4))
-#define SVM_VMGEXIT_UNSUPPORTED_EVENT		0x8000ffff
+#define SVM_VMGEXIT_UNSUPPORTED_EVENT		0x8000ffffull
 
 /* Exit code reserved for hypervisor/software use */
-#define SVM_EXIT_SW				0xf0000000
+#define SVM_EXIT_SW				0xf0000000ull
 
-#define SVM_EXIT_ERR           -1
+#define SVM_EXIT_ERR           -1ull
 
 #define SVM_EXIT_REASONS \
 	{ SVM_EXIT_READ_CR0,    "read_cr0" }, \
diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c
index 088f6429b24c..3ec580d687f5 100644
--- a/arch/x86/kvm/svm/hyperv.c
+++ b/arch/x86/kvm/svm/hyperv.c
@@ -11,7 +11,6 @@ void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL;
-	svm->vmcb->control.exit_code_hi = 0;
 	svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH;
 	svm->vmcb->control.exit_info_2 = 0;
 	nested_svm_vmexit(svm);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 666b5a36c15d..5aa0512e09c9 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -45,7 +45,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
 		 * correctly fill in the high bits of exit_info_1.
 		 */
 		vmcb->control.exit_code = SVM_EXIT_NPF;
-		vmcb->control.exit_code_hi = 0;
 		vmcb->control.exit_info_1 = (1ULL << 32);
 		vmcb->control.exit_info_2 = fault->address;
 	}
@@ -441,7 +440,6 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
 	to->int_vector          = from->int_vector;
 	to->int_state           = from->int_state;
 	to->exit_code           = from->exit_code;
-	to->exit_code_hi        = from->exit_code_hi;
 	to->exit_info_1         = from->exit_info_1;
 	to->exit_info_2         = from->exit_info_2;
 	to->exit_int_info       = from->exit_int_info;
@@ -747,8 +745,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
 	enter_guest_mode(vcpu);
 
 	/*
-	 * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
-	 * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
+	 * Filled at exit: exit_code, exit_info_1, exit_info_2, exit_int_info,
+	 * exit_int_info_err, next_rip, insn_len, insn_bytes.
 	 */
 
 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) &&
@@ -1018,7 +1016,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
 	if (!nested_vmcb_check_save(vcpu) ||
 	    !nested_vmcb_check_controls(vcpu)) {
 		vmcb12->control.exit_code    = SVM_EXIT_ERR;
-		vmcb12->control.exit_code_hi = -1u;
 		vmcb12->control.exit_info_1  = 0;
 		vmcb12->control.exit_info_2  = 0;
 		goto out;
@@ -1051,7 +1048,6 @@ out_exit_err:
 	svm->soft_int_injected = false;
 
 	svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
-	svm->vmcb->control.exit_code_hi = -1u;
 	svm->vmcb->control.exit_info_1  = 0;
 	svm->vmcb->control.exit_info_2  = 0;
 
@@ -1163,7 +1159,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 
 	vmcb12->control.int_state         = vmcb02->control.int_state;
 	vmcb12->control.exit_code         = vmcb02->control.exit_code;
-	vmcb12->control.exit_code_hi      = vmcb02->control.exit_code_hi;
 	vmcb12->control.exit_info_1       = vmcb02->control.exit_info_1;
 	vmcb12->control.exit_info_2       = vmcb02->control.exit_info_2;
 
@@ -1460,7 +1455,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
 
 static int nested_svm_intercept(struct vcpu_svm *svm)
 {
-	u32 exit_code = svm->vmcb->control.exit_code;
+	u64 exit_code = svm->vmcb->control.exit_code;
 	int vmexit = NESTED_EXIT_HOST;
 
 	if (svm_is_vmrun_failure(exit_code))
@@ -1532,7 +1527,6 @@ static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu)
 	struct vmcb *vmcb = svm->vmcb;
 
 	vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector;
-	vmcb->control.exit_code_hi = 0;
 
 	if (ex->has_error_code)
 		vmcb->control.exit_info_1 = ex->error_code;
@@ -1708,7 +1702,6 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
 	dst->int_vector           = from->int_vector;
 	dst->int_state            = from->int_state;
 	dst->exit_code            = from->exit_code;
-	dst->exit_code_hi         = from->exit_code_hi;
 	dst->exit_info_1          = from->exit_info_1;
 	dst->exit_info_2          = from->exit_info_2;
 	dst->exit_int_info        = from->exit_int_info;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 28150506b18c..f67525007089 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3270,11 +3270,6 @@ skip_vmsa_free:
 		kvfree(svm->sev_es.ghcb_sa);
 }
 
-static u64 kvm_get_cached_sw_exit_code(struct vmcb_control_area *control)
-{
-	return (((u64)control->exit_code_hi) << 32) | control->exit_code;
-}
-
 static void dump_ghcb(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -3296,7 +3291,7 @@ static void dump_ghcb(struct vcpu_svm *svm)
 	 */
 	pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa);
 	pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
-	       kvm_get_cached_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm));
+	       control->exit_code, kvm_ghcb_sw_exit_code_is_valid(svm));
 	pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
 	       control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm));
 	pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
@@ -3330,7 +3325,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 	struct ghcb *ghcb = svm->sev_es.ghcb;
-	u64 exit_code;
 
 	/*
 	 * The GHCB protocol so far allows for the following data
@@ -3364,9 +3358,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
 		__kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm));
 
 	/* Copy the GHCB exit information into the VMCB fields */
-	exit_code = kvm_ghcb_get_sw_exit_code(svm);
-	control->exit_code = lower_32_bits(exit_code);
-	control->exit_code_hi = upper_32_bits(exit_code);
+	control->exit_code = kvm_ghcb_get_sw_exit_code(svm);
 	control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm);
 	control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm);
 	svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm);
@@ -3379,15 +3371,8 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	struct kvm_vcpu *vcpu = &svm->vcpu;
-	u64 exit_code;
 	u64 reason;
 
-	/*
-	 * Retrieve the exit code now even though it may not be marked valid
-	 * as it could help with debugging.
-	 */
-	exit_code = kvm_get_cached_sw_exit_code(control);
-
 	/* Only GHCB Usage code 0 is supported */
 	if (svm->sev_es.ghcb->ghcb_usage) {
 		reason = GHCB_ERR_INVALID_USAGE;
@@ -3401,7 +3386,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 	    !kvm_ghcb_sw_exit_info_2_is_valid(svm))
 		goto vmgexit_err;
 
-	switch (exit_code) {
+	switch (control->exit_code) {
 	case SVM_EXIT_READ_DR7:
 		break;
 	case SVM_EXIT_WRITE_DR7:
@@ -3502,15 +3487,19 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 	return 0;
 
 vmgexit_err:
+	/*
+	 * Print the exit code even though it may not be marked valid as it
+	 * could help with debugging.
+	 */
 	if (reason == GHCB_ERR_INVALID_USAGE) {
 		vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
 			    svm->sev_es.ghcb->ghcb_usage);
 	} else if (reason == GHCB_ERR_INVALID_EVENT) {
 		vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
-			    exit_code);
+			    control->exit_code);
 	} else {
 		vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
-			    exit_code);
+			    control->exit_code);
 		dump_ghcb(svm);
 	}
 
@@ -4349,7 +4338,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb_control_area *control = &svm->vmcb->control;
-	u64 ghcb_gpa, exit_code;
+	u64 ghcb_gpa;
 	int ret;
 
 	/* Validate the GHCB */
@@ -4391,8 +4380,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 
 	svm_vmgexit_success(svm, 0);
 
-	exit_code = kvm_get_cached_sw_exit_code(control);
-	switch (exit_code) {
+	switch (control->exit_code) {
 	case SVM_VMGEXIT_MMIO_READ:
 		ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
 		if (ret)
@@ -4484,7 +4472,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 		ret = -EINVAL;
 		break;
 	default:
-		ret = svm_invoke_exit_handler(vcpu, exit_code);
+		ret = svm_invoke_exit_handler(vcpu, control->exit_code);
 	}
 
 	return ret;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 3caf7a21679f..a28cd61d87ea 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2466,7 +2466,6 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
 
 	if (cr0 ^ val) {
 		svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
-		svm->vmcb->control.exit_code_hi = 0;
 		ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
 	}
 
@@ -3299,7 +3298,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
 	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
 	pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
 	pr_err("%-20s%08x\n", "int_state:", control->int_state);
-	pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
+	pr_err("%-20s%016llx\n", "exit_code:", control->exit_code);
 	pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
 	pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
 	pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
@@ -3549,7 +3548,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct kvm_run *kvm_run = vcpu->run;
-	u32 exit_code = svm->vmcb->control.exit_code;
 
 	/* SEV-ES guests must use the CR write traps to track CR registers. */
 	if (!sev_es_guest(vcpu->kvm)) {
@@ -3585,7 +3583,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
 	if (exit_fastpath != EXIT_FASTPATH_NONE)
 		return 1;
 
-	return svm_invoke_exit_handler(vcpu, exit_code);
+	return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code);
 }
 
 static int pre_svm_run(struct kvm_vcpu *vcpu)
@@ -4670,7 +4668,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
 	if (static_cpu_has(X86_FEATURE_NRIPS))
 		vmcb->control.next_rip  = info->next_rip;
 	vmcb->control.exit_code = icpt_info.exit_code;
-	vmcb->control.exit_code_hi = 0;
 	vmexit = nested_svm_exit_handled(svm);
 
 	ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 3360ac36e071..a22433680c73 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -160,8 +160,7 @@ struct vmcb_ctrl_area_cached {
 	u32 int_ctl;
 	u32 int_vector;
 	u32 int_state;
-	u32 exit_code;
-	u32 exit_code_hi;
+	u64 exit_code;
 	u64 exit_info_1;
 	u64 exit_info_2;
 	u32 exit_int_info;
@@ -787,7 +786,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm);
 static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
 {
 	svm->vmcb->control.exit_code	= exit_code;
-	svm->vmcb->control.exit_code_hi	= 0;
 	svm->vmcb->control.exit_info_1	= 0;
 	svm->vmcb->control.exit_info_2	= 0;
 	return nested_svm_vmexit(svm);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index e79bc9cb7162..e7fdbe9efc90 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -383,10 +383,10 @@ TRACE_EVENT(kvm_apic,
 #define kvm_print_exit_reason(exit_reason, isa)				\
 	(isa == KVM_ISA_VMX) ?						\
 	__print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) :	\
-	__print_symbolic(exit_reason, SVM_EXIT_REASONS),		\
+	__print_symbolic_u64(exit_reason, SVM_EXIT_REASONS),		\
 	(isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "",	\
 	(isa == KVM_ISA_VMX) ?						\
-	__print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : ""
+	__print_flags_u64(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : ""
 
 #define TRACE_EVENT_KVM_EXIT(name)					     \
 TRACE_EVENT(name,							     \
@@ -781,7 +781,7 @@ TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit);
  * Tracepoint for #VMEXIT reinjected to the guest
  */
 TRACE_EVENT(kvm_nested_vmexit_inject,
-	    TP_PROTO(__u32 exit_code,
+	    TP_PROTO(__u64 exit_code,
 		     __u64 exit_info1, __u64 exit_info2,
 		     __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
 	    TP_ARGS(exit_code, exit_info1, exit_info2,
diff --git a/include/hyperv/hvgdk.h b/include/hyperv/hvgdk.h
index dd6d4939ea29..384c3f3ff4a5 100644
--- a/include/hyperv/hvgdk.h
+++ b/include/hyperv/hvgdk.h
@@ -281,7 +281,7 @@ struct hv_vmcb_enlightenments {
 #define HV_VMCB_NESTED_ENLIGHTENMENTS		31
 
 /* Synthetic VM-Exit */
-#define HV_SVM_EXITCODE_ENL			0xf0000000
+#define HV_SVM_EXITCODE_ENL			0xf0000000ull
 #define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH	(1)
 
 /* VM_PARTITION_ASSIST_PAGE */
diff --git a/tools/testing/selftests/kvm/include/x86/svm.h b/tools/testing/selftests/kvm/include/x86/svm.h
index 29cffd0a9181..10b30b38bb3f 100644
--- a/tools/testing/selftests/kvm/include/x86/svm.h
+++ b/tools/testing/selftests/kvm/include/x86/svm.h
@@ -92,8 +92,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u32 int_vector;
 	u32 int_state;
 	u8 reserved_3[4];
-	u32 exit_code;
-	u32 exit_code_hi;
+	u64 exit_code;
 	u64 exit_info_1;
 	u64 exit_info_2;
 	u32 exit_int_info;
diff --git a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c
index 7b6481d6c0d3..4bd1655f9e6d 100644
--- a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c
+++ b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c
@@ -103,7 +103,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i
 
 	run_guest(vmcb, svm->vmcb_gpa);
 	__GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL,
-		       "Expected VMMCAL #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'",
+		       "Expected VMMCAL #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'",
 		       vmcb->control.exit_code,
 		       vmcb->control.exit_info_1, vmcb->control.exit_info_2);
 
@@ -133,7 +133,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i
 
 	run_guest(vmcb, svm->vmcb_gpa);
 	__GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_HLT,
-		       "Expected HLT #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'",
+		       "Expected HLT #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'",
 		       vmcb->control.exit_code,
 		       vmcb->control.exit_info_1, vmcb->control.exit_info_2);
 
-- 
cgit v1.2.3


From 69cb6ca52da095d300cf42666c903ae787a762dd Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobbyeshleman@meta.com>
Date: Mon, 12 Jan 2026 19:56:31 -0800
Subject: tools/net/ynl: suppress jobserver warning in ynltool version
 detection

When building ynltool with parallel make (-jN), a warning is emitted:

  make[1]: warning: jobserver unavailable: using -j1.
  Add '+' to parent make rule.

The warning trips up local runs of NIPA's ingest_mdir.py, which
correctly fails on make warnings.

This occurs because SRC_VERSION uses $(shell make ...) to make
kernelversion. The $(shell) function inherits make's MAKEFLAGS env var
which specifies "--jobserver-auth=R,W" pointing to file descriptors that
the invoked make sub-shell does not have access to.

Observed with:

$ make --version | head -1
GNU Make 4.3

Instead of suppressing MAKEFLAGS and foregoing all future MAKEFLAGS
(some of which may be desirable, such as variable overrides) or
introducing a new make target, we instead just ignore the warning by
piping stderr to /dev/null. If 'make kernelversion' fails, the ' || echo
"unknown"' phrase will catch the failure.

Before:
	NIPA ingest_mdir.py:

	ynl
	 Full series FAIL   (1)
	   Generated files up to date; build has 1 warnings/errors; no diff in
	   generated;

After:
	NIPA ingest_mdir.py:

	Series level tests:
	 ynl                             OKAY

Validated output:
	$ ./ynltool/ynltool --version
	ynltool 6.19.0-rc4

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Link: https://patch.msgid.link/20260112-ynl-make-fix-v1-1-c399e76925ad@meta.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynltool/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynltool/Makefile b/tools/net/ynl/ynltool/Makefile
index f5b1de32daa5..48b0f32050f0 100644
--- a/tools/net/ynl/ynltool/Makefile
+++ b/tools/net/ynl/ynltool/Makefile
@@ -13,7 +13,7 @@ endif
 CFLAGS += -I../lib -I../generated -I../../../include/uapi/
 
 SRC_VERSION := \
-	$(shell make --no-print-directory -sC ../../../.. kernelversion || \
+	$(shell make --no-print-directory -sC ../../../.. kernelversion 2>/dev/null || \
 		echo "unknown")
 
 CFLAGS += -DSRC_VERSION='"$(SRC_VERSION)"'
-- 
cgit v1.2.3


From b324192e36ecea472077f9c9e32bcac8bbafeed6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 12 Jan 2026 16:07:35 -0800
Subject: selftests: net: py: teach ksft_pr() multi-line safety

Make printing multi-line logs easier by automatically prefixing
each line in ksft_pr(). Make use of this when formatting exceptions.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20260113000740.255360-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/lib/py/ksft.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py
index 0a96f88bb60a..6cdfb8afccb5 100644
--- a/tools/testing/selftests/net/lib/py/ksft.py
+++ b/tools/testing/selftests/net/lib/py/ksft.py
@@ -32,8 +32,23 @@ class KsftTerminate(KeyboardInterrupt):
 
 
 def ksft_pr(*objs, **kwargs):
+    """
+    Print logs to stdout.
+
+    Behaves like print() but log lines will be prefixed
+    with # to prevent breaking the TAP output formatting.
+
+    Extra arguments (on top of what print() supports):
+      line_pfx - add extra string before each line
+    """
+    sep = kwargs.pop("sep", " ")
+    pfx = kwargs.pop("line_pfx", "")
+    pfx = "#" + (" " + pfx if pfx else "")
     kwargs["flush"] = True
-    print("#", *objs, **kwargs)
+
+    text = sep.join(str(obj) for obj in objs)
+    prefixed = f"\n{pfx} ".join(text.split('\n'))
+    print(pfx, prefixed, **kwargs)
 
 
 def _fail(*args):
@@ -170,9 +185,7 @@ def ksft_flush_defer():
             entry.exec_only()
         except Exception:
             ksft_pr(f"Exception while handling defer / cleanup (callback {i} of {qlen_start})!")
-            tb = traceback.format_exc()
-            for line in tb.strip().split('\n'):
-                ksft_pr("Defer Exception|", line)
+            ksft_pr(traceback.format_exc(), line_pfx="Defer Exception|")
             KSFT_RESULT = False
 
 
@@ -331,9 +344,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()):
             cnt_key = 'xfail'
         except BaseException as e:
             stop |= isinstance(e, KeyboardInterrupt)
-            tb = traceback.format_exc()
-            for line in tb.strip().split('\n'):
-                ksft_pr("Exception|", line)
+            ksft_pr(traceback.format_exc(), line_pfx="Exception|")
             if stop:
                 ksft_pr(f"Stopping tests due to {type(e).__name__}.")
             KSFT_RESULT = False
@@ -343,9 +354,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()):
         try:
             ksft_flush_defer()
         except BaseException as e:
-            tb = traceback.format_exc()
-            for line in tb.strip().split('\n'):
-                ksft_pr("Exception|", line)
+            ksft_pr(traceback.format_exc(), line_pfx="Exception|")
             if isinstance(e, KeyboardInterrupt):
                 ksft_pr()
                 ksft_pr("WARN: defer() interrupted, cleanup may be incomplete.")
-- 
cgit v1.2.3


From ce0f92dc737c65d705d83ea9529d8fb9a2194241 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 12 Jan 2026 16:07:36 -0800
Subject: selftests: net: py: teach cmd() how to print itself

Teach cmd() how to print itself, to make debug prints easier.
Example output (leading # due to ksft_pr()):

  # CMD: /root/ksft-net-drv/drivers/net/gro
  #   EXIT: 1
  #   STDOUT: ipv6 with ext header does coalesce:
  #   STDERR: Expected {200 }, Total 1 packets
  #           Received {100 [!=200]100 [!=0]}, Total 2 packets.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20260113000740.255360-3-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/lib/py/utils.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py
index 824f039d384c..37243103aee3 100644
--- a/tools/testing/selftests/net/lib/py/utils.py
+++ b/tools/testing/selftests/net/lib/py/utils.py
@@ -41,7 +41,9 @@ class cmd:
         self.ret = None
         self.ksft_term_fd = None
 
+        self.host = host
         self.comm = comm
+
         if host:
             self.proc = host.cmd(comm)
         else:
@@ -99,6 +101,27 @@ class cmd:
             raise CmdExitFailure("Command failed: %s\nSTDOUT: %s\nSTDERR: %s" %
                                  (self.proc.args, stdout, stderr), self)
 
+    def __repr__(self):
+        def str_fmt(name, s):
+            name += ': '
+            return (name + s.strip().replace('\n', '\n' + ' ' * len(name)))
+
+        ret = "CMD"
+        if self.host:
+            ret += "[remote]"
+        if self.ret is None:
+            ret += f" (unterminated): {self.comm}\n"
+        elif self.ret == 0:
+            ret += f" (success): {self.comm}\n"
+        else:
+            ret += f": {self.comm}\n"
+            ret += f"  EXIT: {self.ret}\n"
+        if self.stdout:
+            ret += str_fmt("  STDOUT", self.stdout) + "\n"
+        if self.stderr:
+            ret += str_fmt("  STDERR", self.stderr) + "\n"
+        return ret.strip()
+
 
 class bkg(cmd):
     """
-- 
cgit v1.2.3


From d131da6d7282829c775cf70f5f1db1576e5c1273 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 12 Jan 2026 16:07:37 -0800
Subject: selftests: drv-net: gro: use cmd print

Now that cmd() can be printed directly remove the old formatting.

Before:

  # fragmented ip6 doesn't coalesce:
  # Expected {200 100 100 }, Total 3 packets
  # Received {200 100 }, Total 2 packets.
  # /root/ksft-net-drv/drivers/net/gro: incorrect number of packets

Now:

  # CMD: drivers/net/gro --ipv6 --dmac 9e:[...]
  #   EXIT: 1
  #   STDOUT: fragmented ip6 doesn't coalesce:
  #   STDERR: Expected {200 100 100 }, Total 3 packets
  #           Received {200 100 }, Total 2 packets.
  #           /root/ksft-net-drv/drivers/net/gro: incorrect number of packets

Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20260113000740.255360-4-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/gro.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py
index ba83713bf7b5..4e0fb19d1527 100755
--- a/tools/testing/selftests/drivers/net/gro.py
+++ b/tools/testing/selftests/drivers/net/gro.py
@@ -142,8 +142,7 @@ def test(cfg, protocol, test_name):
         if rx_proc.ret == 0:
             return
 
-        ksft_pr(rx_proc.stdout.strip().replace('\n', '\n# '))
-        ksft_pr(rx_proc.stderr.strip().replace('\n', '\n# '))
+        ksft_pr(rx_proc)
 
         if test_name == "large" and os.environ.get("KSFT_MACHINE_SLOW"):
             ksft_pr(f"Ignoring {protocol}/{test_name} failure due to slow environment")
-- 
cgit v1.2.3


From 8171f6a76b2250d93a550566af6b915dc92edc75 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 12 Jan 2026 16:07:38 -0800
Subject: selftests: drv-net: gro: improve feature config

We'll need to do a lot more feature handling to test HW-GRO and LRO.
Clean up the feature handling for SW GRO a bit to let the next commit
focus on the new test cases, only.

Make sure HW GRO-like features are not enabled for the SW tests.
Be more careful about changing features as "nothing changed"
situations may result in non-zero error code from ethtool.

Don't disable TSO on the local interface (receiver) when running over
netdevsim, we just want GSO to break up the segments on the sender.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20260113000740.255360-5-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/gro.py | 39 +++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py
index 4e0fb19d1527..3c30749ead39 100755
--- a/tools/testing/selftests/drivers/net/gro.py
+++ b/tools/testing/selftests/drivers/net/gro.py
@@ -20,7 +20,7 @@ Test cases:
 import os
 from lib.py import ksft_run, ksft_exit, ksft_pr
 from lib.py import NetDrvEpEnv, KsftXfailEx
-from lib.py import cmd, defer, bkg, ip
+from lib.py import bkg, cmd, defer, ethtool, ip
 from lib.py import ksft_variants
 
 
@@ -70,6 +70,27 @@ def _set_mtu_restore(dev, mtu, host):
         defer(ip, f"link set dev {dev['ifname']} mtu {dev['mtu']}", host=host)
 
 
+def _set_ethtool_feat(dev, current, feats, host=None):
+    s2n = {True: "on", False: "off"}
+
+    new = ["-K", dev]
+    old = ["-K", dev]
+    no_change = True
+    for name, state in feats.items():
+        new += [name, s2n[state]]
+        old += [name, s2n[current[name]["active"]]]
+
+        if current[name]["active"] != state:
+            no_change = False
+            if current[name]["fixed"]:
+                raise KsftXfailEx(f"Device does not support {name}")
+    if no_change:
+        return
+
+    ethtool(" ".join(new), host=host)
+    defer(ethtool, " ".join(old), host=host)
+
+
 def _setup(cfg, test_name):
     """ Setup hardware loopback mode for GRO testing. """
 
@@ -77,6 +98,11 @@ def _setup(cfg, test_name):
         cfg.bin_local = cfg.test_dir / "gro"
         cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
 
+    if not hasattr(cfg, "feat"):
+        cfg.feat = ethtool(f"-k {cfg.ifname}", json=True)[0]
+        cfg.remote_feat = ethtool(f"-k {cfg.remote_ifname}",
+                                  host=cfg.remote, json=True)[0]
+
     # "large" test needs at least 4k MTU
     if test_name == "large":
         _set_mtu_restore(cfg.dev, 4096, None)
@@ -88,15 +114,22 @@ def _setup(cfg, test_name):
     _write_defer_restore(cfg, flush_path, "200000", defer_undo=True)
     _write_defer_restore(cfg, irq_path, "10", defer_undo=True)
 
+    _set_ethtool_feat(cfg.ifname, cfg.feat,
+                      {"generic-receive-offload": True,
+                       "rx-gro-hw": False,
+                       "large-receive-offload": False})
+
     try:
         # Disable TSO for local tests
         cfg.require_nsim()  # will raise KsftXfailEx if not running on nsim
 
-        cmd(f"ethtool -K {cfg.ifname} gro on tso off")
-        cmd(f"ethtool -K {cfg.remote_ifname} gro on tso off", host=cfg.remote)
+        _set_ethtool_feat(cfg.remote_ifname, cfg.remote_feat,
+                          {"tcp-segmentation-offload": False},
+                          host=cfg.remote)
     except KsftXfailEx:
         pass
 
+
 def _gro_variants():
     """Generator that yields all combinations of protocol and test types."""
 
-- 
cgit v1.2.3


From d3b35898de024796c43415f9535fd0bc69cb8f1b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 12 Jan 2026 16:07:39 -0800
Subject: selftests: drv-net: gro: run the test against HW GRO and LRO

Run the test against HW GRO and LRO. NICs I have pass the base cases.
Interestingly all are happy to build GROs larger than 64k.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20260113000740.255360-6-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/gro.py        | 72 +++++++++++++++++------
 tools/testing/selftests/drivers/net/lib/py/env.py |  7 ++-
 2 files changed, 60 insertions(+), 19 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py
index 3c30749ead39..1ab85590c439 100755
--- a/tools/testing/selftests/drivers/net/gro.py
+++ b/tools/testing/selftests/drivers/net/gro.py
@@ -87,11 +87,15 @@ def _set_ethtool_feat(dev, current, feats, host=None):
     if no_change:
         return
 
-    ethtool(" ".join(new), host=host)
+    eth_cmd = ethtool(" ".join(new), host=host)
     defer(ethtool, " ".join(old), host=host)
 
+    # If ethtool printed something kernel must have modified some features
+    if eth_cmd.stdout:
+        ksft_pr(eth_cmd)
 
-def _setup(cfg, test_name):
+
+def _setup(cfg, mode, test_name):
     """ Setup hardware loopback mode for GRO testing. """
 
     if not hasattr(cfg, "bin_remote"):
@@ -108,16 +112,49 @@ def _setup(cfg, test_name):
         _set_mtu_restore(cfg.dev, 4096, None)
         _set_mtu_restore(cfg.remote_dev, 4096, cfg.remote)
 
-    flush_path = f"/sys/class/net/{cfg.ifname}/gro_flush_timeout"
-    irq_path = f"/sys/class/net/{cfg.ifname}/napi_defer_hard_irqs"
-
-    _write_defer_restore(cfg, flush_path, "200000", defer_undo=True)
-    _write_defer_restore(cfg, irq_path, "10", defer_undo=True)
-
-    _set_ethtool_feat(cfg.ifname, cfg.feat,
-                      {"generic-receive-offload": True,
-                       "rx-gro-hw": False,
-                       "large-receive-offload": False})
+    if mode == "sw":
+        flush_path = f"/sys/class/net/{cfg.ifname}/gro_flush_timeout"
+        irq_path = f"/sys/class/net/{cfg.ifname}/napi_defer_hard_irqs"
+
+        _write_defer_restore(cfg, flush_path, "200000", defer_undo=True)
+        _write_defer_restore(cfg, irq_path, "10", defer_undo=True)
+
+        _set_ethtool_feat(cfg.ifname, cfg.feat,
+                          {"generic-receive-offload": True,
+                           "rx-gro-hw": False,
+                           "large-receive-offload": False})
+    elif mode == "hw":
+        _set_ethtool_feat(cfg.ifname, cfg.feat,
+                          {"generic-receive-offload": False,
+                           "rx-gro-hw": True,
+                           "large-receive-offload": False})
+
+        # Some NICs treat HW GRO as a GRO sub-feature so disabling GRO
+        # will also clear HW GRO. Use a hack of installing XDP generic
+        # to skip SW GRO, even when enabled.
+        feat = ethtool(f"-k {cfg.ifname}", json=True)[0]
+        if not feat["rx-gro-hw"]["active"]:
+            ksft_pr("Driver clears HW GRO and SW GRO is cleared, using generic XDP workaround")
+            prog = cfg.net_lib_dir / "xdp_dummy.bpf.o"
+            ip(f"link set dev {cfg.ifname} xdpgeneric obj {prog} sec xdp")
+            defer(ip, f"link set dev {cfg.ifname} xdpgeneric off")
+
+            # Attaching XDP may change features, fetch the latest state
+            feat = ethtool(f"-k {cfg.ifname}", json=True)[0]
+
+            _set_ethtool_feat(cfg.ifname, feat,
+                              {"generic-receive-offload": True,
+                               "rx-gro-hw": True,
+                               "large-receive-offload": False})
+    elif mode == "lro":
+        # netdevsim advertises LRO for feature inheritance testing with
+        # bonding/team tests but it doesn't actually perform the offload
+        cfg.require_nsim(nsim_test=False)
+
+        _set_ethtool_feat(cfg.ifname, cfg.feat,
+                          {"generic-receive-offload": False,
+                           "rx-gro-hw": False,
+                           "large-receive-offload": True})
 
     try:
         # Disable TSO for local tests
@@ -133,19 +170,20 @@ def _setup(cfg, test_name):
 def _gro_variants():
     """Generator that yields all combinations of protocol and test types."""
 
-    for protocol in ["ipv4", "ipv6", "ipip"]:
-        for test_name in ["data", "ack", "flags", "tcp", "ip", "large"]:
-            yield protocol, test_name
+    for mode in ["sw", "hw", "lro"]:
+        for protocol in ["ipv4", "ipv6", "ipip"]:
+            for test_name in ["data", "ack", "flags", "tcp", "ip", "large"]:
+                yield mode, protocol, test_name
 
 
 @ksft_variants(_gro_variants())
-def test(cfg, protocol, test_name):
+def test(cfg, mode, protocol, test_name):
     """Run a single GRO test with retries."""
 
     ipver = "6" if protocol[-1] == "6" else "4"
     cfg.require_ipver(ipver)
 
-    _setup(cfg, test_name)
+    _setup(cfg, mode, test_name)
 
     base_cmd_args = [
         f"--{protocol}",
diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py
index 63495376e654..41cc248ac848 100644
--- a/tools/testing/selftests/drivers/net/lib/py/env.py
+++ b/tools/testing/selftests/drivers/net/lib/py/env.py
@@ -248,9 +248,12 @@ class NetDrvEpEnv(NetDrvEnvBase):
         if not self.addr_v[ipver] or not self.remote_addr_v[ipver]:
             raise KsftSkipEx(f"Test requires IPv{ipver} connectivity")
 
-    def require_nsim(self):
-        if self._ns is None:
+    def require_nsim(self, nsim_test=True):
+        """Require or exclude netdevsim for this test"""
+        if nsim_test and self._ns is None:
             raise KsftXfailEx("Test only works on netdevsim")
+        if nsim_test is False and self._ns is not None:
+            raise KsftXfailEx("Test does not work on netdevsim")
 
     def _require_cmd(self, comm, key, host=None):
         cached = self._required_cmd.get(comm, {})
-- 
cgit v1.2.3


From fe074aaa5329246706c652ccba6602eecbed80a8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 12 Jan 2026 16:07:40 -0800
Subject: selftests: drv-net: gro: break out all individual test cases

GRO test groups the cases into categories, e.g. "tcp" case
checks coalescing in presence of:
 - packets with bad csum,
 - sequence number mismatch,
 - timestamp option value mismatch,
 - different TCP options.

Since we now have TAP support grouping the cases like that
lowers our reporting granularity. This matters even more for
NICs performing HW GRO and LRO since it appears that most
implementation have _some_ bugs. Flagging the whole group
of tests as failed prevents us from catching regressions
in the things that work today.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20260113000740.255360-7-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/gro.c  | 441 +++++++++++++++++------------
 tools/testing/selftests/drivers/net/gro.py |  65 ++++-
 2 files changed, 312 insertions(+), 194 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/gro.c b/tools/testing/selftests/drivers/net/gro.c
index 751a8103f408..e76c618704cf 100644
--- a/tools/testing/selftests/drivers/net/gro.c
+++ b/tools/testing/selftests/drivers/net/gro.c
@@ -3,26 +3,45 @@
  * This testsuite provides conformance testing for GRO coalescing.
  *
  * Test cases:
- * 1.data
+ *
+ * data_*:
  *  Data packets of the same size and same header setup with correct
  *  sequence numbers coalesce. The one exception being the last data
  *  packet coalesced: it can be smaller than the rest and coalesced
  *  as long as it is in the same flow.
- * 2.ack
+ *   - data_same:    same size packets coalesce
+ *   - data_lrg_sml: large then small coalesces
+ *   - data_sml_lrg: small then large doesn't coalesce
+ *
+ * ack:
  *  Pure ACK does not coalesce.
- * 3.flags
- *  Specific test cases: no packets with PSH, SYN, URG, RST set will
- *  be coalesced.
- * 4.tcp
+ *
+ * flags_*:
+ *  No packets with PSH, SYN, URG, RST set will be coalesced.
+ *   - flags_psh, flags_syn, flags_rst, flags_urg
+ *
+ * tcp_*:
  *  Packets with incorrect checksum, non-consecutive seqno and
  *  different TCP header options shouldn't coalesce. Nit: given that
  *  some extension headers have paddings, such as timestamp, headers
- *  that are padding differently would not be coalesced.
- * 5.ip:
- *  Packets with different (ECN, TTL, TOS) header, ip options or
- *  ip fragments (ipv6) shouldn't coalesce.
- * 6.large:
+ *  that are padded differently would not be coalesced.
+ *   - tcp_csum: incorrect checksum
+ *   - tcp_seq:  non-consecutive sequence numbers
+ *   - tcp_ts:   different timestamps
+ *   - tcp_opt:  different TCP options
+ *
+ * ip_*:
+ *  Packets with different (ECN, TTL, TOS) header, IP options or
+ *  IP fragments shouldn't coalesce.
+ *   - ip_ecn, ip_tos:            shared between IPv4/IPv6
+ *   - ip_ttl, ip_opt, ip_frag4:  IPv4 only
+ *   - ip_id_df*:                 IPv4 IP ID field coalescing tests
+ *   - ip_frag6, ip_v6ext_*:      IPv6 only
+ *
+ * large_*:
  *  Packets larger than GRO_MAX_SIZE packets shouldn't coalesce.
+ *   - large_max: exceeding max size
+ *   - large_rem: remainder handling
  *
  * MSS is defined as 4096 - header because if it is too small
  * (i.e. 1500 MTU - header), it will result in many packets,
@@ -79,6 +98,15 @@
 #define ipv6_optlen(p)  (((p)->hdrlen+1) << 3) /* calculate IPv6 extension header len */
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
 
+enum flush_id_case {
+	FLUSH_ID_DF1_INC,
+	FLUSH_ID_DF1_FIXED,
+	FLUSH_ID_DF0_INC,
+	FLUSH_ID_DF0_FIXED,
+	FLUSH_ID_DF1_INC_FIXED,
+	FLUSH_ID_DF1_FIXED_INC,
+};
+
 static const char *addr6_src = "fdaa::2";
 static const char *addr6_dst = "fdaa::1";
 static const char *addr4_src = "192.168.1.200";
@@ -95,7 +123,6 @@ static int tcp_offset = -1;
 static int total_hdr_len = -1;
 static int ethhdr_proto = -1;
 static bool ipip;
-static const int num_flush_id_cases = 6;
 
 static void vlog(const char *fmt, ...)
 {
@@ -127,19 +154,19 @@ static void setup_sock_filter(int fd)
 	/* Overridden later if exthdrs are used: */
 	opt_ipproto_off = ipproto_off;
 
-	if (strcmp(testname, "ip") == 0) {
-		if (proto == PF_INET)
-			optlen = sizeof(struct ip_timestamp);
-		else {
-			BUILD_BUG_ON(sizeof(struct ip6_hbh) > MIN_EXTHDR_SIZE);
-			BUILD_BUG_ON(sizeof(struct ip6_dest) > MIN_EXTHDR_SIZE);
-			BUILD_BUG_ON(sizeof(struct ip6_frag) > MIN_EXTHDR_SIZE);
-
-			/* same size for HBH and Fragment extension header types */
-			optlen = MIN_EXTHDR_SIZE;
-			opt_ipproto_off = ETH_HLEN + sizeof(struct ipv6hdr)
-				+ offsetof(struct ip6_ext, ip6e_nxt);
-		}
+	if (strcmp(testname, "ip_opt") == 0) {
+		optlen = sizeof(struct ip_timestamp);
+	} else if (strcmp(testname, "ip_frag6") == 0 ||
+		   strcmp(testname, "ip_v6ext_same") == 0 ||
+		   strcmp(testname, "ip_v6ext_diff") == 0) {
+		BUILD_BUG_ON(sizeof(struct ip6_hbh) > MIN_EXTHDR_SIZE);
+		BUILD_BUG_ON(sizeof(struct ip6_dest) > MIN_EXTHDR_SIZE);
+		BUILD_BUG_ON(sizeof(struct ip6_frag) > MIN_EXTHDR_SIZE);
+
+		/* same size for HBH and Fragment extension header types */
+		optlen = MIN_EXTHDR_SIZE;
+		opt_ipproto_off = ETH_HLEN + sizeof(struct ipv6hdr)
+			+ offsetof(struct ip6_ext, ip6e_nxt);
 	}
 
 	/* this filter validates the following:
@@ -648,7 +675,8 @@ static void fix_ip4_checksum(struct iphdr *iph)
 	iph->check = checksum_fold(iph, sizeof(struct iphdr), 0);
 }
 
-static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase)
+static void send_flush_id_case(int fd, struct sockaddr_ll *daddr,
+			       enum flush_id_case tcase)
 {
 	static char buf1[MAX_HDR_LEN + PAYLOAD_LEN];
 	static char buf2[MAX_HDR_LEN + PAYLOAD_LEN];
@@ -667,7 +695,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase)
 	create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0);
 
 	switch (tcase) {
-	case 0: /* DF=1, Incrementing - should coalesce */
+	case FLUSH_ID_DF1_INC: /* DF=1, Incrementing - should coalesce */
 		iph1->frag_off |= htons(IP_DF);
 		iph1->id = htons(8);
 
@@ -675,7 +703,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase)
 		iph2->id = htons(9);
 		break;
 
-	case 1: /* DF=1, Fixed - should coalesce */
+	case FLUSH_ID_DF1_FIXED: /* DF=1, Fixed - should coalesce */
 		iph1->frag_off |= htons(IP_DF);
 		iph1->id = htons(8);
 
@@ -683,7 +711,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase)
 		iph2->id = htons(8);
 		break;
 
-	case 2: /* DF=0, Incrementing - should coalesce */
+	case FLUSH_ID_DF0_INC: /* DF=0, Incrementing - should coalesce */
 		iph1->frag_off &= ~htons(IP_DF);
 		iph1->id = htons(8);
 
@@ -691,7 +719,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase)
 		iph2->id = htons(9);
 		break;
 
-	case 3: /* DF=0, Fixed - should coalesce */
+	case FLUSH_ID_DF0_FIXED: /* DF=0, Fixed - should coalesce */
 		iph1->frag_off &= ~htons(IP_DF);
 		iph1->id = htons(8);
 
@@ -699,9 +727,10 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase)
 		iph2->id = htons(8);
 		break;
 
-	case 4: /* DF=1, two packets incrementing, and one fixed - should
-		 * coalesce only the first two packets
-		 */
+	case FLUSH_ID_DF1_INC_FIXED: /* DF=1, two packets incrementing, and
+				      * one fixed - should coalesce only the
+				      * first two packets
+				      */
 		iph1->frag_off |= htons(IP_DF);
 		iph1->id = htons(8);
 
@@ -713,9 +742,10 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase)
 		send_three = true;
 		break;
 
-	case 5: /* DF=1, two packets fixed, and one incrementing - should
-		 * coalesce only the first two packets
-		 */
+	case FLUSH_ID_DF1_FIXED_INC: /* DF=1, two packets fixed, and one
+				      * incrementing - should coalesce only
+				      * the first two packets
+				      */
 		iph1->frag_off |= htons(IP_DF);
 		iph1->id = htons(8);
 
@@ -739,16 +769,6 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase)
 	}
 }
 
-static void test_flush_id(int fd, struct sockaddr_ll *daddr, char *fin_pkt)
-{
-	for (int i = 0; i < num_flush_id_cases; i++) {
-		sleep(1);
-		send_flush_id_case(fd, daddr, i);
-		sleep(1);
-		write_packet(fd, fin_pkt, total_hdr_len, daddr);
-	}
-}
-
 static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, char *ext_data2)
 {
 	static char buf[MAX_HDR_LEN + PAYLOAD_LEN];
@@ -1030,108 +1050,128 @@ static void gro_sender(void)
 	daddr.sll_halen = ETH_ALEN;
 	create_packet(fin_pkt, PAYLOAD_LEN * 2, 0, 0, 1);
 
-	if (strcmp(testname, "data") == 0) {
+	/* data sub-tests */
+	if (strcmp(testname, "data_same") == 0) {
 		send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
+	} else if (strcmp(testname, "data_lrg_sml") == 0) {
 		send_data_pkts(txfd, &daddr, PAYLOAD_LEN, PAYLOAD_LEN / 2);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
+	} else if (strcmp(testname, "data_sml_lrg") == 0) {
 		send_data_pkts(txfd, &daddr, PAYLOAD_LEN / 2, PAYLOAD_LEN);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+
+	/* ack test */
 	} else if (strcmp(testname, "ack") == 0) {
 		send_ack(txfd, &daddr);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-	} else if (strcmp(testname, "flags") == 0) {
+
+	/* flags sub-tests */
+	} else if (strcmp(testname, "flags_psh") == 0) {
 		send_flags(txfd, &daddr, 1, 0, 0, 0);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
+	} else if (strcmp(testname, "flags_syn") == 0) {
 		send_flags(txfd, &daddr, 0, 1, 0, 0);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
+	} else if (strcmp(testname, "flags_rst") == 0) {
 		send_flags(txfd, &daddr, 0, 0, 1, 0);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
+	} else if (strcmp(testname, "flags_urg") == 0) {
 		send_flags(txfd, &daddr, 0, 0, 0, 1);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-	} else if (strcmp(testname, "tcp") == 0) {
+
+	/* tcp sub-tests */
+	} else if (strcmp(testname, "tcp_csum") == 0) {
 		send_changed_checksum(txfd, &daddr);
-		/* Adding sleep before sending FIN so that it is not
-		 * received prior to other packets.
-		 */
 		usleep(fin_delay_us);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
+	} else if (strcmp(testname, "tcp_seq") == 0) {
 		send_changed_seq(txfd, &daddr);
 		usleep(fin_delay_us);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
+	} else if (strcmp(testname, "tcp_ts") == 0) {
 		send_changed_ts(txfd, &daddr);
 		usleep(fin_delay_us);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
+	} else if (strcmp(testname, "tcp_opt") == 0) {
 		send_diff_opt(txfd, &daddr);
 		usleep(fin_delay_us);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-	} else if (strcmp(testname, "ip") == 0) {
+
+	/* ip sub-tests - shared between IPv4 and IPv6 */
+	} else if (strcmp(testname, "ip_ecn") == 0) {
 		send_changed_ECN(txfd, &daddr);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
+	} else if (strcmp(testname, "ip_tos") == 0) {
 		send_changed_tos(txfd, &daddr);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-		if (proto == PF_INET) {
-			/* Modified packets may be received out of order.
-			 * Sleep function added to enforce test boundaries
-			 * so that fin pkts are not received prior to other pkts.
-			 */
-			sleep(1);
-			send_changed_ttl(txfd, &daddr);
-			write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
-			sleep(1);
-			send_ip_options(txfd, &daddr);
-			sleep(1);
-			write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
-			sleep(1);
-			send_fragment4(txfd, &daddr);
-			sleep(1);
-			write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
-			test_flush_id(txfd, &daddr, fin_pkt);
-		} else if (proto == PF_INET6) {
-			sleep(1);
-			send_fragment6(txfd, &daddr);
-			sleep(1);
-			write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
-			sleep(1);
-			/* send IPv6 packets with ext header with same payload */
-			send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_1);
-			sleep(1);
-			write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-
-			sleep(1);
-			/* send IPv6 packets with ext header with different payload */
-			send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_2);
-			sleep(1);
-			write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
-		}
-	} else if (strcmp(testname, "large") == 0) {
-		/* 20 is the difference between min iphdr size
-		 * and min ipv6hdr size. Like MAX_HDR_SIZE,
-		 * MAX_PAYLOAD is defined with the larger header of the two.
-		 */
+
+	/* ip sub-tests - IPv4 only */
+	} else if (strcmp(testname, "ip_ttl") == 0) {
+		send_changed_ttl(txfd, &daddr);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+	} else if (strcmp(testname, "ip_opt") == 0) {
+		send_ip_options(txfd, &daddr);
+		usleep(fin_delay_us);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+	} else if (strcmp(testname, "ip_frag4") == 0) {
+		send_fragment4(txfd, &daddr);
+		usleep(fin_delay_us);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+	} else if (strcmp(testname, "ip_id_df1_inc") == 0) {
+		send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_INC);
+		usleep(fin_delay_us);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+	} else if (strcmp(testname, "ip_id_df1_fixed") == 0) {
+		send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_FIXED);
+		usleep(fin_delay_us);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+	} else if (strcmp(testname, "ip_id_df0_inc") == 0) {
+		send_flush_id_case(txfd, &daddr, FLUSH_ID_DF0_INC);
+		usleep(fin_delay_us);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+	} else if (strcmp(testname, "ip_id_df0_fixed") == 0) {
+		send_flush_id_case(txfd, &daddr, FLUSH_ID_DF0_FIXED);
+		usleep(fin_delay_us);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+	} else if (strcmp(testname, "ip_id_df1_inc_fixed") == 0) {
+		send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_INC_FIXED);
+		usleep(fin_delay_us);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+	} else if (strcmp(testname, "ip_id_df1_fixed_inc") == 0) {
+		send_flush_id_case(txfd, &daddr, FLUSH_ID_DF1_FIXED_INC);
+		usleep(fin_delay_us);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+
+	/* ip sub-tests - IPv6 only */
+	} else if (strcmp(testname, "ip_frag6") == 0) {
+		send_fragment6(txfd, &daddr);
+		usleep(fin_delay_us);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+	} else if (strcmp(testname, "ip_v6ext_same") == 0) {
+		send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_1);
+		usleep(fin_delay_us);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+	} else if (strcmp(testname, "ip_v6ext_diff") == 0) {
+		send_ipv6_exthdr(txfd, &daddr, EXT_PAYLOAD_1, EXT_PAYLOAD_2);
+		usleep(fin_delay_us);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+
+	/* large sub-tests */
+	} else if (strcmp(testname, "large_max") == 0) {
 		int offset = (proto == PF_INET && !ipip) ? 20 : 0;
 		int remainder = (MAX_PAYLOAD + offset) % MSS;
 
 		send_large(txfd, &daddr, remainder);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+	} else if (strcmp(testname, "large_rem") == 0) {
+		int offset = (proto == PF_INET && !ipip) ? 20 : 0;
+		int remainder = (MAX_PAYLOAD + offset) % MSS;
 
 		send_large(txfd, &daddr, remainder + 1);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
 	} else {
-		error(1, 0, "Unknown testcase");
+		error(1, 0, "Unknown testcase: %s", testname);
 	}
 
 	if (close(txfd))
@@ -1155,126 +1195,153 @@ static void gro_receiver(void)
 
 	memset(correct_payload, 0, sizeof(correct_payload));
 
-	if (strcmp(testname, "data") == 0) {
+	/* data sub-tests */
+	if (strcmp(testname, "data_same") == 0) {
 		printf("pure data packet of same size: ");
 		correct_payload[0] = PAYLOAD_LEN * 2;
 		check_recv_pkts(rxfd, correct_payload, 1);
-
+	} else if (strcmp(testname, "data_lrg_sml") == 0) {
 		printf("large data packets followed by a smaller one: ");
 		correct_payload[0] = PAYLOAD_LEN * 1.5;
 		check_recv_pkts(rxfd, correct_payload, 1);
-
+	} else if (strcmp(testname, "data_sml_lrg") == 0) {
 		printf("small data packets followed by a larger one: ");
 		correct_payload[0] = PAYLOAD_LEN / 2;
 		correct_payload[1] = PAYLOAD_LEN;
 		check_recv_pkts(rxfd, correct_payload, 2);
+
+	/* ack test */
 	} else if (strcmp(testname, "ack") == 0) {
 		printf("duplicate ack and pure ack: ");
 		check_recv_pkts(rxfd, correct_payload, 3);
-	} else if (strcmp(testname, "flags") == 0) {
+
+	/* flags sub-tests */
+	} else if (strcmp(testname, "flags_psh") == 0) {
 		correct_payload[0] = PAYLOAD_LEN * 3;
 		correct_payload[1] = PAYLOAD_LEN * 2;
-
 		printf("psh flag ends coalescing: ");
 		check_recv_pkts(rxfd, correct_payload, 2);
-
+	} else if (strcmp(testname, "flags_syn") == 0) {
 		correct_payload[0] = PAYLOAD_LEN * 2;
 		correct_payload[1] = 0;
 		correct_payload[2] = PAYLOAD_LEN * 2;
 		printf("syn flag ends coalescing: ");
 		check_recv_pkts(rxfd, correct_payload, 3);
-
+	} else if (strcmp(testname, "flags_rst") == 0) {
+		correct_payload[0] = PAYLOAD_LEN * 2;
+		correct_payload[1] = 0;
+		correct_payload[2] = PAYLOAD_LEN * 2;
 		printf("rst flag ends coalescing: ");
 		check_recv_pkts(rxfd, correct_payload, 3);
-
+	} else if (strcmp(testname, "flags_urg") == 0) {
+		correct_payload[0] = PAYLOAD_LEN * 2;
+		correct_payload[1] = 0;
+		correct_payload[2] = PAYLOAD_LEN * 2;
 		printf("urg flag ends coalescing: ");
 		check_recv_pkts(rxfd, correct_payload, 3);
-	} else if (strcmp(testname, "tcp") == 0) {
+
+	/* tcp sub-tests */
+	} else if (strcmp(testname, "tcp_csum") == 0) {
 		correct_payload[0] = PAYLOAD_LEN;
 		correct_payload[1] = PAYLOAD_LEN;
-		correct_payload[2] = PAYLOAD_LEN;
-		correct_payload[3] = PAYLOAD_LEN;
-
 		printf("changed checksum does not coalesce: ");
 		check_recv_pkts(rxfd, correct_payload, 2);
-
+	} else if (strcmp(testname, "tcp_seq") == 0) {
+		correct_payload[0] = PAYLOAD_LEN;
+		correct_payload[1] = PAYLOAD_LEN;
 		printf("Wrong Seq number doesn't coalesce: ");
 		check_recv_pkts(rxfd, correct_payload, 2);
-
-		printf("Different timestamp doesn't coalesce: ");
+	} else if (strcmp(testname, "tcp_ts") == 0) {
 		correct_payload[0] = PAYLOAD_LEN * 2;
+		correct_payload[1] = PAYLOAD_LEN;
+		correct_payload[2] = PAYLOAD_LEN;
+		correct_payload[3] = PAYLOAD_LEN;
+		printf("Different timestamp doesn't coalesce: ");
 		check_recv_pkts(rxfd, correct_payload, 4);
-
-		printf("Different options doesn't coalesce: ");
+	} else if (strcmp(testname, "tcp_opt") == 0) {
 		correct_payload[0] = PAYLOAD_LEN * 2;
+		correct_payload[1] = PAYLOAD_LEN;
+		printf("Different options doesn't coalesce: ");
 		check_recv_pkts(rxfd, correct_payload, 2);
-	} else if (strcmp(testname, "ip") == 0) {
+
+	/* ip sub-tests - shared between IPv4 and IPv6 */
+	} else if (strcmp(testname, "ip_ecn") == 0) {
 		correct_payload[0] = PAYLOAD_LEN;
 		correct_payload[1] = PAYLOAD_LEN;
-
 		printf("different ECN doesn't coalesce: ");
 		check_recv_pkts(rxfd, correct_payload, 2);
-
+	} else if (strcmp(testname, "ip_tos") == 0) {
+		correct_payload[0] = PAYLOAD_LEN;
+		correct_payload[1] = PAYLOAD_LEN;
 		printf("different tos doesn't coalesce: ");
 		check_recv_pkts(rxfd, correct_payload, 2);
 
-		if (proto == PF_INET) {
-			printf("different ttl doesn't coalesce: ");
-			check_recv_pkts(rxfd, correct_payload, 2);
-
-			printf("ip options doesn't coalesce: ");
-			correct_payload[2] = PAYLOAD_LEN;
-			check_recv_pkts(rxfd, correct_payload, 3);
-
-			printf("fragmented ip4 doesn't coalesce: ");
-			check_recv_pkts(rxfd, correct_payload, 2);
-
-			/* is_atomic checks */
-			printf("DF=1, Incrementing - should coalesce: ");
-			correct_payload[0] = PAYLOAD_LEN * 2;
-			check_recv_pkts(rxfd, correct_payload, 1);
-
-			printf("DF=1, Fixed - should coalesce: ");
-			correct_payload[0] = PAYLOAD_LEN * 2;
-			check_recv_pkts(rxfd, correct_payload, 1);
-
-			printf("DF=0, Incrementing - should coalesce: ");
-			correct_payload[0] = PAYLOAD_LEN * 2;
-			check_recv_pkts(rxfd, correct_payload, 1);
-
-			printf("DF=0, Fixed - should coalesce: ");
-			correct_payload[0] = PAYLOAD_LEN * 2;
-			check_recv_pkts(rxfd, correct_payload, 1);
-
-			printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: ");
-			correct_payload[0] = PAYLOAD_LEN * 2;
-			correct_payload[1] = PAYLOAD_LEN;
-			check_recv_pkts(rxfd, correct_payload, 2);
-
-			printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: ");
-			correct_payload[0] = PAYLOAD_LEN * 2;
-			correct_payload[1] = PAYLOAD_LEN;
-			check_recv_pkts(rxfd, correct_payload, 2);
-		} else if (proto == PF_INET6) {
-			/* GRO doesn't check for ipv6 hop limit when flushing.
-			 * Hence no corresponding test to the ipv4 case.
-			 */
-			printf("fragmented ip6 doesn't coalesce: ");
-			correct_payload[0] = PAYLOAD_LEN * 2;
-			correct_payload[1] = PAYLOAD_LEN;
-			correct_payload[2] = PAYLOAD_LEN;
-			check_recv_pkts(rxfd, correct_payload, 3);
-
-			printf("ipv6 with ext header does coalesce: ");
-			correct_payload[0] = PAYLOAD_LEN * 2;
-			check_recv_pkts(rxfd, correct_payload, 1);
-
-			printf("ipv6 with ext header with different payloads doesn't coalesce: ");
-			correct_payload[0] = PAYLOAD_LEN;
-			correct_payload[1] = PAYLOAD_LEN;
-			check_recv_pkts(rxfd, correct_payload, 2);
-		}
-	} else if (strcmp(testname, "large") == 0) {
+	/* ip sub-tests - IPv4 only */
+	} else if (strcmp(testname, "ip_ttl") == 0) {
+		correct_payload[0] = PAYLOAD_LEN;
+		correct_payload[1] = PAYLOAD_LEN;
+		printf("different ttl doesn't coalesce: ");
+		check_recv_pkts(rxfd, correct_payload, 2);
+	} else if (strcmp(testname, "ip_opt") == 0) {
+		correct_payload[0] = PAYLOAD_LEN;
+		correct_payload[1] = PAYLOAD_LEN;
+		correct_payload[2] = PAYLOAD_LEN;
+		printf("ip options doesn't coalesce: ");
+		check_recv_pkts(rxfd, correct_payload, 3);
+	} else if (strcmp(testname, "ip_frag4") == 0) {
+		correct_payload[0] = PAYLOAD_LEN;
+		correct_payload[1] = PAYLOAD_LEN;
+		printf("fragmented ip4 doesn't coalesce: ");
+		check_recv_pkts(rxfd, correct_payload, 2);
+	} else if (strcmp(testname, "ip_id_df1_inc") == 0) {
+		printf("DF=1, Incrementing - should coalesce: ");
+		correct_payload[0] = PAYLOAD_LEN * 2;
+		check_recv_pkts(rxfd, correct_payload, 1);
+	} else if (strcmp(testname, "ip_id_df1_fixed") == 0) {
+		printf("DF=1, Fixed - should coalesce: ");
+		correct_payload[0] = PAYLOAD_LEN * 2;
+		check_recv_pkts(rxfd, correct_payload, 1);
+	} else if (strcmp(testname, "ip_id_df0_inc") == 0) {
+		printf("DF=0, Incrementing - should coalesce: ");
+		correct_payload[0] = PAYLOAD_LEN * 2;
+		check_recv_pkts(rxfd, correct_payload, 1);
+	} else if (strcmp(testname, "ip_id_df0_fixed") == 0) {
+		printf("DF=0, Fixed - should coalesce: ");
+		correct_payload[0] = PAYLOAD_LEN * 2;
+		check_recv_pkts(rxfd, correct_payload, 1);
+	} else if (strcmp(testname, "ip_id_df1_inc_fixed") == 0) {
+		printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: ");
+		correct_payload[0] = PAYLOAD_LEN * 2;
+		correct_payload[1] = PAYLOAD_LEN;
+		check_recv_pkts(rxfd, correct_payload, 2);
+	} else if (strcmp(testname, "ip_id_df1_fixed_inc") == 0) {
+		printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: ");
+		correct_payload[0] = PAYLOAD_LEN * 2;
+		correct_payload[1] = PAYLOAD_LEN;
+		check_recv_pkts(rxfd, correct_payload, 2);
+
+	/* ip sub-tests - IPv6 only */
+	} else if (strcmp(testname, "ip_frag6") == 0) {
+		/* GRO doesn't check for ipv6 hop limit when flushing.
+		 * Hence no corresponding test to the ipv4 case.
+		 */
+		printf("fragmented ip6 doesn't coalesce: ");
+		correct_payload[0] = PAYLOAD_LEN * 2;
+		correct_payload[1] = PAYLOAD_LEN;
+		correct_payload[2] = PAYLOAD_LEN;
+		check_recv_pkts(rxfd, correct_payload, 3);
+	} else if (strcmp(testname, "ip_v6ext_same") == 0) {
+		printf("ipv6 with ext header does coalesce: ");
+		correct_payload[0] = PAYLOAD_LEN * 2;
+		check_recv_pkts(rxfd, correct_payload, 1);
+	} else if (strcmp(testname, "ip_v6ext_diff") == 0) {
+		printf("ipv6 with ext header with different payloads doesn't coalesce: ");
+		correct_payload[0] = PAYLOAD_LEN;
+		correct_payload[1] = PAYLOAD_LEN;
+		check_recv_pkts(rxfd, correct_payload, 2);
+
+	/* large sub-tests */
+	} else if (strcmp(testname, "large_max") == 0) {
 		int offset = (proto == PF_INET && !ipip) ? 20 : 0;
 		int remainder = (MAX_PAYLOAD + offset) % MSS;
 
@@ -1282,14 +1349,18 @@ static void gro_receiver(void)
 		correct_payload[1] = remainder;
 		printf("Shouldn't coalesce if exceed IP max pkt size: ");
 		check_recv_pkts(rxfd, correct_payload, 2);
+	} else if (strcmp(testname, "large_rem") == 0) {
+		int offset = (proto == PF_INET && !ipip) ? 20 : 0;
+		int remainder = (MAX_PAYLOAD + offset) % MSS;
 
 		/* last segment sent individually, doesn't start new segment */
-		correct_payload[0] = correct_payload[0] - remainder;
+		correct_payload[0] = (MAX_PAYLOAD + offset) - remainder;
 		correct_payload[1] = remainder + 1;
 		correct_payload[2] = remainder + 1;
+		printf("last segment sent individually: ");
 		check_recv_pkts(rxfd, correct_payload, 3);
 	} else {
-		error(1, 0, "Test case error, should never trigger");
+		error(1, 0, "Test case error: unknown testname %s", testname);
 	}
 
 	if (close(rxfd))
diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py
index 1ab85590c439..1bb8af571456 100755
--- a/tools/testing/selftests/drivers/net/gro.py
+++ b/tools/testing/selftests/drivers/net/gro.py
@@ -9,12 +9,29 @@ binary in different configurations and checking for correct packet
 coalescing behavior.
 
 Test cases:
-  - data: Data packets with same size/headers and correct seq numbers coalesce
+  - data_same: Same size data packets coalesce
+  - data_lrg_sml: Large packet followed by smaller one coalesces
+  - data_sml_lrg: Small packet followed by larger one doesn't coalesce
   - ack: Pure ACK packets do not coalesce
-  - flags: Packets with PSH, SYN, URG, RST flags do not coalesce
-  - tcp: Packets with incorrect checksum, non-consecutive seqno don't coalesce
-  - ip: Packets with different ECN, TTL, TOS, or IP options don't coalesce
-  - large: Packets larger than GRO_MAX_SIZE don't coalesce
+  - flags_psh: Packets with PSH flag don't coalesce
+  - flags_syn: Packets with SYN flag don't coalesce
+  - flags_rst: Packets with RST flag don't coalesce
+  - flags_urg: Packets with URG flag don't coalesce
+  - tcp_csum: Packets with incorrect checksum don't coalesce
+  - tcp_seq: Packets with non-consecutive seqno don't coalesce
+  - tcp_ts: Packets with different timestamp options don't coalesce
+  - tcp_opt: Packets with different TCP options don't coalesce
+  - ip_ecn: Packets with different ECN don't coalesce
+  - ip_tos: Packets with different TOS don't coalesce
+  - ip_ttl: (IPv4) Packets with different TTL don't coalesce
+  - ip_opt: (IPv4) Packets with IP options don't coalesce
+  - ip_frag4: (IPv4) IPv4 fragments don't coalesce
+  - ip_id_df*: (IPv4) IP ID field coalescing tests
+  - ip_frag6: (IPv6) IPv6 fragments don't coalesce
+  - ip_v6ext_same: (IPv6) IPv6 ext header with same payload coalesces
+  - ip_v6ext_diff: (IPv6) IPv6 ext header with different payload doesn't coalesce
+  - large_max: Packets exceeding GRO_MAX_SIZE don't coalesce
+  - large_rem: Large packet remainder handling
 """
 
 import os
@@ -107,8 +124,8 @@ def _setup(cfg, mode, test_name):
         cfg.remote_feat = ethtool(f"-k {cfg.remote_ifname}",
                                   host=cfg.remote, json=True)[0]
 
-    # "large" test needs at least 4k MTU
-    if test_name == "large":
+    # "large_*" tests need at least 4k MTU
+    if test_name.startswith("large_"):
         _set_mtu_restore(cfg.dev, 4096, None)
         _set_mtu_restore(cfg.remote_dev, 4096, cfg.remote)
 
@@ -170,11 +187,41 @@ def _setup(cfg, mode, test_name):
 def _gro_variants():
     """Generator that yields all combinations of protocol and test types."""
 
+    # Tests that work for all protocols
+    common_tests = [
+        "data_same", "data_lrg_sml", "data_sml_lrg",
+        "ack",
+        "flags_psh", "flags_syn", "flags_rst", "flags_urg",
+        "tcp_csum", "tcp_seq", "tcp_ts", "tcp_opt",
+        "ip_ecn", "ip_tos",
+        "large_max", "large_rem",
+    ]
+
+    # Tests specific to IPv4
+    ipv4_tests = [
+        "ip_ttl", "ip_opt", "ip_frag4",
+        "ip_id_df1_inc", "ip_id_df1_fixed",
+        "ip_id_df0_inc", "ip_id_df0_fixed",
+        "ip_id_df1_inc_fixed", "ip_id_df1_fixed_inc",
+    ]
+
+    # Tests specific to IPv6
+    ipv6_tests = [
+        "ip_frag6", "ip_v6ext_same", "ip_v6ext_diff",
+    ]
+
     for mode in ["sw", "hw", "lro"]:
         for protocol in ["ipv4", "ipv6", "ipip"]:
-            for test_name in ["data", "ack", "flags", "tcp", "ip", "large"]:
+            for test_name in common_tests:
                 yield mode, protocol, test_name
 
+            if protocol in ["ipv4", "ipip"]:
+                for test_name in ipv4_tests:
+                    yield mode, protocol, test_name
+            elif protocol == "ipv6":
+                for test_name in ipv6_tests:
+                    yield mode, protocol, test_name
+
 
 @ksft_variants(_gro_variants())
 def test(cfg, mode, protocol, test_name):
@@ -215,7 +262,7 @@ def test(cfg, mode, protocol, test_name):
 
         ksft_pr(rx_proc)
 
-        if test_name == "large" and os.environ.get("KSFT_MACHINE_SLOW"):
+        if test_name.startswith("large_") and os.environ.get("KSFT_MACHINE_SLOW"):
             ksft_pr(f"Ignoring {protocol}/{test_name} failure due to slow environment")
             return
 
-- 
cgit v1.2.3


From a32bb32d019332394aee9e2befea4fec05a672e4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 19 Nov 2025 12:15:20 +0000
Subject: selftests: iou-zcrx: test large chunk sizes

Add a test using large chunks for zcrx memory area.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 tools/testing/selftests/drivers/net/hw/iou-zcrx.c  | 72 ++++++++++++++++++----
 tools/testing/selftests/drivers/net/hw/iou-zcrx.py | 39 ++++++++++++
 2 files changed, 99 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
index 62456df947bc..240d13dbc54e 100644
--- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
+++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
@@ -12,6 +12,7 @@
 #include <unistd.h>
 
 #include <arpa/inet.h>
+#include <linux/mman.h>
 #include <linux/errqueue.h>
 #include <linux/if_packet.h>
 #include <linux/ipv6.h>
@@ -37,6 +38,23 @@
 
 #include <liburing.h>
 
+#define SKIP_CODE	42
+
+struct t_io_uring_zcrx_ifq_reg {
+	__u32	if_idx;
+	__u32	if_rxq;
+	__u32	rq_entries;
+	__u32	flags;
+
+	__u64	area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
+	__u64	region_ptr; /* struct io_uring_region_desc * */
+
+	struct io_uring_zcrx_offsets offsets;
+	__u32	zcrx_id;
+	__u32	rx_buf_len;
+	__u64	__resv[3];
+};
+
 static long page_size;
 #define AREA_SIZE (8192 * page_size)
 #define SEND_SIZE (512 * 4096)
@@ -65,6 +83,8 @@ static bool cfg_oneshot;
 static int cfg_oneshot_recvs;
 static int cfg_send_size = SEND_SIZE;
 static struct sockaddr_in6 cfg_addr;
+static unsigned int cfg_rx_buf_len;
+static bool cfg_dry_run;
 
 static char *payload;
 static void *area_ptr;
@@ -128,14 +148,28 @@ static void setup_zcrx(struct io_uring *ring)
 	if (!ifindex)
 		error(1, 0, "bad interface name: %s", cfg_ifname);
 
-	area_ptr = mmap(NULL,
-			AREA_SIZE,
-			PROT_READ | PROT_WRITE,
-			MAP_ANONYMOUS | MAP_PRIVATE,
-			0,
-			0);
-	if (area_ptr == MAP_FAILED)
-		error(1, 0, "mmap(): zero copy area");
+	if (cfg_rx_buf_len && cfg_rx_buf_len != page_size) {
+		area_ptr = mmap(NULL,
+				AREA_SIZE,
+				PROT_READ | PROT_WRITE,
+				MAP_ANONYMOUS | MAP_PRIVATE |
+				MAP_HUGETLB | MAP_HUGE_2MB,
+				-1,
+				0);
+		if (area_ptr == MAP_FAILED) {
+			printf("Can't allocate huge pages\n");
+			exit(SKIP_CODE);
+		}
+	} else {
+		area_ptr = mmap(NULL,
+				AREA_SIZE,
+				PROT_READ | PROT_WRITE,
+				MAP_ANONYMOUS | MAP_PRIVATE,
+				0,
+				0);
+		if (area_ptr == MAP_FAILED)
+			error(1, 0, "mmap(): zero copy area");
+	}
 
 	ring_size = get_refill_ring_size(rq_entries);
 	ring_ptr = mmap(NULL,
@@ -157,17 +191,23 @@ static void setup_zcrx(struct io_uring *ring)
 		.flags = 0,
 	};
 
-	struct io_uring_zcrx_ifq_reg reg = {
+	struct t_io_uring_zcrx_ifq_reg reg = {
 		.if_idx = ifindex,
 		.if_rxq = cfg_queue_id,
 		.rq_entries = rq_entries,
 		.area_ptr = (__u64)(unsigned long)&area_reg,
 		.region_ptr = (__u64)(unsigned long)&region_reg,
+		.rx_buf_len = cfg_rx_buf_len,
 	};
 
-	ret = io_uring_register_ifq(ring, &reg);
-	if (ret)
+	ret = io_uring_register_ifq(ring, (void *)&reg);
+	if (cfg_rx_buf_len && (ret == -EINVAL || ret == -EOPNOTSUPP ||
+			       ret == -ERANGE)) {
+		printf("Large chunks are not supported %i\n", ret);
+		exit(SKIP_CODE);
+	} else if (ret) {
 		error(1, 0, "io_uring_register_ifq(): %d", ret);
+	}
 
 	rq_ring.khead = (unsigned int *)((char *)ring_ptr + reg.offsets.head);
 	rq_ring.ktail = (unsigned int *)((char *)ring_ptr + reg.offsets.tail);
@@ -323,6 +363,8 @@ static void run_server(void)
 	io_uring_queue_init(512, &ring, flags);
 
 	setup_zcrx(&ring);
+	if (cfg_dry_run)
+		return;
 
 	add_accept(&ring, fd);
 
@@ -383,7 +425,7 @@ static void parse_opts(int argc, char **argv)
 		usage(argv[0]);
 	cfg_payload_len = max_payload_len;
 
-	while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:")) != -1) {
+	while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:x:d")) != -1) {
 		switch (c) {
 		case 's':
 			if (cfg_client)
@@ -418,6 +460,12 @@ static void parse_opts(int argc, char **argv)
 		case 'z':
 			cfg_send_size = strtoul(optarg, NULL, 0);
 			break;
+		case 'x':
+			cfg_rx_buf_len = page_size * strtoul(optarg, NULL, 0);
+			break;
+		case 'd':
+			cfg_dry_run = true;
+			break;
 		}
 	}
 
diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
index 712c806508b5..7f596a33eb2b 100755
--- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
+++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
@@ -7,6 +7,7 @@ from lib.py import ksft_run, ksft_exit, KsftSkipEx
 from lib.py import NetDrvEpEnv
 from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen
 
+SKIP_CODE = 42
 
 def _get_current_settings(cfg):
     output = ethtool(f"-g {cfg.ifname}", json=True)[0]
@@ -132,6 +133,44 @@ def test_zcrx_rss(cfg) -> None:
         cmd(tx_cmd, host=cfg.remote)
 
 
+def test_zcrx_large_chunks(cfg) -> None:
+    """Test zcrx with large buffer chunks."""
+
+    cfg.require_ipver('6')
+
+    combined_chans = _get_combined_channels(cfg)
+    if combined_chans < 2:
+        raise KsftSkipEx('at least 2 combined channels required')
+    (rx_ring, hds_thresh) = _get_current_settings(cfg)
+    port = rand_port()
+
+    ethtool(f"-G {cfg.ifname} tcp-data-split on")
+    defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto")
+
+    ethtool(f"-G {cfg.ifname} hds-thresh 0")
+    defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}")
+
+    ethtool(f"-G {cfg.ifname} rx 64")
+    defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}")
+
+    ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}")
+    defer(ethtool, f"-X {cfg.ifname} default")
+
+    flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1)
+    defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
+
+    rx_cmd = f"{cfg.bin_local} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1} -x 2"
+    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {port} -l 12840"
+
+    probe = cmd(rx_cmd + " -d", fail=False)
+    if probe.ret == SKIP_CODE:
+        raise KsftSkipEx(probe.stdout)
+
+    with bkg(rx_cmd, exit_wait=True):
+        wait_port_listen(port, proto="tcp")
+        cmd(tx_cmd, host=cfg.remote)
+
+
 def main() -> None:
     with NetDrvEpEnv(__file__) as cfg:
         cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx")
-- 
cgit v1.2.3


From e5566f6b1d13e9bc0a458babb880916e212c45fb Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Sun, 11 Jan 2026 14:08:09 +0200
Subject: selftests: fib-onlink: Remove "wrong nexthop device" IPv4 tests

According to the test description, these tests fail because of a wrong
nexthop device:

 # ./fib-onlink-tests.sh -v
 [...]
 COMMAND: ip ro add table 254 169.254.101.102/32 via 169.254.3.1 dev veth1 onlink
 Error: Nexthop has invalid gateway.

 TEST: Gateway resolves to wrong nexthop device            [ OK ]
 COMMAND: ip ro add table 1101 169.254.102.103/32 via 169.254.7.1 dev veth5 onlink
 Error: Nexthop has invalid gateway.

 TEST: Gateway resolves to wrong nexthop device - VRF      [ OK ]
 [...]

But this is incorrect. They fail because the gateway addresses are local
addresses:

 # ip -4 address show
 [...]
 28: veth3@if27: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link-netns peer_ns-Urqh3o
     inet 169.254.3.1/24 scope global veth3
 [...]
 32: veth7@if31: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue master lisa state UP group default qlen 1000 link-netns peer_ns-Urqh3o
     inet 169.254.7.1/24 scope global veth7

Therefore, using a local address that matches the nexthop device fails
as well:

 # ip ro add table 254 169.254.101.102/32 via 169.254.3.1 dev veth3 onlink
 Error: Nexthop has invalid gateway.

Using a gateway address with a "wrong" nexthop device is actually valid
and allowed:

 # ip route get 169.254.1.2
 169.254.1.2 dev veth1 src 169.254.1.1 uid 0
 # ip ro add table 254 169.254.101.102/32 via 169.254.1.2 dev veth3 onlink
 # echo $?
 0

Remove these tests given that their output is confusing and that the
scenario that they are testing is already covered by other tests.

A subsequent patch will add tests for the nexthop device mismatch
scenario.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20260111120813.159799-2-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib-onlink-tests.sh | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh
index ec2d6ceb1f08..1bb1c2289650 100755
--- a/tools/testing/selftests/net/fib-onlink-tests.sh
+++ b/tools/testing/selftests/net/fib-onlink-tests.sh
@@ -315,12 +315,6 @@ invalid_onlink_ipv4()
 		"Invalid gw - local unicast address, VRF"
 
 	run_ip 254 ${TEST_NET4[1]}.101 ${V4ADDRS[p1]} "" 2 "No nexthop device given"
-
-	run_ip 254 ${TEST_NET4[1]}.102 ${V4ADDRS[p3]} ${NETIFS[p1]} 2 \
-		"Gateway resolves to wrong nexthop device"
-
-	run_ip ${VRF_TABLE} ${TEST_NET4[2]}.103 ${V4ADDRS[p7]} ${NETIFS[p5]} 2 \
-		"Gateway resolves to wrong nexthop device - VRF"
 }
 
 ################################################################################
-- 
cgit v1.2.3


From 0a3419f4ba407b9624c315bfd6f0056caf536898 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Sun, 11 Jan 2026 14:08:10 +0200
Subject: selftests: fib-onlink: Remove "wrong nexthop device" IPv6 tests

The command in the test fails as expected because IPv6 forbids a nexthop
device mismatch:

 # ./fib-onlink-tests.sh -v
 [...]
 COMMAND: ip -6 ro add table 1101 2001:db8:102::103/128 via 2001:db8:701::64 dev veth5 onlink
 Error: Nexthop has invalid gateway or device mismatch.

 TEST: Gateway resolves to wrong nexthop device - VRF      [ OK ]
 [...]

Where:

 # ip route get 2001:db8:701::64 vrf lisa
 2001:db8:701::64 dev veth7 table 1101 proto kernel src 2001:db8:701::1 metric 256 pref medium

This is in contrast to IPv4 where a nexthop device mismatch is allowed
when "onlink" is specified:

 # ip route get 169.254.7.2 vrf lisa
 169.254.7.2 dev veth7 table 1101 src 169.254.7.1 uid 0
 # ip ro add table 1101 169.254.102.103/32 via 169.254.7.2 dev veth5 onlink
 # echo $?
 0

Remove these tests in preparation for aligning IPv6 with IPv4 and
allowing nexthop device mismatch when "onlink" is specified.

A subsequent patch will add tests that verify that both address families
allow a nexthop device mismatch with "onlink".

Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20260111120813.159799-3-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib-onlink-tests.sh | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh
index 1bb1c2289650..63477be859e3 100755
--- a/tools/testing/selftests/net/fib-onlink-tests.sh
+++ b/tools/testing/selftests/net/fib-onlink-tests.sh
@@ -432,13 +432,6 @@ invalid_onlink_ipv6()
 
 	run_ip6 254 ${TEST_NET6[1]}::101 ${V6ADDRS[p1]} "" 2 \
 		"No nexthop device given"
-
-	# default VRF validation is done against LOCAL table
-	# run_ip6 254 ${TEST_NET6[1]}::102 ${V6ADDRS[p3]/::[0-9]/::64} ${NETIFS[p1]} 2 \
-	#	"Gateway resolves to wrong nexthop device"
-
-	run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::103 ${V6ADDRS[p7]/::[0-9]/::64} ${NETIFS[p5]} 2 \
-		"Gateway resolves to wrong nexthop device - VRF"
 }
 
 run_onlink_tests()
-- 
cgit v1.2.3


From 9bf8345fb38ab9bd771bd430073cbd3e912fcf75 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Sun, 11 Jan 2026 14:08:11 +0200
Subject: selftests: fib-onlink: Add a test case for IPv4 multicast gateway

A multicast gateway address should be rejected when "onlink" is
specified, but it is only tested as part of the IPv6 tests. Add an
equivalent IPv4 test.

 # ./fib-onlink-tests.sh -v
 [...]
 COMMAND: ip ro add table 254 169.254.101.12/32 via 233.252.0.1 dev veth1 onlink
 Error: Nexthop has invalid gateway.

 TEST: Invalid gw - multicast address                      [ OK ]
 [...]
 COMMAND: ip ro add table 1101 169.254.102.12/32 via 233.252.0.1 dev veth5 onlink
 Error: Nexthop has invalid gateway.

 TEST: Invalid gw - multicast address, VRF                 [ OK ]
 [...]
 Tests passed:  37
 Tests failed:   0

Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20260111120813.159799-4-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib-onlink-tests.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh
index 63477be859e3..7a0fd7a91e4e 100755
--- a/tools/testing/selftests/net/fib-onlink-tests.sh
+++ b/tools/testing/selftests/net/fib-onlink-tests.sh
@@ -72,7 +72,8 @@ declare -A TEST_NET4IN6IN6
 TEST_NET4IN6[1]=10.1.1.254
 TEST_NET4IN6[2]=10.2.1.254
 
-# mcast address
+# mcast addresses
+MCAST4=233.252.0.1
 MCAST6=ff02::1
 
 VRF=lisa
@@ -310,9 +311,13 @@ invalid_onlink_ipv4()
 {
 	run_ip 254 ${TEST_NET4[1]}.11 ${V4ADDRS[p1]} ${NETIFS[p1]} 2 \
 		"Invalid gw - local unicast address"
+	run_ip 254 ${TEST_NET4[1]}.12 ${MCAST4} ${NETIFS[p1]} 2 \
+		"Invalid gw - multicast address"
 
 	run_ip ${VRF_TABLE} ${TEST_NET4[2]}.11 ${V4ADDRS[p5]} ${NETIFS[p5]} 2 \
 		"Invalid gw - local unicast address, VRF"
+	run_ip ${VRF_TABLE} ${TEST_NET4[2]}.12 ${MCAST4} ${NETIFS[p5]} 2 \
+		"Invalid gw - multicast address, VRF"
 
 	run_ip 254 ${TEST_NET4[1]}.101 ${V4ADDRS[p1]} "" 2 "No nexthop device given"
 }
-- 
cgit v1.2.3


From f8f9ee9d8b2ed1f29309399020f2fc30f7f93035 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Sun, 11 Jan 2026 14:08:13 +0200
Subject: selftests: fib-onlink: Add test cases for nexthop device mismatch

Add test cases that verify that when the "onlink" keyword is specified,
both address families (with and without VRF) accept routes with a
gateway address that is reachable via a different interface than the one
specified.

Output without "ipv6: Allow for nexthop device mismatch with "onlink"":

 # ./fib-onlink-tests.sh | grep mismatch
 TEST: nexthop device mismatch                             [ OK ]
 TEST: nexthop device mismatch                             [ OK ]
 TEST: nexthop device mismatch                             [FAIL]
 TEST: nexthop device mismatch                             [FAIL]

Output with "ipv6: Allow for nexthop device mismatch with "onlink"":

 # ./fib-onlink-tests.sh | grep mismatch
 TEST: nexthop device mismatch                             [ OK ]
 TEST: nexthop device mismatch                             [ OK ]
 TEST: nexthop device mismatch                             [ OK ]
 TEST: nexthop device mismatch                             [ OK ]

That is, the IPv4 tests were always passing, but the IPv6 ones only pass
after the specified patch.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20260111120813.159799-6-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib-onlink-tests.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh
index 7a0fd7a91e4e..b5773ac8847d 100755
--- a/tools/testing/selftests/net/fib-onlink-tests.sh
+++ b/tools/testing/selftests/net/fib-onlink-tests.sh
@@ -271,11 +271,15 @@ valid_onlink_ipv4()
 
 	run_ip 254 ${TEST_NET4[1]}.1 ${CONGW[1]} ${NETIFS[p1]} 0 "unicast connected"
 	run_ip 254 ${TEST_NET4[1]}.2 ${RECGW4[1]} ${NETIFS[p1]} 0 "unicast recursive"
+	run_ip 254 ${TEST_NET4[1]}.9 ${CONGW[1]} ${NETIFS[p3]} 0 \
+		"nexthop device mismatch"
 
 	log_subsection "VRF ${VRF}"
 
 	run_ip ${VRF_TABLE} ${TEST_NET4[2]}.1 ${CONGW[3]} ${NETIFS[p5]} 0 "unicast connected"
 	run_ip ${VRF_TABLE} ${TEST_NET4[2]}.2 ${RECGW4[2]} ${NETIFS[p5]} 0 "unicast recursive"
+	run_ip ${VRF_TABLE} ${TEST_NET4[2]}.10 ${CONGW[3]} ${NETIFS[p7]} 0 \
+		"nexthop device mismatch"
 
 	log_subsection "VRF device, PBR table"
 
@@ -366,12 +370,16 @@ valid_onlink_ipv6()
 	run_ip6 254 ${TEST_NET6[1]}::1 ${V6ADDRS[p1]/::*}::64 ${NETIFS[p1]} 0 "unicast connected"
 	run_ip6 254 ${TEST_NET6[1]}::2 ${RECGW6[1]} ${NETIFS[p1]} 0 "unicast recursive"
 	run_ip6 254 ${TEST_NET6[1]}::3 ::ffff:${TEST_NET4IN6[1]} ${NETIFS[p1]} 0 "v4-mapped"
+	run_ip6 254 ${TEST_NET6[1]}::a ${V6ADDRS[p1]/::*}::64 ${NETIFS[p3]} 0 \
+		"nexthop device mismatch"
 
 	log_subsection "VRF ${VRF}"
 
 	run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::1 ${V6ADDRS[p5]/::*}::64 ${NETIFS[p5]} 0 "unicast connected"
 	run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::2 ${RECGW6[2]} ${NETIFS[p5]} 0 "unicast recursive"
 	run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::3 ::ffff:${TEST_NET4IN6[2]} ${NETIFS[p5]} 0 "v4-mapped"
+	run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::b ${V6ADDRS[p5]/::*}::64 \
+		${NETIFS[p7]} 0 "nexthop device mismatch"
 
 	log_subsection "VRF device, PBR table"
 
-- 
cgit v1.2.3


From 9d48c62f6b4ed70ebeea70f52ddb1c6d8613bed4 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Mon, 12 Jan 2026 19:37:14 +0200
Subject: selftests: drv-net: fix RPS mask handling in toeplitz test

The toeplitz.py test passed the hex mask without "0x" prefix (e.g.,
"300" for CPUs 8,9). The toeplitz.c strtoul() call wrongly parsed this
as decimal 300 (0x12c) instead of hex 0x300.

Pass the prefixed mask to toeplitz.c, and the unprefixed one to sysfs.

Fixes: 9cf9aa77a1f6 ("selftests: drv-net: hw: convert the Toeplitz test to Python")
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20260112173715.384843-2-gal@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/toeplitz.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.py b/tools/testing/selftests/drivers/net/hw/toeplitz.py
index d2db5ee9e358..d288c57894f6 100755
--- a/tools/testing/selftests/drivers/net/hw/toeplitz.py
+++ b/tools/testing/selftests/drivers/net/hw/toeplitz.py
@@ -94,12 +94,14 @@ def _configure_rps(cfg, rps_cpus):
     mask = 0
     for cpu in rps_cpus:
         mask |= (1 << cpu)
-    mask = hex(mask)[2:]
+
+    mask = hex(mask)
 
     # Set RPS bitmap for all rx queues
     for rps_file in glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*/rps_cpus"):
         with open(rps_file, "w", encoding="utf-8") as fp:
-            fp.write(mask)
+            # sysfs expects hex without '0x' prefix, toeplitz.c needs the prefix
+            fp.write(mask[2:])
 
     return mask
 
-- 
cgit v1.2.3


From cf055f8c000445aa688c53a706ef4f580818eedb Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Mon, 12 Jan 2026 19:37:15 +0200
Subject: selftests: drv-net: fix RPS mask handling for high CPU numbers

The RPS bitmask bounds check uses ~(RPS_MAX_CPUS - 1) which equals ~15 =
0xfff0, only allowing CPUs 0-3.

Change the mask to ~((1UL << RPS_MAX_CPUS) - 1) = ~0xffff to allow CPUs
0-15.

Fixes: 5ebfb4cc3048 ("selftests/net: toeplitz test")
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20260112173715.384843-3-gal@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/toeplitz.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.c b/tools/testing/selftests/drivers/net/hw/toeplitz.c
index d23b3b0c20a3..285bb17df9c2 100644
--- a/tools/testing/selftests/drivers/net/hw/toeplitz.c
+++ b/tools/testing/selftests/drivers/net/hw/toeplitz.c
@@ -485,8 +485,8 @@ static void parse_rps_bitmap(const char *arg)
 
 	bitmap = strtoul(arg, NULL, 0);
 
-	if (bitmap & ~(RPS_MAX_CPUS - 1))
-		error(1, 0, "rps bitmap 0x%lx out of bounds 0..%lu",
+	if (bitmap & ~((1UL << RPS_MAX_CPUS) - 1))
+		error(1, 0, "rps bitmap 0x%lx out of bounds, max cpu %lu",
 		      bitmap, RPS_MAX_CPUS - 1);
 
 	for (i = 0; i < RPS_MAX_CPUS; i++)
-- 
cgit v1.2.3


From f8ade2342e22e7dbc71af496f07c900f8c69dd54 Mon Sep 17 00:00:00 2001
From: Matt Bobrowski <mattbobrowski@google.com>
Date: Tue, 13 Jan 2026 08:39:47 +0000
Subject: bpf: return PTR_TO_BTF_ID | PTR_TRUSTED from BPF kfuncs by default

Teach the BPF verifier to treat pointers to struct types returned from
BPF kfuncs as implicitly trusted (PTR_TO_BTF_ID | PTR_TRUSTED) by
default. Returning untrusted pointers to struct types from BPF kfuncs
should be considered an exception only, and certainly not the norm.

Update existing selftests to reflect the change in register type
printing (e.g. `ptr_` becoming `trusted_ptr_` in verifier error
messages).

Link: https://lore.kernel.org/bpf/aV4nbCaMfIoM0awM@google.com/
Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20260113083949.2502978-1-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                              | 46 ++++++++++++++--------
 tools/testing/selftests/bpf/progs/map_kptr_fail.c  |  4 +-
 .../struct_ops_kptr_return_fail__wrong_type.c      |  2 +-
 .../selftests/bpf/progs/verifier_global_ptr_args.c |  2 +-
 tools/testing/selftests/bpf/verifier/calls.c       |  2 +-
 5 files changed, 34 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 45733bae271d..faa1ecc1fe9d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -14212,26 +14212,38 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			if (is_kfunc_rcu_protected(&meta))
 				regs[BPF_REG_0].type |= MEM_RCU;
 		} else {
-			mark_reg_known_zero(env, regs, BPF_REG_0);
-			regs[BPF_REG_0].btf = desc_btf;
-			regs[BPF_REG_0].type = PTR_TO_BTF_ID;
-			regs[BPF_REG_0].btf_id = ptr_type_id;
+			enum bpf_reg_type type = PTR_TO_BTF_ID;
 
 			if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache])
-				regs[BPF_REG_0].type |= PTR_UNTRUSTED;
-			else if (is_kfunc_rcu_protected(&meta))
-				regs[BPF_REG_0].type |= MEM_RCU;
-
-			if (is_iter_next_kfunc(&meta)) {
-				struct bpf_reg_state *cur_iter;
-
-				cur_iter = get_iter_from_state(env->cur_state, &meta);
-
-				if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */
-					regs[BPF_REG_0].type |= MEM_RCU;
-				else
-					regs[BPF_REG_0].type |= PTR_TRUSTED;
+				type |= PTR_UNTRUSTED;
+			else if (is_kfunc_rcu_protected(&meta) ||
+				 (is_iter_next_kfunc(&meta) &&
+				  (get_iter_from_state(env->cur_state, &meta)
+					   ->type & MEM_RCU))) {
+				/*
+				 * If the iterator's constructor (the _new
+				 * function e.g., bpf_iter_task_new) has been
+				 * annotated with BPF kfunc flag
+				 * KF_RCU_PROTECTED and was called within a RCU
+				 * read-side critical section, also propagate
+				 * the MEM_RCU flag to the pointer returned from
+				 * the iterator's next function (e.g.,
+				 * bpf_iter_task_next).
+				 */
+				type |= MEM_RCU;
+			} else {
+				/*
+				 * Any PTR_TO_BTF_ID that is returned from a BPF
+				 * kfunc should by default be treated as
+				 * implicitly trusted.
+				 */
+				type |= PTR_TRUSTED;
 			}
+
+			mark_reg_known_zero(env, regs, BPF_REG_0);
+			regs[BPF_REG_0].btf = desc_btf;
+			regs[BPF_REG_0].type = type;
+			regs[BPF_REG_0].btf_id = ptr_type_id;
 		}
 
 		if (is_kfunc_ret_null(&meta)) {
diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
index 4c0ff01f1a96..6443b320c732 100644
--- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
@@ -272,7 +272,7 @@ int reject_untrusted_xchg(struct __sk_buff *ctx)
 
 SEC("?tc")
 __failure
-__msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member")
+__msg("invalid kptr access, R2 type=trusted_ptr_prog_test_ref_kfunc expected=ptr_prog_test_member")
 int reject_bad_type_xchg(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *ref_ptr;
@@ -291,7 +291,7 @@ int reject_bad_type_xchg(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc")
+__failure __msg("invalid kptr access, R2 type=trusted_ptr_prog_test_ref_kfunc")
 int reject_member_of_ref_xchg(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *ref_ptr;
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c
index 6a2dd5367802..c8d217e89eea 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c
@@ -12,7 +12,7 @@ void bpf_task_release(struct task_struct *p) __ksym;
  * reject programs returning a referenced kptr of the wrong type.
  */
 SEC("struct_ops/test_return_ref_kptr")
-__failure __msg("At program exit the register R0 is not a known value (ptr_or_null_)")
+__failure __msg("At program exit the register R0 is not a known value (trusted_ptr_or_null_)")
 struct task_struct *BPF_PROG(kptr_return_fail__wrong_type, int dummy,
 			     struct task_struct *task, struct cgroup *cgrp)
 {
diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c
index 1204fbc58178..e7dae0cf9c17 100644
--- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c
+++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c
@@ -72,7 +72,7 @@ int trusted_task_arg_nonnull_fail1(void *ctx)
 
 SEC("?tp_btf/task_newtask")
 __failure __log_level(2)
-__msg("R1 type=ptr_or_null_ expected=ptr_, trusted_ptr_, rcu_ptr_")
+__msg("R1 type=trusted_ptr_or_null_ expected=ptr_, trusted_ptr_, rcu_ptr_")
 __msg("Caller passes invalid args into func#1 ('subprog_trusted_task_nonnull')")
 int trusted_task_arg_nonnull_fail2(void *ctx)
 {
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index c8d640802cce..9ca83dce100d 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -220,7 +220,7 @@
 	},
 	.result_unpriv = REJECT,
 	.result = REJECT,
-	.errstr = "variable ptr_ access var_off=(0x0; 0x7) disallowed",
+	.errstr = "variable trusted_ptr_ access var_off=(0x0; 0x7) disallowed",
 },
 {
 	"calls: invalid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID",
-- 
cgit v1.2.3


From bbdbed193bcf57f1e9c0d9d58c3ad3350bfd0bd1 Mon Sep 17 00:00:00 2001
From: Matt Bobrowski <mattbobrowski@google.com>
Date: Tue, 13 Jan 2026 08:39:49 +0000
Subject: selftests/bpf: assert BPF kfunc default trusted pointer semantics

The BPF verifier was recently updated to treat pointers to struct types
returned from BPF kfuncs as implicitly trusted by default. Add a new
test case to exercise this new implicit trust semantic.

The KF_ACQUIRE flag was dropped from the bpf_get_root_mem_cgroup()
kfunc because it returns a global pointer to root_mem_cgroup without
performing any explicit reference counting. This makes it an ideal
candidate to verify the new implicit trusted pointer semantics.

Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20260113083949.2502978-3-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c  |  2 ++
 .../selftests/bpf/progs/verifier_memcontrol.c      | 32 ++++++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_memcontrol.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 5829ffd70f8f..38c5ba70100c 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -61,6 +61,7 @@
 #include "verifier_masking.skel.h"
 #include "verifier_may_goto_1.skel.h"
 #include "verifier_may_goto_2.skel.h"
+#include "verifier_memcontrol.skel.h"
 #include "verifier_meta_access.skel.h"
 #include "verifier_movsx.skel.h"
 #include "verifier_mtu.skel.h"
@@ -202,6 +203,7 @@ void test_verifier_map_ret_val(void)          { RUN(verifier_map_ret_val); }
 void test_verifier_masking(void)              { RUN(verifier_masking); }
 void test_verifier_may_goto_1(void)           { RUN(verifier_may_goto_1); }
 void test_verifier_may_goto_2(void)           { RUN(verifier_may_goto_2); }
+void test_verifier_memcontrol(void)	      { RUN(verifier_memcontrol); }
 void test_verifier_meta_access(void)          { RUN(verifier_meta_access); }
 void test_verifier_movsx(void)                 { RUN(verifier_movsx); }
 void test_verifier_mul(void)                  { RUN(verifier_mul); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_memcontrol.c b/tools/testing/selftests/bpf/progs/verifier_memcontrol.c
new file mode 100644
index 000000000000..13564956f621
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_memcontrol.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2026 Google LLC.
+ */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+
+SEC("syscall")
+__success __retval(0)
+int root_mem_cgroup_default_trusted(void *ctx)
+{
+	unsigned long usage;
+	struct mem_cgroup *root_mem_cgroup;
+
+	root_mem_cgroup = bpf_get_root_mem_cgroup();
+	if (!root_mem_cgroup)
+		return 1;
+
+	/*
+	 * BPF kfunc bpf_get_root_mem_cgroup() returns a PTR_TO_BTF_ID |
+	 * PTR_TRUSTED | PTR_MAYBE_NULL, therefore it should be accepted when
+	 * passed to a BPF kfunc only accepting KF_TRUSTED_ARGS.
+	 */
+	usage = bpf_mem_cgroup_usage(root_mem_cgroup);
+	__sink(usage);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From c656807675e09604af09a4b9f3ea466af91b7b7a Mon Sep 17 00:00:00 2001
From: Anton Protopopov <a.s.protopopov@gmail.com>
Date: Sun, 11 Jan 2026 15:30:47 +0000
Subject: selftests/bpf: Add tests for loading insn array values with offsets

The ldimm64 instruction for map value supports an offset.
For insn array maps it wasn't tested before, as normally
such instructions aren't generated. However, this is still
possible to pass such instructions, so add a few tests to
check that correct offsets work properly and incorrect
offsets are rejected.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Link: https://lore.kernel.org/r/20260111153047.8388-4-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/bpf_gotox.c | 208 +++++++++++++++++++++
 1 file changed, 208 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c
index d138cc7b1bda..75b0cf2467ab 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c
@@ -240,6 +240,208 @@ static void check_nonstatic_global_other_sec(struct bpf_gotox *skel)
 	bpf_link__destroy(link);
 }
 
+/*
+ * The following subtests do not use skeleton rather than to check
+ * if the test should be skipped.
+ */
+
+static int create_jt_map(__u32 max_entries)
+{
+	const char *map_name = "jt";
+	__u32 key_size = 4;
+	__u32 value_size = sizeof(struct bpf_insn_array_value);
+
+	return bpf_map_create(BPF_MAP_TYPE_INSN_ARRAY, map_name,
+			      key_size, value_size, max_entries, NULL);
+}
+
+static int prog_load(struct bpf_insn *insns, __u32 insn_cnt)
+{
+	return bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL);
+}
+
+static int __check_ldimm64_off_prog_load(__u32 max_entries, __u32 off)
+{
+	struct bpf_insn insns[] = {
+		BPF_LD_IMM64_RAW(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0),
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	int map_fd, ret;
+
+	map_fd = create_jt_map(max_entries);
+	if (!ASSERT_GE(map_fd, 0, "create_jt_map"))
+		return -1;
+	if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) {
+		close(map_fd);
+		return -1;
+	}
+
+	insns[0].imm = map_fd;
+	insns[1].imm = off;
+
+	ret = prog_load(insns, ARRAY_SIZE(insns));
+	close(map_fd);
+	return ret;
+}
+
+/*
+ * Check that loads from an instruction array map are only allowed with offsets
+ * which are multiples of 8 and do not point to outside of the map.
+ */
+static void check_ldimm64_off_load(struct bpf_gotox *skel __always_unused)
+{
+	const __u32 max_entries = 10;
+	int prog_fd;
+	__u32 off;
+
+	for (off = 0; off < max_entries; off++) {
+		prog_fd = __check_ldimm64_off_prog_load(max_entries, off * 8);
+		if (!ASSERT_GE(prog_fd, 0, "__check_ldimm64_off_prog_load"))
+			return;
+		close(prog_fd);
+	}
+
+	prog_fd = __check_ldimm64_off_prog_load(max_entries, 7 /* not a multiple of 8 */);
+	if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_off_prog_load: should be -EACCES")) {
+		close(prog_fd);
+		return;
+	}
+
+	prog_fd = __check_ldimm64_off_prog_load(max_entries, max_entries * 8 /* too large */);
+	if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_off_prog_load: should be -EACCES")) {
+		close(prog_fd);
+		return;
+	}
+}
+
+static int __check_ldimm64_gotox_prog_load(struct bpf_insn *insns,
+					   __u32 insn_cnt,
+					   __u32 off1, __u32 off2)
+{
+	const __u32 values[] = {5, 7, 9, 11, 13, 15};
+	const __u32 max_entries = ARRAY_SIZE(values);
+	struct bpf_insn_array_value val = {};
+	int map_fd, ret, i;
+
+	map_fd = create_jt_map(max_entries);
+	if (!ASSERT_GE(map_fd, 0, "create_jt_map"))
+		return -1;
+
+	for (i = 0; i < max_entries; i++) {
+		val.orig_off = values[i];
+		if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0,
+			       "bpf_map_update_elem")) {
+			close(map_fd);
+			return -1;
+		}
+	}
+
+	if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) {
+		close(map_fd);
+		return -1;
+	}
+
+	/* r1 = &map + offset1 */
+	insns[0].imm = map_fd;
+	insns[1].imm = off1;
+
+	/* r1 += off2 */
+	insns[2].imm = off2;
+
+	ret = prog_load(insns, insn_cnt);
+	close(map_fd);
+	return ret;
+}
+
+static void reject_offsets(struct bpf_insn *insns, __u32 insn_cnt, __u32 off1, __u32 off2)
+{
+	int prog_fd;
+
+	prog_fd = __check_ldimm64_gotox_prog_load(insns, insn_cnt, off1, off2);
+	if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_gotox_prog_load"))
+		close(prog_fd);
+}
+
+/*
+ * Verify a bit more complex programs which include indirect jumps
+ * and with jump tables loaded with a non-zero offset
+ */
+static void check_ldimm64_off_gotox(struct bpf_gotox *skel __always_unused)
+{
+	struct bpf_insn insns[] = {
+		/*
+		 * The following instructions perform an indirect jump to
+		 * labels below. Thus valid offsets in the map are {0,...,5}.
+		 * The program rewrites the offsets in the instructions below:
+		 *     r1 = &map + offset1
+		 *     r1 += offset2
+		 *     r1 = *r1
+		 *     gotox r1
+		 */
+		BPF_LD_IMM64_RAW(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0),
+		BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_1, 0, 0, 0),
+
+		/* case 0: */
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+		/* case 1: */
+		BPF_MOV64_IMM(BPF_REG_0, 1),
+		BPF_EXIT_INSN(),
+		/* case 2: */
+		BPF_MOV64_IMM(BPF_REG_0, 2),
+		BPF_EXIT_INSN(),
+		/* case 3: */
+		BPF_MOV64_IMM(BPF_REG_0, 3),
+		BPF_EXIT_INSN(),
+		/* case 4: */
+		BPF_MOV64_IMM(BPF_REG_0, 4),
+		BPF_EXIT_INSN(),
+		/* default: */
+		BPF_MOV64_IMM(BPF_REG_0, 5),
+		BPF_EXIT_INSN(),
+	};
+	int prog_fd, err;
+	__u32 off1, off2;
+
+	/* allow all combinations off1 + off2 < 6 */
+	for (off1 = 0; off1 < 6; off1++) {
+		for (off2 = 0; off1 + off2 < 6; off2++) {
+			LIBBPF_OPTS(bpf_test_run_opts, topts);
+
+			prog_fd = __check_ldimm64_gotox_prog_load(insns, ARRAY_SIZE(insns),
+								  off1 * 8, off2 * 8);
+			if (!ASSERT_GE(prog_fd, 0, "__check_ldimm64_gotox_prog_load"))
+				return;
+
+			err = bpf_prog_test_run_opts(prog_fd, &topts);
+			if (!ASSERT_OK(err, "test_run_opts err")) {
+				close(prog_fd);
+				return;
+			}
+
+			if (!ASSERT_EQ(topts.retval, off1 + off2, "test_run_opts retval")) {
+				close(prog_fd);
+				return;
+			}
+
+			close(prog_fd);
+		}
+	}
+
+	/* reject off1 + off2 >= 6 */
+	reject_offsets(insns, ARRAY_SIZE(insns), 8 * 3, 8 * 3);
+	reject_offsets(insns, ARRAY_SIZE(insns), 8 * 7, 8 * 0);
+	reject_offsets(insns, ARRAY_SIZE(insns), 8 * 0, 8 * 7);
+
+	/* reject (off1 + off2) % 8 != 0 */
+	reject_offsets(insns, ARRAY_SIZE(insns), 3, 3);
+	reject_offsets(insns, ARRAY_SIZE(insns), 7, 0);
+	reject_offsets(insns, ARRAY_SIZE(insns), 0, 7);
+}
+
 void test_bpf_gotox(void)
 {
 	struct bpf_gotox *skel;
@@ -288,5 +490,11 @@ void test_bpf_gotox(void)
 	if (test__start_subtest("one-map-two-jumps"))
 		__subtest(skel, check_one_map_two_jumps);
 
+	if (test__start_subtest("check-ldimm64-off"))
+		__subtest(skel, check_ldimm64_off_load);
+
+	if (test__start_subtest("check-ldimm64-off-gotox"))
+		__subtest(skel, check_ldimm64_off_gotox);
+
 	bpf_gotox__destroy(skel);
 }
-- 
cgit v1.2.3


From 7158fc54b2c6f124eec0d7cd13bff69da0172e59 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 30 Dec 2025 08:08:44 +0100
Subject: vdso: Remove struct getcpu_cache
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cache parameter of getcpu() is useless nowadays for various reasons.

  * It is never passed by userspace for either the vDSO or syscalls.
  * It is never used by the kernel.
  * It could not be made to work on the current vDSO architecture.
  * The structure definition is not part of the UAPI headers.
  * vdso_getcpu() is superseded by restartable sequences in any case.

Remove the struct and its header.

As a side-effect this gets rid of an unwanted inclusion of the linux/
header namespace from vDSO code.

[ tglx: Adapt to s390 upstream changes */

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Heiko Carstens <hca@linux.ibm.com> # s390
Link: https://patch.msgid.link/20251230-getcpu_cache-v3-1-fb9c5f880ebe@linutronix.de
---
 arch/loongarch/vdso/vgetcpu.c                   |  5 ++---
 arch/s390/kernel/vdso/getcpu.c                  |  3 +--
 arch/s390/kernel/vdso/vdso.h                    |  4 +---
 arch/x86/entry/vdso/vgetcpu.c                   |  5 ++---
 arch/x86/include/asm/vdso/processor.h           |  4 +---
 include/linux/getcpu.h                          | 19 -------------------
 include/linux/syscalls.h                        |  3 +--
 kernel/sys.c                                    |  4 +---
 tools/testing/selftests/vDSO/vdso_test_getcpu.c |  4 +---
 9 files changed, 10 insertions(+), 41 deletions(-)
 delete mode 100644 include/linux/getcpu.h

(limited to 'tools')

diff --git a/arch/loongarch/vdso/vgetcpu.c b/arch/loongarch/vdso/vgetcpu.c
index 73af49242ecd..6f054ec898c7 100644
--- a/arch/loongarch/vdso/vgetcpu.c
+++ b/arch/loongarch/vdso/vgetcpu.c
@@ -4,7 +4,6 @@
  */
 
 #include <asm/vdso.h>
-#include <linux/getcpu.h>
 
 static __always_inline int read_cpu_id(void)
 {
@@ -28,8 +27,8 @@ static __always_inline int read_cpu_id(void)
 }
 
 extern
-int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused);
-int __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused)
+int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused);
+int __vdso_getcpu(unsigned int *cpu, unsigned int *node, void *unused)
 {
 	int cpu_id;
 
diff --git a/arch/s390/kernel/vdso/getcpu.c b/arch/s390/kernel/vdso/getcpu.c
index 5c5d4a848b76..1e17665616c5 100644
--- a/arch/s390/kernel/vdso/getcpu.c
+++ b/arch/s390/kernel/vdso/getcpu.c
@@ -2,11 +2,10 @@
 /* Copyright IBM Corp. 2020 */
 
 #include <linux/compiler.h>
-#include <linux/getcpu.h>
 #include <asm/timex.h>
 #include "vdso.h"
 
-int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, void *unused)
 {
 	union tod_clock clk;
 
diff --git a/arch/s390/kernel/vdso/vdso.h b/arch/s390/kernel/vdso/vdso.h
index 8cff033dd854..1fe52a6f5a56 100644
--- a/arch/s390/kernel/vdso/vdso.h
+++ b/arch/s390/kernel/vdso/vdso.h
@@ -4,9 +4,7 @@
 
 #include <vdso/datapage.h>
 
-struct getcpu_cache;
-
-int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);
+int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, void *unused);
 int __s390_vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
 int __s390_vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
 int __s390_vdso_clock_getres(clockid_t clock, struct __kernel_timespec *ts);
diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
index e4640306b2e3..6381b472b7c5 100644
--- a/arch/x86/entry/vdso/vgetcpu.c
+++ b/arch/x86/entry/vdso/vgetcpu.c
@@ -6,17 +6,16 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/getcpu.h>
 #include <asm/segment.h>
 #include <vdso/processor.h>
 
 notrace long
-__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+__vdso_getcpu(unsigned *cpu, unsigned *node, void *unused)
 {
 	vdso_read_cpunode(cpu, node);
 
 	return 0;
 }
 
-long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+long getcpu(unsigned *cpu, unsigned *node, void *tcache)
 	__attribute__((weak, alias("__vdso_getcpu")));
diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h
index 7000aeb59aa2..93e0e24e5cb4 100644
--- a/arch/x86/include/asm/vdso/processor.h
+++ b/arch/x86/include/asm/vdso/processor.h
@@ -18,9 +18,7 @@ static __always_inline void cpu_relax(void)
 	native_pause();
 }
 
-struct getcpu_cache;
-
-notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);
+notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, void *unused);
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/include/linux/getcpu.h b/include/linux/getcpu.h
deleted file mode 100644
index c304dcdb4eac..000000000000
--- a/include/linux/getcpu.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_GETCPU_H
-#define _LINUX_GETCPU_H 1
-
-/* Cache for getcpu() to speed it up. Results might be a short time
-   out of date, but will be faster.
-
-   User programs should not refer to the contents of this structure.
-   I repeat they should not refer to it. If they do they will break
-   in future kernels.
-
-   It is only a private cache for vgetcpu(). It will change in future kernels.
-   The user program must store this information per thread (__thread)
-   If you want 100% accurate information pass NULL instead. */
-struct getcpu_cache {
-	unsigned long blob[128 / sizeof(long)];
-};
-
-#endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index cf84d98964b2..23704e006afd 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -59,7 +59,6 @@ struct compat_stat;
 struct old_timeval32;
 struct robust_list_head;
 struct futex_waitv;
-struct getcpu_cache;
 struct old_linux_dirent;
 struct perf_event_attr;
 struct file_handle;
@@ -718,7 +717,7 @@ asmlinkage long sys_getrusage(int who, struct rusage __user *ru);
 asmlinkage long sys_umask(int mask);
 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			unsigned long arg4, unsigned long arg5);
-asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
+asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, void __user *cache);
 asmlinkage long sys_gettimeofday(struct __kernel_old_timeval __user *tv,
 				struct timezone __user *tz);
 asmlinkage long sys_settimeofday(struct __kernel_old_timeval __user *tv,
diff --git a/kernel/sys.c b/kernel/sys.c
index 8b58eece4e58..f1780ab132a3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -31,7 +31,6 @@
 #include <linux/tty.h>
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
-#include <linux/getcpu.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
@@ -2876,8 +2875,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	return error;
 }
 
-SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
-		struct getcpu_cache __user *, unused)
+SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, void __user *, unused)
 {
 	int err = 0;
 	int cpu = raw_smp_processor_id();
diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c
index bea8ad54da11..3fe49cbdae98 100644
--- a/tools/testing/selftests/vDSO/vdso_test_getcpu.c
+++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c
@@ -16,9 +16,7 @@
 #include "vdso_config.h"
 #include "vdso_call.h"
 
-struct getcpu_cache;
-typedef long (*getcpu_t)(unsigned int *, unsigned int *,
-			 struct getcpu_cache *);
+typedef long (*getcpu_t)(unsigned int *, unsigned int *, void *);
 
 int main(int argc, char **argv)
 {
-- 
cgit v1.2.3


From 1d7cf255eefbb479d0eea9aa3b6372a1e52f8c62 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 16 Oct 2025 13:51:25 -0700
Subject: tools headers: Update the linux/unaligned.h copy with the kernel
 sources

To pick up the changes in:

  vdso: Switch get/put_unaligned() from packed struct to memcpy

As the code is dependent on __unqual_scalar_typeof, update also the tools
version of compiler_types.h to include this.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Link: https://patch.msgid.link/20251016205126.2882625-4-irogers@google.com
---
 tools/include/linux/compiler_types.h | 22 +++++++++++++++++++
 tools/include/vdso/unaligned.h       | 41 ++++++++++++++++++++++++++++++------
 2 files changed, 57 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h
index d09f9dc172a4..890982283a5e 100644
--- a/tools/include/linux/compiler_types.h
+++ b/tools/include/linux/compiler_types.h
@@ -40,4 +40,26 @@
 #define asm_goto_output(x...) asm goto(x)
 #endif
 
+/*
+ * __unqual_scalar_typeof(x) - Declare an unqualified scalar type, leaving
+ *			       non-scalar types unchanged.
+ */
+/*
+ * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char'
+ * is not type-compatible with 'signed char', and we define a separate case.
+ */
+#define __scalar_type_to_expr_cases(type)				\
+		unsigned type:	(unsigned type)0,			\
+		signed type:	(signed type)0
+
+#define __unqual_scalar_typeof(x) typeof(				\
+		_Generic((x),						\
+			 char:	(char)0,				\
+			 __scalar_type_to_expr_cases(char),		\
+			 __scalar_type_to_expr_cases(short),		\
+			 __scalar_type_to_expr_cases(int),		\
+			 __scalar_type_to_expr_cases(long),		\
+			 __scalar_type_to_expr_cases(long long),	\
+			 default: (x)))
+
 #endif /* __LINUX_COMPILER_TYPES_H */
diff --git a/tools/include/vdso/unaligned.h b/tools/include/vdso/unaligned.h
index ff0c06b6513e..9076483c9fbb 100644
--- a/tools/include/vdso/unaligned.h
+++ b/tools/include/vdso/unaligned.h
@@ -2,14 +2,43 @@
 #ifndef __VDSO_UNALIGNED_H
 #define __VDSO_UNALIGNED_H
 
-#define __get_unaligned_t(type, ptr) ({							\
-	const struct { type x; } __packed * __get_pptr = (typeof(__get_pptr))(ptr);	\
-	__get_pptr->x;									\
+#include <linux/compiler_types.h>
+
+/**
+ * __get_unaligned_t - read an unaligned value from memory.
+ * @type:	the type to load from the pointer.
+ * @ptr:	the pointer to load from.
+ *
+ * Use memcpy to affect an unaligned type sized load avoiding undefined behavior
+ * from approaches like type punning that require -fno-strict-aliasing in order
+ * to be correct. As type may be const, use __unqual_scalar_typeof to map to a
+ * non-const type - you can't memcpy into a const type. The
+ * __get_unaligned_ctrl_type gives __unqual_scalar_typeof its required
+ * expression rather than type, a pointer is used to avoid warnings about mixing
+ * the use of 0 and NULL. The void* cast silences ubsan warnings.
+ */
+#define __get_unaligned_t(type, ptr) ({					\
+	type *__get_unaligned_ctrl_type __always_unused = NULL;		\
+	__unqual_scalar_typeof(*__get_unaligned_ctrl_type) __get_unaligned_val; \
+	__builtin_memcpy(&__get_unaligned_val, (void *)(ptr),		\
+			 sizeof(__get_unaligned_val));			\
+	__get_unaligned_val;						\
 })
 
-#define __put_unaligned_t(type, val, ptr) do {						\
-	struct { type x; } __packed * __put_pptr = (typeof(__put_pptr))(ptr);		\
-	__put_pptr->x = (val);								\
+/**
+ * __put_unaligned_t - write an unaligned value to memory.
+ * @type:	the type of the value to store.
+ * @val:	the value to store.
+ * @ptr:	the pointer to store to.
+ *
+ * Use memcpy to affect an unaligned type sized store avoiding undefined
+ * behavior from approaches like type punning that require -fno-strict-aliasing
+ * in order to be correct. The void* cast silences ubsan warnings.
+ */
+#define __put_unaligned_t(type, val, ptr) do {				\
+	type __put_unaligned_val = (val);				\
+	__builtin_memcpy((void *)(ptr), &__put_unaligned_val,		\
+			 sizeof(__put_unaligned_val));			\
 } while (0)
 
 #endif /* __VDSO_UNALIGNED_H */
-- 
cgit v1.2.3


From 10a62a0611f5544d209446acfde5beb7b27773c7 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 16 Oct 2025 13:51:26 -0700
Subject: tools headers: Remove unneeded ignoring of warnings in unaligned.h

Now that get/put_unaligned() use memcpy() the -Wpacked and -Wattributes
warnings don't need disabling anymore.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Link: https://patch.msgid.link/20251016205126.2882625-5-irogers@google.com
---
 tools/include/linux/unaligned.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/include/linux/unaligned.h b/tools/include/linux/unaligned.h
index 395a4464fe73..d51ddafed138 100644
--- a/tools/include/linux/unaligned.h
+++ b/tools/include/linux/unaligned.h
@@ -6,9 +6,6 @@
  * This is the most generic implementation of unaligned accesses
  * and should work almost anywhere.
  */
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wpacked"
-#pragma GCC diagnostic ignored "-Wattributes"
 #include <vdso/unaligned.h>
 
 #define get_unaligned(ptr)	__get_unaligned_t(typeof(*(ptr)), (ptr))
@@ -143,6 +140,5 @@ static inline u64 get_unaligned_be48(const void *p)
 {
 	return __get_unaligned_be48(p);
 }
-#pragma GCC diagnostic pop
 
 #endif /* __LINUX_UNALIGNED_H */
-- 
cgit v1.2.3


From f756ed82c62aa2725757ac011710492d4cc8c7d8 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Tue, 13 Jan 2026 17:14:56 +0000
Subject: KVM: selftests: Slightly simplify memstress_setup_nested()

Instead of calling memstress_setup_ept_mappings() only in the first
iteration in the loop, move it before the loop.

The call needed to happen within the loop before commit e40e72fec0de
("KVM: selftests: Stop passing VMX metadata to TDP mapping functions"),
as memstress_setup_ept_mappings() used to take in a pointer to vmx_pages
and pass it into tdp_identity_map_1g() (to get the EPT root GPA). This
is no longer the case, as tdp_identity_map_1g() gets the EPT root
through stage2 MMU.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20260113171456.2097312-1-yosry.ahmed@linux.dev
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/lib/x86/memstress.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c
index 86f4c5e4c430..f53414ba7103 100644
--- a/tools/testing/selftests/kvm/lib/x86/memstress.c
+++ b/tools/testing/selftests/kvm/lib/x86/memstress.c
@@ -110,16 +110,13 @@ void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vc
 	TEST_REQUIRE(kvm_cpu_has_tdp());
 
 	vm_enable_tdp(vm);
+	memstress_setup_ept_mappings(vm);
 	for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
 		if (kvm_cpu_has(X86_FEATURE_VMX))
 			vcpu_alloc_vmx(vm, &nested_gva);
 		else
 			vcpu_alloc_svm(vm, &nested_gva);
 
-		/* The EPTs are shared across vCPUs, setup the mappings once */
-		if (vcpu_id == 0)
-			memstress_setup_ept_mappings(vm);
-
 		/*
 		 * Override the vCPU to run memstress_l1_guest_code() which will
 		 * bounce it into L2 before calling memstress_guest_code().
-- 
cgit v1.2.3


From 240156b25a397d3b26ef95b3f13bddb2db6038ab Mon Sep 17 00:00:00 2001
From: Manuel Hernández Méndez <manuel.hernandez@openchip.com>
Date: Thu, 4 Dec 2025 16:40:38 +0000
Subject: perf vendor events riscv: Add CVA6 JSON file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch add the OpenHW Core-V CVA6 Risc-V JSON file.

For more info:

https://openhwfoundation.org/news/2023/11/07/openhw-group-announces-core-v-cva6-platform-project-for-risc-v-software-development-and-testing/

Signed-off-by: Manuel Hernández Méndez <manuel.hernandez@openchip.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/arch/riscv/mapfile.csv       |  1 +
 .../arch/riscv/openhwgroup/cva6/firmware.json      | 68 ++++++++++++++++++++++
 .../arch/riscv/openhwgroup/cva6/instructions.json  | 47 +++++++++++++++
 .../arch/riscv/openhwgroup/cva6/memory.json        | 42 +++++++++++++
 .../arch/riscv/openhwgroup/cva6/microarch.json     | 27 +++++++++
 5 files changed, 185 insertions(+)
 create mode 100644 tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/firmware.json
 create mode 100644 tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/instructions.json
 create mode 100644 tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/memory.json
 create mode 100644 tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/microarch.json

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/riscv/mapfile.csv b/tools/perf/pmu-events/arch/riscv/mapfile.csv
index d5eea7f9aa9a..87cfb0e0849f 100644
--- a/tools/perf/pmu-events/arch/riscv/mapfile.csv
+++ b/tools/perf/pmu-events/arch/riscv/mapfile.csv
@@ -21,5 +21,6 @@
 0x489-0x8000000000000[1-6]08-0x[9b][[:xdigit:]]+,v1,sifive/p650,core
 0x5b7-0x0-0x0,v1,thead/c900-legacy,core
 0x5b7-0x80000000090c0d00-0x2047000,v1,thead/c900-legacy,core
+0x602-0x3-0x0,v1,openhwgroup/cva6,core
 0x67e-0x80000000db0000[89]0-0x[[:xdigit:]]+,v1,starfive/dubhe-80,core
 0x31e-0x8000000000008a45-0x[[:xdigit:]]+,v1,andes/ax45,core
diff --git a/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/firmware.json b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/firmware.json
new file mode 100644
index 000000000000..7149caec4f80
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/firmware.json
@@ -0,0 +1,68 @@
+[
+  {
+    "ArchStdEvent": "FW_MISALIGNED_LOAD"
+  },
+  {
+    "ArchStdEvent": "FW_MISALIGNED_STORE"
+  },
+  {
+    "ArchStdEvent": "FW_ACCESS_LOAD"
+  },
+  {
+    "ArchStdEvent": "FW_ACCESS_STORE"
+  },
+  {
+    "ArchStdEvent": "FW_ILLEGAL_INSN"
+  },
+  {
+    "ArchStdEvent": "FW_SET_TIMER"
+  },
+  {
+    "ArchStdEvent": "FW_IPI_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_IPI_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_FENCE_I_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_FENCE_I_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_ASID_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_VMID_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_VMID_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_ASID_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_ASID_RECEIVED"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/instructions.json b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/instructions.json
new file mode 100644
index 000000000000..c38f6c97cf1f
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/instructions.json
@@ -0,0 +1,47 @@
+[
+  {
+    "EventName": "LOAD_INSTRUCTIONS_RETIRED",
+    "EventCode": "0x5",
+    "BriefDescription": "number of data memory load instructions retired"
+  },
+  {
+    "EventName": "STORE_INSTRUCTIONS_RETIRED",
+    "EventCode": "0x6",
+    "BriefDescription": "number of data memory store instructions retired"
+  },
+  {
+    "EventName": "EXCEPTIONS",
+    "EventCode": "0x7",
+    "BriefDescription": "valid exceptions encountered"
+  },
+  {
+    "EventName": "EXCEPTION_HANDLER_RETURNS",
+    "EventCode": "0x8",
+    "BriefDescription": "return from an exception"
+  },
+  {
+    "EventName": "BRANCH_INSTRUCTIONS_RETIRED",
+    "EventCode": "0x9",
+    "BriefDescription": "number of branch instructions encountered retired"
+  },
+  {
+    "EventName": "CALL_INSTRUCTIONS_RETIRED",
+    "EventCode": "0xC",
+    "BriefDescription": "number of call instructions retired"
+  },
+  {
+    "EventName": "RETURN_INSTRUCTIONS_RETIRED",
+    "EventCode": "0xD",
+    "BriefDescription": "number of return instructions retired"
+  },
+  {
+    "EventName": "INTEGER_INSTRUCTIONS_RETIRED",
+    "EventCode": "0x14",
+    "BriefDescription": "number of integer instructions retired"
+  },
+  {
+    "EventName": "FLOATING_POINT_INSTRUCTIONS_RETIRED",
+    "EventCode": "0x15",
+    "BriefDescription": "number of floating point instructions retired"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/memory.json b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/memory.json
new file mode 100644
index 000000000000..c4f376a0ee4e
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/memory.json
@@ -0,0 +1,42 @@
+[
+  {
+    "EventName": "L1_I_CACHE_MISSES",
+    "EventCode": "0x1",
+    "BriefDescription": "number of misses in L1 I-Cache"
+  },
+  {
+    "EventName": "L1_D_CACHE_MISSES",
+    "EventCode": "0x2",
+    "BriefDescription": "number of misses in L1 D-Cache"
+  },
+  {
+    "EventName": "ITLB_MISSES",
+    "EventCode": "0x3",
+    "BriefDescription": "number of misses in ITLB"
+  },
+  {
+    "EventName": "DTLB_MISSES",
+    "EventCode": "0x4",
+    "BriefDescription": "number of misses in DTLB"
+  },
+  {
+    "EventName": "L1_I_CACHE_ACCESSES",
+    "EventCode": "0x10",
+    "BriefDescription": "number of accesses to instruction cache"
+  },
+  {
+    "EventName": "L1_D_CACHE_ACCESSES",
+    "EventCode": "0x11",
+    "BriefDescription": "number of accesses to data cache"
+  },
+  {
+    "EventName": "L1_CACHE_LINE_EVICTION",
+    "EventCode": "0x12",
+    "BriefDescription": "number of data cache line eviction"
+  },
+  {
+    "EventName": "ITLB_FLUSH",
+    "EventCode": "0x13",
+    "BriefDescription": "number of ITLB flushes"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/microarch.json b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/microarch.json
new file mode 100644
index 000000000000..104e6e8197da
--- /dev/null
+++ b/tools/perf/pmu-events/arch/riscv/openhwgroup/cva6/microarch.json
@@ -0,0 +1,27 @@
+[
+  {
+    "EventName": "BRANCH_MISPREDICTS",
+    "EventCode": "0xA",
+    "BriefDescription": "number of branch mispredictions"
+  },
+  {
+    "EventName": "BRANCH_EXCEPTIONS",
+    "EventCode": "0xB",
+    "BriefDescription": "number of valid branch exceptions"
+  },
+  {
+    "EventName": "MSB_FULL",
+    "EventCode": "0xE",
+    "BriefDescription": "scoreboard is full"
+  },
+  {
+    "EventName": "INSTRUCTION_FETCH_EMPTY",
+    "EventCode": "0xF",
+    "BriefDescription": "number of invalid instructions in IF stage"
+  },
+  {
+    "EventName": "PIPELINE_STALL",
+    "EventCode": "0x16",
+    "BriefDescription": "number of cycles the pipeline is stalled during read operands"
+  }
+]
-- 
cgit v1.2.3


From 2c3cd43d27c1148fae05b50870f970ab24464fd5 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan.das@amd.com>
Date: Thu, 8 Jan 2026 13:22:17 +0530
Subject: perf vendor events amd: Add Zen 6 mapping

Add a regular expression in the map file so that appropriate JSON event
files are used for AMD Zen 6 processors. Restrict the regular expression
for AMD Zen 5 processors to known model ranges since they also belong to
Family 1Ah.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Sandipan Das <sandipan.das@amd.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Stephane Eranian <eranian@google.com>
[ Moved this one to the front of the series to keep the tree bisectable, as per Ian Rogers suggestion ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/arch/x86/mapfile.csv | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 3d0c57198056..149bbe7abaf5 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -43,4 +43,5 @@ AuthenticAMD-23-([12][0-9A-F]|[0-9A-F]),v2,amdzen1,core
 AuthenticAMD-23-[[:xdigit:]]+,v1,amdzen2,core
 AuthenticAMD-25-([245][[:xdigit:]]|[[:xdigit:]]),v1,amdzen3,core
 AuthenticAMD-25-[[:xdigit:]]+,v1,amdzen4,core
-AuthenticAMD-26-[[:xdigit:]]+,v1,amdzen5,core
+AuthenticAMD-26-([12467][[:xdigit:]]|[[:xdigit:]]),v1,amdzen5,core
+AuthenticAMD-26-[[:xdigit:]]+,v1,amdzen6,core
-- 
cgit v1.2.3


From 2f42fb0661d9a979800a506b6a91dc3a7d1fb162 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan.das@amd.com>
Date: Thu, 8 Jan 2026 13:22:14 +0530
Subject: perf vendor events amd: Add Zen 6 core events

Add core events taken from Section 1.5 "Core Performance Monitor
Counters" of the Performance Monitor Counters for AMD Family 1Ah Model
50h-57h Processors document available at the link below.

This constitutes events which capture information on op dispatch,
execution and retirement, branch prediction, L1 and L2 cache activity,
TLB activity, etc.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Sandipan Das <sandipan.das@amd.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Link: https://bugzilla.kernel.org/attachment.cgi?id=309149
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../arch/x86/amdzen6/branch-prediction.json        |   93 ++
 tools/perf/pmu-events/arch/x86/amdzen6/decode.json |  139 +++
 .../pmu-events/arch/x86/amdzen6/execution.json     |  192 ++++
 .../arch/x86/amdzen6/floating-point.json           | 1106 ++++++++++++++++++++
 .../pmu-events/arch/x86/amdzen6/inst-cache.json    |  120 +++
 .../perf/pmu-events/arch/x86/amdzen6/l2-cache.json |  326 ++++++
 .../pmu-events/arch/x86/amdzen6/load-store.json    |  523 +++++++++
 7 files changed, 2499 insertions(+)
 create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/branch-prediction.json
 create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/decode.json
 create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/execution.json
 create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/floating-point.json
 create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/inst-cache.json
 create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/l2-cache.json
 create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/load-store.json

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/branch-prediction.json b/tools/perf/pmu-events/arch/x86/amdzen6/branch-prediction.json
new file mode 100644
index 000000000000..dd70069f68ed
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/branch-prediction.json
@@ -0,0 +1,93 @@
+[
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_hit",
+    "EventCode": "0x84",
+    "BriefDescription": "Instruction fetches that miss in the L1 ITLB but hit in the L2 ITLB."
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if4k",
+    "EventCode": "0x85",
+    "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for 4k pages.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if2m",
+    "EventCode": "0x85",
+    "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for 2M pages.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if1g",
+    "EventCode": "0x85",
+    "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for 1G pages.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.coalesced_4k",
+    "EventCode": "0x85",
+    "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for coalesced pages (16k pages created from four adjacent 4k pages).",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.all",
+    "EventCode": "0x85",
+    "BriefDescription": "Instruction fetches that miss in both the L1 and L2 ITLBs (page-table walks requested) for all page sizes.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "bp_pipe_correct",
+    "EventCode": "0x8b",
+    "BriefDescription": "Branch predictor pipeline flushes due to internal conditions such as a second level prediction structure."
+  },
+  {
+    "EventName": "bp_var_target_pred",
+    "EventCode": "0x8e",
+    "BriefDescription": "Indirect predictions (branch used the indirect predictor to make a prediction)."
+  },
+  {
+    "EventName": "bp_early_redir",
+    "EventCode": "0x91",
+    "BriefDescription": "Early redirects sent to branch predictor. This happens when either the decoder or dispatch logic is able to detect that the branch predictor needs to be redirected."
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if4k",
+    "EventCode": "0x94",
+    "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 4k or coalesced pages (16k pages created from four adjacent 4k pages).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if2m",
+    "EventCode": "0x94",
+    "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 2M pages.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if1g",
+    "EventCode": "0x94",
+    "BriefDescription": "Instruction fetches that hit in the L1 ITLB for 1G pages.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.all",
+    "EventCode": "0x94",
+    "BriefDescription": "Instruction fetches that hit in the L1 ITLB for all page sizes.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "bp_fe_redir.resync",
+    "EventCode": "0x9f",
+    "BriefDescription": "Redirects of the pipeline frontend caused by resyncs. These are retire time pipeline restarts.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "bp_fe_redir.ex_redir",
+    "EventCode": "0x9f",
+    "BriefDescription": "Redirects of the pipeline frontend caused by mispredicts. These are used for branch direction correction and handling indirect branch target mispredicts.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "bp_fe_redir.all",
+    "EventCode": "0x9f",
+    "BriefDescription": "Redirects of the pipeline frontend caused by any reason."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/decode.json b/tools/perf/pmu-events/arch/x86/amdzen6/decode.json
new file mode 100644
index 000000000000..c5d37fbac948
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/decode.json
@@ -0,0 +1,139 @@
+[
+  {
+    "EventName": "de_op_queue_empty",
+    "EventCode": "0xa9",
+    "BriefDescription": "Cycles where the op queue is empty. Such cycles indicate that the frontend is not delivering instructions fast enough."
+  },
+  {
+    "EventName": "de_src_op_disp.x86_decoder",
+    "EventCode": "0xaa",
+    "BriefDescription": "Ops dispatched from x86 decoder.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "de_src_op_disp.op_cache",
+    "EventCode": "0xaa",
+    "BriefDescription": "Ops dispatched from op cache.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "de_src_op_disp.all",
+    "EventCode": "0xaa",
+    "BriefDescription": "Ops dispatched from any source.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "de_dis_ops_from_decoder.any_fp",
+    "EventCode": "0xab",
+    "BriefDescription": "Ops dispatched from the decoder to a floating-point unit.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dis_ops_from_decoder.any_int",
+    "EventCode": "0xab",
+    "BriefDescription": "Ops dispatched from the decoder to an integer unit.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "de_disp_stall_cycles_dynamic_tokens_part1.int_phy_reg_file_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to integer physical register file resource stalls.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.load_queue_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to load queue token stalls.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.store_queue_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to store queue token stalls.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.taken_brnch_buffer_rsrc",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to taken branch buffer resource stalls.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part1.fp_sch_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to floating-point non-schedulable queue token stalls.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq0",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 0 tokens.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq1",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 1 tokens.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq2",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 2 tokens.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq3",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 3 tokens.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq4",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 4 tokens.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.int_sq5",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of integer scheduler 5 tokens.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.ret_q",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to unavailability of retire queue tokens.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "de_dispatch_stall_cycle_dynamic_tokens_part2.all",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to any token stalls.",
+    "UMask": "0xbf"
+  },
+  {
+    "EventName": "de_no_dispatch_per_slot.no_ops_from_frontend",
+    "EventCode": "0x1a0",
+    "BriefDescription": "Dispatch slots in each cycle that were empty because the frontend did not supply ops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "de_no_dispatch_per_slot.backend_stalls",
+    "EventCode": "0x1a0",
+    "BriefDescription": "Dispatch slots in each cycle that were unused because of backend stalls.",
+    "UMask": "0x1e"
+  },
+  {
+    "EventName": "de_no_dispatch_per_slot.smt_contention",
+    "EventCode": "0x1a0",
+    "BriefDescription": "Dispatch slots in each cycle that were unused because the dispatch cycle was granted to the other SMT thread.",
+    "UMask": "0x60"
+  },
+  {
+    "EventName": "de_additional_resource_stalls.dispatch_stalls",
+    "EventCode": "0x1a2",
+    "BriefDescription": "Counts additional cycles where dispatch is stalled due to a lack of dispatch resources.",
+    "UMask": "0x30"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/execution.json b/tools/perf/pmu-events/arch/x86/amdzen6/execution.json
new file mode 100644
index 000000000000..1b80acc89b6f
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/execution.json
@@ -0,0 +1,192 @@
+[
+  {
+    "EventName": "ex_ret_instr",
+    "EventCode": "0xc0",
+    "BriefDescription": "Retired instructions."
+  },
+  {
+    "EventName": "ex_ret_ops",
+    "EventCode": "0xc1",
+    "BriefDescription": "Retired macro-ops."
+  },
+  {
+    "EventName": "ex_ret_brn",
+    "EventCode": "0xc2",
+    "BriefDescription": "Retired branch instructions (all types of architectural control flow changes, including exceptions and interrupts)."
+  },
+  {
+    "EventName": "ex_ret_brn_misp",
+    "EventCode": "0xc3",
+    "BriefDescription": "Retired branch instructions that were mispredicted."
+  },
+  {
+    "EventName": "ex_ret_brn_tkn",
+    "EventCode": "0xc4",
+    "BriefDescription": "Retired taken branch instructions (all types of architectural control flow changes, including exceptions and interrupts)."
+  },
+  {
+    "EventName": "ex_ret_brn_tkn_misp",
+    "EventCode": "0xc5",
+    "BriefDescription": "Retired taken branch instructions that were mispredicted."
+  },
+  {
+    "EventName": "ex_ret_brn_far",
+    "EventCode": "0xc6",
+    "BriefDescription": "Retired far control transfers (far call, far jump, far return, IRET, SYSCALL and SYSRET, plus exceptions and interrupts). Far control transfers are not subject to branch prediction."
+  },
+  {
+    "EventName": "ex_ret_near_ret",
+    "EventCode": "0xc8",
+    "BriefDescription": "Retired near returns (RET or RET Iw)."
+  },
+  {
+    "EventName": "ex_ret_near_ret_mispred",
+    "EventCode": "0xc9",
+    "BriefDescription": "Retired near returns that were mispredicted. Each misprediction incurs the same penalty as that of a mispredicted conditional branch instruction."
+  },
+  {
+    "EventName": "ex_ret_brn_ind_misp",
+    "EventCode": "0xca",
+    "BriefDescription": "Retired indirect branch instructions that were mispredicted (only EX mispredicts). Each misprediction incurs the same penalty as that of a mispredicted conditional branch instruction."
+  },
+  {
+    "EventName": "ex_ret_brn_ind",
+    "EventCode": "0xcc",
+    "BriefDescription": "Retired indirect branch instructions."
+  },
+  {
+    "EventName": "ex_ret_brn_cond",
+    "EventCode": "0xd1",
+    "BriefDescription": "Retired conditional branch instructions."
+  },
+  {
+    "EventName": "ex_div_busy",
+    "EventCode": "0xd3",
+    "BriefDescription": "Cycles where the divider is busy."
+  },
+  {
+    "EventName": "ex_div_count",
+    "EventCode": "0xd4",
+    "BriefDescription": "Divide ops executed."
+  },
+  {
+    "EventName": "ex_no_retire.empty",
+    "EventCode": "0xd6",
+    "BriefDescription": "Cycles where the thread does not retire any ops due to a lack of valid ops in the retire queue (may be caused by front-end bottlenecks or pipeline redirects).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ex_no_retire.not_complete",
+    "EventCode": "0xd6",
+    "BriefDescription": "Cycles where the thread does not retire any ops as the oldest retire slot is waiting to be marked as completed.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ex_no_retire.other",
+    "EventCode": "0xd6",
+    "BriefDescription": "Cycles where the thread does not retire any ops due to other reasons (retire breaks, traps, faults, etc.).",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ex_no_retire.thread_not_selected",
+    "EventCode": "0xd6",
+    "BriefDescription": "Cycles where the thread does not retire any ops as thread arbitration did not select the current thread.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ex_no_retire.load_not_complete",
+    "EventCode": "0xd6",
+    "BriefDescription": "Cycles where the thread does not retire any ops due to missing load completion.",
+    "UMask": "0xa2"
+  },
+  {
+    "EventName": "ex_ret_ucode_instr",
+    "EventCode": "0x1c1",
+    "BriefDescription": "Retired microcoded instructions."
+  },
+  {
+    "EventName": "ex_ret_ucode_ops",
+    "EventCode": "0x1c2",
+    "BriefDescription": "Retired microcode ops."
+  },
+  {
+    "EventName": "ex_ret_brn_cond_misp",
+    "EventCode": "0x1c7",
+    "BriefDescription": "Retired conditional branch instructions that were mispredicted due to direction mismatch."
+  },
+  {
+    "EventName": "ex_ret_brn_uncond_ind_near_misp",
+    "EventCode": "0x1c8",
+    "BriefDescription": "Retired unconditional indirect near branch instructions that were mispredicted."
+  },
+  {
+    "EventName": "ex_ret_brn_uncond",
+    "EventCode": "0x1c9",
+    "BriefDescription": "Retired unconditional branch instructions."
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.tagged",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Execution IBS tagged ops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.tagged_ret",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Execution IBS tagged ops that retired.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.rollovers",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Execution IBS periodic counter rollovers due to a previous tagged op not being IBS complete.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.filtered",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Execution IBS tagged ops that retired but were discarded due to IBS filtering.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.valid",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Execution IBS tagged ops that resulted in a valid sample and an IBS interrupt.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ex_ret_fused_instr",
+    "EventCode": "0x1d0",
+    "BriefDescription": "Retired fused instructions."
+  },
+  {
+    "EventName": "ex_mprof_ibs_ops.tagged",
+    "EventCode": "0x2c0",
+    "BriefDescription": "Memory Profiler IBS tagged ops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ex_mprof_ibs_ops.tagged_ret",
+    "EventCode": "0x2c0",
+    "BriefDescription": "Memory Profiler IBS tagged ops that retired.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ex_mprof_ibs_ops.rollovers",
+    "EventCode": "0x2c0",
+    "BriefDescription": "Memory Profiler IBS periodic counter rollovers due to a previous tagged op not being IBS complete.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ex_mprof_ibs_ops.filtered",
+    "EventCode": "0x2c0",
+    "BriefDescription": "Memory Profiler IBS tagged ops that retired but were discarded due to IBS filtering.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ex_mprof_ibs_ops.valid",
+    "EventCode": "0x2c0",
+    "BriefDescription": "Memory Profiler IBS tagged ops that resulted in a valid sample and an IBS interrupt.",
+    "UMask": "0x10"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen6/floating-point.json
new file mode 100644
index 000000000000..03cb039434de
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/floating-point.json
@@ -0,0 +1,1106 @@
+[
+  {
+    "EventName": "fp_ret_x87_fp_ops.add_sub_ops",
+    "EventCode": "0x02",
+    "BriefDescription": "Retired x87 floating-point add and subtract uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_ret_x87_fp_ops.mul_ops",
+    "EventCode": "0x02",
+    "BriefDescription": "Retired x87 floating-point multiply uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_ret_x87_fp_ops.div_sqrt_ops",
+    "EventCode": "0x02",
+    "BriefDescription": "Retired x87 floating-point divide and square root uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_ret_x87_fp_ops.all",
+    "EventCode": "0x02",
+    "BriefDescription": "Retired x87 floating-point uops of all types.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.add_sub_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX add and subtract FLOPs.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.mult_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX multiply FLOPs.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.div_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX divide and square root FLOPs.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.mac_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX multiply-accumulate FLOPs (each operation is counted as 2 FLOPs, bfloat operations are not included).",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.bfloat16_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX bfloat16 FLOPs.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.scalar_single_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX scalar single-precision (FP32) FLOPs.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.packed_single_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX packed single-precision (FP32) FLOPs.",
+    "UMask": "0x60"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.scalar_double_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX scalar double-precision (FP64) FLOPs.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.packed_double_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX packed double-precision (FP64) FLOPs.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.scalar_half_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX scalar half-precision (FP16) FLOPs.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.packed_half_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX packed half-precision (FP16) FLOPs.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.all",
+    "EventCode": "0x03",
+    "BriefDescription": "Retired SSE and AVX FLOPs of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.x87",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired x87 floating-point uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.mmx",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired MMX floating-point uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.scalar",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired scalar floating-point uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.pack_128",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired packed 128-bit floating-point uops.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.pack_256",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired packed 256-bit floating-point uops.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.pack_512",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired packed 512-bit floating-point uops.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_ops_ret_by_width.all",
+    "EventCode": "0x08",
+    "BriefDescription": "Retired floating-point uops of all widths.",
+    "UMask": "0x3f"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_add",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point add uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_sub",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point subtract uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_mul",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point multiply uops.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_mac",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point multiply-accumulate uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_div",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point divide uops.",
+    "UMask": "0x05"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_sqrt",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point square root uops.",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_cmp",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point compare uops.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_cvt",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point convert uops.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_blend",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point blend uops.",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_move",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point move uops.",
+    "UMask": "0x0a"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_shuffle",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0x0b"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_bfloat",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point bfloat uops.",
+    "UMask": "0x0c"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_logical",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point move uops.",
+    "UMask": "0x0d"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_other",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point uops of other types.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.scalar_all",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired scalar floating-point uops of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_add",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point add uops.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_sub",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point subtract uops.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_mul",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point multiply uops.",
+    "UMask": "0x30"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_mac",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point multiply-accumulate uops.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_div",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point divide uops.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_sqrt",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point square root uops.",
+    "UMask": "0x60"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_cmp",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point compare uops.",
+    "UMask": "0x70"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_cvt",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point convert uops.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_blend",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point blend uops.",
+    "UMask": "0x90"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_move",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point move uops.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_shuffle",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0xb0"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_bfloat",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point bfloat uops.",
+    "UMask": "0xc0"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_logical",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point logical uops.",
+    "UMask": "0xd0"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_other",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point uops of other types.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.vector_all",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired vector floating-point uops of all types.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "fp_ops_ret_by_type.all",
+    "EventCode": "0x0a",
+    "BriefDescription": "Retired floating-point uops of all types.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_add",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer add uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_sub",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer subtract uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_mul",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer multiply uops.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_mac",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer multiply-accumulate uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_aes",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer AES uops.",
+    "UMask": "0x05"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_sha",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer SHA uops.",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_cmp",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer compare uops.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_cvt",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer convert or pack uops.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_shift",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer shift or rotate uops.",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_mov",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer move uops.",
+    "UMask": "0x0a"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_shuffle",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0x0b"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_vnni",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer VNNI uops.",
+    "UMask": "0x0c"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_logical",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer logical uops.",
+    "UMask": "0x0d"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_other",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer multiply uops of other types.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.mmx_all",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX integer uops of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_add",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer add uops.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_sub",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer subtract uops.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_mul",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer multiply uops.",
+    "UMask": "0x30"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_mac",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer multiply-accumulate uops.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_aes",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer AES uops.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_sha",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer SHA uops.",
+    "UMask": "0x60"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_cmp",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer compare uops.",
+    "UMask": "0x70"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_cvt",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer convert or pack uops.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_shift",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer shift or rotate uops.",
+    "UMask": "0x90"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_mov",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer move uops.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_shuffle",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0xb0"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_vnni",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer VNNI uops.",
+    "UMask": "0xc0"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_logical",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer logical uops.",
+    "UMask": "0xd0"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_other",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer uops of other types.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.sse_avx_all",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired SSE and AVX integer uops of all types.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "fp_sse_avx_ops_ret.all",
+    "EventCode": "0x0b",
+    "BriefDescription": "Retired MMX, SSE and AVX integer uops of all types.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_add",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point add uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_sub",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point subtract uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_mul",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point multiply uops.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_mac",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point multiply-accumulate uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_div",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point divide uops.",
+    "UMask": "0x05"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_sqrt",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point square root uops.",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_cmp",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point compare uops.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_cvt",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point convert uops.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_blend",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point blend uops.",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_mov",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point move uops.",
+    "UMask": "0x0a"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_shuffle",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0x0b"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_bfloat",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point bfloat uops.",
+    "UMask": "0x0c"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_logical",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point logical uops.",
+    "UMask": "0x0d"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_other",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point uops of other types.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp128_all",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 128-bit packed floating-point uops of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_add",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point add uops.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_sub",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point subtract uops.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_mul",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point multiply uops.",
+    "UMask": "0x30"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_mac",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point multiply-accumulate uops.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_div",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point divide uops.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_sqrt",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point square root uops.",
+    "UMask": "0x60"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_cmp",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point compare uops.",
+    "UMask": "0x70"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_cvt",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point convert uops.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_blend",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point blend uops.",
+    "UMask": "0x90"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_mov",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point move uops.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_shuffle",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0xb0"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_logical",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point logical uops.",
+    "UMask": "0xd0"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_other",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point uops of other types.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp256_all",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired 256-bit packed floating-point uops of all types.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "fp_pack_ops_ret.fp_all",
+    "EventCode": "0x0c",
+    "BriefDescription": "Retired packed floating-point uops of all types.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_add",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer add uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_sub",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer subtract uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_mul",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer multiply uops.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_mac",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer multiply-accumulate uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_aes",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer AES uops.",
+    "UMask": "0x05"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_sha",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer SHA uops.",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_cmp",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer compare uops.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_cvt",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer convert or pack uops.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_shift",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer shift or rotate uops.",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_mov",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer move uops.",
+    "UMask": "0x0a"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_shuffle",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0x0b"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_vnni",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer VNNI ops.",
+    "UMask": "0x0c"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_logical",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer logical uops.",
+    "UMask": "0x0d"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_other",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer uops of other types.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int128_all",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 128-bit packed integer uops of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_add",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer add uops.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_sub",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer subtract uops.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_mul",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer multiply uops.",
+    "UMask": "0x30"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_mac",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer multiply-accumulate uops.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_cmp",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer compare uops.",
+    "UMask": "0x70"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_shift",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer shift or rotate uops.",
+    "UMask": "0x90"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_mov",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer move uops.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_shuffle",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0xb0"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_vnni",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer VNNI uops.",
+    "UMask": "0xc0"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_logical",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer logical uops.",
+    "UMask": "0xd0"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_other",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer uops of other types.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int256_all",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired 256-bit packed integer uops of all types.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "fp_pack_int_ops_ret.int_all",
+    "EventCode": "0x0d",
+    "BriefDescription": "Retired packed integer uops of all types.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "fp_disp_faults.x87_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating-point dispatch faults for x87 fills.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_disp_faults.xmm_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating-point dispatch faults for XMM fills.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_disp_faults.ymm_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating-point dispatch faults for YMM fills.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_disp_faults.ymm_spill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating-point dispatch faults for YMM spills.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_disp_faults.sse_avx_all",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating-point dispatch faults of all types for SSE and AVX ops.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_disp_faults.all",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating-point dispatch faults of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_add",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point add uops.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_sub",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point subtract uops.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_mul",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point multiply uops.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_mac",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point multiply-accumulate uops.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_div",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point divide uops.",
+    "UMask": "0x05"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_sqrt",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point square root uops.",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_cmp",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point compare uops.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_cvt",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point convert uops.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_blend",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point blend uops.",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_mov",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point move uops.",
+    "UMask": "0x0a"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_shuffle",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0x0b"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_bfloat",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point bfloat uops.",
+    "UMask": "0x0c"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_logical",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point logical uops.",
+    "UMask": "0x0d"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_other",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point uops of other types.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.fp512_all",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed floating-point uops of all types.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_add",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer add uops.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_sub",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer subtract uops.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_mul",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer multiply uops.",
+    "UMask": "0x30"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_mac",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer multiply-accumulate uops.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_aes",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer AES uops.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_sha",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer SHA uops.",
+    "UMask": "0x60"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_cmp",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer compare uops.",
+    "UMask": "0x70"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_cvt",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer convert or pack uops.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_shift",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer shift or rotate uops.",
+    "UMask": "0x90"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_mov",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer move uops.",
+    "UMask": "0xa0"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_shuffle",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer shuffle uops (may include instructions not necessarily thought of as including shuffles e.g. horizontal add, dot product, and certain MOV instructions).",
+    "UMask": "0xb0"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_vnni",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer VNNI uops.",
+    "UMask": "0xc0"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_logical",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer logical uops.",
+    "UMask": "0xd0"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_other",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer uops of other types.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.int512_all",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed integer uops of all types.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "fp_pack_512b_ops_ret.512b_all",
+    "EventCode": "0x0f",
+    "BriefDescription": "Retired 512-bit packed uops of all types.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "fp_nsq_read_stalls.fp_prf",
+    "EventCode": "0x13",
+    "BriefDescription": "Cycles when reads of the NSQ and writes to the floating-point or SIMD schedulers are stalled due to insufficient free physical register file (FP-PRF) entries.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_nsq_read_stalls.k_prf",
+    "EventCode": "0x13",
+    "BriefDescription": "Cycles when reads of the NSQ and writes to the floating-point or SIMD schedulers are stalled due to insufficient free mask physical register file (K-PRF) entries.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_nsq_read_stalls.fp_sq",
+    "EventCode": "0x13",
+    "BriefDescription": "Cycles when reads of the NSQ and writes to the floating-point or SIMD schedulers are stalled due to insufficient free scheduler entries.",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "fp_nsq_read_stalls.all",
+    "EventCode": "0x13",
+    "BriefDescription": "Cycles when reads of the NSQ and writes to the floating-point or SIMD schedulers are stalled due to any reason.",
+    "UMask": "0x0e"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/inst-cache.json b/tools/perf/pmu-events/arch/x86/amdzen6/inst-cache.json
new file mode 100644
index 000000000000..5ab6766f8940
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/inst-cache.json
@@ -0,0 +1,120 @@
+[
+  {
+    "EventName": "ic_cache_fill_l2",
+    "EventCode": "0x82",
+    "BriefDescription": "Instruction cache lines (64 bytes) fulfilled from the L2 cache."
+  },
+  {
+    "EventName": "ic_cache_fill_sys",
+    "EventCode": "0x83",
+    "BriefDescription": "Instruction cache lines (64 bytes) fulfilled from system memory or another cache."
+  },
+  {
+    "EventName": "ic_fetch_ibs_events.tagged",
+    "EventCode": "0x188",
+    "BriefDescription": "Fetch IBS tagged fetches. Not all tagged fetches result in a valid sample and an IBS interrupt.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ic_fetch_ibs_events.filtered",
+    "EventCode": "0x188",
+    "BriefDescription": "Fetch IBS tagged fetches that were discarded due to IBS filtering.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ic_fetch_ibs_events.valid",
+    "EventCode": "0x188",
+    "BriefDescription": "Fetch IBS tagged fetches that resulted in a valid sample and an IBS interrupt.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "op_cache_hit_miss.hit",
+    "EventCode": "0x28f",
+    "BriefDescription": "Op cache fetch hits.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "op_cache_hit_miss.miss",
+    "EventCode": "0x28f",
+    "BriefDescription": "Op cache fetch misses.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "op_cache_hit_miss.all",
+    "EventCode": "0x28f",
+    "BriefDescription": "Op cache fetches of all types.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "ic_fills_from_sys.local_l2",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from local L2 cache.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ic_fills_from_sys.local_ccx",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ic_fills_from_sys.local_all",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ic_fills_from_sys.near_cache",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ic_fills_from_sys.dram_io_near",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from either DRAM or MMIO in the same NUMA node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ic_fills_from_sys.far_cache",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ic_fills_from_sys.remote_cache",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from cache of another CCX in the same or a different NUMA node.",
+    "UMask": "0x14"
+  },
+  {
+    "EventName": "ic_fills_from_sys.dram_io_far",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from either DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ic_fills_from_sys.dram_io_all",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "ic_fills_from_sys.far_all",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "ic_fills_from_sys.alt_mem",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from extension memory (CXL).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ic_fills_from_sys.all",
+    "EventCode": "0x29c",
+    "BriefDescription": "Instruction cache fills where data is returned from all types of sources.",
+    "UMask": "0xdf"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/l2-cache.json b/tools/perf/pmu-events/arch/x86/amdzen6/l2-cache.json
new file mode 100644
index 000000000000..b0b2090fb920
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/l2-cache.json
@@ -0,0 +1,326 @@
+[
+  {
+    "EventName": "l2_request_g1.group2",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests of non-cacheable type (non-cached data and instructions reads, self-modifying code checks).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_request_g1.l2_hwpf",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests from hardware prefetchers to prefetch directly into L2 (hit or miss).",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_request_g1.prefetch_l2_cmd",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests to prefetch directly into L2.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_request_g1.cacheable_ic_read",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests for instruction cache reads.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_request_g1.ls_rd_blk_c_s",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests for data cache shared reads.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_request_g1.rd_blk_x",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests for data cache stores.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_request_g1.rd_blk_l",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests for data cache reads (includes hardware and software prefetches).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_request_g1.dc_all",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests of common types from data cache (includes prefetches).",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "l2_request_g1.no_pf_all",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests of common types not including prefetches.",
+    "UMask": "0xf1"
+  },
+  {
+    "EventName": "l2_request_g1.all",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 cache requests of all types.",
+    "UMask": "0xf7"
+  },
+  {
+    "EventName": "l2_request_g2.ls_rd_sized_nc",
+    "EventCode": "0x61",
+    "BriefDescription": "L2 cache requests for non-coherent, non-cacheable LS sized reads.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_request_g2.ls_rd_sized",
+    "EventCode": "0x61",
+    "BriefDescription": "L2 cache requests for coherent, non-cacheable LS sized reads.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_request_g2.all",
+    "EventCode": "0x61",
+    "BriefDescription": "L2 cache requests of all rare types.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_wcb_req.wcb_close",
+    "EventCode": "0x63",
+    "BriefDescription": "Write Combining Buffer (WCB) closures.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_miss",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 misses.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_hit_s",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 hits on non-modifiable lines.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_hit_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 hits on modifiable lines.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_hit_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 hits.",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_access_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the instruction cache that result in L2 accesses.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_c",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 misses.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache and the instruction cache that result in L2 misses.",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) that result in data cache stores or L2 state change hits.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_s",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 hits on non-modifiable lines.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 hits on modifiable lines.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_cs",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 read hits on shared lines.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_cache_req_stat.dc_hit_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 hits.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache and the instruction cache that result in L2 hits.",
+    "UMask": "0xf6"
+  },
+  {
+    "EventName": "l2_cache_req_stat.dc_access_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache that result in L2 accesses.",
+    "UMask": "0xf8"
+  },
+  {
+    "EventName": "l2_cache_req_stat.all",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cache requests (not including L2 prefetch) from the data cache and the instruction cache that result in L2 accesses.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_pf_hit_l2.l2_hwpf",
+    "EventCode": "0x70",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L2 hardware prefetchers.",
+    "UMask": "0x1f"
+  },
+  {
+    "EventName": "l2_pf_hit_l2.l1_dc_hwpf",
+    "EventCode": "0x70",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L1 data hardware prefetchers.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "l2_pf_hit_l2.l1_dc_l2_hwpf",
+    "EventCode": "0x70",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which hit in the L2 cache and are generated from L1 data and L2 hardware prefetchers.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_hit_l3.l2_hwpf",
+    "EventCode": "0x71",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L2 hardware prefetchers.",
+    "UMask": "0x1f"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_hit_l3.l1_dc_hwpf",
+    "EventCode": "0x71",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L1 data hardware prefetchers.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf",
+    "EventCode": "0x71",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 cache but hit in the L3 cache and are generated from L1 data and L2 hardware prefetchers.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_l3.l2_hwpf",
+    "EventCode": "0x72",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L2 hardware prefetchers.",
+    "UMask": "0x1f"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_l3.l1_dc_hwpf",
+    "EventCode": "0x72",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L1 data hardware prefetchers.",
+    "UMask": "0xe0"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_l3.l1_dc_l2_hwpf",
+    "EventCode": "0x72",
+    "BriefDescription": "L2 prefetches accepted by the L2 pipeline which miss the L2 as well as the L3 caches and are generated from L1 data and L2 hardware prefetchers.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.local_ccx",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.near_cache",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.dram_io_near",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from either DRAM or MMIO in the same NUMA node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.far_cache",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.dram_io_far",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from either DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.dram_io_all",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.far_all",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.alt_mem",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from extension memory (CXL).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_fill_rsp_src.all",
+    "EventCode": "0x165",
+    "BriefDescription": "L2 cache fills where data is returned from all types of sources.",
+    "UMask": "0xde"
+  },
+  {
+    "EventName": "l2_sys_bw.local_dram_fill",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for fill events that target the same NUMA node and return from DRAM in the same NUMA node.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_sys_bw.remote_dram_fill",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for fill events that target a different NUMA node and return from DRAM in a different NUMA node.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_sys_bw.nt_write",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for non-temporal write events that target all NUMA nodes.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_sys_bw.local_scm_fill",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for fill events that target the same NUMA node and return from extension memory (CXL) in the same NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_sys_bw.remote_scm_fill",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for fill events that target a different NUMA node and return from extension memory (CXL) in a different NUMA node.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_sys_bw.victim",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for cache victim events that target all NUMA nodes.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_sys_bw.all",
+    "EventCode": "0x175",
+    "BriefDescription": "System bandwidth utilization for all types of events (total utilization).",
+    "UMask": "0xff"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/load-store.json b/tools/perf/pmu-events/arch/x86/amdzen6/load-store.json
new file mode 100644
index 000000000000..4291eb59426f
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/load-store.json
@@ -0,0 +1,523 @@
+[
+  {
+    "EventName": "ls_bad_status2.stli_other",
+    "EventCode": "0x24",
+    "BriefDescription": "Store-to-load conflicts (loads unable to complete due to a non-forwardable conflict with an older store).",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_locks.bus_lock",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions which caused a bus lock (non-cacheable or cache-misaligned lock).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_locks.all",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions of all types.",
+    "UMask": "0x1f"
+  },
+  {
+    "EventName": "ls_ret_cl_flush",
+    "EventCode": "0x26",
+    "BriefDescription": "Retired CLFLUSH instructions."
+  },
+  {
+    "EventName": "ls_ret_cpuid",
+    "EventCode": "0x27",
+    "BriefDescription": "Retired CPUID instructions."
+  },
+  {
+    "EventName": "ls_dispatch.pure_ld",
+    "EventCode": "0x29",
+    "BriefDescription": "Memory load operations dispatched to the load-store unit.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_dispatch.pure_st",
+    "EventCode": "0x29",
+    "BriefDescription": "Memory store operations dispatched to the load-store unit.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_dispatch.ld_st",
+    "EventCode": "0x29",
+    "BriefDescription": "Memory load-store operations (load from and store to the same memory address) dispatched to the load-store unit.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_dispatch.all",
+    "EventCode": "0x29",
+    "BriefDescription": "Memory operations dispatched to the load-store unit of all types.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "ls_smi_rx",
+    "EventCode": "0x2b",
+    "BriefDescription": "System Management Interrupts (SMIs) received."
+  },
+  {
+    "EventName": "ls_int_taken",
+    "EventCode": "0x2c",
+    "BriefDescription": "Interrupts taken."
+  },
+  {
+    "EventName": "ls_stlf",
+    "EventCode": "0x35",
+    "BriefDescription": "Store-to-load-forward (STLF) hits."
+  },
+  {
+    "EventName": "ls_st_commit_cancel.older_st_vis_dep",
+    "EventCode": "0x37",
+    "BriefDescription": "Store commits cancelled due to an older store, that the thread was waiting on to become globally visible, was unable to become globally visible.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_mab_alloc.ls",
+    "EventCode": "0x41",
+    "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for load-store allocations.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "ls_mab_alloc.hwpf",
+    "EventCode": "0x41",
+    "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for hardware prefetcher allocations.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_mab_alloc.all",
+    "EventCode": "0x41",
+    "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for all types of allocations.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.local_l2",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from local L2 cache.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.local_ccx",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.local_all",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.near_cache",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.dram_io_near",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from either DRAM or MMIO in the same NUMA node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.far_cache",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.remote_cache",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from cache of another CCX in the same or a different NUMA node.",
+    "UMask": "0x14"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.dram_io_far",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from either DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.dram_io_all",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.far_all",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.alt_mem",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from extension memory (CXL).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.all",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand data cache fills where data is returned from all types of sources.",
+    "UMask": "0xdf"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.local_l2",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from local L2 cache.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.local_ccx",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.local_all",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.near_cache",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.dram_io_near",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from either DRAM or MMIO in the same NUMA node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.far_cache",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.remote_cache",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from cache of another CCX in the same or a different NUMA node.",
+    "UMask": "0x14"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.dram_io_far",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from either DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.dram_io_all",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.far_all",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from either cache of another CCX, DRAM or MMIO when the address was in a different NUMA node.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.alt_mem",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from extension memory (CXL).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.all",
+    "EventCode": "0x44",
+    "BriefDescription": "Any data cache fills where data is returned from all types of data sources.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 4k pages.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB hits for coalesced pages (16k pages created from four adjacent 4k pages).",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 2M pages.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB hits for 1G pages.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for 4k pages.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for coalesced pages (16k pages created from four adjacent 4k pages).",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for 2M pages.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for 1G pages.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.l2_miss_all",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses with L2 DTLB misses (page-table walks requested) for all page sizes.",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.all",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB misses for all page sizes.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ls_misal_loads.ma64",
+    "EventCode": "0x47",
+    "BriefDescription": "64B misaligned (cacheline crossing) loads.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_misal_loads.ma4k",
+    "EventCode": "0x47",
+    "BriefDescription": "4kB misaligned (page crossing) loads.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchT0 (move data to all cache levels), T1 (move data to all cache levels except L1) and T2 (move data to all cache levels except L1 and L2).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch_w",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchW (move data to L1 cache and mark it modifiable).",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch_nta",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software prefetch instructions dispatched (speculative) of type PrefetchNTA (move data with minimum cache pollution i.e. non-temporal access).",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.all",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software prefetch instructions dispatched (speculative) of all types.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "wcb_close.full_line_64b",
+    "EventCode": "0x50",
+    "BriefDescription": "Events that caused a Write Combining Buffer (WCB) entry to close because all 64 bytes of the entry have been written to.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_inef_sw_pref.dc_hit",
+    "EventCode": "0x52",
+    "BriefDescription": "Software prefetches that did not fetch data outside of the processor core as the PREFETCH instruction saw a data cache hit.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_inef_sw_pref.mab_hit",
+    "EventCode": "0x52",
+    "BriefDescription": "Software prefetches that did not fetch data outside of the processor core as the PREFETCH instruction saw a match on an already allocated miss request (MAB).",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_inef_sw_pref.all",
+    "EventCode": "0x52",
+    "BriefDescript6ion": "Software prefetches that did not fetch data outside of the processor core for any reason.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.local_l2",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from local L2 cache.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.local_ccx",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.local_all",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.near_cache",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.dram_io_near",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from either DRAM or MMIO in the same NUMA node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.far_cache",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.remote_cache",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from cache of another CCX in the same or a different NUMA node.",
+    "UMask": "0x14"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.dram_io_far",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from either DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.dram_io_all",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.far_all",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.alt_mem",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from extension memory (CXL).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.all",
+    "EventCode": "0x59",
+    "BriefDescription": "Software prefetch data cache fills where data is returned from all types of data sources.",
+    "UMask": "0xdf"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.local_l2",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from local L2 cache.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.local_ccx",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.local_all",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from local L2 cache, L3 cache or different L2 cache in the same CCX.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.near_cache",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.dram_io_near",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from either DRAM or MMIO in the same NUMA node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.far_cache",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.remote_cache",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from cache of another CCX in the same or a different NUMA node.",
+    "UMask": "0x14"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.dram_io_far",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from either DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.dram_io_all",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from either DRAM or MMIO in the same or a different NUMA node.",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.far_all",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from either cache of another CCX, DRAM or MMIO in a different NUMA node.",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.alt_mem",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from extension memory (CXL).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.all",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware prefetch data cache fills where data is returned from all types of data sources.",
+    "UMask": "0xdf"
+  },
+  {
+    "EventName": "ls_alloc_mab_count",
+    "EventCode": "0x5f",
+    "BriefDescription": "In-flight L1 data cache misses i.e. Miss Address Buffer (MAB) allocations each cycle."
+  },
+  {
+    "EventName": "ls_not_halted_cyc",
+    "EventCode": "0x76",
+    "BriefDescription": "Core cycles where the thread is not in halted state."
+  },
+  {
+    "EventName": "ls_tlb_flush.all",
+    "EventCode": "0x78",
+    "BriefDescription": "All TLB flushes.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ls_not_halted_p0_cyc.p0_freq_cyc",
+    "EventCode": "0x120",
+    "BriefDescription": "Reference cycles (P0 frequency) where the thread is not in halted state.",
+    "UMask": "0x1"
+  }
+]
-- 
cgit v1.2.3


From de18394f8f69e4cb86e1561f3dd86e9f724b8f25 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan.das@amd.com>
Date: Thu, 8 Jan 2026 13:22:15 +0530
Subject: perf vendor events amd: Add Zen 6 uncore events

Add uncore events taken from Section 1.6 "L3 Cache Performance Monitor
Counters" and Section 2.2 "UMC Performance Monitor Events" of the
Performance Monitor Counters for AMD Family 1Ah Model 50h-57h Processors
document available at the link below.

This constitutes events which capture L3 cache and UMC command activity.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Sandipan Das <sandipan.das@amd.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Link: https://bugzilla.kernel.org/attachment.cgi?id=309149
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../perf/pmu-events/arch/x86/amdzen6/l3-cache.json | 177 +++++++++++++++++++++
 .../arch/x86/amdzen6/memory-controller.json        | 101 ++++++++++++
 2 files changed, 278 insertions(+)
 create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/l3-cache.json
 create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/memory-controller.json

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/l3-cache.json b/tools/perf/pmu-events/arch/x86/amdzen6/l3-cache.json
new file mode 100644
index 000000000000..9b9804317da7
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/l3-cache.json
@@ -0,0 +1,177 @@
+[
+  {
+    "EventName": "l3_lookup_state.l3_miss",
+    "EventCode": "0x04",
+    "BriefDescription": "L3 cache misses.",
+    "UMask": "0x01",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_lookup_state.l3_hit",
+    "EventCode": "0x04",
+    "BriefDescription": "L3 cache hits.",
+    "UMask": "0xfe",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_lookup_state.all_coherent_accesses_to_l3",
+    "EventCode": "0x04",
+    "BriefDescription": "L3 cache requests for all coherent accesses.",
+    "UMask": "0xff",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.dram_near",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from DRAM in the same NUMA node.",
+    "UMask": "0x01",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.dram_far",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from DRAM in a different NUMA node.",
+    "UMask": "0x02",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.near_cache",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.far_cache",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x08",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.ext_near",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from extension memory (CXL) in the same NUMA node.",
+    "UMask": "0x10",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.ext_far",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from extension memory (CXL) in a different NUMA node.",
+    "UMask": "0x20",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency.all",
+    "EventCode": "0xac",
+    "BriefDescription": "Average sampled latency for L3 requests where data is returned from all types of sources.",
+    "UMask": "0x3f",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.dram_near",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from DRAM in the same NUMA node.",
+    "UMask": "0x01",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.dram_far",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from DRAM in a different NUMA node.",
+    "UMask": "0x02",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.near_cache",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from cache of another CCX in the same NUMA node.",
+    "UMask": "0x04",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.far_cache",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from cache of another CCX in a different NUMA node.",
+    "UMask": "0x08",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.ext_near",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from extension memory (CXL) in the same NUMA node.",
+    "UMask": "0x10",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.ext_far",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from extension memory (CXL) in a different NUMA node.",
+    "UMask": "0x20",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_xi_sampled_latency_requests.all",
+    "EventCode": "0xad",
+    "BriefDescription": "Average sampled L3 requests where data is returned from all types of sources.",
+    "UMask": "0x3f",
+    "EnAllCores": "0x1",
+    "EnAllSlices": "0x1",
+    "SliceId": "0x3",
+    "ThreadMask": "0x3",
+    "Unit": "L3PMC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/memory-controller.json b/tools/perf/pmu-events/arch/x86/amdzen6/memory-controller.json
new file mode 100644
index 000000000000..649a60b09e1b
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/memory-controller.json
@@ -0,0 +1,101 @@
+[
+  {
+    "EventName": "umc_mem_clk",
+    "PublicDescription": "Memory clock (MEMCLK) cycles.",
+    "EventCode": "0x00",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_act_cmd.all",
+    "PublicDescription": "ACTIVATE commands sent.",
+    "EventCode": "0x05",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_act_cmd.rd",
+    "PublicDescription": "ACTIVATE commands sent for reads.",
+    "EventCode": "0x05",
+    "RdWrMask": "0x1",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_act_cmd.wr",
+    "PublicDescription": "ACTIVATE commands sent for writes.",
+    "EventCode": "0x05",
+    "RdWrMask": "0x2",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_pchg_cmd.all",
+    "PublicDescription": "PRECHARGE commands sent.",
+    "EventCode": "0x06",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_pchg_cmd.rd",
+    "PublicDescription": "PRECHARGE commands sent for reads.",
+    "EventCode": "0x06",
+    "RdWrMask": "0x1",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_pchg_cmd.wr",
+    "PublicDescription": "PRECHARGE commands sent for writes.",
+    "EventCode": "0x06",
+    "RdWrMask": "0x2",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_cas_cmd.all",
+    "PublicDescription": "CAS commands sent.",
+    "EventCode": "0x0a",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_cas_cmd.rd",
+    "PublicDescription": "CAS commands sent for reads.",
+    "EventCode": "0x0a",
+    "RdWrMask": "0x1",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_cas_cmd.wr",
+    "PublicDescription": "CAS commands sent for writes.",
+    "EventCode": "0x0a",
+    "RdWrMask": "0x2",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_data_slot_clks.all",
+    "PublicDescription": "Clock cycles where the data bus is utilized.",
+    "EventCode": "0x14",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_data_slot_clks.rd",
+    "PublicDescription": "Clock cycles where the data bus is utilized for reads.",
+    "EventCode": "0x14",
+    "RdWrMask": "0x1",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  },
+  {
+    "EventName": "umc_data_slot_clks.wr",
+    "PublicDescription": "Clock cycles where the data bus is utilized for writes.",
+    "EventCode": "0x14",
+    "RdWrMask": "0x2",
+    "PerPkg": "1",
+    "Unit": "UMCPMC"
+  }
+]
-- 
cgit v1.2.3


From d0a3df886d777180322a254176c40fd4a4a23cbe Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan.das@amd.com>
Date: Thu, 8 Jan 2026 13:22:16 +0530
Subject: perf vendor events amd: Add Zen 6 metrics

Add metrics taken from Section 1.2 "Performance Measurement" of the
Performance Monitor Counters for AMD Family 1Ah Model 50h-57h Processors
document available at the link below.

The recommended metrics are sourced from Table 1 "Guidance for Common
Performance Statistics with Complex Event Selects".

The pipeline utilization metrics are sourced from Table 2 "Guidance
for Pipeline Utilization Analysis Statistics". These are useful for
finding performance bottlenecks by analyzing activity at different
stages of the pipeline. There are metric groups available for Level 1
and Level 2 analysis.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Sandipan Das <sandipan.das@amd.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Link: https://bugzilla.kernel.org/attachment.cgi?id=309149
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../perf/pmu-events/arch/x86/amdzen6/pipeline.json |  99 ++++++
 .../pmu-events/arch/x86/amdzen6/recommended.json   | 339 +++++++++++++++++++++
 2 files changed, 438 insertions(+)
 create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/pipeline.json
 create mode 100644 tools/perf/pmu-events/arch/x86/amdzen6/recommended.json

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/pipeline.json b/tools/perf/pmu-events/arch/x86/amdzen6/pipeline.json
new file mode 100644
index 000000000000..48c501d8a097
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/pipeline.json
@@ -0,0 +1,99 @@
+[
+  {
+    "MetricName": "total_dispatch_slots",
+    "BriefDescription": "Total dispatch slots (up to 8 instructions can be dispatched in each cycle).",
+    "MetricExpr": "8 * ls_not_halted_cyc",
+    "ScaleUnit": "1slots"
+  },
+  {
+    "MetricName": "frontend_bound",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because the frontend did not supply enough instructions/ops.",
+    "MetricExpr": "d_ratio(de_no_dispatch_per_slot.no_ops_from_frontend, total_dispatch_slots)",
+    "MetricGroup": "PipelineL1",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "bad_speculation",
+    "BriefDescription": "Percentage of dispatched ops that did not retire.",
+    "MetricExpr": "d_ratio(de_src_op_disp.all - ex_ret_ops, total_dispatch_slots)",
+    "MetricGroup": "PipelineL1",
+    "ScaleUnit": "100%ops"
+  },
+  {
+    "MetricName": "backend_bound",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because of backend stalls.",
+    "MetricExpr": "d_ratio(de_no_dispatch_per_slot.backend_stalls, total_dispatch_slots)",
+    "MetricGroup": "PipelineL1",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "smt_contention",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because the other thread was selected.",
+    "MetricExpr": "d_ratio(de_no_dispatch_per_slot.smt_contention, total_dispatch_slots)",
+    "MetricGroup": "PipelineL1",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "retiring",
+    "BriefDescription": "Percentage of dispatch slots used by ops that retired.",
+    "MetricExpr": "d_ratio(ex_ret_ops, total_dispatch_slots)",
+    "MetricGroup": "PipelineL1",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "frontend_bound_by_latency",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because of a latency bottleneck in the frontend (such as instruction cache or TLB misses).",
+    "MetricExpr": "d_ratio((8 * cpu@de_no_dispatch_per_slot.no_ops_from_frontend\\,cmask\\=0x8@), total_dispatch_slots)",
+    "MetricGroup": "PipelineL2;frontend_bound_group",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "frontend_bound_by_bandwidth",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because of a bandwidth bottleneck in the frontend (such as decode or op cache fetch bandwidth).",
+    "MetricExpr": "d_ratio(de_no_dispatch_per_slot.no_ops_from_frontend - (8 * cpu@de_no_dispatch_per_slot.no_ops_from_frontend\\,cmask\\=0x8@), total_dispatch_slots)",
+    "MetricGroup": "PipelineL2;frontend_bound_group",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "bad_speculation_from_mispredicts",
+    "BriefDescription": "Percentage of dispatched ops that were flushed due to branch mispredicts.",
+    "MetricExpr": "d_ratio(bad_speculation * ex_ret_brn_misp, ex_ret_brn_misp + bp_fe_redir.resync)",
+    "MetricGroup": "PipelineL2;bad_speculation_group",
+    "ScaleUnit": "100%ops"
+  },
+  {
+    "MetricName": "bad_speculation_from_pipeline_restarts",
+    "BriefDescription": "Percentage of dispatched ops that were flushed due to pipeline restarts (resyncs).",
+    "MetricExpr": "d_ratio(bad_speculation * bp_fe_redir.resync, ex_ret_brn_misp + bp_fe_redir.resync)",
+    "MetricGroup": "PipelineL2;bad_speculation_group",
+    "ScaleUnit": "100%ops"
+  },
+  {
+    "MetricName": "backend_bound_by_memory",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because of stalls due to the memory subsystem.",
+    "MetricExpr": "backend_bound * d_ratio(ex_no_retire.load_not_complete, ex_no_retire.not_complete)",
+    "MetricGroup": "PipelineL2;backend_bound_group",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "backend_bound_by_cpu",
+    "BriefDescription": "Percentage of dispatch slots that remained unused because of stalls not related to the memory subsystem.",
+    "MetricExpr": "backend_bound * (1 - d_ratio(ex_no_retire.load_not_complete, ex_no_retire.not_complete))",
+    "MetricGroup": "PipelineL2;backend_bound_group",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "retiring_from_fastpath",
+    "BriefDescription": "Percentage of dispatch slots used by fastpath ops that retired.",
+    "MetricExpr": "retiring * (1 - d_ratio(ex_ret_ucode_ops, ex_ret_ops))",
+    "MetricGroup": "PipelineL2;retiring_group",
+    "ScaleUnit": "100%slots"
+  },
+  {
+    "MetricName": "retiring_from_microcode",
+    "BriefDescription": "Percentage of dispatch slots used by microcode ops that retired.",
+    "MetricExpr": "retiring * d_ratio(ex_ret_ucode_ops, ex_ret_ops)",
+    "MetricGroup": "PipelineL2;retiring_group",
+    "ScaleUnit": "100%slots"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen6/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen6/recommended.json
new file mode 100644
index 000000000000..2849a8c159f6
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen6/recommended.json
@@ -0,0 +1,339 @@
+[
+  {
+    "MetricName": "branch_misprediction_rate",
+    "BriefDescription": "Execution-time branch misprediction rate (non-speculative).",
+    "MetricExpr": "d_ratio(ex_ret_brn_misp, ex_ret_brn)",
+    "MetricGroup": "branch_prediction",
+    "ScaleUnit": "1per_branch"
+  },
+  {
+    "MetricName": "all_data_cache_accesses_pti",
+    "BriefDescription": "All data cache accesses per thousand instructions.",
+    "MetricExpr": "ls_dispatch.all / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "all_l2_cache_accesses_pti",
+    "BriefDescription": "All L2 cache accesses per thousand instructions.",
+    "MetricExpr": "(l2_request_g1.no_pf_all + l2_pf_hit_l2.l2_hwpf + l2_pf_miss_l2_hit_l3.l2_hwpf + l2_pf_miss_l2_l3.l2_hwpf) / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_accesses_from_l1_ic_misses_pti",
+    "BriefDescription": "L2 cache accesses from L1 instruction cache misses (including prefetch) per thousand instructions.",
+    "MetricExpr": "l2_request_g1.cacheable_ic_read / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_accesses_from_l1_dc_misses_pti",
+    "BriefDescription": "L2 cache accesses from L1 data cache misses (including prefetch) per thousand instructions.",
+    "MetricExpr": "l2_request_g1.dc_all / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_accesses_from_l2_hwpf_pti",
+    "BriefDescription": "L2 cache accesses from L2 cache hardware prefetcher per thousand instructions.",
+    "MetricExpr": "(l2_pf_hit_l2.l1_dc_l2_hwpf + l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf + l2_pf_miss_l2_l3.l1_dc_l2_hwpf) / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "all_l2_cache_misses_pti",
+    "BriefDescription": "All L2 cache misses per thousand instructions.",
+    "MetricExpr": "(l2_cache_req_stat.ic_dc_miss_in_l2 + l2_pf_miss_l2_hit_l3.l2_hwpf + l2_pf_miss_l2_l3.l2_hwpf) / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_misses_from_l1_ic_miss_pti",
+    "BriefDescription": "L2 cache misses from L1 instruction cache misses per thousand instructions.",
+    "MetricExpr": "l2_cache_req_stat.ic_fill_miss / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_misses_from_l1_dc_miss_pti",
+    "BriefDescription": "L2 cache misses from L1 data cache misses per thousand instructions.",
+    "MetricExpr": "l2_cache_req_stat.ls_rd_blk_c / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_misses_from_l2_hwpf_pti",
+    "BriefDescription": "L2 cache misses from L2 cache hardware prefetcher per thousand instructions.",
+    "MetricExpr": "(l2_pf_miss_l2_hit_l3.l1_dc_l2_hwpf + l2_pf_miss_l2_l3.l1_dc_l2_hwpf) / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "all_l2_cache_hits_pti",
+    "BriefDescription": "All L2 cache hits per thousand instructions.",
+    "MetricExpr": "(l2_cache_req_stat.ic_dc_hit_in_l2 + l2_pf_hit_l2.l2_hwpf) / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_hits_from_l1_ic_miss_pti",
+    "BriefDescription": "L2 cache hits from L1 instruction cache misses per thousand instructions.",
+    "MetricExpr": "l2_cache_req_stat.ic_hit_in_l2 / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_hits_from_l1_dc_miss_pti",
+    "BriefDescription": "L2 cache hits from L1 data cache misses per thousand instructions.",
+    "MetricExpr": "l2_cache_req_stat.dc_hit_in_l2 / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_cache_hits_from_l2_hwpf_pti",
+    "BriefDescription": "L2 cache hits from L2 cache hardware prefetcher per thousand instructions.",
+    "MetricExpr": "l2_pf_hit_l2.l1_dc_l2_hwpf / instructions",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l3_cache_accesses",
+    "BriefDescription": "L3 cache accesses.",
+    "MetricExpr": "l3_lookup_state.all_coherent_accesses_to_l3",
+    "MetricGroup": "l3_cache"
+  },
+  {
+    "MetricName": "l3_misses",
+    "BriefDescription": "L3 misses (including cacheline state change requests).",
+    "MetricExpr": "l3_lookup_state.l3_miss",
+    "MetricGroup": "l3_cache"
+  },
+  {
+    "MetricName": "l3_read_miss_latency",
+    "BriefDescription": "Average L3 read miss latency (in core clocks).",
+    "MetricExpr": "(l3_xi_sampled_latency.all * 10) / l3_xi_sampled_latency_requests.all",
+    "MetricGroup": "l3_cache",
+    "ScaleUnit": "1ns"
+  },
+  {
+    "MetricName": "l3_read_miss_latency_for_local_dram",
+    "BriefDescription": "Average L3 read miss latency (in core clocks) for local DRAM.",
+    "MetricExpr": "(l3_xi_sampled_latency.dram_near * 10) / l3_xi_sampled_latency_requests.dram_near",
+    "MetricGroup": "l3_cache",
+    "ScaleUnit": "1ns"
+  },
+  {
+    "MetricName": "l3_read_miss_latency_for_remote_dram",
+    "BriefDescription": "Average L3 read miss latency (in core clocks) for remote DRAM.",
+    "MetricExpr": "(l3_xi_sampled_latency.dram_far * 10) / l3_xi_sampled_latency_requests.dram_far",
+    "MetricGroup": "l3_cache",
+    "ScaleUnit": "1ns"
+  },
+  {
+    "MetricName": "op_cache_fetch_miss_ratio",
+    "BriefDescription": "Op cache miss ratio for all fetches.",
+    "MetricExpr": "d_ratio(op_cache_hit_miss.miss, op_cache_hit_miss.all)",
+    "ScaleUnit": "100%"
+  },
+  {
+    "MetricName": "l1_data_cache_fills_from_memory_pti",
+    "BriefDescription": "L1 data cache fills from DRAM or MMIO in any NUMA node per thousand instructions.",
+    "MetricExpr": "ls_any_fills_from_sys.dram_io_all / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_data_cache_fills_from_remote_node_pti",
+    "BriefDescription": "L1 data cache fills from a different NUMA node per thousand instructions.",
+    "MetricExpr": "ls_any_fills_from_sys.far_all / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_data_cache_fills_from_same_ccx_pti",
+    "BriefDescription": "L1 data cache fills from within the same CCX per thousand instructions.",
+    "MetricExpr": "ls_any_fills_from_sys.local_all / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_data_cache_fills_from_different_ccx_pti",
+    "BriefDescription": "L1 data cache fills from another CCX cache in any NUMA node per thousand instructions.",
+    "MetricExpr": "ls_any_fills_from_sys.remote_cache / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "all_l1_data_cache_fills_pti",
+    "BriefDescription": "All L1 data cache fills per thousand instructions.",
+    "MetricExpr": "ls_any_fills_from_sys.all / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_demand_data_cache_fills_from_local_l2_pti",
+    "BriefDescription": "L1 demand data cache fills from local L2 cache per thousand instructions.",
+    "MetricExpr": "ls_dmnd_fills_from_sys.local_l2 / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_demand_data_cache_fills_from_same_ccx_pti",
+    "BriefDescription": "L1 demand data cache fills from within the same CCX per thousand instructions.",
+    "MetricExpr": "ls_dmnd_fills_from_sys.local_ccx / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_demand_data_cache_fills_from_near_cache_pti",
+    "BriefDescription": "L1 demand data cache fills from another CCX cache in the same NUMA node per thousand instructions.",
+    "MetricExpr": "ls_dmnd_fills_from_sys.near_cache / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_demand_data_cache_fills_from_near_memory_pti",
+    "BriefDescription": "L1 demand data cache fills from DRAM or MMIO in the same NUMA node per thousand instructions.",
+    "MetricExpr": "ls_dmnd_fills_from_sys.dram_io_near / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_demand_data_cache_fills_from_far_cache_pti",
+    "BriefDescription": "L1 demand data cache fills from another CCX cache in a different NUMA node per thousand instructions.",
+    "MetricExpr": "ls_dmnd_fills_from_sys.far_cache / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_demand_data_cache_fills_from_far_memory_pti",
+    "BriefDescription": "L1 demand data cache fills from DRAM or MMIO in a different NUMA node per thousand instructions.",
+    "MetricExpr": "ls_dmnd_fills_from_sys.dram_io_far / instructions",
+    "MetricGroup": "l1_dcache",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_itlb_misses_pti",
+    "BriefDescription": "L1 instruction TLB misses per thousand instructions.",
+    "MetricExpr": "(bp_l1_tlb_miss_l2_tlb_hit + bp_l1_tlb_miss_l2_tlb_miss.all) / instructions",
+    "MetricGroup": "tlb",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_itlb_misses_pti",
+    "BriefDescription": "L2 instruction TLB misses and instruction page walks per thousand instructions.",
+    "MetricExpr": "bp_l1_tlb_miss_l2_tlb_miss.all / instructions",
+    "MetricGroup": "tlb",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l1_dtlb_misses_pti",
+    "BriefDescription": "L1 data TLB misses per thousand instructions.",
+    "MetricExpr": "ls_l1_d_tlb_miss.all / instructions",
+    "MetricGroup": "tlb",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "l2_dtlb_misses_pti",
+    "BriefDescription": "L2 data TLB misses and data page walks per thousand instructions.",
+    "MetricExpr": "ls_l1_d_tlb_miss.l2_miss_all / instructions",
+    "MetricGroup": "tlb",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "all_tlbs_flushed_pti",
+    "BriefDescription": "All TLBs flushed per thousand instructions.",
+    "MetricExpr": "ls_tlb_flush.all / instructions",
+    "MetricGroup": "tlb",
+    "ScaleUnit": "1e3per_1k_instr"
+  },
+  {
+    "MetricName": "macro_ops_dispatched",
+    "BriefDescription": "Macro-ops dispatched.",
+    "MetricExpr": "de_src_op_disp.all",
+    "MetricGroup": "decoder"
+  },
+  {
+    "MetricName": "sse_avx_stalls",
+    "BriefDescription": "Mixed SSE/AVX stalls.",
+    "MetricExpr": "fp_disp_faults.sse_avx_all"
+  },
+  {
+    "MetricName": "macro_ops_retired",
+    "BriefDescription": "Macro-ops retired.",
+    "MetricExpr": "ex_ret_ops"
+  },
+  {
+    "MetricName": "umc_data_bus_utilization",
+    "BriefDescription": "Memory controller data bus utilization.",
+    "MetricExpr": "d_ratio(umc_data_slot_clks.all / 2, umc_mem_clk)",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "100%"
+  },
+  {
+    "MetricName": "umc_cas_cmd_rate",
+    "BriefDescription": "Memory controller CAS command rate.",
+    "MetricExpr": "d_ratio(umc_cas_cmd.all * 1000, umc_mem_clk)",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "1per_memclk"
+  },
+  {
+    "MetricName": "umc_cas_cmd_read_ratio",
+    "BriefDescription": "Ratio of memory controller CAS commands for reads.",
+    "MetricExpr": "d_ratio(umc_cas_cmd.rd, umc_cas_cmd.all)",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "100%"
+  },
+  {
+    "MetricName": "umc_cas_cmd_write_ratio",
+    "BriefDescription": "Ratio of memory controller CAS commands for writes.",
+    "MetricExpr": "d_ratio(umc_cas_cmd.wr, umc_cas_cmd.all)",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "100%"
+  },
+  {
+    "MetricName": "umc_mem_read_bandwidth",
+    "BriefDescription": "Estimated memory read bandwidth.",
+    "MetricExpr": "(umc_cas_cmd.rd * 64) / 1e6 / duration_time",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "1MB/s"
+  },
+  {
+    "MetricName": "umc_mem_write_bandwidth",
+    "BriefDescription": "Estimated memory write bandwidth.",
+    "MetricExpr": "(umc_cas_cmd.wr * 64) / 1e6 / duration_time",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "1MB/s"
+  },
+  {
+    "MetricName": "umc_mem_bandwidth",
+    "BriefDescription": "Estimated combined memory bandwidth.",
+    "MetricExpr": "(umc_cas_cmd.all * 64) / 1e6 / duration_time",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "1MB/s"
+  },
+  {
+    "MetricName": "umc_activate_cmd_rate",
+    "BriefDescription": "Memory controller ACTIVATE command rate.",
+    "MetricExpr": "d_ratio(umc_act_cmd.all * 1000, umc_mem_clk)",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "1per_memclk"
+  },
+  {
+    "MetricName": "umc_precharge_cmd_rate",
+    "BriefDescription": "Memory controller PRECHARGE command rate.",
+    "MetricExpr": "d_ratio(umc_pchg_cmd.all * 1000, umc_mem_clk)",
+    "MetricGroup": "memory_controller",
+    "PerPkg": "1",
+    "ScaleUnit": "1per_memclk"
+  }
+]
-- 
cgit v1.2.3


From 47d3545faeeb6822f404ddb237985e1824a8bd70 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 4 Dec 2025 13:11:43 -0800
Subject: perf help: Move common_cmds into builtin-help

There's a lot of infrastructure for generating a relatively simple
array used by one function.

Move the array into the function and remove the supporting build logic.

At the same time opportunistically const-ify the array.

Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Charlie Jenkins <charlie@rivosinc.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.perf            | 11 ++----
 tools/perf/builtin-help.c           | 51 ++++++++++++++++++++++++---
 tools/perf/command-list.txt         | 36 -------------------
 tools/perf/util/Build               | 14 --------
 tools/perf/util/generate-cmdlist.sh | 70 -------------------------------------
 5 files changed, 49 insertions(+), 133 deletions(-)
 delete mode 100644 tools/perf/command-list.txt
 delete mode 100755 tools/perf/util/generate-cmdlist.sh

(limited to 'tools')

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index e6895626c187..45d5a59a02cb 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -808,11 +808,6 @@ $(GTK_IN): FORCE prepare
 $(OUTPUT)libperf-gtk.so: $(GTK_IN) $(PERFLIBS)
 	$(QUIET_LINK)$(CC) -o $@ -shared $(LDFLAGS) $(filter %.o,$^) $(GTK_LIBS)
 
-$(OUTPUT)common-cmds.h: util/generate-cmdlist.sh command-list.txt
-
-$(OUTPUT)common-cmds.h: $(wildcard Documentation/perf-*.txt)
-	$(QUIET_GEN). util/generate-cmdlist.sh > $@+ && mv $@+ $@
-
 $(SCRIPTS) : % : %.sh
 	$(QUIET_GEN)$(INSTALL) '$@.sh' '$(OUTPUT)$@'
 
@@ -850,7 +845,7 @@ endif
 __build-dir = $(subst $(OUTPUT),,$(dir $@))
 build-dir   = $(or $(__build-dir),.)
 
-prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders \
+prepare: $(OUTPUT)PERF-VERSION-FILE archheaders \
 	arm64-sysreg-defs \
 	$(syscall_array) \
 	$(fs_at_flags_array) \
@@ -1054,7 +1049,7 @@ cscope:
 # However, the environment gets quite big, and some programs have problems
 # with that.
 
-check: $(OUTPUT)common-cmds.h
+check: prepare
 	if sparse; \
 	then \
 		for i in *.c */*.c; \
@@ -1297,7 +1292,7 @@ clean:: $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBSYMBOL)-clean $(
 	$(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 \
 		perf-read-vdsox32 $(OUTPUT)$(LIBJVMTI).so
 	$(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo \
-		$(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE \
+		TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE \
 		$(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \
 		$(OUTPUT)util/intel-pt-decoder/inat-tables.c \
 		$(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c \
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 7be6fb6df595..2692b2e40a23 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -9,7 +9,6 @@
 #include "util/strbuf.h"
 #include "builtin.h"
 #include <subcmd/exec-cmd.h>
-#include "common-cmds.h"
 #include <subcmd/parse-options.h>
 #include <subcmd/run-command.h>
 #include <subcmd/help.h>
@@ -301,16 +300,58 @@ static struct cmdnames main_cmds, other_cmds;
 
 void list_common_cmds_help(void)
 {
-	unsigned int i, longest = 0;
+	const struct cmdname_help {
+		const char *name;
+		const char *help;
+	} common_cmds[] = {
+		{"annotate", "Read perf.data (created by perf record) and display annotated code"},
+		{"archive",
+		 "Create archive with object files with build-ids found in perf.data file"},
+		{"bench", "General framework for benchmark suites"},
+		{"buildid-cache", "Manage build-id cache."},
+		{"buildid-list", "List the buildids in a perf.data file"},
+		{"c2c", "Shared Data C2C/HITM Analyzer."},
+		{"config", "Get and set variables in a configuration file."},
+		{"daemon", "Run record sessions on background"},
+		{"data", "Data file related processing"},
+		{"diff", "Read perf.data files and display the differential profile"},
+		{"evlist", "List the event names in a perf.data file"},
+		{"ftrace", "simple wrapper for kernel's ftrace functionality"},
+		{"inject", "Filter to augment the events stream with additional information"},
+		{"iostat", "Show I/O performance metrics"},
+		{"kallsyms", "Searches running kernel for symbols"},
+		{"kvm", "Tool to trace/measure kvm guest os"},
+		{"list", "List all symbolic event types"},
+		{"mem", "Profile memory accesses"},
+		{"record", "Run a command and record its profile into perf.data"},
+		{"report", "Read perf.data (created by perf record) and display the profile"},
+		{"script", "Read perf.data (created by perf record) and display trace output"},
+		{"stat", "Run a command and gather performance counter statistics"},
+		{"test", "Runs sanity tests."},
+		{"top", "System profiling tool."},
+		{"version", "display the version of perf binary"},
+	#ifdef HAVE_LIBELF_SUPPORT
+		{"probe", "Define new dynamic tracepoints"},
+	#endif /* HAVE_LIBELF_SUPPORT */
+	#ifdef HAVE_LIBTRACEEVENT
+		{"trace", "strace inspired tool"},
+		{"kmem", "Tool to trace/measure kernel memory properties"},
+		{"kwork", "Tool to trace/measure kernel work properties (latencies)"},
+		{"lock", "Analyze lock events"},
+		{"sched", "Tool to trace/measure scheduler properties (latencies)"},
+		{"timechart", "Tool to visualize total system behavior during a workload"},
+	#endif /* HAVE_LIBTRACEEVENT */
+	};
+	size_t longest = 0;
 
-	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+	for (size_t i = 0; i < ARRAY_SIZE(common_cmds); i++) {
 		if (longest < strlen(common_cmds[i].name))
 			longest = strlen(common_cmds[i].name);
 	}
 
 	puts(" The most commonly used perf commands are:");
-	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
-		printf("   %-*s   ", longest, common_cmds[i].name);
+	for (size_t i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		printf("   %-*s   ", (int)longest, common_cmds[i].name);
 		puts(common_cmds[i].help);
 	}
 }
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
deleted file mode 100644
index e8d2762adade..000000000000
--- a/tools/perf/command-list.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-#
-# List of known perf commands.
-# command name			category [deprecated] [common]
-#
-perf-annotate			mainporcelain common
-perf-archive			mainporcelain common
-perf-bench			mainporcelain common
-perf-buildid-cache		mainporcelain common
-perf-buildid-list		mainporcelain common
-perf-data			mainporcelain common
-perf-diff			mainporcelain common
-perf-c2c			mainporcelain common
-perf-config			mainporcelain common
-perf-evlist			mainporcelain common
-perf-ftrace			mainporcelain common
-perf-inject			mainporcelain common
-perf-iostat			mainporcelain common
-perf-kallsyms			mainporcelain common
-perf-kmem			mainporcelain traceevent
-perf-kvm			mainporcelain common
-perf-kwork			mainporcelain traceevent
-perf-list			mainporcelain common
-perf-lock			mainporcelain traceevent
-perf-mem			mainporcelain common
-perf-probe			mainporcelain full
-perf-record			mainporcelain common
-perf-report			mainporcelain common
-perf-sched			mainporcelain traceevent
-perf-script			mainporcelain common
-perf-stat			mainporcelain common
-perf-test			mainporcelain common
-perf-timechart			mainporcelain traceevent
-perf-top			mainporcelain common
-perf-trace			mainporcelain audit
-perf-version			mainporcelain common
-perf-daemon			mainporcelain common
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 248ad3ac64da..4915f237ba9e 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -419,20 +419,6 @@ $(OUTPUT)util/list_sort.o: ../lib/list_sort.c FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_o_c)
 
-ifdef SHELLCHECK
-  SHELL_TESTS := generate-cmdlist.sh
-  SHELL_TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log)
-else
-  SHELL_TESTS :=
-  SHELL_TEST_LOGS :=
-endif
-
-$(OUTPUT)%.shellcheck_log: %
-	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,test)$(SHELLCHECK) "$<" > $@ || (cat $@ && rm $@ && false)
-
-perf-util-y += $(SHELL_TEST_LOGS)
-
 PY_TESTS := setup.py
 ifdef MYPY
   MYPY_TEST_LOGS := $(PY_TESTS:%=%.mypy_log)
diff --git a/tools/perf/util/generate-cmdlist.sh b/tools/perf/util/generate-cmdlist.sh
deleted file mode 100755
index 6a73c903d690..000000000000
--- a/tools/perf/util/generate-cmdlist.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-echo "/* Automatically generated by $0 */
-struct cmdname_help
-{
-    char name[16];
-    char help[80];
-};
-
-static struct cmdname_help common_cmds[] = {"
-
-sed -n -e 's/^perf-\([^ 	]*\)[ 	].* common.*/\1/p' command-list.txt |
-sort |
-while read cmd
-do
-     sed -n '
-     /^NAME/,/perf-'"$cmd"'/H
-     ${
-            x
-            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
-	    p
-     }' "Documentation/perf-$cmd.txt"
-done
-
-echo "#ifdef HAVE_LIBELF_SUPPORT"
-sed -n -e 's/^perf-\([^ 	]*\)[ 	].* full.*/\1/p' command-list.txt |
-sort |
-while read cmd
-do
-     sed -n '
-     /^NAME/,/perf-'"$cmd"'/H
-     ${
-            x
-            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
-	    p
-     }' "Documentation/perf-$cmd.txt"
-done
-echo "#endif /* HAVE_LIBELF_SUPPORT */"
-
-echo "#if defined(HAVE_LIBTRACEEVENT)"
-sed -n -e 's/^perf-\([^ 	]*\)[ 	].* audit*/\1/p' command-list.txt |
-sort |
-while read cmd
-do
-     sed -n '
-     /^NAME/,/perf-'"$cmd"'/H
-     ${
-            x
-            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
-	    p
-     }' "Documentation/perf-$cmd.txt"
-done
-echo "#endif /* HAVE_LIBTRACEEVENT */"
-
-echo "#ifdef HAVE_LIBTRACEEVENT"
-sed -n -e 's/^perf-\([^ 	]*\)[ 	].* traceevent.*/\1/p' command-list.txt |
-sort |
-while read cmd
-do
-     sed -n '
-     /^NAME/,/perf-'"$cmd"'/H
-     ${
-            x
-            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
-            p
-     }' "Documentation/perf-$cmd.txt"
-done
-echo "#endif /* HAVE_LIBTRACEEVENT */"
-echo "};"
-- 
cgit v1.2.3


From bac74dcbd48b5b441e47841fd0fe507c7b0bcbaf Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 19 Nov 2025 15:36:21 -0800
Subject: perf tools: Switch printf("...%s", strerror(errno)) to
 printf("...%m")

strerror() has thread safety issues, strerror_r() requires stack
allocated buffers.

Code in perf has already been using the "%m" formatting flag that is a
widely support glibc extension to print the current errno's description.

Expand the usage of this formatting flag and remove usage of
strerror()/strerror_r().

Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Yunseong Kim <ysk@kzalloc.com>
Cc: Zhongqiu Han <quic_zhonhan@quicinc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/tests/bp-modify.c | 30 ++++++++++++------------------
 tools/perf/bench/uprobe.c             |  2 +-
 tools/perf/builtin-daemon.c           |  8 +++-----
 tools/perf/builtin-probe.c            |  3 +--
 tools/perf/builtin-record.c           | 24 ++++++++++--------------
 tools/perf/builtin-stat.c             |  9 ++++-----
 tools/perf/builtin-trace.c            | 15 ++++++---------
 tools/perf/perf.c                     | 18 +++++++-----------
 tools/perf/util/bpf-event.c           | 11 ++++-------
 tools/perf/util/bpf-utils.c           |  4 ++--
 tools/perf/util/bpf_lock_contention.c |  2 +-
 tools/perf/util/cap.c                 |  3 +--
 tools/perf/util/data.c                | 29 ++++++++++-------------------
 tools/perf/util/dso.c                 | 19 ++++++-------------
 tools/perf/util/evlist.c              | 31 +++++++++++++++++--------------
 tools/perf/util/evsel.c               | 17 +++++++++--------
 tools/perf/util/jitdump.c             |  3 ++-
 tools/perf/util/lzma.c                |  6 +++---
 tools/perf/util/session.c             |  5 +++--
 tools/perf/util/symbol-elf.c          |  4 ++--
 20 files changed, 104 insertions(+), 139 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/x86/tests/bp-modify.c b/tools/perf/arch/x86/tests/bp-modify.c
index 0924ccd9e36d..589b43273948 100644
--- a/tools/perf/arch/x86/tests/bp-modify.c
+++ b/tools/perf/arch/x86/tests/bp-modify.c
@@ -80,26 +80,24 @@ static int bp_modify1(void)
 	 */
 	if (ptrace(PTRACE_POKEUSER, child,
 		   offsetof(struct user, u_debugreg[0]), bp_2)) {
-		pr_debug("failed to set breakpoint, 1st time: %s\n",
-			 strerror(errno));
+		pr_debug("failed to set breakpoint, 1st time: %m\n");
 		goto out;
 	}
 
 	if (ptrace(PTRACE_POKEUSER, child,
 		   offsetof(struct user, u_debugreg[0]), bp_1)) {
-		pr_debug("failed to set breakpoint, 2nd time: %s\n",
-			 strerror(errno));
+		pr_debug("failed to set breakpoint, 2nd time: %m\n");
 		goto out;
 	}
 
 	if (ptrace(PTRACE_POKEUSER, child,
 		   offsetof(struct user, u_debugreg[7]), dr7)) {
-		pr_debug("failed to set dr7: %s\n", strerror(errno));
+		pr_debug("failed to set dr7: %m\n");
 		goto out;
 	}
 
 	if (ptrace(PTRACE_CONT, child, NULL, NULL)) {
-		pr_debug("failed to PTRACE_CONT: %s\n", strerror(errno));
+		pr_debug("failed to PTRACE_CONT: %m\n");
 		goto out;
 	}
 
@@ -112,19 +110,17 @@ static int bp_modify1(void)
 	rip = ptrace(PTRACE_PEEKUSER, child,
 		     offsetof(struct user_regs_struct, rip), NULL);
 	if (rip == (unsigned long) -1) {
-		pr_debug("failed to PTRACE_PEEKUSER: %s\n",
-			 strerror(errno));
+		pr_debug("failed to PTRACE_PEEKUSER: %m\n");
 		goto out;
 	}
 
 	pr_debug("rip %lx, bp_1 %p\n", rip, bp_1);
-
 out:
 	if (ptrace(PTRACE_DETACH, child, NULL, NULL)) {
-		pr_debug("failed to PTRACE_DETACH: %s", strerror(errno));
+		pr_debug("failed to PTRACE_DETACH: %m\n");
 		return TEST_FAIL;
-	}
 
+	}
 	return rip == (unsigned long) bp_1 ? TEST_OK : TEST_FAIL;
 }
 
@@ -157,14 +153,13 @@ static int bp_modify2(void)
 	 */
 	if (ptrace(PTRACE_POKEUSER, child,
 		   offsetof(struct user, u_debugreg[0]), bp_1)) {
-		pr_debug("failed to set breakpoint: %s\n",
-			 strerror(errno));
+		pr_debug("failed to set breakpoint: %m\n");
 		goto out;
 	}
 
 	if (ptrace(PTRACE_POKEUSER, child,
 		   offsetof(struct user, u_debugreg[7]), dr7)) {
-		pr_debug("failed to set dr7: %s\n", strerror(errno));
+		pr_debug("failed to set dr7: %m\n");
 		goto out;
 	}
 
@@ -175,7 +170,7 @@ static int bp_modify2(void)
 	}
 
 	if (ptrace(PTRACE_CONT, child, NULL, NULL)) {
-		pr_debug("failed to PTRACE_CONT: %s\n", strerror(errno));
+		pr_debug("failed to PTRACE_CONT: %m\n");
 		goto out;
 	}
 
@@ -188,8 +183,7 @@ static int bp_modify2(void)
 	rip = ptrace(PTRACE_PEEKUSER, child,
 		     offsetof(struct user_regs_struct, rip), NULL);
 	if (rip == (unsigned long) -1) {
-		pr_debug("failed to PTRACE_PEEKUSER: %s\n",
-			 strerror(errno));
+		pr_debug("failed to PTRACE_PEEKUSER: %m\n");
 		goto out;
 	}
 
@@ -197,7 +191,7 @@ static int bp_modify2(void)
 
 out:
 	if (ptrace(PTRACE_DETACH, child, NULL, NULL)) {
-		pr_debug("failed to PTRACE_DETACH: %s", strerror(errno));
+		pr_debug("failed to PTRACE_DETACH: %m\n");
 		return TEST_FAIL;
 	}
 
diff --git a/tools/perf/bench/uprobe.c b/tools/perf/bench/uprobe.c
index 0b90275862e1..c4dac868f1ee 100644
--- a/tools/perf/bench/uprobe.c
+++ b/tools/perf/bench/uprobe.c
@@ -54,7 +54,7 @@ static const char * const bench_uprobe_usage[] = {
 							   /*opts=*/&uprobe_opts); \
 	if (!skel->links.prog) { \
 		err = -errno; \
-		fprintf(stderr, "Failed to attach bench uprobe \"%s\": %s\n", #prog, strerror(errno)); \
+		fprintf(stderr, "Failed to attach bench uprobe \"%s\": %m\n", #prog); \
 		goto cleanup; \
 	}
 
diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c
index f0568431fbd5..33473e071392 100644
--- a/tools/perf/builtin-daemon.c
+++ b/tools/perf/builtin-daemon.c
@@ -265,8 +265,7 @@ static int check_base(struct daemon *daemon)
 			       daemon->base);
 			return -EACCES;
 		default:
-			pr_err("failed: can't access base '%s': %s\n",
-			       daemon->base, strerror(errno));
+			pr_err("failed: can't access base '%s': %m\n", daemon->base);
 			return -errno;
 		}
 	}
@@ -544,8 +543,7 @@ static int daemon_session__control(struct daemon_session *session,
 
 	err = writen(control, msg, len);
 	if (err != len) {
-		pr_err("failed: write to control pipe: %d (%s)\n",
-		       errno, control_path);
+		pr_err("failed: write to control pipe: %m (%s)\n", control_path);
 		goto out;
 	}
 
@@ -586,7 +584,7 @@ static int setup_server_socket(struct daemon *daemon)
 	int fd = socket(AF_UNIX, SOCK_STREAM, 0);
 
 	if (fd < 0) {
-		fprintf(stderr, "socket: %s\n", strerror(errno));
+		fprintf(stderr, "socket: %m\n");
 		return -1;
 	}
 
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index 69800e4d9530..1b4ba85ee019 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -211,8 +211,7 @@ static int opt_set_target_ns(const struct option *opt __maybe_unused,
 		ns_pid = (pid_t)strtol(str, NULL, 10);
 		if (errno != 0) {
 			ret = -errno;
-			pr_warning("Failed to parse %s as a pid: %s\n", str,
-				   strerror(errno));
+			pr_warning("Failed to parse %s as a pid: %m\n", str);
 			return ret;
 		}
 		nsip = nsinfo__new(ns_pid);
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index cbfbd9bb1063..003e47a4fc1d 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1286,7 +1286,6 @@ static int record__mmap_evlist(struct record *rec,
 	struct record_opts *opts = &rec->opts;
 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
 				  opts->auxtrace_sample_mode;
-	char msg[512];
 
 	if (opts->affinity != PERF_AFFINITY_SYS)
 		cpu__setup_cpunode_map();
@@ -1305,8 +1304,7 @@ static int record__mmap_evlist(struct record *rec,
 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
 			return -errno;
 		} else {
-			pr_err("failed to mmap with %d (%s)\n", errno,
-				str_error_r(errno, msg, sizeof(msg)));
+			pr_err("failed to mmap: %m\n");
 			if (errno)
 				return -errno;
 			else
@@ -1324,7 +1322,8 @@ static int record__mmap_evlist(struct record *rec,
 	if (record__threads_enabled(rec)) {
 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
 		if (ret) {
-			pr_err("Failed to create data directory: %s\n", strerror(-ret));
+			errno = -ret;
+			pr_err("Failed to create data directory: %m\n");
 			return ret;
 		}
 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
@@ -1461,9 +1460,8 @@ try_again:
 	}
 
 	if (evlist__apply_filters(evlist, &pos, &opts->target)) {
-		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
-			pos->filter ?: "BPF", evsel__name(pos), errno,
-			str_error_r(errno, msg, sizeof(msg)));
+		pr_err("failed to set filter \"%s\" on event %s: %m\n",
+			pos->filter ?: "BPF", evsel__name(pos));
 		rc = -1;
 		goto out;
 	}
@@ -1748,8 +1746,7 @@ static void *record__thread(void *arg)
 
 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
 	if (err == -1)
-		pr_warning("threads[%d]: failed to notify on start: %s\n",
-			   thread->tid, strerror(errno));
+		pr_warning("threads[%d]: failed to notify on start: %m\n", thread->tid);
 
 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
 
@@ -1792,8 +1789,7 @@ static void *record__thread(void *arg)
 
 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
 	if (err == -1)
-		pr_warning("threads[%d]: failed to notify on termination: %s\n",
-			   thread->tid, strerror(errno));
+		pr_warning("threads[%d]: failed to notify on termination: %m\n", thread->tid);
 
 	return NULL;
 }
@@ -2338,7 +2334,7 @@ static int record__start_threads(struct record *rec)
 
 	sigfillset(&full);
 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
-		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
+		pr_err("Failed to block signals on threads start: %m\n");
 		return -1;
 	}
 
@@ -2356,7 +2352,7 @@ static int record__start_threads(struct record *rec)
 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
 			for (tt = 1; tt < t; tt++)
 				record__terminate_thread(&thread_data[t]);
-			pr_err("Failed to start threads: %s\n", strerror(errno));
+			pr_err("Failed to start threads: %m\n");
 			ret = -1;
 			goto out_err;
 		}
@@ -2379,7 +2375,7 @@ out_err:
 	pthread_attr_destroy(&attrs);
 
 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
-		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
+		pr_err("Failed to unblock signals on threads start: %m\n");
 		ret = -1;
 	}
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index ab40d85fb125..2895b809607f 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -937,9 +937,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	}
 
 	if (evlist__apply_filters(evsel_list, &counter, &target)) {
-		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
-			counter->filter, evsel__name(counter), errno,
-			str_error_r(errno, msg, sizeof(msg)));
+		pr_err("failed to set filter \"%s\" on event %s: %m\n",
+			counter->filter, evsel__name(counter));
 		return -1;
 	}
 
@@ -1001,8 +1000,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 		}
 
 		if (workload_exec_errno) {
-			const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
-			pr_err("Workload failed: %s\n", emsg);
+			errno = workload_exec_errno;
+			pr_err("Workload failed: %m\n");
 			err = -1;
 			goto err_out;
 		}
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index d49c1ae409d7..58a32adafddf 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2616,12 +2616,10 @@ static struct syscall *trace__syscall_info(struct trace *trace, struct evsel *ev
 		err = syscall__read_info(sc, trace);
 
 	if (err && verbose > 0) {
-		char sbuf[STRERR_BUFSIZE];
-
-		fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err,
-			str_error_r(-err, sbuf, sizeof(sbuf)));
+		errno = -err;
+		fprintf(trace->output, "Problems reading syscall %d: %m", id);
 		if (sc && sc->name)
-			fprintf(trace->output, "(%s)", sc->name);
+			fprintf(trace->output, " (%s)", sc->name);
 		fputs(" information\n", trace->output);
 	}
 	return err ? NULL : sc;
@@ -4673,9 +4671,8 @@ out_error:
 
 out_error_apply_filters:
 	fprintf(trace->output,
-		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
-		evsel->filter, evsel__name(evsel), errno,
-		str_error_r(errno, errbuf, sizeof(errbuf)));
+		"Failed to set filter \"%s\" on event %s: %m\n",
+		evsel->filter, evsel__name(evsel));
 	goto out_delete_evlist;
 }
 out_error_mem:
@@ -4683,7 +4680,7 @@ out_error_mem:
 	goto out_delete_evlist;
 
 out_errno:
-	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
+	fprintf(trace->output, "%m\n");
 	goto out_delete_evlist;
 }
 
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 88c60ecf3395..f475a8664ffc 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -169,8 +169,8 @@ static int set_debug_file(const char *path)
 {
 	debug_fp = fopen(path, "w");
 	if (!debug_fp) {
-		fprintf(stderr, "Open debug file '%s' failed: %s\n",
-			path, strerror(errno));
+		fprintf(stderr, "Open debug file '%s' failed: %m\n",
+			path);
 		return -1;
 	}
 
@@ -335,7 +335,6 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 {
 	int status;
 	struct stat st;
-	char sbuf[STRERR_BUFSIZE];
 
 	if (use_browser == -1)
 		use_browser = check_browser_config(p->cmd);
@@ -363,17 +362,15 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 	status = 1;
 	/* Check for ENOSPC and EIO errors.. */
 	if (fflush(stdout)) {
-		fprintf(stderr, "write failure on standard output: %s",
-			str_error_r(errno, sbuf, sizeof(sbuf)));
+		fprintf(stderr, "write failure on standard output: %m\n");
 		goto out;
 	}
 	if (ferror(stdout)) {
-		fprintf(stderr, "unknown write failure on standard output");
+		fprintf(stderr, "unknown write failure on standard output\n");
 		goto out;
 	}
 	if (fclose(stdout)) {
-		fprintf(stderr, "close failed on standard output: %s",
-			str_error_r(errno, sbuf, sizeof(sbuf)));
+		fprintf(stderr, "close failed on standard output: %m\n");
 		goto out;
 	}
 	status = 0;
@@ -459,7 +456,6 @@ int main(int argc, const char **argv)
 {
 	int err, done_help = 0;
 	const char *cmd;
-	char sbuf[STRERR_BUFSIZE];
 
 	perf_debug_setup();
 
@@ -573,8 +569,8 @@ int main(int argc, const char **argv)
 	}
 
 	if (cmd) {
-		fprintf(stderr, "Failed to run command '%s': %s\n",
-			cmd, str_error_r(errno, sbuf, sizeof(sbuf)));
+		fprintf(stderr, "Failed to run command '%s': %m\n",
+			cmd);
 	}
 out:
 	if (debug_fp)
diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index 2298cd396c42..2e6da3ad0a4f 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -787,11 +787,10 @@ int perf_event__synthesize_bpf_events(struct perf_session *session,
 				err = 0;
 				break;
 			}
-			pr_debug("%s: can't get next program: %s%s\n",
-				 __func__, strerror(errno),
-				 errno == EINVAL ? " -- kernel too old?" : "");
 			/* don't report error on old kernel or EPERM  */
 			err = (errno == EINVAL || errno == EPERM) ? 0 : -1;
+			pr_debug("%s: can\'t get next program: %m%s\n",
+				__func__, errno == EINVAL ? " -- kernel too old?" : "");
 			break;
 		}
 		fd = bpf_prog_get_fd_by_id(id);
@@ -824,10 +823,8 @@ int perf_event__synthesize_bpf_events(struct perf_session *session,
 		.tool    = session->tool,
 	};
 
-	if (kallsyms__parse(kallsyms_filename, &arg, kallsyms_process_symbol)) {
-		pr_err("%s: failed to synthesize bpf images: %s\n",
-		       __func__, strerror(errno));
-	}
+	if (kallsyms__parse(kallsyms_filename, &arg, kallsyms_process_symbol))
+		pr_err("%s: failed to synthesize bpf images: %m\n", __func__);
 
 	free(event);
 	return err;
diff --git a/tools/perf/util/bpf-utils.c b/tools/perf/util/bpf-utils.c
index 5a66dc8594aa..d6d2c9c190f7 100644
--- a/tools/perf/util/bpf-utils.c
+++ b/tools/perf/util/bpf-utils.c
@@ -123,7 +123,7 @@ get_bpf_prog_info_linear(int fd, __u64 arrays)
 	/* step 1: get array dimensions */
 	err = bpf_obj_get_info_by_fd(fd, &info, &info_len);
 	if (err) {
-		pr_debug("can't get prog info: %s", strerror(errno));
+		pr_debug("can't get prog info: %m\n");
 		return ERR_PTR(-EFAULT);
 	}
 	if (info.type >= __MAX_BPF_PROG_TYPE)
@@ -186,7 +186,7 @@ get_bpf_prog_info_linear(int fd, __u64 arrays)
 	/* step 5: call syscall again to get required arrays */
 	err = bpf_obj_get_info_by_fd(fd, &info_linear->info, &info_len);
 	if (err) {
-		pr_debug("can't get prog info: %s", strerror(errno));
+		pr_debug("can't get prog info: %m\n");
 		free(info_linear);
 		return ERR_PTR(-EFAULT);
 	}
diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index 7b5671f13c53..788d30be2058 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -42,7 +42,7 @@ static void check_slab_cache_iter(struct lock_contention *con)
 
 	con->btf = btf__load_vmlinux_btf();
 	if (con->btf == NULL) {
-		pr_debug("BTF loading failed: %s\n", strerror(errno));
+		pr_debug("BTF loading failed: %m\n");
 		return;
 	}
 
diff --git a/tools/perf/util/cap.c b/tools/perf/util/cap.c
index 24a0ea7e6d97..ac6d1d9a523d 100644
--- a/tools/perf/util/cap.c
+++ b/tools/perf/util/cap.c
@@ -28,8 +28,7 @@ bool perf_cap__capable(int cap, bool *used_root)
 		    header.version == _LINUX_CAPABILITY_VERSION_1)
 			continue;
 
-		pr_debug2("capget syscall failed (%s - %d) fall back on root check\n",
-			  strerror(errno), errno);
+		pr_debug2("capget syscall failed (%m) fall back on root check\n");
 		*used_root = true;
 		return geteuid() == 0;
 	}
diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index 164eb45a0b36..90df41da1a32 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -213,17 +213,15 @@ static int check_backup(struct perf_data *data)
 
 		ret = rm_rf_perf_data(oldname);
 		if (ret) {
-			pr_err("Can't remove old data: %s (%s)\n",
-			       ret == -2 ?
-			       "Unknown file found" : strerror(errno),
-			       oldname);
+			if (ret == -2)
+				pr_err("Can't remove old data: Unknown file found (%s)\n", oldname);
+			else
+				pr_err("Can't remove old data: %m (%s)\n", oldname);
 			return -1;
 		}
 
 		if (rename(data->path, oldname)) {
-			pr_err("Can't move data: %s (%s to %s)\n",
-			       strerror(errno),
-			       data->path, oldname);
+			pr_err("Can't move data: %m (%s to %s)\n", data->path, oldname);
 			return -1;
 		}
 	}
@@ -246,14 +244,12 @@ static int open_file_read(struct perf_data *data)
 	int flags = data->in_place_update ? O_RDWR : O_RDONLY;
 	struct stat st;
 	int fd;
-	char sbuf[STRERR_BUFSIZE];
 
 	fd = open(data->file.path, flags);
 	if (fd < 0) {
 		int err = errno;
 
-		pr_err("failed to open %s: %s", data->file.path,
-			str_error_r(err, sbuf, sizeof(sbuf)));
+		pr_err("failed to open %s: %m", data->file.path);
 		if (err == ENOENT && !strcmp(data->file.path, "perf.data"))
 			pr_err("  (try 'perf record' first)");
 		pr_err("\n");
@@ -285,15 +281,10 @@ static int open_file_read(struct perf_data *data)
 
 static int open_file_write(struct perf_data *data)
 {
-	int fd;
-	char sbuf[STRERR_BUFSIZE];
-
-	fd = open(data->file.path, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC,
-		  S_IRUSR|S_IWUSR);
+	int fd = open(data->file.path, O_CREAT|O_RDWR|O_TRUNC|O_CLOEXEC, S_IRUSR|S_IWUSR);
 
 	if (fd < 0)
-		pr_err("failed to open %s : %s\n", data->file.path,
-			str_error_r(errno, sbuf, sizeof(sbuf)));
+		pr_err("failed to open %s : %m\n", data->file.path);
 
 	return fd;
 }
@@ -436,8 +427,8 @@ int perf_data__switch(struct perf_data *data,
 
 		if (lseek(data->file.fd, pos, SEEK_SET) == (off_t)-1) {
 			ret = -errno;
-			pr_debug("Failed to lseek to %zu: %s",
-				 pos, strerror(errno));
+			pr_debug("Failed to lseek to %zu: %m\n",
+				 pos);
 			goto out;
 		}
 	}
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 06980844c014..18e656712f5a 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -540,16 +540,13 @@ static void close_first_dso(void);
 
 static int do_open(char *name) EXCLUSIVE_LOCKS_REQUIRED(_dso__data_open_lock)
 {
-	int fd;
-	char sbuf[STRERR_BUFSIZE];
-
 	do {
-		fd = open(name, O_RDONLY|O_CLOEXEC);
+		int fd = open(name, O_RDONLY|O_CLOEXEC);
+
 		if (fd >= 0)
 			return fd;
 
-		pr_debug("dso open failed: %s\n",
-			 str_error_r(errno, sbuf, sizeof(sbuf)));
+		pr_debug("dso open failed: %m\n");
 		if (!dso__data_open_cnt || errno != EMFILE)
 			break;
 
@@ -1098,7 +1095,6 @@ static int file_size(struct dso *dso, struct machine *machine)
 {
 	int ret = 0;
 	struct stat st;
-	char sbuf[STRERR_BUFSIZE];
 
 	mutex_lock(dso__data_open_lock());
 
@@ -1116,8 +1112,7 @@ static int file_size(struct dso *dso, struct machine *machine)
 
 	if (fstat(dso__data(dso)->fd, &st) < 0) {
 		ret = -errno;
-		pr_err("dso cache fstat failed: %s\n",
-		       str_error_r(errno, sbuf, sizeof(sbuf)));
+		pr_err("dso cache fstat failed: %m\n");
 		dso__data(dso)->status = DSO_DATA_STATUS_ERROR;
 		goto out;
 	}
@@ -1773,10 +1768,8 @@ int dso__strerror_load(struct dso *dso, char *buf, size_t buflen)
 	BUG_ON(buflen == 0);
 
 	if (errnum >= 0) {
-		const char *err = str_error_r(errnum, buf, buflen);
-
-		if (err != buf)
-			scnprintf(buf, buflen, "%s", err);
+		errno = errnum;
+		scnprintf(buf, buflen, "%m");
 
 		return 0;
 	}
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 649519628541..3b0d837e3046 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1614,14 +1614,14 @@ int evlist__parse_sample_timestamp(struct evlist *evlist, union perf_event *even
 int evlist__strerror_open(struct evlist *evlist, int err, char *buf, size_t size)
 {
 	int printed, value;
-	char sbuf[STRERR_BUFSIZE], *emsg = str_error_r(err, sbuf, sizeof(sbuf));
 
 	switch (err) {
 	case EACCES:
 	case EPERM:
+		errno = err;
 		printed = scnprintf(buf, size,
-				    "Error:\t%s.\n"
-				    "Hint:\tCheck /proc/sys/kernel/perf_event_paranoid setting.", emsg);
+				    "Error:\t%m.\n"
+				    "Hint:\tCheck /proc/sys/kernel/perf_event_paranoid setting.");
 
 		value = perf_event_paranoid();
 
@@ -1648,16 +1648,18 @@ int evlist__strerror_open(struct evlist *evlist, int err, char *buf, size_t size
 		if (first->core.attr.sample_freq < (u64)max_freq)
 			goto out_default;
 
+		errno = err;
 		printed = scnprintf(buf, size,
-				    "Error:\t%s.\n"
+				    "Error:\t%m.\n"
 				    "Hint:\tCheck /proc/sys/kernel/perf_event_max_sample_rate.\n"
 				    "Hint:\tThe current value is %d and %" PRIu64 " is being requested.",
-				    emsg, max_freq, first->core.attr.sample_freq);
+				    max_freq, first->core.attr.sample_freq);
 		break;
 	}
 	default:
 out_default:
-		scnprintf(buf, size, "%s", emsg);
+		errno = err;
+		scnprintf(buf, size, "%m");
 		break;
 	}
 
@@ -1666,17 +1668,17 @@ out_default:
 
 int evlist__strerror_mmap(struct evlist *evlist, int err, char *buf, size_t size)
 {
-	char sbuf[STRERR_BUFSIZE], *emsg = str_error_r(err, sbuf, sizeof(sbuf));
 	int pages_attempted = evlist->core.mmap_len / 1024, pages_max_per_user, printed = 0;
 
 	switch (err) {
 	case EPERM:
 		sysctl__read_int("kernel/perf_event_mlock_kb", &pages_max_per_user);
+		errno = err;
 		printed += scnprintf(buf + printed, size - printed,
-				     "Error:\t%s.\n"
+				     "Error:\t%m.\n"
 				     "Hint:\tCheck /proc/sys/kernel/perf_event_mlock_kb (%d kB) setting.\n"
 				     "Hint:\tTried using %zd kB.\n",
-				     emsg, pages_max_per_user, pages_attempted);
+				     pages_max_per_user, pages_attempted);
 
 		if (pages_attempted >= pages_max_per_user) {
 			printed += scnprintf(buf + printed, size - printed,
@@ -1688,7 +1690,8 @@ int evlist__strerror_mmap(struct evlist *evlist, int err, char *buf, size_t size
 				     "Hint:\tTry using a smaller -m/--mmap-pages value.");
 		break;
 	default:
-		scnprintf(buf, size, "%s", emsg);
+		errno = err;
+		scnprintf(buf, size, "%m");
 		break;
 	}
 
@@ -1920,8 +1923,8 @@ static int evlist__parse_control_fifo(const char *str, int *ctl_fd, int *ctl_fd_
 	 */
 	fd = open(s, O_RDWR | O_NONBLOCK | O_CLOEXEC);
 	if (fd < 0) {
-		pr_err("Failed to open '%s'\n", s);
 		ret = -errno;
+		pr_err("Failed to open '%s': %m\n", s);
 		goto out_free;
 	}
 	*ctl_fd = fd;
@@ -1931,7 +1934,7 @@ static int evlist__parse_control_fifo(const char *str, int *ctl_fd, int *ctl_fd_
 		/* O_RDWR | O_NONBLOCK means the other end need not be open */
 		fd = open(p, O_RDWR | O_NONBLOCK | O_CLOEXEC);
 		if (fd < 0) {
-			pr_err("Failed to open '%s'\n", p);
+			pr_err("Failed to open '%s': %m\n", p);
 			ret = -errno;
 			goto out_free;
 		}
@@ -2364,7 +2367,7 @@ int evlist__parse_event_enable_time(struct evlist *evlist, struct record_opts *o
 	eet->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_CLOEXEC);
 	if (eet->timerfd == -1) {
 		err = -errno;
-		pr_err("timerfd_create failed: %s\n", strerror(errno));
+		pr_err("timerfd_create failed: %m\n");
 		goto free_eet_times;
 	}
 
@@ -2399,7 +2402,7 @@ static int event_enable_timer__set_timer(struct event_enable_timer *eet, int ms)
 
 	if (timerfd_settime(eet->timerfd, 0, &its, NULL) < 0) {
 		err = -errno;
-		pr_err("timerfd_settime failed: %s\n", strerror(errno));
+		pr_err("timerfd_settime failed: %m\n");
 	}
 	return err;
 }
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index ec6552a6f667..e2de642fbf53 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -648,8 +648,9 @@ struct tep_event *evsel__tp_format(struct evsel *evsel)
 	if (IS_ERR(tp_format)) {
 		int err = -PTR_ERR(evsel->tp_format);
 
-		pr_err("Error getting tracepoint format '%s' '%s'(%d)\n",
-			evsel__name(evsel), strerror(err), err);
+		errno = err;
+		pr_err("Error getting tracepoint format '%s': %m\n",
+			evsel__name(evsel));
 		return NULL;
 	}
 	evsel->tp_format = tp_format;
@@ -2772,8 +2773,8 @@ retry_open:
 					    PERF_EVENT_IOC_SET_BPF,
 					    bpf_fd);
 				if (err && errno != EEXIST) {
-					pr_err("failed to attach bpf fd %d: %s\n",
-					       bpf_fd, strerror(errno));
+					pr_err("failed to attach bpf fd %d: %m\n",
+					       bpf_fd);
 					err = -EINVAL;
 					goto out_close;
 				}
@@ -3864,7 +3865,6 @@ int evsel__open_strerror(struct evsel *evsel, struct target *target,
 			 int err, char *msg, size_t size)
 {
 	struct perf_pmu *pmu;
-	char sbuf[STRERR_BUFSIZE];
 	int printed = 0, enforced = 0;
 	int ret;
 
@@ -3997,10 +3997,11 @@ int evsel__open_strerror(struct evsel *evsel, struct target *target,
 	if (ret)
 		return ret;
 
+	errno = err;
 	return scnprintf(msg, size,
-	"The sys_perf_event_open() syscall returned with %d (%s) for event (%s).\n"
-	"\"dmesg | grep -i perf\" may provide additional information.\n",
-			 err, str_error_r(err, sbuf, sizeof(sbuf)), evsel__name(evsel));
+			 "The sys_perf_event_open() syscall failed for event (%s): %m\n"
+			 "\"dmesg | grep -i perf\" may provide additional information.\n",
+			 evsel__name(evsel));
 }
 
 struct perf_session *evsel__session(struct evsel *evsel)
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index f00814e37de9..d4fe35f9d9a5 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -90,7 +90,8 @@ jit_emit_elf(struct jit_buf_desc *jd,
 	saved_errno = errno;
 	nsinfo__mountns_exit(&nsc);
 	if (fd == -1) {
-		pr_warning("cannot create jit ELF %s: %s\n", filename, strerror(saved_errno));
+		errno = saved_errno;
+		pr_warning("cannot create jit ELF %s: %m\n", filename);
 		return -1;
 	}
 
diff --git a/tools/perf/util/lzma.c b/tools/perf/util/lzma.c
index c355757ed391..91b9b5171d1f 100644
--- a/tools/perf/util/lzma.c
+++ b/tools/perf/util/lzma.c
@@ -59,7 +59,7 @@ int lzma_decompress_stream_to_file(FILE *infile, int output_fd)
 			strm.avail_in = fread(buf_in, 1, sizeof(buf_in), infile);
 
 			if (ferror(infile)) {
-				pr_debug("lzma: read error: %s\n", strerror(errno));
+				pr_debug("lzma: read error: %m\n");
 				goto err_lzma_end;
 			}
 
@@ -73,7 +73,7 @@ int lzma_decompress_stream_to_file(FILE *infile, int output_fd)
 			ssize_t write_size = sizeof(buf_out) - strm.avail_out;
 
 			if (writen(output_fd, buf_out, write_size) != write_size) {
-				pr_debug("lzma: write error: %s\n", strerror(errno));
+				pr_debug("lzma: write error: %m\n");
 				goto err_lzma_end;
 			}
 
@@ -103,7 +103,7 @@ int lzma_decompress_to_file(const char *input, int output_fd)
 
 	infile = fopen(input, "rb");
 	if (!infile) {
-		pr_debug("lzma: fopen failed on %s: '%s'\n", input, strerror(errno));
+		pr_debug("lzma: fopen failed on %s: '%m'\n", input);
 		return -1;
 	}
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 65fa9bdff1b8..922ef6577bbb 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2349,9 +2349,10 @@ reader__read_event(struct reader *rd, struct perf_session *session,
 
 	if (size < sizeof(struct perf_event_header) ||
 	    (skip = rd->process(session, event, rd->file_pos, rd->path)) < 0) {
-		pr_err("%#" PRIx64 " [%#x]: failed to process type: %d [%s]\n",
+		errno = -skip;
+		pr_err("%#" PRIx64 " [%#x]: failed to process type: %d [%m]\n",
 		       rd->file_offset + rd->head, event->header.size,
-		       event->header.type, strerror(-skip));
+		       event->header.type);
 		err = skip;
 		goto out;
 	}
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index d1dcafa4b3b8..b8fea12997a0 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -1105,14 +1105,14 @@ static Elf *read_gnu_debugdata(struct dso *dso, Elf *elf, const char *name, int
 
 	wrapped = fmemopen(scn_data->d_buf, scn_data->d_size, "r");
 	if (!wrapped) {
-		pr_debug("%s: fmemopen: %s\n", __func__, strerror(errno));
+		pr_debug("%s: fmemopen: %m\n", __func__);
 		*dso__load_errno(dso) = -errno;
 		return NULL;
 	}
 
 	temp_fd = mkstemp(temp_filename);
 	if (temp_fd < 0) {
-		pr_debug("%s: mkstemp: %s\n", __func__, strerror(errno));
+		pr_debug("%s: mkstemp: %m\n", __func__);
 		*dso__load_errno(dso) = -errno;
 		fclose(wrapped);
 		return NULL;
-- 
cgit v1.2.3


From 55058e32151f95dc5badd62381d184e89f15de99 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosry.ahmed@linux.dev>
Date: Sat, 10 Jan 2026 00:48:20 +0000
Subject: KVM: selftests: Add a selftests for nested VMLOAD/VMSAVE

Add a test for VMLOAD/VMSAVE in an L2 guest. The test verifies that L1
intercepts for VMSAVE/VMLOAD always work regardless of
VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK.

Then, more interestingly, it makes sure that when L1 does not intercept
VMLOAD/VMSAVE, they work as intended in L2. When
VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is enabled by L1, VMSAVE/VMLOAD from
L2 should interpret the GPA as an L2 GPA and translate it through the
NPT. When VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is disabled by L1,
VMSAVE/VMLOAD from L2 should interpret the GPA as an L1 GPA.

To test this, put two VMCBs (0 and 1) in L1's physical address space,
and have a single L2 GPA where:
- L2 VMCB GPA == L1 VMCB(0) GPA
- L2 VMCB GPA maps to L1 VMCB(1) via the NPT in L1.

This setup allows detecting how the GPA is interpreted based on which L1
VMCB is actually accessed.

In both cases, L2 sets KERNEL_GS_BASE (one of the fields handled by
VMSAVE/VMLOAD), and executes VMSAVE to write its value to the VMCB. The
test userspace code then checks that the write was made to the correct
VMCB (based on whether VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is set by L1),
and writes a new value to that VMCB. L2 then executes VMLOAD to load the
new value and makes sure it's reflected correctly in KERNERL_GS_BASE.

Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20260110004821.3411245-4-yosry.ahmed@linux.dev
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm           |   1 +
 .../testing/selftests/kvm/include/x86/processor.h  |   1 +
 .../selftests/kvm/x86/nested_vmsave_vmload_test.c  | 197 +++++++++++++++++++++
 3 files changed, 199 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index ffbf891b31d3..fc78d4508ebd 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -95,6 +95,7 @@ TEST_GEN_PROGS_x86 += x86/nested_exceptions_test
 TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test
 TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test
 TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test
+TEST_GEN_PROGS_x86 += x86/nested_vmsave_vmload_test
 TEST_GEN_PROGS_x86 += x86/platform_info_test
 TEST_GEN_PROGS_x86 += x86/pmu_counters_test
 TEST_GEN_PROGS_x86 += x86/pmu_event_filter_test
diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 8f130e7d7048..6bfffc3b0a33 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -201,6 +201,7 @@ struct kvm_x86_cpu_feature {
 #define X86_FEATURE_TSCRATEMSR          KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 4)
 #define X86_FEATURE_PAUSEFILTER         KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 10)
 #define X86_FEATURE_PFTHRESHOLD         KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 12)
+#define	X86_FEATURE_V_VMSAVE_VMLOAD	KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 15)
 #define	X86_FEATURE_VGIF		KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 16)
 #define X86_FEATURE_IDLE_HLT		KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 30)
 #define X86_FEATURE_SEV			KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 1)
diff --git a/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c
new file mode 100644
index 000000000000..6764a48f9d4d
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2026, Google LLC.
+ */
+#include "kvm_util.h"
+#include "vmx.h"
+#include "svm_util.h"
+#include "kselftest.h"
+
+/*
+ * Allocate two VMCB pages for testing. Both pages have different GVAs (shared
+ * by both L1 and L2) and L1 GPAs. A single L2 GPA is used such that:
+ * - L2 GPA == L1 GPA for VMCB0.
+ * - L2 GPA is mapped to L1 GPA for VMCB1 using NPT in L1.
+ *
+ * This allows testing whether the GPA used by VMSAVE/VMLOAD in L2 is
+ * interpreted as a direct L1 GPA or translated using NPT as an L2 GPA, depends
+ * on which VMCB is accessed.
+ */
+#define TEST_MEM_SLOT_INDEX		1
+#define TEST_MEM_PAGES			2
+#define TEST_MEM_BASE			0xc0000000
+
+#define TEST_GUEST_ADDR(idx)		(TEST_MEM_BASE + (idx) * PAGE_SIZE)
+
+#define TEST_VMCB_L1_GPA(idx)		TEST_GUEST_ADDR(idx)
+#define TEST_VMCB_GVA(idx)		TEST_GUEST_ADDR(idx)
+
+#define TEST_VMCB_L2_GPA		TEST_VMCB_L1_GPA(0)
+
+#define L2_GUEST_STACK_SIZE		64
+
+static void l2_guest_code_vmsave(void)
+{
+	asm volatile("vmsave %0" : : "a"(TEST_VMCB_L2_GPA) : "memory");
+}
+
+static void l2_guest_code_vmload(void)
+{
+	asm volatile("vmload %0" : : "a"(TEST_VMCB_L2_GPA) : "memory");
+}
+
+static void l2_guest_code_vmcb(int vmcb_idx)
+{
+	wrmsr(MSR_KERNEL_GS_BASE, 0xaaaa);
+	l2_guest_code_vmsave();
+
+	/* Verify the VMCB used by VMSAVE and update KERNEL_GS_BASE to 0xbbbb */
+	GUEST_SYNC(vmcb_idx);
+
+	l2_guest_code_vmload();
+	GUEST_ASSERT_EQ(rdmsr(MSR_KERNEL_GS_BASE), 0xbbbb);
+
+	/* Reset MSR_KERNEL_GS_BASE */
+	wrmsr(MSR_KERNEL_GS_BASE, 0);
+	l2_guest_code_vmsave();
+
+	vmmcall();
+}
+
+static void l2_guest_code_vmcb0(void)
+{
+	l2_guest_code_vmcb(0);
+}
+
+static void l2_guest_code_vmcb1(void)
+{
+	l2_guest_code_vmcb(1);
+}
+
+static void l1_guest_code(struct svm_test_data *svm)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	/* Each test case initializes the guest RIP below */
+	generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/* Set VMSAVE/VMLOAD intercepts and make sure they work with.. */
+	svm->vmcb->control.intercept |= (BIT_ULL(INTERCEPT_VMSAVE) |
+					 BIT_ULL(INTERCEPT_VMLOAD));
+
+	 /* ..VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK cleared.. */
+	svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+	svm->vmcb->save.rip = (u64)l2_guest_code_vmsave;
+	run_guest(svm->vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMSAVE);
+
+	svm->vmcb->save.rip = (u64)l2_guest_code_vmload;
+	run_guest(svm->vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMLOAD);
+
+	/* ..and VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK set */
+	svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+	svm->vmcb->save.rip = (u64)l2_guest_code_vmsave;
+	run_guest(svm->vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMSAVE);
+
+	svm->vmcb->save.rip = (u64)l2_guest_code_vmload;
+	run_guest(svm->vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMLOAD);
+
+	/* Now clear the intercepts to test VMSAVE/VMLOAD behavior */
+	svm->vmcb->control.intercept &= ~(BIT_ULL(INTERCEPT_VMSAVE) |
+					  BIT_ULL(INTERCEPT_VMLOAD));
+
+	/*
+	 * Without VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be
+	 * interpreted as an L1 GPA, so VMCB0 should be used.
+	 */
+	svm->vmcb->save.rip = (u64)l2_guest_code_vmcb0;
+	svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+	run_guest(svm->vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL);
+
+	/*
+	 * With VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be interpeted as
+	 * an L2 GPA, and translated through the NPT to VMCB1.
+	 */
+	svm->vmcb->save.rip = (u64)l2_guest_code_vmcb1;
+	svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+	run_guest(svm->vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t nested_gva = 0;
+	struct vmcb *test_vmcb[2];
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	int i;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_NPT));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+	vm_enable_tdp(vm);
+
+	vcpu_alloc_svm(vm, &nested_gva);
+	vcpu_args_set(vcpu, 1, nested_gva);
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    TEST_MEM_BASE, TEST_MEM_SLOT_INDEX,
+				    TEST_MEM_PAGES, 0);
+
+	for (i = 0; i <= 1; i++) {
+		virt_map(vm, TEST_VMCB_GVA(i), TEST_VMCB_L1_GPA(i), 1);
+		test_vmcb[i] = (struct vmcb *)addr_gva2hva(vm, TEST_VMCB_GVA(i));
+	}
+
+	tdp_identity_map_default_memslots(vm);
+
+	/*
+	 * L2 GPA == L1_GPA(0), but map it to L1_GPA(1), to allow testing
+	 * whether the L2 GPA is interpreted as an L1 GPA or translated through
+	 * the NPT.
+	 */
+	TEST_ASSERT_EQ(TEST_VMCB_L2_GPA, TEST_VMCB_L1_GPA(0));
+	tdp_map(vm, TEST_VMCB_L2_GPA, TEST_VMCB_L1_GPA(1), PAGE_SIZE);
+
+	for (;;) {
+		struct ucall uc;
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		case UCALL_SYNC:
+			i = uc.args[1];
+			TEST_ASSERT(i == 0 || i == 1, "Unexpected VMCB idx: %d", i);
+
+			/*
+			 * Check that only the expected VMCB has KERNEL_GS_BASE
+			 * set to 0xaaaa, and update it to 0xbbbb.
+			 */
+			TEST_ASSERT_EQ(test_vmcb[i]->save.kernel_gs_base, 0xaaaa);
+			TEST_ASSERT_EQ(test_vmcb[1-i]->save.kernel_gs_base, 0);
+			test_vmcb[i]->save.kernel_gs_base = 0xbbbb;
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
-- 
cgit v1.2.3


From 159ca97cd97ce8cc65364fee37319823b5ffb5bd Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:13 +0000
Subject: perf parse-events: Refactor get_config_terms() to remove macros

The ADD_CONFIG_TERM() macros build the __type argument out of a partial
EVSEL__CONFIG_TERM_x enum name. This means that they can't be called
from a function where __type is a variable and it's also impossible to
grep the codebase to find usages of these enums as they're never typed
in full.

Fix this by removing the macros and replacing them with an
add_config_term() function. It seems the main reason these existed in
the first place was to avoid type punning and to write to a specific
field in the union, but the same thing can be achieved with a single
write to a u64 'val' field.

Running the Perf tests with "-fsanitize=undefined -fno-sanitize-recover"
results in no new issues as a result of this change.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel_config.h |   1 +
 tools/perf/util/parse-events.c | 146 ++++++++++++++++++++++++-----------------
 2 files changed, 86 insertions(+), 61 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h
index bcd3a978f0c4..685fd8d5c4a8 100644
--- a/tools/perf/util/evsel_config.h
+++ b/tools/perf/util/evsel_config.h
@@ -50,6 +50,7 @@ struct evsel_config_term {
 		u64	      cfg_chg;
 		char	      *str;
 		int	      cpu;
+		u64	      val;
 	} val;
 	bool weak;
 };
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 17c1c36a7bf9..46422286380f 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1116,105 +1116,107 @@ static int config_attr(struct perf_event_attr *attr,
 	return 0;
 }
 
-static int get_config_terms(const struct parse_events_terms *head_config,
-			    struct list_head *head_terms)
+static struct evsel_config_term *add_config_term(enum evsel_term_type type,
+						 struct list_head *head_terms,
+						 bool weak)
 {
-#define ADD_CONFIG_TERM(__type, __weak)				\
-	struct evsel_config_term *__t;			\
-								\
-	__t = zalloc(sizeof(*__t));				\
-	if (!__t)						\
-		return -ENOMEM;					\
-								\
-	INIT_LIST_HEAD(&__t->list);				\
-	__t->type       = EVSEL__CONFIG_TERM_ ## __type;	\
-	__t->weak	= __weak;				\
-	list_add_tail(&__t->list, head_terms)
-
-#define ADD_CONFIG_TERM_VAL(__type, __name, __val, __weak)	\
-do {								\
-	ADD_CONFIG_TERM(__type, __weak);			\
-	__t->val.__name = __val;				\
-} while (0)
+	struct evsel_config_term *t;
 
-#define ADD_CONFIG_TERM_STR(__type, __val, __weak)		\
-do {								\
-	ADD_CONFIG_TERM(__type, __weak);			\
-	__t->val.str = strdup(__val);				\
-	if (!__t->val.str) {					\
-		zfree(&__t);					\
-		return -ENOMEM;					\
-	}							\
-	__t->free_str = true;					\
-} while (0)
+	t = zalloc(sizeof(*t));
+	if (!t)
+		return NULL;
+
+	INIT_LIST_HEAD(&t->list);
+	t->type = type;
+	t->weak	= weak;
+	list_add_tail(&t->list, head_terms);
 
+	return t;
+}
+
+static int get_config_terms(const struct parse_events_terms *head_config,
+			    struct list_head *head_terms)
+{
 	struct parse_events_term *term;
 
 	list_for_each_entry(term, &head_config->terms, list) {
+		struct evsel_config_term *new_term;
+		enum evsel_term_type new_type;
+		bool str_type = false;
+		u64 val;
+
 		switch (term->type_term) {
 		case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD:
-			ADD_CONFIG_TERM_VAL(PERIOD, period, term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_PERIOD;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ:
-			ADD_CONFIG_TERM_VAL(FREQ, freq, term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_FREQ;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_TIME:
-			ADD_CONFIG_TERM_VAL(TIME, time, term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_TIME;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_CALLGRAPH:
-			ADD_CONFIG_TERM_STR(CALLGRAPH, term->val.str, term->weak);
+			new_type = EVSEL__CONFIG_TERM_CALLGRAPH;
+			str_type = true;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE:
-			ADD_CONFIG_TERM_STR(BRANCH, term->val.str, term->weak);
+			new_type = EVSEL__CONFIG_TERM_BRANCH;
+			str_type = true;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_STACKSIZE:
-			ADD_CONFIG_TERM_VAL(STACK_USER, stack_user,
-					    term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_STACK_USER;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_INHERIT:
-			ADD_CONFIG_TERM_VAL(INHERIT, inherit,
-					    term->val.num ? 1 : 0, term->weak);
+			new_type = EVSEL__CONFIG_TERM_INHERIT;
+			val = term->val.num ? 1 : 0;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
-			ADD_CONFIG_TERM_VAL(INHERIT, inherit,
-					    term->val.num ? 0 : 1, term->weak);
+			new_type = EVSEL__CONFIG_TERM_INHERIT;
+			val = term->val.num ? 0 : 1;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_MAX_STACK:
-			ADD_CONFIG_TERM_VAL(MAX_STACK, max_stack,
-					    term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_MAX_STACK;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS:
-			ADD_CONFIG_TERM_VAL(MAX_EVENTS, max_events,
-					    term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_MAX_EVENTS;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_OVERWRITE:
-			ADD_CONFIG_TERM_VAL(OVERWRITE, overwrite,
-					    term->val.num ? 1 : 0, term->weak);
+			new_type = EVSEL__CONFIG_TERM_OVERWRITE;
+			val = term->val.num ? 1 : 0;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE:
-			ADD_CONFIG_TERM_VAL(OVERWRITE, overwrite,
-					    term->val.num ? 0 : 1, term->weak);
+			new_type = EVSEL__CONFIG_TERM_OVERWRITE;
+			val = term->val.num ? 0 : 1;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_DRV_CFG:
-			ADD_CONFIG_TERM_STR(DRV_CFG, term->val.str, term->weak);
+			new_type = EVSEL__CONFIG_TERM_DRV_CFG;
+			str_type = true;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_PERCORE:
-			ADD_CONFIG_TERM_VAL(PERCORE, percore,
-					    term->val.num ? true : false, term->weak);
+			new_type = EVSEL__CONFIG_TERM_PERCORE;
+			val = term->val.num ? true : false;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT:
-			ADD_CONFIG_TERM_VAL(AUX_OUTPUT, aux_output,
-					    term->val.num ? 1 : 0, term->weak);
+			new_type = EVSEL__CONFIG_TERM_AUX_OUTPUT;
+			val = term->val.num ? 1 : 0;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_AUX_ACTION:
-			ADD_CONFIG_TERM_STR(AUX_ACTION, term->val.str, term->weak);
+			new_type = EVSEL__CONFIG_TERM_AUX_ACTION;
+			str_type = true;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE:
-			ADD_CONFIG_TERM_VAL(AUX_SAMPLE_SIZE, aux_sample_size,
-					    term->val.num, term->weak);
+			new_type = EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE;
+			val = term->val.num;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV:
-			ADD_CONFIG_TERM_STR(RATIO_TO_PREV, term->val.str, term->weak);
+			new_type = EVSEL__CONFIG_TERM_RATIO_TO_PREV;
+			str_type = true;
 			break;
 		case PARSE_EVENTS__TERM_TYPE_USER:
 		case PARSE_EVENTS__TERM_TYPE_CONFIG:
@@ -1229,7 +1231,23 @@ do {								\
 		case PARSE_EVENTS__TERM_TYPE_RAW:
 		case PARSE_EVENTS__TERM_TYPE_CPU:
 		default:
-			break;
+			/* Don't add a new term for these ones */
+			continue;
+		}
+
+		new_term = add_config_term(new_type, head_terms, term->weak);
+		if (!new_term)
+			return -ENOMEM;
+
+		if (str_type) {
+			new_term->val.str = strdup(term->val.str);
+			if (!new_term->val.str) {
+				zfree(&new_term);
+				return -ENOMEM;
+			}
+			new_term->free_str = true;
+		} else {
+			new_term->val.val = val;
 		}
 	}
 	return 0;
@@ -1290,10 +1308,16 @@ static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head
 		}
 	}
 
-	if (bits)
-		ADD_CONFIG_TERM_VAL(CFG_CHG, cfg_chg, bits, false);
+	if (bits) {
+		struct evsel_config_term *new_term;
+
+		new_term = add_config_term(EVSEL__CONFIG_TERM_CFG_CHG,
+					   head_terms, false);
+		if (!new_term)
+			return -ENOMEM;
+		new_term->val.cfg_chg = bits;
+	}
 
-#undef ADD_CONFIG_TERM
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4563e23bd9e4057d6d22ae6631f9dee781fd22bd Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:14 +0000
Subject: perf evsel: Refactor evsel__set_config_if_unset() arguments

Make the evsel argument first to match the other evsel__* functions
and remove the redundant pmu argument, which can be accessed via evsel.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/util/cs-etm.c    | 9 +++------
 tools/perf/arch/arm64/util/arm-spe.c | 2 +-
 tools/perf/arch/x86/util/intel-pt.c  | 3 +--
 tools/perf/util/evsel.h              | 4 ++--
 tools/perf/util/pmu.c                | 6 +++---
 5 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index ea891d12f8f4..c28208361d91 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -441,10 +441,8 @@ static int cs_etm_recording_options(struct auxtrace_record *itr,
 	 * when a context switch happened.
 	 */
 	if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) {
-		evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel,
-					   "timestamp", 1);
-		evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel,
-					   "contextid", 1);
+		evsel__set_config_if_unset(cs_etm_evsel, "timestamp", 1);
+		evsel__set_config_if_unset(cs_etm_evsel, "contextid", 1);
 	}
 
 	/*
@@ -453,8 +451,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr,
 	 * timestamp tracing.
 	 */
 	if (opts->sample_time_set)
-		evsel__set_config_if_unset(cs_etm_pmu, cs_etm_evsel,
-					   "timestamp", 1);
+		evsel__set_config_if_unset(cs_etm_evsel, "timestamp", 1);
 
 	/* Add dummy event to keep tracking */
 	err = parse_event(evlist, "dummy:u");
diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c
index d5ec1408d0ae..51014f8bff97 100644
--- a/tools/perf/arch/arm64/util/arm-spe.c
+++ b/tools/perf/arch/arm64/util/arm-spe.c
@@ -274,7 +274,7 @@ static void arm_spe_setup_evsel(struct evsel *evsel, struct perf_cpu_map *cpus)
 	 */
 	if (!perf_cpu_map__is_any_cpu_or_is_empty(cpus)) {
 		evsel__set_sample_bit(evsel, CPU);
-		evsel__set_config_if_unset(evsel->pmu, evsel, "ts_enable", 1);
+		evsel__set_config_if_unset(evsel, "ts_enable", 1);
 	}
 
 	/*
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index b394ad9cc635..c131a727774f 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -664,8 +664,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
 		return 0;
 
 	if (opts->auxtrace_sample_mode)
-		evsel__set_config_if_unset(intel_pt_pmu, intel_pt_evsel,
-					   "psb_period", 0);
+		evsel__set_config_if_unset(intel_pt_evsel, "psb_period", 0);
 
 	err = intel_pt_validate_config(intel_pt_pmu, intel_pt_evsel);
 	if (err)
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index a08130ff2e47..2cf87bc67df7 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -575,8 +575,8 @@ void evsel__uniquify_counter(struct evsel *counter);
 	((((src) >> (pos)) & ((1ull << (size)) - 1)) << (63 - ((pos) + (size) - 1)))
 
 u64 evsel__bitfield_swap_branch_flags(u64 value);
-void evsel__set_config_if_unset(struct perf_pmu *pmu, struct evsel *evsel,
-				const char *config_name, u64 val);
+void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name,
+				u64 val);
 
 bool evsel__is_offcpu_event(struct evsel *evsel);
 
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 956ea273c2c7..e87c12946d71 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -1382,8 +1382,8 @@ bool evsel__is_aux_event(const struct evsel *evsel)
  * something to true, pass 1 for val rather than a pre shifted value.
  */
 #define field_prep(_mask, _val) (((_val) << (ffsll(_mask) - 1)) & (_mask))
-void evsel__set_config_if_unset(struct perf_pmu *pmu, struct evsel *evsel,
-				const char *config_name, u64 val)
+void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name,
+				u64 val)
 {
 	u64 user_bits = 0, bits;
 	struct evsel_config_term *term = evsel__get_config_term(evsel, CFG_CHG);
@@ -1391,7 +1391,7 @@ void evsel__set_config_if_unset(struct perf_pmu *pmu, struct evsel *evsel,
 	if (term)
 		user_bits = term->val.cfg_chg;
 
-	bits = perf_pmu__format_bits(pmu, config_name);
+	bits = perf_pmu__format_bits(evsel->pmu, config_name);
 
 	/* Do nothing if the user changed the value */
 	if (bits & user_bits)
-- 
cgit v1.2.3


From 11ac46060512f6ef1caedabf9a1a129157d0e8a5 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:15 +0000
Subject: perf evsel: Move evsel__* functions to evsel.c

At least one of these were put here to avoid a Python binding linking
issue which is no longer present. Put them back in their correct
location to avoid confusion about which file to add a new evsel__*
function to later.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/all/ZEbAS2yx2fguW60w@kernel.org/
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 40 ++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/pmu.c   | 40 ----------------------------------------
 2 files changed, 40 insertions(+), 40 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index e2de642fbf53..27bdef01beaf 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1315,6 +1315,35 @@ struct evsel_config_term *__evsel__get_config_term(struct evsel *evsel, enum evs
 	return found_term;
 }
 
+/*
+ * Set @config_name to @val as long as the user hasn't already set or cleared it
+ * by passing a config term on the command line.
+ *
+ * @val is the value to put into the bits specified by @config_name rather than
+ * the bit pattern. It is shifted into position by this function, so to set
+ * something to true, pass 1 for val rather than a pre shifted value.
+ */
+#define field_prep(_mask, _val) (((_val) << (ffsll(_mask) - 1)) & (_mask))
+void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name,
+				u64 val)
+{
+	u64 user_bits = 0, bits;
+	struct evsel_config_term *term = evsel__get_config_term(evsel, CFG_CHG);
+
+	if (term)
+		user_bits = term->val.cfg_chg;
+
+	bits = perf_pmu__format_bits(evsel->pmu, config_name);
+
+	/* Do nothing if the user changed the value */
+	if (bits & user_bits)
+		return;
+
+	/* Otherwise replace it */
+	evsel->core.attr.config &= ~bits;
+	evsel->core.attr.config |= field_prep(bits, val);
+}
+
 void __weak arch_evsel__set_sample_weight(struct evsel *evsel)
 {
 	evsel__set_sample_bit(evsel, WEIGHT);
@@ -4099,6 +4128,17 @@ void evsel__set_leader(struct evsel *evsel, struct evsel *leader)
 	evsel->core.leader = &leader->core;
 }
 
+bool evsel__is_aux_event(const struct evsel *evsel)
+{
+	struct perf_pmu *pmu;
+
+	if (evsel->needs_auxtrace_mmap)
+		return true;
+
+	pmu = evsel__find_pmu(evsel);
+	return pmu && pmu->auxtrace;
+}
+
 int evsel__source_count(const struct evsel *evsel)
 {
 	struct evsel *pos;
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index e87c12946d71..e3a1f26213ec 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -1362,46 +1362,6 @@ void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu)
 	}
 }
 
-bool evsel__is_aux_event(const struct evsel *evsel)
-{
-	struct perf_pmu *pmu;
-
-	if (evsel->needs_auxtrace_mmap)
-		return true;
-
-	pmu = evsel__find_pmu(evsel);
-	return pmu && pmu->auxtrace;
-}
-
-/*
- * Set @config_name to @val as long as the user hasn't already set or cleared it
- * by passing a config term on the command line.
- *
- * @val is the value to put into the bits specified by @config_name rather than
- * the bit pattern. It is shifted into position by this function, so to set
- * something to true, pass 1 for val rather than a pre shifted value.
- */
-#define field_prep(_mask, _val) (((_val) << (ffsll(_mask) - 1)) & (_mask))
-void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name,
-				u64 val)
-{
-	u64 user_bits = 0, bits;
-	struct evsel_config_term *term = evsel__get_config_term(evsel, CFG_CHG);
-
-	if (term)
-		user_bits = term->val.cfg_chg;
-
-	bits = perf_pmu__format_bits(evsel->pmu, config_name);
-
-	/* Do nothing if the user changed the value */
-	if (bits & user_bits)
-		return;
-
-	/* Otherwise replace it */
-	evsel->core.attr.config &= ~bits;
-	evsel->core.attr.config |= field_prep(bits, val);
-}
-
 static struct perf_pmu_format *
 pmu_find_format(const struct list_head *formats, const char *name)
 {
-- 
cgit v1.2.3


From 5b5e01304f13a53daec000b28ba60e51b149cdf4 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:16 +0000
Subject: perf evsel: Support sparse fields in evsel__set_config_if_unset()

Sparse config fields are technically supported although currently
unused. field_prep() only works for contiguous bitfields so replace it
with pmu_format_value().

pmu_format_value() also takes a bitmap rather than a u64 so replace
'u64 bits' with format->bits.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 19 +++++++++++--------
 tools/perf/util/pmu.c   | 32 +++-----------------------------
 tools/perf/util/pmu.h   | 28 ++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 37 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 27bdef01beaf..9a9f5e5a64b9 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1323,25 +1323,28 @@ struct evsel_config_term *__evsel__get_config_term(struct evsel *evsel, enum evs
  * the bit pattern. It is shifted into position by this function, so to set
  * something to true, pass 1 for val rather than a pre shifted value.
  */
-#define field_prep(_mask, _val) (((_val) << (ffsll(_mask) - 1)) & (_mask))
 void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name,
 				u64 val)
 {
-	u64 user_bits = 0, bits;
+	u64 user_bits = 0;
 	struct evsel_config_term *term = evsel__get_config_term(evsel, CFG_CHG);
+	struct perf_pmu_format *format = pmu_find_format(&evsel->pmu->format,
+							 config_name);
+	int fbit;
+
+	if (!format)
+		return;
 
 	if (term)
 		user_bits = term->val.cfg_chg;
 
-	bits = perf_pmu__format_bits(evsel->pmu, config_name);
-
 	/* Do nothing if the user changed the value */
-	if (bits & user_bits)
-		return;
+	for_each_set_bit(fbit, format->bits, PERF_PMU_FORMAT_BITS)
+		if ((1ULL << fbit) & user_bits)
+			return;
 
 	/* Otherwise replace it */
-	evsel->core.attr.config &= ~bits;
-	evsel->core.attr.config |= field_prep(bits, val);
+	pmu_format_value(format->bits, val, &evsel->core.attr.config, /*zero=*/true);
 }
 
 void __weak arch_evsel__set_sample_weight(struct evsel *evsel)
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index e3a1f26213ec..7967d9159742 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -118,31 +118,6 @@ struct perf_pmu_alias {
 	bool info_loaded;
 };
 
-/**
- * struct perf_pmu_format - Values from a format file read from
- * <sysfs>/devices/cpu/format/ held in struct perf_pmu.
- *
- * For example, the contents of <sysfs>/devices/cpu/format/event may be
- * "config:0-7" and will be represented here as name="event",
- * value=PERF_PMU_FORMAT_VALUE_CONFIG and bits 0 to 7 will be set.
- */
-struct perf_pmu_format {
-	/** @list: Element on list within struct perf_pmu. */
-	struct list_head list;
-	/** @bits: Which config bits are set by this format value. */
-	DECLARE_BITMAP(bits, PERF_PMU_FORMAT_BITS);
-	/** @name: The modifier/file name. */
-	char *name;
-	/**
-	 * @value : Which config value the format relates to. Supported values
-	 * are from PERF_PMU_FORMAT_VALUE_CONFIG to
-	 * PERF_PMU_FORMAT_VALUE_CONFIG_END.
-	 */
-	u16 value;
-	/** @loaded: Has the contents been loaded/parsed. */
-	bool loaded;
-};
-
 static int pmu_aliases_parse(struct perf_pmu *pmu);
 
 static struct perf_pmu_format *perf_pmu__new_format(struct list_head *list, char *name)
@@ -1362,8 +1337,8 @@ void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu)
 	}
 }
 
-static struct perf_pmu_format *
-pmu_find_format(const struct list_head *formats, const char *name)
+struct perf_pmu_format *pmu_find_format(const struct list_head *formats,
+					const char *name)
 {
 	struct perf_pmu_format *format;
 
@@ -1404,8 +1379,7 @@ int perf_pmu__format_type(struct perf_pmu *pmu, const char *name)
  * Sets value based on the format definition (format parameter)
  * and unformatted value (value parameter).
  */
-static void pmu_format_value(unsigned long *format, __u64 value, __u64 *v,
-			     bool zero)
+void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, bool zero)
 {
 	unsigned long fbit, vbit;
 
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 8f11bfe8ed6d..3a53e1882cf1 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -233,6 +233,31 @@ struct pmu_event_info {
 	bool deprecated;
 };
 
+/**
+ * struct perf_pmu_format - Values from a format file read from
+ * <sysfs>/devices/cpu/format/ held in struct perf_pmu.
+ *
+ * For example, the contents of <sysfs>/devices/cpu/format/event may be
+ * "config:0-7" and will be represented here as name="event",
+ * value=PERF_PMU_FORMAT_VALUE_CONFIG and bits 0 to 7 will be set.
+ */
+struct perf_pmu_format {
+	/** @list: Element on list within struct perf_pmu. */
+	struct list_head list;
+	/** @bits: Which config bits are set by this format value. */
+	DECLARE_BITMAP(bits, PERF_PMU_FORMAT_BITS);
+	/** @name: The modifier/file name. */
+	char *name;
+	/**
+	 * @value : Which config value the format relates to. Supported values
+	 * are from PERF_PMU_FORMAT_VALUE_CONFIG to
+	 * PERF_PMU_FORMAT_VALUE_CONFIG_END.
+	 */
+	u16 value;
+	/** @loaded: Has the contents been loaded/parsed. */
+	bool loaded;
+};
+
 typedef int (*pmu_event_callback)(void *state, struct pmu_event_info *info);
 typedef int (*pmu_format_callback)(void *state, const char *name, int config,
 				   const unsigned long *bits);
@@ -254,6 +279,9 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_
 			  u64 *alternate_hw_config, struct parse_events_error *err);
 int perf_pmu__find_event(struct perf_pmu *pmu, const char *event, void *state, pmu_event_callback cb);
 
+void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, bool zero);
+struct perf_pmu_format *pmu_find_format(const struct list_head *formats,
+					const char *name);
 void perf_pmu_format__set_value(void *format, int config, unsigned long *bits);
 bool perf_pmu__has_format(const struct perf_pmu *pmu, const char *name);
 int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_callback cb);
-- 
cgit v1.2.3


From a2441cf3a5930370aa02d14f2c90fcc4c2ba26f7 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:17 +0000
Subject: perf parse-events: Track all user changed config bits

Currently we only track which bits were set by the user in attr->config.
But all configN fields should be treated equally as they can all have
default and user overridden values.

Track them all by making get_config_chgs() generic and calling it once
for each config value.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c        |  9 +++-
 tools/perf/util/evsel_config.h |  6 ++-
 tools/perf/util/parse-events.c | 98 +++++++++++++++++++++++-------------------
 tools/perf/util/pmu.c          |  4 +-
 tools/perf/util/pmu.h          |  4 +-
 5 files changed, 70 insertions(+), 51 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 9a9f5e5a64b9..1a41a10d9ab2 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1243,7 +1243,11 @@ static void evsel__apply_config_terms(struct evsel *evsel,
 		case EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE:
 			/* Already applied by auxtrace */
 			break;
-		case EVSEL__CONFIG_TERM_CFG_CHG:
+		case EVSEL__CONFIG_TERM_USR_CHG_CONFIG:
+		case EVSEL__CONFIG_TERM_USR_CHG_CONFIG1:
+		case EVSEL__CONFIG_TERM_USR_CHG_CONFIG2:
+		case EVSEL__CONFIG_TERM_USR_CHG_CONFIG3:
+		case EVSEL__CONFIG_TERM_USR_CHG_CONFIG4:
 			break;
 		case EVSEL__CONFIG_TERM_RATIO_TO_PREV:
 			rtp_buf = term->val.str;
@@ -1327,7 +1331,8 @@ void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name,
 				u64 val)
 {
 	u64 user_bits = 0;
-	struct evsel_config_term *term = evsel__get_config_term(evsel, CFG_CHG);
+	struct evsel_config_term *term = evsel__get_config_term(evsel,
+								USR_CHG_CONFIG);
 	struct perf_pmu_format *format = pmu_find_format(&evsel->pmu->format,
 							 config_name);
 	int fbit;
diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h
index 685fd8d5c4a8..7b565d76c0bc 100644
--- a/tools/perf/util/evsel_config.h
+++ b/tools/perf/util/evsel_config.h
@@ -27,7 +27,11 @@ enum evsel_term_type {
 	EVSEL__CONFIG_TERM_AUX_OUTPUT,
 	EVSEL__CONFIG_TERM_AUX_ACTION,
 	EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE,
-	EVSEL__CONFIG_TERM_CFG_CHG,
+	EVSEL__CONFIG_TERM_USR_CHG_CONFIG,
+	EVSEL__CONFIG_TERM_USR_CHG_CONFIG1,
+	EVSEL__CONFIG_TERM_USR_CHG_CONFIG2,
+	EVSEL__CONFIG_TERM_USR_CHG_CONFIG3,
+	EVSEL__CONFIG_TERM_USR_CHG_CONFIG4,
 	EVSEL__CONFIG_TERM_RATIO_TO_PREV,
 };
 
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 46422286380f..1f6e2213326d 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1253,66 +1253,32 @@ static int get_config_terms(const struct parse_events_terms *head_config,
 	return 0;
 }
 
-/*
- * Add EVSEL__CONFIG_TERM_CFG_CHG where cfg_chg will have a bit set for
- * each bit of attr->config that the user has changed.
- */
-static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head_config,
-			   struct list_head *head_terms)
+static int add_cfg_chg(const struct perf_pmu *pmu,
+		       const struct parse_events_terms *head_config,
+		       struct list_head *head_terms,
+		       int format_type,
+		       enum parse_events__term_type term_type,
+		       enum evsel_term_type new_term_type)
 {
 	struct parse_events_term *term;
 	u64 bits = 0;
 	int type;
 
 	list_for_each_entry(term, &head_config->terms, list) {
-		switch (term->type_term) {
-		case PARSE_EVENTS__TERM_TYPE_USER:
+		if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER) {
 			type = perf_pmu__format_type(pmu, term->config);
-			if (type != PERF_PMU_FORMAT_VALUE_CONFIG)
+			if (type != format_type)
 				continue;
 			bits |= perf_pmu__format_bits(pmu, term->config);
-			break;
-		case PARSE_EVENTS__TERM_TYPE_CONFIG:
+		} else if (term->type_term == term_type) {
 			bits = ~(u64)0;
-			break;
-		case PARSE_EVENTS__TERM_TYPE_CONFIG1:
-		case PARSE_EVENTS__TERM_TYPE_CONFIG2:
-		case PARSE_EVENTS__TERM_TYPE_CONFIG3:
-		case PARSE_EVENTS__TERM_TYPE_CONFIG4:
-		case PARSE_EVENTS__TERM_TYPE_LEGACY_HARDWARE_CONFIG:
-		case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE_CONFIG:
-		case PARSE_EVENTS__TERM_TYPE_NAME:
-		case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD:
-		case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ:
-		case PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE:
-		case PARSE_EVENTS__TERM_TYPE_TIME:
-		case PARSE_EVENTS__TERM_TYPE_CALLGRAPH:
-		case PARSE_EVENTS__TERM_TYPE_STACKSIZE:
-		case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
-		case PARSE_EVENTS__TERM_TYPE_INHERIT:
-		case PARSE_EVENTS__TERM_TYPE_MAX_STACK:
-		case PARSE_EVENTS__TERM_TYPE_MAX_EVENTS:
-		case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE:
-		case PARSE_EVENTS__TERM_TYPE_OVERWRITE:
-		case PARSE_EVENTS__TERM_TYPE_DRV_CFG:
-		case PARSE_EVENTS__TERM_TYPE_PERCORE:
-		case PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT:
-		case PARSE_EVENTS__TERM_TYPE_AUX_ACTION:
-		case PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE:
-		case PARSE_EVENTS__TERM_TYPE_METRIC_ID:
-		case PARSE_EVENTS__TERM_TYPE_RAW:
-		case PARSE_EVENTS__TERM_TYPE_CPU:
-		case PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV:
-		default:
-			break;
 		}
 	}
 
 	if (bits) {
 		struct evsel_config_term *new_term;
 
-		new_term = add_config_term(EVSEL__CONFIG_TERM_CFG_CHG,
-					   head_terms, false);
+		new_term = add_config_term(new_term_type, head_terms, false);
 		if (!new_term)
 			return -ENOMEM;
 		new_term->val.cfg_chg = bits;
@@ -1321,6 +1287,50 @@ static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head
 	return 0;
 }
 
+/*
+ * Add EVSEL__CONFIG_TERM_USR_CFG_CONFIGn where cfg_chg will have a bit set for
+ * each bit of attr->configN that the user has changed.
+ */
+static int get_config_chgs(const struct perf_pmu *pmu,
+			   const struct parse_events_terms *head_config,
+			   struct list_head *head_terms)
+{
+	int ret;
+
+	ret = add_cfg_chg(pmu, head_config, head_terms,
+			  PERF_PMU_FORMAT_VALUE_CONFIG,
+			  PARSE_EVENTS__TERM_TYPE_CONFIG,
+			  EVSEL__CONFIG_TERM_USR_CHG_CONFIG);
+	if (ret)
+		return ret;
+
+	ret = add_cfg_chg(pmu, head_config, head_terms,
+			  PERF_PMU_FORMAT_VALUE_CONFIG1,
+			  PARSE_EVENTS__TERM_TYPE_CONFIG1,
+			  EVSEL__CONFIG_TERM_USR_CHG_CONFIG1);
+	if (ret)
+		return ret;
+
+	ret = add_cfg_chg(pmu, head_config, head_terms,
+			  PERF_PMU_FORMAT_VALUE_CONFIG2,
+			  PARSE_EVENTS__TERM_TYPE_CONFIG2,
+			  EVSEL__CONFIG_TERM_USR_CHG_CONFIG2);
+	if (ret)
+		return ret;
+
+	ret = add_cfg_chg(pmu, head_config, head_terms,
+			  PERF_PMU_FORMAT_VALUE_CONFIG3,
+			  PARSE_EVENTS__TERM_TYPE_CONFIG3,
+			  EVSEL__CONFIG_TERM_USR_CHG_CONFIG3);
+	if (ret)
+		return ret;
+
+	return add_cfg_chg(pmu, head_config, head_terms,
+			   PERF_PMU_FORMAT_VALUE_CONFIG4,
+			   PARSE_EVENTS__TERM_TYPE_CONFIG4,
+			   EVSEL__CONFIG_TERM_USR_CHG_CONFIG4);
+}
+
 int parse_events_add_tracepoint(struct parse_events_state *parse_state,
 				struct list_head *list,
 				const char *sys, const char *event,
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 7967d9159742..dc5dab69151f 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -1349,7 +1349,7 @@ struct perf_pmu_format *pmu_find_format(const struct list_head *formats,
 	return NULL;
 }
 
-__u64 perf_pmu__format_bits(struct perf_pmu *pmu, const char *name)
+__u64 perf_pmu__format_bits(const struct perf_pmu *pmu, const char *name)
 {
 	struct perf_pmu_format *format = pmu_find_format(&pmu->format, name);
 	__u64 bits = 0;
@@ -1364,7 +1364,7 @@ __u64 perf_pmu__format_bits(struct perf_pmu *pmu, const char *name)
 	return bits;
 }
 
-int perf_pmu__format_type(struct perf_pmu *pmu, const char *name)
+int perf_pmu__format_type(const struct perf_pmu *pmu, const char *name)
 {
 	struct perf_pmu_format *format = pmu_find_format(&pmu->format, name);
 
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 3a53e1882cf1..7655d996090a 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -272,8 +272,8 @@ int perf_pmu__config_terms(const struct perf_pmu *pmu,
 			   struct parse_events_terms *terms,
 			   bool zero, bool apply_hardcoded,
 			   struct parse_events_error *error);
-__u64 perf_pmu__format_bits(struct perf_pmu *pmu, const char *name);
-int perf_pmu__format_type(struct perf_pmu *pmu, const char *name);
+__u64 perf_pmu__format_bits(const struct perf_pmu *pmu, const char *name);
+int perf_pmu__format_type(const struct perf_pmu *pmu, const char *name);
 int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_terms,
 			  struct perf_pmu_info *info, bool *rewrote_terms,
 			  u64 *alternate_hw_config, struct parse_events_error *err);
-- 
cgit v1.2.3


From 87775abac8733f5a4856cd59122c6dc8c8032a13 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:18 +0000
Subject: perf evsel: apply evsel__set_config_if_unset() to all config fields

Misleadingly, evsel__set_config_if_unset() only works with the config
field and not config1, config2, etc. This is fine at the moment because
all users of it happen to operate on bits that are in that config field.
Fix it before there are any new users of the function which operate on
bits in different config fields.

In theory it's also possible for a driver to move an existing bit to
another config field and this fixes that scenario too, although this
hasn't happened yet either.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 1a41a10d9ab2..32517683351f 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1336,6 +1336,36 @@ void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name,
 	struct perf_pmu_format *format = pmu_find_format(&evsel->pmu->format,
 							 config_name);
 	int fbit;
+	__u64 *vp;
+
+	if (!format)
+		return;
+
+	switch (format->value) {
+	case PERF_PMU_FORMAT_VALUE_CONFIG:
+		term = evsel__get_config_term(evsel, USR_CHG_CONFIG);
+		vp = &evsel->core.attr.config;
+		break;
+	case PERF_PMU_FORMAT_VALUE_CONFIG1:
+		term = evsel__get_config_term(evsel, USR_CHG_CONFIG1);
+		vp = &evsel->core.attr.config1;
+		break;
+	case PERF_PMU_FORMAT_VALUE_CONFIG2:
+		term = evsel__get_config_term(evsel, USR_CHG_CONFIG2);
+		vp = &evsel->core.attr.config2;
+		break;
+	case PERF_PMU_FORMAT_VALUE_CONFIG3:
+		term = evsel__get_config_term(evsel, USR_CHG_CONFIG3);
+		vp = &evsel->core.attr.config3;
+		break;
+	case PERF_PMU_FORMAT_VALUE_CONFIG4:
+		term = evsel__get_config_term(evsel, USR_CHG_CONFIG4);
+		vp = &evsel->core.attr.config4;
+		break;
+	default:
+		pr_err("Unknown format value: %d\n", format->value);
+		return;
+	}
 
 	if (!format)
 		return;
@@ -1349,7 +1379,7 @@ void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name,
 			return;
 
 	/* Otherwise replace it */
-	pmu_format_value(format->bits, val, &evsel->core.attr.config, /*zero=*/true);
+	pmu_format_value(format->bits, val, vp, /*zero=*/true);
 }
 
 void __weak arch_evsel__set_sample_weight(struct evsel *evsel)
-- 
cgit v1.2.3


From 34b4cfbe5cb03328a3330ea47dd8df00715dd627 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:19 +0000
Subject: perf evsel: Add a helper to get the value of a config field

This will be used by aux PMUs to read an already written value for
configuring their events and for also testing.

Its helper perf_pmu__format_unpack() does the opposite of the existing
pmu_format_value() so rename that one to perf_pmu__format_pack() so it's
clear how they are related.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 tools/perf/util/evsel.h |  2 ++
 tools/perf/util/pmu.c   | 35 ++++++++++++++++++++++++++++-------
 tools/perf/util/pmu.h   |  4 +++-
 4 files changed, 74 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 32517683351f..6d324141588c 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1379,7 +1379,47 @@ void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name,
 			return;
 
 	/* Otherwise replace it */
-	pmu_format_value(format->bits, val, vp, /*zero=*/true);
+	perf_pmu__format_pack(format->bits, val, vp, /*zero=*/true);
+}
+
+
+int evsel__get_config_val(const struct evsel *evsel, const char *config_name,
+			  u64 *val)
+{
+	struct perf_pmu_format *format = pmu_find_format(&evsel->pmu->format, config_name);
+
+	if (!format || bitmap_empty(format->bits, PERF_PMU_FORMAT_BITS)) {
+		pr_err("Unknown/empty format name: %s\n", config_name);
+		*val = 0;
+		return -EINVAL;
+	}
+
+	switch (format->value) {
+	case PERF_PMU_FORMAT_VALUE_CONFIG:
+		*val = perf_pmu__format_unpack(format->bits,
+					       evsel->core.attr.config);
+		return 0;
+	case PERF_PMU_FORMAT_VALUE_CONFIG1:
+		*val = perf_pmu__format_unpack(format->bits,
+					       evsel->core.attr.config1);
+		return 0;
+	case PERF_PMU_FORMAT_VALUE_CONFIG2:
+		*val = perf_pmu__format_unpack(format->bits,
+					       evsel->core.attr.config2);
+		return 0;
+	case PERF_PMU_FORMAT_VALUE_CONFIG3:
+		*val = perf_pmu__format_unpack(format->bits,
+					       evsel->core.attr.config3);
+		return 0;
+	case PERF_PMU_FORMAT_VALUE_CONFIG4:
+		*val = perf_pmu__format_unpack(format->bits,
+					       evsel->core.attr.config4);
+		return 0;
+	default:
+		pr_err("Unknown format value: %d\n", format->value);
+		*val = 0;
+		return -EINVAL;
+	}
 }
 
 void __weak arch_evsel__set_sample_weight(struct evsel *evsel)
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 2cf87bc67df7..95c4bd0f0f2e 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -575,6 +575,8 @@ void evsel__uniquify_counter(struct evsel *counter);
 	((((src) >> (pos)) & ((1ull << (size)) - 1)) << (63 - ((pos) + (size) - 1)))
 
 u64 evsel__bitfield_swap_branch_flags(u64 value);
+int evsel__get_config_val(const struct evsel *evsel, const char *config_name,
+			  u64 *val);
 void evsel__set_config_if_unset(struct evsel *evsel, const char *config_name,
 				u64 val);
 
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index dc5dab69151f..bb399a47d2b4 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -1337,6 +1337,26 @@ void perf_pmu__warn_invalid_formats(struct perf_pmu *pmu)
 	}
 }
 
+/*
+ * Unpacks a raw config[n] value using the sparse bitfield that defines a
+ * format attr. For example "config1:1,6-7,44" defines a 4 bit value across non
+ * contiguous bits and this function returns those 4 bits as a value.
+ */
+u64 perf_pmu__format_unpack(unsigned long *format, u64 config_val)
+{
+	int val_bit = 0;
+	u64 res = 0;
+	int fmt_bit;
+
+	for_each_set_bit(fmt_bit, format, PERF_PMU_FORMAT_BITS) {
+		if (config_val & (1ULL << fmt_bit))
+			res |= BIT_ULL(val_bit);
+
+		val_bit++;
+	}
+	return res;
+}
+
 struct perf_pmu_format *pmu_find_format(const struct list_head *formats,
 					const char *name)
 {
@@ -1379,7 +1399,8 @@ int perf_pmu__format_type(const struct perf_pmu *pmu, const char *name)
  * Sets value based on the format definition (format parameter)
  * and unformatted value (value parameter).
  */
-void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, bool zero)
+void perf_pmu__format_pack(unsigned long *format, __u64 value, __u64 *v,
+			   bool zero)
 {
 	unsigned long fbit, vbit;
 
@@ -1496,23 +1517,23 @@ static int pmu_config_term(const struct perf_pmu *pmu,
 		switch (term->type_term) {
 		case PARSE_EVENTS__TERM_TYPE_CONFIG:
 			assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM);
-			pmu_format_value(bits, term->val.num, &attr->config, zero);
+			perf_pmu__format_pack(bits, term->val.num, &attr->config, zero);
 			break;
 		case PARSE_EVENTS__TERM_TYPE_CONFIG1:
 			assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM);
-			pmu_format_value(bits, term->val.num, &attr->config1, zero);
+			perf_pmu__format_pack(bits, term->val.num, &attr->config1, zero);
 			break;
 		case PARSE_EVENTS__TERM_TYPE_CONFIG2:
 			assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM);
-			pmu_format_value(bits, term->val.num, &attr->config2, zero);
+			perf_pmu__format_pack(bits, term->val.num, &attr->config2, zero);
 			break;
 		case PARSE_EVENTS__TERM_TYPE_CONFIG3:
 			assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM);
-			pmu_format_value(bits, term->val.num, &attr->config3, zero);
+			perf_pmu__format_pack(bits, term->val.num, &attr->config3, zero);
 			break;
 		case PARSE_EVENTS__TERM_TYPE_CONFIG4:
 			assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM);
-			pmu_format_value(bits, term->val.num, &attr->config4, zero);
+			perf_pmu__format_pack(bits, term->val.num, &attr->config4, zero);
 			break;
 		case PARSE_EVENTS__TERM_TYPE_LEGACY_HARDWARE_CONFIG:
 			assert(term->type_val == PARSE_EVENTS__TERM_TYPE_NUM);
@@ -1650,7 +1671,7 @@ static int pmu_config_term(const struct perf_pmu *pmu,
 		 */
 	}
 
-	pmu_format_value(format->bits, val, vp, zero);
+	perf_pmu__format_pack(format->bits, val, vp, zero);
 	return 0;
 }
 
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 7655d996090a..7ef90b54a149 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -279,12 +279,14 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct parse_events_terms *head_
 			  u64 *alternate_hw_config, struct parse_events_error *err);
 int perf_pmu__find_event(struct perf_pmu *pmu, const char *event, void *state, pmu_event_callback cb);
 
-void pmu_format_value(unsigned long *format, __u64 value, __u64 *v, bool zero);
+void perf_pmu__format_pack(unsigned long *format, __u64 value, __u64 *v,
+			   bool zero);
 struct perf_pmu_format *pmu_find_format(const struct list_head *formats,
 					const char *name);
 void perf_pmu_format__set_value(void *format, int config, unsigned long *bits);
 bool perf_pmu__has_format(const struct perf_pmu *pmu, const char *name);
 int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_callback cb);
+u64 perf_pmu__format_unpack(unsigned long *format, u64 config_val);
 
 bool is_pmu_core(const char *name);
 bool perf_pmu__supports_legacy_cache(const struct perf_pmu *pmu);
-- 
cgit v1.2.3


From 6f87719b8ae170448348e56a82228ca39a3336a6 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:20 +0000
Subject: perf parse-events: Always track user config changes

Requiring the 'pmu->perf_event_attr_init_default' callback to be set to
track user changes is a bit of a trap to fall in. It's hard to see that
this is required when depending on the user change tracking.

It's possible to want all 0 defaults so not set it, but at the same time
still do some programmatic setting of configs with
evsel__set_config_if_unset(). Also if a PMU reverts to 0 defaults and
deletes its existing callback, it will silently break existing uses of
evsel__set_config_if_unset().

One way to fix this would be to assert in evsel__set_config_if_unset()
if the changes weren't tracked, but that would be a possibly untested
runtime failure. Instead, always track it as it's harmless and
simplifies testing too.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-events.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 1f6e2213326d..c8f2962a06c7 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1528,12 +1528,8 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state,
 		return -ENOMEM;
 	}
 
-	/*
-	 * When using default config, record which bits of attr->config were
-	 * changed by the user.
-	 */
-	if (pmu->perf_event_attr_init_default &&
-	    get_config_chgs(pmu, &parsed_terms, &config_terms)) {
+	/* Record which bits of attr->config were changed by the user. */
+	if (get_config_chgs(pmu, &parsed_terms, &config_terms)) {
 		parse_events_terms__exit(&parsed_terms);
 		return -ENOMEM;
 	}
-- 
cgit v1.2.3


From 8e2ef85c66dc65b61ca16be2650936387dc0d583 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:21 +0000
Subject: perf tests: Test evsel__set_config_if_unset() and config change
 tracking

Test that evsel__set_config_if_unset() behaves as expected. This also
tests the user config change tracking mechanism as it depends on it.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/pmu.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c
index cbded2c6faa4..0ebf2d7b2cb4 100644
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c
@@ -192,12 +192,102 @@ static int test__pmu_format(struct test_suite *test __maybe_unused, int subtest
 	}
 	if (attr.config2 != 0x0400000020041d07) {
 		pr_err("Unexpected config2 value %llx\n", attr.config2);
+	}
+
+	ret = TEST_OK;
+err_out:
+	parse_events_terms__exit(&terms);
+	test_pmu_put(dir, pmu);
+	return ret;
+}
+
+static int test__pmu_usr_chgs(struct test_suite *test __maybe_unused, int subtest __maybe_unused)
+{
+	const char *event = "perf-pmu-test/config=15,config1=4,krava02=170,"
+			    "krava03=1,krava11=27,krava12=1/";
+	struct parse_events_terms terms;
+	struct parse_events_error err;
+	LIST_HEAD(config_terms);
+	struct evlist *evlist;
+	struct perf_pmu *pmu;
+	struct evsel *evsel;
+	int ret = TEST_FAIL;
+	char dir[PATH_MAX];
+	u64 val;
+
+	pmu = test_pmu_get(dir, sizeof(dir));
+	if (!pmu)
+		return TEST_FAIL;
+
+	evlist = evlist__new();
+	if (evlist == NULL) {
+		pr_err("Failed allocation");
+		goto err_out;
+	}
+
+	parse_events_terms__init(&terms);
+	ret = parse_events(evlist, event, &err);
+	if (ret) {
+		pr_debug("failed to parse event '%s', err %d\n", event, ret);
+		parse_events_error__print(&err, event);
+		if (parse_events_error__contains(&err, "can't access trace events"))
+			ret = TEST_SKIP;
 		goto err_out;
 	}
+	evsel = evlist__first(evlist);
+
+	/*
+	 * Set via config=15, krava01 bits 0-1
+	 * Set via config1=4, krava11 bit 1
+	 * Set values: krava02=170, krava03=1, krava11=27, krava12=1
+	 *
+	 * Test that already set values aren't overwritten.
+	 */
+	evsel__set_config_if_unset(evsel, "krava01", 16);
+	evsel__get_config_val(evsel, "krava01", &val);
+	TEST_ASSERT_EQUAL("krava01 overwritten", (int) val, (15 & 0b11));
+
+	evsel__set_config_if_unset(evsel, "krava11", 45);
+	evsel__get_config_val(evsel, "krava11", &val);
+	TEST_ASSERT_EQUAL("krava11 overwritten", (int) val, (27 | (4 << 1)));
+
+	evsel__set_config_if_unset(evsel, "krava02", 32);
+	evsel__get_config_val(evsel, "krava02", &val);
+	TEST_ASSERT_EQUAL("krava02 overwritten", (int) val, 170);
+
+	evsel__set_config_if_unset(evsel, "krava03", 0);
+	evsel__get_config_val(evsel, "krava03", &val);
+	TEST_ASSERT_EQUAL("krava03 overwritten", (int) val, 1);
+
+	/*
+	 * krava13 doesn't have any bits set by either krava13= or config1=
+	 * but setting _any_ raw value for config1 implies that krava13
+	 * shouldn't be overwritten. So it's value should remain as 0.
+	 */
+	evsel__set_config_if_unset(evsel, "krava13", 5);
+	evsel__get_config_val(evsel, "krava13", &val);
+	TEST_ASSERT_EQUAL("krava13 overwritten", (int) val, 0);
+
+	/*
+	 * Unset values: krava21, krava22, krava23
+	 *
+	 * Test that unset values are overwritten.
+	 */
+	evsel__set_config_if_unset(evsel, "krava21", 13905);
+	evsel__get_config_val(evsel, "krava21", &val);
+	TEST_ASSERT_EQUAL("krava21 not overwritten", (int) val, 13905);
+
+	evsel__set_config_if_unset(evsel, "krava22", 11);
+	evsel__get_config_val(evsel, "krava22", &val);
+	TEST_ASSERT_EQUAL("krava22 not overwritten", (int) val, 11);
 
+	evsel__set_config_if_unset(evsel, "krava23", 0);
+	evsel__get_config_val(evsel, "krava23", &val);
+	TEST_ASSERT_EQUAL("krava23 not overwritten", (int) val, 0);
 	ret = TEST_OK;
 err_out:
 	parse_events_terms__exit(&terms);
+	evlist__delete(evlist);
 	test_pmu_put(dir, pmu);
 	return ret;
 }
@@ -539,6 +629,7 @@ static struct test_case tests__pmu[] = {
 	TEST_CASE("PMU name combining", name_len),
 	TEST_CASE("PMU name comparison", name_cmp),
 	TEST_CASE("PMU cmdline match", pmu_match),
+	TEST_CASE("PMU user config changes", pmu_usr_chgs),
 	{	.name = NULL, }
 };
 
-- 
cgit v1.2.3


From 4c2efb230a76d9dcdf0e4c39d1116df08312e740 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:22 +0000
Subject: perf cs-etm: Make a helper to find the Coresight evsel

This pattern occurs a few times and we'll add another one later, so add
a helper function for it.

Reviewed-by: Ian Rogers <irogers@google.com>
Reviewed-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/util/cs-etm.c | 50 +++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index c28208361d91..a49753f0d20f 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -302,6 +302,19 @@ static int cs_etm_set_sink_attr(struct perf_pmu *pmu,
 	return 0;
 }
 
+static struct evsel *cs_etm_get_evsel(struct evlist *evlist,
+				      struct perf_pmu *cs_etm_pmu)
+{
+	struct evsel *evsel;
+
+	evlist__for_each_entry(evlist, evsel) {
+		if (evsel->core.attr.type == cs_etm_pmu->type)
+			return evsel;
+	}
+
+	return NULL;
+}
+
 static int cs_etm_recording_options(struct auxtrace_record *itr,
 				    struct evlist *evlist,
 				    struct record_opts *opts)
@@ -473,29 +486,21 @@ out:
 
 static u64 cs_etm_get_config(struct auxtrace_record *itr)
 {
-	u64 config = 0;
 	struct cs_etm_recording *ptr =
 			container_of(itr, struct cs_etm_recording, itr);
 	struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
 	struct evlist *evlist = ptr->evlist;
-	struct evsel *evsel;
+	struct evsel *evsel = cs_etm_get_evsel(evlist, cs_etm_pmu);
 
-	evlist__for_each_entry(evlist, evsel) {
-		if (evsel->core.attr.type == cs_etm_pmu->type) {
-			/*
-			 * Variable perf_event_attr::config is assigned to
-			 * ETMv3/PTM.  The bit fields have been made to match
-			 * the ETMv3.5 ETRMCR register specification.  See the
-			 * PMU_FORMAT_ATTR() declarations in
-			 * drivers/hwtracing/coresight/coresight-perf.c for
-			 * details.
-			 */
-			config = evsel->core.attr.config;
-			break;
-		}
-	}
-
-	return config;
+	/*
+	 * Variable perf_event_attr::config is assigned to
+	 * ETMv3/PTM.  The bit fields have been made to match
+	 * the ETMv3.5 ETRMCR register specification.  See the
+	 * PMU_FORMAT_ATTR() declarations in
+	 * drivers/hwtracing/coresight/coresight-perf.c for
+	 * details.
+	 */
+	return evsel ? evsel->core.attr.config : 0;
 }
 
 #ifndef BIT
@@ -829,12 +834,11 @@ static int cs_etm_snapshot_start(struct auxtrace_record *itr)
 {
 	struct cs_etm_recording *ptr =
 			container_of(itr, struct cs_etm_recording, itr);
-	struct evsel *evsel;
+	struct evsel *evsel = cs_etm_get_evsel(ptr->evlist, ptr->cs_etm_pmu);
+
+	if (evsel)
+		return evsel__disable(evsel);
 
-	evlist__for_each_entry(ptr->evlist, evsel) {
-		if (evsel->core.attr.type == ptr->cs_etm_pmu->type)
-			return evsel__disable(evsel);
-	}
 	return -EINVAL;
 }
 
-- 
cgit v1.2.3


From 4ffd443f5d1fc85740ac60f9ccd0200fab42f95e Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:23 +0000
Subject: perf cs-etm: Don't use hard coded config bits when setting up ETMCR

Perf only looks at attr.config when determining what was programmed into
ETMCR. These bits could theoretically be in any of the config fields.
Add a generic helper to find the value of any named format field in any
config field and then use it to get the attributes relevant to ETMCR.

The kernel will also stop publishing the ETMCR register bits in a header
[1] so preempt that by defining them here.

Move field_prep() to util.h so we can define it along side field_get().
Unfortunately FIELD_PREP() and FIELD_GET() from the kernel can't be used
as they require the mask to be a compile time constant.

[1]: https://lore.kernel.org/linux-arm-kernel/20251128-james-cs-syncfreq-v8-10-4d319764cc58@linaro.org/

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/util/cs-etm.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index a49753f0d20f..f535027ce862 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -68,6 +68,12 @@ static const char * const metadata_ete_ro[] = {
 
 enum cs_etm_version { CS_NOT_PRESENT, CS_ETMV3, CS_ETMV4, CS_ETE };
 
+
+/* ETMv3 ETMCR register bits */
+#define ETMCR_CYC_ACC		BIT(12)
+#define ETMCR_TIMESTAMP_EN	BIT(28)
+#define ETMCR_RETURN_STACK	BIT(29)
+
 static bool cs_etm_is_ete(struct perf_pmu *cs_etm_pmu, struct perf_cpu cpu);
 static int cs_etm_get_ro(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path, __u64 *val);
 static bool cs_etm_pmu_path_exists(struct perf_pmu *pmu, struct perf_cpu cpu, const char *path);
@@ -484,6 +490,33 @@ out:
 	return err;
 }
 
+static u64 cs_etm_synth_etmcr(struct auxtrace_record *itr)
+{
+	struct cs_etm_recording *ptr =
+		container_of(itr, struct cs_etm_recording, itr);
+	struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
+	struct evsel *evsel = cs_etm_get_evsel(ptr->evlist, cs_etm_pmu);
+	u64 etmcr = 0;
+	u64 val;
+
+	if (!evsel)
+		return 0;
+
+	/*
+	 * Synthesize what the kernel programmed into ETMCR based on
+	 * what options the event was opened with. This doesn't have to be
+	 * complete or 100% accurate, not all bits used by OpenCSD anyway.
+	 */
+	if (!evsel__get_config_val(evsel, "cycacc", &val) && val)
+		etmcr |= ETMCR_CYC_ACC;
+	if (!evsel__get_config_val(evsel, "timestamp", &val) && val)
+		etmcr |= ETMCR_TIMESTAMP_EN;
+	if (!evsel__get_config_val(evsel, "retstack", &val) && val)
+		etmcr |= ETMCR_RETURN_STACK;
+
+	return etmcr;
+}
+
 static u64 cs_etm_get_config(struct auxtrace_record *itr)
 {
 	struct cs_etm_recording *ptr =
@@ -743,7 +776,7 @@ static void cs_etm_get_metadata(struct perf_cpu cpu, u32 *offset,
 	case CS_ETMV3:
 		magic = __perf_cs_etmv3_magic;
 		/* Get configuration register */
-		info->priv[*offset + CS_ETM_ETMCR] = cs_etm_get_config(itr);
+		info->priv[*offset + CS_ETM_ETMCR] = cs_etm_synth_etmcr(itr);
 		/* traceID set to legacy value in case new perf running on old system */
 		info->priv[*offset + CS_ETM_ETMTRACEIDR] = cs_etm_get_legacy_trace_id(cpu);
 		/* Get read-only information from sysFS */
-- 
cgit v1.2.3


From 3f620f26576526ccc9e5cb1164bd1cf33a7c70bd Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:24 +0000
Subject: perf cs-etm: Don't use hard coded config bits when setting up
 TRCCONFIGR

Perf only looks at attr.config when determining what was programmed into
TRCCONFIGR. These bits could theoretically be in any of the config
fields. Use the evsel__get_config_val() helper so it's agnostic to
which config field they are in.

The kernel will also stop publishing the TRCCONFIGR register bits in a
header [1] so preempt that by defining them here.

[1]: https://lore.kernel.org/linux-arm-kernel/20251128-james-cs-syncfreq-v8-10-4d319764cc58@linaro.org/

Reviewed-by: Ian Rogers <irogers@google.com>
Reviewed-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/util/cs-etm.c | 79 +++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 45 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index f535027ce862..12b28562c2f3 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -68,6 +68,14 @@ static const char * const metadata_ete_ro[] = {
 
 enum cs_etm_version { CS_NOT_PRESENT, CS_ETMV3, CS_ETMV4, CS_ETE };
 
+/* ETMv4 CONFIGR register bits */
+#define TRCCONFIGR_BB		BIT(3)
+#define TRCCONFIGR_CCI		BIT(4)
+#define TRCCONFIGR_CID		BIT(6)
+#define TRCCONFIGR_VMID		BIT(7)
+#define TRCCONFIGR_TS		BIT(11)
+#define TRCCONFIGR_RS		BIT(12)
+#define TRCCONFIGR_VMIDOPT	BIT(15)
 
 /* ETMv3 ETMCR register bits */
 #define ETMCR_CYC_ACC		BIT(12)
@@ -517,56 +525,37 @@ static u64 cs_etm_synth_etmcr(struct auxtrace_record *itr)
 	return etmcr;
 }
 
-static u64 cs_etm_get_config(struct auxtrace_record *itr)
+static u64 cs_etmv4_synth_trcconfigr(struct auxtrace_record *itr)
 {
+	u64 trcconfigr = 0;
 	struct cs_etm_recording *ptr =
-			container_of(itr, struct cs_etm_recording, itr);
+		container_of(itr, struct cs_etm_recording, itr);
 	struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
-	struct evlist *evlist = ptr->evlist;
-	struct evsel *evsel = cs_etm_get_evsel(evlist, cs_etm_pmu);
-
-	/*
-	 * Variable perf_event_attr::config is assigned to
-	 * ETMv3/PTM.  The bit fields have been made to match
-	 * the ETMv3.5 ETRMCR register specification.  See the
-	 * PMU_FORMAT_ATTR() declarations in
-	 * drivers/hwtracing/coresight/coresight-perf.c for
-	 * details.
-	 */
-	return evsel ? evsel->core.attr.config : 0;
-}
-
-#ifndef BIT
-#define BIT(N) (1UL << (N))
-#endif
+	struct evsel *evsel = cs_etm_get_evsel(ptr->evlist, cs_etm_pmu);
+	u64 val;
 
-static u64 cs_etmv4_get_config(struct auxtrace_record *itr)
-{
-	u64 config = 0;
-	u64 config_opts = 0;
+	if (!evsel)
+		return 0;
 
 	/*
-	 * The perf event variable config bits represent both
-	 * the command line options and register programming
-	 * bits in ETMv3/PTM. For ETMv4 we must remap options
-	 * to real bits
+	 * Synthesize what the kernel programmed into TRCCONFIGR based on
+	 * what options the event was opened with. This doesn't have to be
+	 * complete or 100% accurate, not all bits used by OpenCSD anyway.
 	 */
-	config_opts = cs_etm_get_config(itr);
-	if (config_opts & BIT(ETM_OPT_CYCACC))
-		config |= BIT(ETM4_CFG_BIT_CYCACC);
-	if (config_opts & BIT(ETM_OPT_CTXTID))
-		config |= BIT(ETM4_CFG_BIT_CTXTID);
-	if (config_opts & BIT(ETM_OPT_TS))
-		config |= BIT(ETM4_CFG_BIT_TS);
-	if (config_opts & BIT(ETM_OPT_RETSTK))
-		config |= BIT(ETM4_CFG_BIT_RETSTK);
-	if (config_opts & BIT(ETM_OPT_CTXTID2))
-		config |= BIT(ETM4_CFG_BIT_VMID) |
-			  BIT(ETM4_CFG_BIT_VMID_OPT);
-	if (config_opts & BIT(ETM_OPT_BRANCH_BROADCAST))
-		config |= BIT(ETM4_CFG_BIT_BB);
-
-	return config;
+	if (!evsel__get_config_val(evsel, "cycacc", &val) && val)
+		trcconfigr |= TRCCONFIGR_CCI;
+	if (!evsel__get_config_val(evsel, "contextid1", &val) && val)
+		trcconfigr |= TRCCONFIGR_CID;
+	if (!evsel__get_config_val(evsel, "timestamp", &val) && val)
+		trcconfigr |= TRCCONFIGR_TS;
+	if (!evsel__get_config_val(evsel, "retstack", &val) && val)
+		trcconfigr |= TRCCONFIGR_RS;
+	if (!evsel__get_config_val(evsel, "contextid2", &val) && val)
+		trcconfigr |= TRCCONFIGR_VMID | TRCCONFIGR_VMIDOPT;
+	if (!evsel__get_config_val(evsel, "branch_broadcast", &val) && val)
+		trcconfigr |= TRCCONFIGR_BB;
+
+	return trcconfigr;
 }
 
 static size_t
@@ -688,7 +677,7 @@ static void cs_etm_save_etmv4_header(__u64 data[], struct auxtrace_record *itr,
 	struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
 
 	/* Get trace configuration register */
-	data[CS_ETMV4_TRCCONFIGR] = cs_etmv4_get_config(itr);
+	data[CS_ETMV4_TRCCONFIGR] = cs_etmv4_synth_trcconfigr(itr);
 	/* traceID set to legacy version, in case new perf running on older system */
 	data[CS_ETMV4_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu);
 
@@ -720,7 +709,7 @@ static void cs_etm_save_ete_header(__u64 data[], struct auxtrace_record *itr, st
 	struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu;
 
 	/* Get trace configuration register */
-	data[CS_ETE_TRCCONFIGR] = cs_etmv4_get_config(itr);
+	data[CS_ETE_TRCCONFIGR] = cs_etmv4_synth_trcconfigr(itr);
 	/* traceID set to legacy version, in case new perf running on older system */
 	data[CS_ETE_TRCTRACEIDR] = cs_etm_get_legacy_trace_id(cpu);
 
-- 
cgit v1.2.3


From 5e63706f1bc1446e40a8643d05a9842ebad4ec34 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:25 +0000
Subject: perf cs-etm: Don't hard code config attribute when configuring the
 event

These instances of hard coded config attributes are used for configuring
and validating the event options. Use the config attribute that's
published by the driver by replacing the open coded operations with
evsel__get_config_val() and evsel__set_config_if_unset().

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/util/cs-etm.c | 56 +++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 29 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index 12b28562c2f3..dc3f4e86b075 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -103,13 +103,14 @@ static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel
 				      struct perf_cpu cpu)
 {
 	int err;
-	__u64 val;
-	u64 contextid = evsel->core.attr.config &
-		(perf_pmu__format_bits(cs_etm_pmu, "contextid") |
-		 perf_pmu__format_bits(cs_etm_pmu, "contextid1") |
-		 perf_pmu__format_bits(cs_etm_pmu, "contextid2"));
+	u64 ctxt, ctxt1, ctxt2;
+	__u64 trcidr2;
 
-	if (!contextid)
+	evsel__get_config_val(evsel, "contextid", &ctxt);
+	evsel__get_config_val(evsel, "contextid1", &ctxt1);
+	evsel__get_config_val(evsel, "contextid2", &ctxt2);
+
+	if (!ctxt && !ctxt1 && !ctxt2)
 		return 0;
 
 	/* Not supported in etmv3 */
@@ -120,12 +121,11 @@ static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel
 	}
 
 	/* Get a handle on TRCIDR2 */
-	err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR2], &val);
+	err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR2], &trcidr2);
 	if (err)
 		return err;
 
-	if (contextid &
-	    perf_pmu__format_bits(cs_etm_pmu, "contextid1")) {
+	if (ctxt1) {
 		/*
 		 * TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID
 		 * tracing is supported:
@@ -133,15 +133,14 @@ static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel
 		 *  0b00100 Maximum of 32-bit Context ID size.
 		 *  All other values are reserved.
 		 */
-		if (BMVAL(val, 5, 9) != 0x4) {
+		if (BMVAL(trcidr2, 5, 9) != 0x4) {
 			pr_err("%s: CONTEXTIDR_EL1 isn't supported, disable with %s/contextid1=0/\n",
 			       CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME);
 			return -EINVAL;
 		}
 	}
 
-	if (contextid &
-	    perf_pmu__format_bits(cs_etm_pmu, "contextid2")) {
+	if (ctxt2) {
 		/*
 		 * TRCIDR2.VMIDOPT[30:29] != 0 and
 		 * TRCIDR2.VMIDSIZE[14:10] == 0b00100 (32bit virtual contextid)
@@ -149,7 +148,7 @@ static int cs_etm_validate_context_id(struct perf_pmu *cs_etm_pmu, struct evsel
 		 * virtual context id is < 32bit.
 		 * Any value of VMIDSIZE >= 4 (i.e, > 32bit) is fine for us.
 		 */
-		if (!BMVAL(val, 29, 30) || BMVAL(val, 10, 14) < 4) {
+		if (!BMVAL(trcidr2, 29, 30) || BMVAL(trcidr2, 10, 14) < 4) {
 			pr_err("%s: CONTEXTIDR_EL2 isn't supported, disable with %s/contextid2=0/\n",
 			       CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME);
 			return -EINVAL;
@@ -163,10 +162,11 @@ static int cs_etm_validate_timestamp(struct perf_pmu *cs_etm_pmu, struct evsel *
 				     struct perf_cpu cpu)
 {
 	int err;
-	__u64 val;
+	u64 val;
+	__u64 trcidr0;
 
-	if (!(evsel->core.attr.config &
-	      perf_pmu__format_bits(cs_etm_pmu, "timestamp")))
+	evsel__get_config_val(evsel, "timestamp", &val);
+	if (!val)
 		return 0;
 
 	if (cs_etm_get_version(cs_etm_pmu, cpu) == CS_ETMV3) {
@@ -176,7 +176,7 @@ static int cs_etm_validate_timestamp(struct perf_pmu *cs_etm_pmu, struct evsel *
 	}
 
 	/* Get a handle on TRCIRD0 */
-	err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0], &val);
+	err = cs_etm_get_ro(cs_etm_pmu, cpu, metadata_etmv4_ro[CS_ETMV4_TRCIDR0], &trcidr0);
 	if (err)
 		return err;
 
@@ -187,10 +187,9 @@ static int cs_etm_validate_timestamp(struct perf_pmu *cs_etm_pmu, struct evsel *
 	 *  0b00110 Implementation supports a maximum timestamp of 48bits.
 	 *  0b01000 Implementation supports a maximum timestamp of 64bits.
 	 */
-	val &= GENMASK(28, 24);
-	if (!val) {
+	trcidr0 &= GENMASK(28, 24);
+	if (!trcidr0)
 		return -EINVAL;
-	}
 
 	return 0;
 }
@@ -273,16 +272,19 @@ static int cs_etm_parse_snapshot_options(struct auxtrace_record *itr,
 	return 0;
 }
 
+/*
+ * If the sink name format "@sink_name" is used, lookup the sink by name to convert to
+ * "sinkid=sink_hash" format. If the user has already manually provided a hash then
+ * "sinkid" isn't overwritten. If neither are provided then the driver will pick the best
+ * sink.
+ */
 static int cs_etm_set_sink_attr(struct perf_pmu *pmu,
 				struct evsel *evsel)
 {
 	char msg[BUFSIZ], path[PATH_MAX], *sink;
 	struct evsel_config_term *term;
-	int ret = -EINVAL;
 	u32 hash;
-
-	if (evsel->core.attr.config2 & GENMASK(31, 0))
-		return 0;
+	int ret;
 
 	list_for_each_entry(term, &evsel->config_terms, list) {
 		if (term->type != EVSEL__CONFIG_TERM_DRV_CFG)
@@ -305,14 +307,10 @@ static int cs_etm_set_sink_attr(struct perf_pmu *pmu,
 			return ret;
 		}
 
-		evsel->core.attr.config2 |= hash;
+		evsel__set_config_if_unset(evsel, "sinkid", hash);
 		return 0;
 	}
 
-	/*
-	 * No sink was provided on the command line - allow the CoreSight
-	 * system to look for a default
-	 */
 	return 0;
 }
 
-- 
cgit v1.2.3


From 571d29baa07e83e637075239f379f91353c24ec9 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 14 Jan 2026 15:57:26 +0000
Subject: perf arm-spe: Don't hard code config attribute

Use the config attribute that's published by the driver instead of
hard coding "attr.config".

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm64/util/arm-spe.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c
index 51014f8bff97..17ced7bbbdda 100644
--- a/tools/perf/arch/arm64/util/arm-spe.c
+++ b/tools/perf/arch/arm64/util/arm-spe.c
@@ -256,7 +256,7 @@ static __u64 arm_spe_pmu__sample_period(const struct perf_pmu *arm_spe_pmu)
 
 static void arm_spe_setup_evsel(struct evsel *evsel, struct perf_cpu_map *cpus)
 {
-	u64 bit;
+	u64 pa_enable_bit;
 
 	evsel->core.attr.freq = 0;
 	evsel->core.attr.sample_period = arm_spe_pmu__sample_period(evsel->pmu);
@@ -288,9 +288,10 @@ static void arm_spe_setup_evsel(struct evsel *evsel, struct perf_cpu_map *cpus)
 	 * inform that the resulting output's SPE samples contain physical addresses
 	 * where applicable.
 	 */
-	bit = perf_pmu__format_bits(evsel->pmu, "pa_enable");
-	if (evsel->core.attr.config & bit)
-		evsel__set_sample_bit(evsel, PHYS_ADDR);
+
+	if (!evsel__get_config_val(evsel, "pa_enable", &pa_enable_bit))
+		if (pa_enable_bit)
+			evsel__set_sample_bit(evsel, PHYS_ADDR);
 }
 
 static int arm_spe_setup_aux_buffer(struct record_opts *opts)
@@ -397,6 +398,7 @@ static int arm_spe_recording_options(struct auxtrace_record *itr,
 	struct perf_cpu_map *cpus = evlist->core.user_requested_cpus;
 	bool discard = false;
 	int err;
+	u64 discard_bit;
 
 	sper->evlist = evlist;
 
@@ -425,9 +427,8 @@ static int arm_spe_recording_options(struct auxtrace_record *itr,
 	evlist__for_each_entry_safe(evlist, tmp, evsel) {
 		if (evsel__is_aux_event(evsel)) {
 			arm_spe_setup_evsel(evsel, cpus);
-			if (evsel->core.attr.config &
-			    perf_pmu__format_bits(evsel->pmu, "discard"))
-				discard = true;
+			if (!evsel__get_config_val(evsel, "discard", &discard_bit))
+				discard = !!discard_bit;
 		}
 	}
 
-- 
cgit v1.2.3


From 7c8e817e443c118aa303f1bbcec33df8d9e3487a Mon Sep 17 00:00:00 2001
From: Anton Protopopov <a.s.protopopov@gmail.com>
Date: Wed, 14 Jan 2026 16:25:44 +0000
Subject: selftests/bpf: Extend live regs tests with a test for gotox

Add a test which checks that the destination register of a gotox
instruction is marked as used and that the union of jump targets
is considered as live.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Link: https://lore.kernel.org/r/20260114162544.83253-3-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/progs/compute_live_registers.c   | 41 ++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/compute_live_registers.c b/tools/testing/selftests/bpf/progs/compute_live_registers.c
index 6884ab99a421..f05e120f3450 100644
--- a/tools/testing/selftests/bpf/progs/compute_live_registers.c
+++ b/tools/testing/selftests/bpf/progs/compute_live_registers.c
@@ -431,6 +431,47 @@ __naked void subprog1(void)
 		::: __clobber_all);
 }
 
+#if defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64)
+
+SEC("socket")
+__log_level(2)
+__msg("2: .1........ (07) r1 += 8")
+__msg("3: .1........ (79) r2 = *(u64 *)(r1 +0)")
+__msg("4: ..2....... (b7) r3 = 1")
+__msg("5: ..23...... (b7) r4 = 2")
+__msg("6: ..234..... (0d) gotox r2")
+__msg("7: ...3...... (bf) r0 = r3")
+__msg("8: 0......... (95) exit")
+__msg("9: ....4..... (bf) r0 = r4")
+__msg("10: 0......... (95) exit")
+__naked
+void gotox(void)
+{
+	asm volatile (
+	".pushsection .jumptables,\"\",@progbits;"
+"jt0_%=: .quad l0_%= - socket;"
+	".quad l1_%= - socket;"
+	".size jt0_%=, 16;"
+	".global jt0_%=;"
+	".popsection;"
+
+	"r1 = jt0_%= ll;"
+	"r1 += 8;"
+	"r2 = *(u64 *)(r1 + 0);"
+	"r3 = 1;"
+	"r4 = 2;"
+	".8byte %[gotox_r2];"
+"l0_%=:  r0 = r3;"
+	"exit;"
+"l1_%=:  r0 = r4;"
+	"exit;"
+	:
+	: __imm_insn(gotox_r2, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_2, BPF_REG_0, 0, 0))
+	: __clobber_all);
+}
+
+#endif /* __TARGET_ARCH_x86 || __TARGET_ARCH_arm64 */
+
 /* to retain debug info for BTF generation */
 void kfunc_root(void)
 {
-- 
cgit v1.2.3


From 0ace8f2db6b3b4b0677e559d1a7ab7fd625d61ec Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 5 Jan 2026 20:11:48 +0000
Subject: tools/testing/selftests: add tests for !tgt, src mremap() merges

Test that mremap()'ing a VMA into a position such that the target VMA on
merge is unfaulted and the source faulted is correctly performed.

We cover 4 cases:

    1. Previous VMA unfaulted:

                  copied -----|
                              v
            |-----------|.............|
            | unfaulted |(faulted VMA)|
            |-----------|.............|
                 prev

    target = prev, expand prev to cover.

    2. Next VMA unfaulted:

                  copied -----|
                              v
                        |.............|-----------|
                        |(faulted VMA)| unfaulted |
                        |.............|-----------|
                                          next

    target = next, expand next to cover.

    3. Both adjacent VMAs unfaulted:

                  copied -----|
                              v
            |-----------|.............|-----------|
            | unfaulted |(faulted VMA)| unfaulted |
            |-----------|.............|-----------|
                 prev                      next

    target = prev, expand prev to cover.

    4. prev unfaulted, next faulted:

                  copied -----|
                              v
            |-----------|.............|-----------|
            | unfaulted |(faulted VMA)|  faulted  |
            |-----------|.............|-----------|
                 prev                      next

    target = prev, expand prev to cover. Essentially equivalent to 3, but
    with additional requirement that next's anon_vma is the same as the
    copied VMA's.

Each of these are performed with MREMAP_DONTUNMAP set, which will cause a
KASAN assert for UAF or an assert on zero refcount anon_vma if a bug
exists with correctly propagating anon_vma state in each scenario.

Link: https://lkml.kernel.org/r/f903af2930c7c2c6e0948c886b58d0f42d8e8ba3.1767638272.git.lorenzo.stoakes@oracle.com
Fixes: 879bca0a2c4f ("mm/vma: fix incorrectly disallowed anonymous VMA merges")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jeongjun Park <aha310510@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/merge.c | 232 +++++++++++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c
index 363c1033cc7d..22be149f7109 100644
--- a/tools/testing/selftests/mm/merge.c
+++ b/tools/testing/selftests/mm/merge.c
@@ -1171,4 +1171,236 @@ TEST_F(merge, mremap_correct_placed_faulted)
 	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 15 * page_size);
 }
 
+TEST_F(merge, mremap_faulted_to_unfaulted_prev)
+{
+	struct procmap_fd *procmap = &self->procmap;
+	unsigned int page_size = self->page_size;
+	char *ptr_a, *ptr_b;
+
+	/*
+	 * mremap() such that A and B merge:
+	 *
+	 *                             |------------|
+	 *                             |    \       |
+	 *           |-----------|     |    /  |---------|
+	 *           | unfaulted |     v    \  | faulted |
+	 *           |-----------|          /  |---------|
+	 *                 B                \       A
+	 */
+
+	/* Map VMA A into place. */
+	ptr_a = mmap(&self->carveout[page_size + 3 * page_size],
+		     3 * page_size,
+		     PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+	ASSERT_NE(ptr_a, MAP_FAILED);
+	/* Fault it in. */
+	ptr_a[0] = 'x';
+
+	/*
+	 * Now move it out of the way so we can place VMA B in position,
+	 * unfaulted.
+	 */
+	ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size,
+		       MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]);
+	ASSERT_NE(ptr_a, MAP_FAILED);
+
+	/* Map VMA B into place. */
+	ptr_b = mmap(&self->carveout[page_size], 3 * page_size,
+		     PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+	ASSERT_NE(ptr_b, MAP_FAILED);
+
+	/*
+	 * Now move VMA A into position with MREMAP_DONTUNMAP to catch incorrect
+	 * anon_vma propagation.
+	 */
+	ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size,
+		       MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP,
+		       &self->carveout[page_size + 3 * page_size]);
+	ASSERT_NE(ptr_a, MAP_FAILED);
+
+	/* The VMAs should have merged. */
+	ASSERT_TRUE(find_vma_procmap(procmap, ptr_b));
+	ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b);
+	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 6 * page_size);
+}
+
+TEST_F(merge, mremap_faulted_to_unfaulted_next)
+{
+	struct procmap_fd *procmap = &self->procmap;
+	unsigned int page_size = self->page_size;
+	char *ptr_a, *ptr_b;
+
+	/*
+	 * mremap() such that A and B merge:
+	 *
+	 *      |---------------------------|
+	 *      |                   \       |
+	 *      |    |-----------|  /  |---------|
+	 *      v    | unfaulted |  \  | faulted |
+	 *           |-----------|  /  |---------|
+	 *                 B        \       A
+	 *
+	 * Then unmap VMA A to trigger the bug.
+	 */
+
+	/* Map VMA A into place. */
+	ptr_a = mmap(&self->carveout[page_size], 3 * page_size,
+		     PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+	ASSERT_NE(ptr_a, MAP_FAILED);
+	/* Fault it in. */
+	ptr_a[0] = 'x';
+
+	/*
+	 * Now move it out of the way so we can place VMA B in position,
+	 * unfaulted.
+	 */
+	ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size,
+		       MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]);
+	ASSERT_NE(ptr_a, MAP_FAILED);
+
+	/* Map VMA B into place. */
+	ptr_b = mmap(&self->carveout[page_size + 3 * page_size], 3 * page_size,
+		     PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+	ASSERT_NE(ptr_b, MAP_FAILED);
+
+	/*
+	 * Now move VMA A into position with MREMAP_DONTUNMAP to catch incorrect
+	 * anon_vma propagation.
+	 */
+	ptr_a = mremap(ptr_a, 3 * page_size, 3 * page_size,
+		       MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP,
+		       &self->carveout[page_size]);
+	ASSERT_NE(ptr_a, MAP_FAILED);
+
+	/* The VMAs should have merged. */
+	ASSERT_TRUE(find_vma_procmap(procmap, ptr_a));
+	ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a);
+	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 6 * page_size);
+}
+
+TEST_F(merge, mremap_faulted_to_unfaulted_prev_unfaulted_next)
+{
+	struct procmap_fd *procmap = &self->procmap;
+	unsigned int page_size = self->page_size;
+	char *ptr_a, *ptr_b, *ptr_c;
+
+	/*
+	 * mremap() with MREMAP_DONTUNMAP such that A, B and C merge:
+	 *
+	 *                  |---------------------------|
+	 *                  |                   \       |
+	 * |-----------|    |    |-----------|  /  |---------|
+	 * | unfaulted |    v    | unfaulted |  \  | faulted |
+	 * |-----------|         |-----------|  /  |---------|
+	 *       A                     C        \        B
+	 */
+
+	/* Map VMA B into place. */
+	ptr_b = mmap(&self->carveout[page_size + 3 * page_size], 3 * page_size,
+		     PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+	ASSERT_NE(ptr_b, MAP_FAILED);
+	/* Fault it in. */
+	ptr_b[0] = 'x';
+
+	/*
+	 * Now move it out of the way so we can place VMAs A, C in position,
+	 * unfaulted.
+	 */
+	ptr_b = mremap(ptr_b, 3 * page_size, 3 * page_size,
+		       MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]);
+	ASSERT_NE(ptr_b, MAP_FAILED);
+
+	/* Map VMA A into place. */
+
+	ptr_a = mmap(&self->carveout[page_size], 3 * page_size,
+		     PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+	ASSERT_NE(ptr_a, MAP_FAILED);
+
+	/* Map VMA C into place. */
+	ptr_c = mmap(&self->carveout[page_size + 3 * page_size + 3 * page_size],
+		     3 * page_size, PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+	ASSERT_NE(ptr_c, MAP_FAILED);
+
+	/*
+	 * Now move VMA B into position with MREMAP_DONTUNMAP to catch incorrect
+	 * anon_vma propagation.
+	 */
+	ptr_b = mremap(ptr_b, 3 * page_size, 3 * page_size,
+		       MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP,
+		       &self->carveout[page_size + 3 * page_size]);
+	ASSERT_NE(ptr_b, MAP_FAILED);
+
+	/* The VMAs should have merged. */
+	ASSERT_TRUE(find_vma_procmap(procmap, ptr_a));
+	ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a);
+	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size);
+}
+
+TEST_F(merge, mremap_faulted_to_unfaulted_prev_faulted_next)
+{
+	struct procmap_fd *procmap = &self->procmap;
+	unsigned int page_size = self->page_size;
+	char *ptr_a, *ptr_b, *ptr_bc;
+
+	/*
+	 * mremap() with MREMAP_DONTUNMAP such that A, B and C merge:
+	 *
+	 *                  |---------------------------|
+	 *                  |                   \       |
+	 * |-----------|    |    |-----------|  /  |---------|
+	 * | unfaulted |    v    |  faulted  |  \  | faulted |
+	 * |-----------|         |-----------|  /  |---------|
+	 *       A                     C        \       B
+	 */
+
+	/*
+	 * Map VMA B and C into place. We have to map them together so their
+	 * anon_vma is the same and the vma->vm_pgoff's are correctly aligned.
+	 */
+	ptr_bc = mmap(&self->carveout[page_size + 3 * page_size],
+		      3 * page_size + 3 * page_size,
+		     PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+	ASSERT_NE(ptr_bc, MAP_FAILED);
+
+	/* Fault it in. */
+	ptr_bc[0] = 'x';
+
+	/*
+	 * Now move VMA B out the way (splitting VMA BC) so we can place VMA A
+	 * in position, unfaulted, and leave the remainder of the VMA we just
+	 * moved in place, faulted, as VMA C.
+	 */
+	ptr_b = mremap(ptr_bc, 3 * page_size, 3 * page_size,
+		       MREMAP_FIXED | MREMAP_MAYMOVE, &self->carveout[20 * page_size]);
+	ASSERT_NE(ptr_b, MAP_FAILED);
+
+	/* Map VMA A into place. */
+	ptr_a = mmap(&self->carveout[page_size], 3 * page_size,
+		     PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
+	ASSERT_NE(ptr_a, MAP_FAILED);
+
+	/*
+	 * Now move VMA B into position with MREMAP_DONTUNMAP to catch incorrect
+	 * anon_vma propagation.
+	 */
+	ptr_b = mremap(ptr_b, 3 * page_size, 3 * page_size,
+		       MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP,
+		       &self->carveout[page_size + 3 * page_size]);
+	ASSERT_NE(ptr_b, MAP_FAILED);
+
+	/* The VMAs should have merged. */
+	ASSERT_TRUE(find_vma_procmap(procmap, ptr_a));
+	ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a);
+	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From fb39444732f02c32a8312c168d97e33d872c14d3 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 5 Jan 2026 20:11:50 +0000
Subject: tools/testing/selftests: add forked (un)/faulted VMA merge tests

Now we correctly handle forked faulted/unfaulted merge on mremap(),
exhaustively assert that we handle this correctly.

Do this in the less duplicative way by adding a new merge_with_fork
fixture and forked/unforked variants, and abstract the forking logic as
necessary to avoid code duplication with this also.

Link: https://lkml.kernel.org/r/1daf76d89fdb9d96f38a6a0152d8f3c2e9e30ac7.1767638272.git.lorenzo.stoakes@oracle.com
Fixes: 879bca0a2c4f ("mm/vma: fix incorrectly disallowed anonymous VMA merges")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jeongjun Park <aha310510@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yeoreum Yun <yeoreum.yun@arm.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/merge.c | 180 ++++++++++++++++++++++++++++---------
 1 file changed, 139 insertions(+), 41 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c
index 22be149f7109..10b686102b79 100644
--- a/tools/testing/selftests/mm/merge.c
+++ b/tools/testing/selftests/mm/merge.c
@@ -22,12 +22,37 @@ FIXTURE(merge)
 	struct procmap_fd procmap;
 };
 
+static char *map_carveout(unsigned int page_size)
+{
+	return mmap(NULL, 30 * page_size, PROT_NONE,
+		    MAP_ANON | MAP_PRIVATE, -1, 0);
+}
+
+static pid_t do_fork(struct procmap_fd *procmap)
+{
+	pid_t pid = fork();
+
+	if (pid == -1)
+		return -1;
+	if (pid != 0) {
+		wait(NULL);
+		return pid;
+	}
+
+	/* Reopen for child. */
+	if (close_procmap(procmap))
+		return -1;
+	if (open_self_procmap(procmap))
+		return -1;
+
+	return 0;
+}
+
 FIXTURE_SETUP(merge)
 {
 	self->page_size = psize();
 	/* Carve out PROT_NONE region to map over. */
-	self->carveout = mmap(NULL, 30 * self->page_size, PROT_NONE,
-			      MAP_ANON | MAP_PRIVATE, -1, 0);
+	self->carveout = map_carveout(self->page_size);
 	ASSERT_NE(self->carveout, MAP_FAILED);
 	/* Setup PROCMAP_QUERY interface. */
 	ASSERT_EQ(open_self_procmap(&self->procmap), 0);
@@ -36,7 +61,8 @@ FIXTURE_SETUP(merge)
 FIXTURE_TEARDOWN(merge)
 {
 	ASSERT_EQ(munmap(self->carveout, 30 * self->page_size), 0);
-	ASSERT_EQ(close_procmap(&self->procmap), 0);
+	/* May fail for parent of forked process. */
+	close_procmap(&self->procmap);
 	/*
 	 * Clear unconditionally, as some tests set this. It is no issue if this
 	 * fails (KSM may be disabled for instance).
@@ -44,6 +70,44 @@ FIXTURE_TEARDOWN(merge)
 	prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0);
 }
 
+FIXTURE(merge_with_fork)
+{
+	unsigned int page_size;
+	char *carveout;
+	struct procmap_fd procmap;
+};
+
+FIXTURE_VARIANT(merge_with_fork)
+{
+	bool forked;
+};
+
+FIXTURE_VARIANT_ADD(merge_with_fork, forked)
+{
+	.forked = true,
+};
+
+FIXTURE_VARIANT_ADD(merge_with_fork, unforked)
+{
+	.forked = false,
+};
+
+FIXTURE_SETUP(merge_with_fork)
+{
+	self->page_size = psize();
+	self->carveout = map_carveout(self->page_size);
+	ASSERT_NE(self->carveout, MAP_FAILED);
+	ASSERT_EQ(open_self_procmap(&self->procmap), 0);
+}
+
+FIXTURE_TEARDOWN(merge_with_fork)
+{
+	ASSERT_EQ(munmap(self->carveout, 30 * self->page_size), 0);
+	ASSERT_EQ(close_procmap(&self->procmap), 0);
+	/* See above. */
+	prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0);
+}
+
 TEST_F(merge, mprotect_unfaulted_left)
 {
 	unsigned int page_size = self->page_size;
@@ -322,8 +386,8 @@ TEST_F(merge, forked_target_vma)
 	unsigned int page_size = self->page_size;
 	char *carveout = self->carveout;
 	struct procmap_fd *procmap = &self->procmap;
-	pid_t pid;
 	char *ptr, *ptr2;
+	pid_t pid;
 	int i;
 
 	/*
@@ -344,19 +408,10 @@ TEST_F(merge, forked_target_vma)
 	 */
 	ptr[0] = 'x';
 
-	pid = fork();
+	pid = do_fork(&self->procmap);
 	ASSERT_NE(pid, -1);
-
-	if (pid != 0) {
-		wait(NULL);
+	if (pid != 0)
 		return;
-	}
-
-	/* Child process below: */
-
-	/* Reopen for child. */
-	ASSERT_EQ(close_procmap(&self->procmap), 0);
-	ASSERT_EQ(open_self_procmap(&self->procmap), 0);
 
 	/* unCOWing everything does not cause the AVC to go away. */
 	for (i = 0; i < 5 * page_size; i += page_size)
@@ -386,8 +441,8 @@ TEST_F(merge, forked_source_vma)
 	unsigned int page_size = self->page_size;
 	char *carveout = self->carveout;
 	struct procmap_fd *procmap = &self->procmap;
-	pid_t pid;
 	char *ptr, *ptr2;
+	pid_t pid;
 	int i;
 
 	/*
@@ -408,19 +463,10 @@ TEST_F(merge, forked_source_vma)
 	 */
 	ptr[0] = 'x';
 
-	pid = fork();
+	pid = do_fork(&self->procmap);
 	ASSERT_NE(pid, -1);
-
-	if (pid != 0) {
-		wait(NULL);
+	if (pid != 0)
 		return;
-	}
-
-	/* Child process below: */
-
-	/* Reopen for child. */
-	ASSERT_EQ(close_procmap(&self->procmap), 0);
-	ASSERT_EQ(open_self_procmap(&self->procmap), 0);
 
 	/* unCOWing everything does not cause the AVC to go away. */
 	for (i = 0; i < 5 * page_size; i += page_size)
@@ -1171,10 +1217,11 @@ TEST_F(merge, mremap_correct_placed_faulted)
 	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr + 15 * page_size);
 }
 
-TEST_F(merge, mremap_faulted_to_unfaulted_prev)
+TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev)
 {
 	struct procmap_fd *procmap = &self->procmap;
 	unsigned int page_size = self->page_size;
+	unsigned long offset;
 	char *ptr_a, *ptr_b;
 
 	/*
@@ -1197,6 +1244,14 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev)
 	/* Fault it in. */
 	ptr_a[0] = 'x';
 
+	if (variant->forked) {
+		pid_t pid = do_fork(&self->procmap);
+
+		ASSERT_NE(pid, -1);
+		if (pid != 0)
+			return;
+	}
+
 	/*
 	 * Now move it out of the way so we can place VMA B in position,
 	 * unfaulted.
@@ -1220,16 +1275,19 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev)
 		       &self->carveout[page_size + 3 * page_size]);
 	ASSERT_NE(ptr_a, MAP_FAILED);
 
-	/* The VMAs should have merged. */
+	/* The VMAs should have merged, if not forked. */
 	ASSERT_TRUE(find_vma_procmap(procmap, ptr_b));
 	ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b);
-	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 6 * page_size);
+
+	offset = variant->forked ? 3 * page_size : 6 * page_size;
+	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + offset);
 }
 
-TEST_F(merge, mremap_faulted_to_unfaulted_next)
+TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_next)
 {
 	struct procmap_fd *procmap = &self->procmap;
 	unsigned int page_size = self->page_size;
+	unsigned long offset;
 	char *ptr_a, *ptr_b;
 
 	/*
@@ -1253,6 +1311,14 @@ TEST_F(merge, mremap_faulted_to_unfaulted_next)
 	/* Fault it in. */
 	ptr_a[0] = 'x';
 
+	if (variant->forked) {
+		pid_t pid = do_fork(&self->procmap);
+
+		ASSERT_NE(pid, -1);
+		if (pid != 0)
+			return;
+	}
+
 	/*
 	 * Now move it out of the way so we can place VMA B in position,
 	 * unfaulted.
@@ -1276,16 +1342,18 @@ TEST_F(merge, mremap_faulted_to_unfaulted_next)
 		       &self->carveout[page_size]);
 	ASSERT_NE(ptr_a, MAP_FAILED);
 
-	/* The VMAs should have merged. */
+	/* The VMAs should have merged, if not forked. */
 	ASSERT_TRUE(find_vma_procmap(procmap, ptr_a));
 	ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a);
-	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 6 * page_size);
+	offset = variant->forked ? 3 * page_size : 6 * page_size;
+	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + offset);
 }
 
-TEST_F(merge, mremap_faulted_to_unfaulted_prev_unfaulted_next)
+TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev_unfaulted_next)
 {
 	struct procmap_fd *procmap = &self->procmap;
 	unsigned int page_size = self->page_size;
+	unsigned long offset;
 	char *ptr_a, *ptr_b, *ptr_c;
 
 	/*
@@ -1307,6 +1375,14 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev_unfaulted_next)
 	/* Fault it in. */
 	ptr_b[0] = 'x';
 
+	if (variant->forked) {
+		pid_t pid = do_fork(&self->procmap);
+
+		ASSERT_NE(pid, -1);
+		if (pid != 0)
+			return;
+	}
+
 	/*
 	 * Now move it out of the way so we can place VMAs A, C in position,
 	 * unfaulted.
@@ -1337,13 +1413,21 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev_unfaulted_next)
 		       &self->carveout[page_size + 3 * page_size]);
 	ASSERT_NE(ptr_b, MAP_FAILED);
 
-	/* The VMAs should have merged. */
+	/* The VMAs should have merged, if not forked. */
 	ASSERT_TRUE(find_vma_procmap(procmap, ptr_a));
 	ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a);
-	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size);
+	offset = variant->forked ? 3 * page_size : 9 * page_size;
+	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + offset);
+
+	/* If forked, B and C should also not have merged. */
+	if (variant->forked) {
+		ASSERT_TRUE(find_vma_procmap(procmap, ptr_b));
+		ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b);
+		ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 3 * page_size);
+	}
 }
 
-TEST_F(merge, mremap_faulted_to_unfaulted_prev_faulted_next)
+TEST_F(merge_with_fork, mremap_faulted_to_unfaulted_prev_faulted_next)
 {
 	struct procmap_fd *procmap = &self->procmap;
 	unsigned int page_size = self->page_size;
@@ -1373,6 +1457,14 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev_faulted_next)
 	/* Fault it in. */
 	ptr_bc[0] = 'x';
 
+	if (variant->forked) {
+		pid_t pid = do_fork(&self->procmap);
+
+		ASSERT_NE(pid, -1);
+		if (pid != 0)
+			return;
+	}
+
 	/*
 	 * Now move VMA B out the way (splitting VMA BC) so we can place VMA A
 	 * in position, unfaulted, and leave the remainder of the VMA we just
@@ -1397,10 +1489,16 @@ TEST_F(merge, mremap_faulted_to_unfaulted_prev_faulted_next)
 		       &self->carveout[page_size + 3 * page_size]);
 	ASSERT_NE(ptr_b, MAP_FAILED);
 
-	/* The VMAs should have merged. */
-	ASSERT_TRUE(find_vma_procmap(procmap, ptr_a));
-	ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a);
-	ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size);
+	/* The VMAs should have merged. A,B,C if unforked, B, C if forked. */
+	if (variant->forked) {
+		ASSERT_TRUE(find_vma_procmap(procmap, ptr_b));
+		ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_b);
+		ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_b + 6 * page_size);
+	} else {
+		ASSERT_TRUE(find_vma_procmap(procmap, ptr_a));
+		ASSERT_EQ(procmap->query.vma_start, (unsigned long)ptr_a);
+		ASSERT_EQ(procmap->query.vma_end, (unsigned long)ptr_a + 9 * page_size);
+	}
 }
 
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 21c68ad1d9771d331198cc73cbf6e908d7915f35 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 6 Jan 2026 15:45:47 +0000
Subject: tools/testing/selftests: fix gup_longterm for unknown fs

Commit 66bce7afbaca ("selftests/mm: fix test result reporting in
gup_longterm") introduced a small bug causing unknown filesystems to
always result in a test failure.

This is because do_test() was updated to use a common reporting path, but
this case appears to have been missed.

This is problematic for e.g.  virtme-ng which uses an overlayfs file
system, causing gup_longterm to appear to fail each time due to a test
count mismatch:

	# Planned tests != run tests (50 != 46)
	# Totals: pass:24 fail:0 xfail:0 xpass:0 skip:22 error:0

The fix is to simply change the return into a break.

Link: https://lkml.kernel.org/r/20260106154547.214907-1-lorenzo.stoakes@oracle.com
Fixes: 66bce7afbaca ("selftests/mm: fix test result reporting in gup_longterm")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/gup_longterm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c
index 6279893a0adc..f61150d28eb2 100644
--- a/tools/testing/selftests/mm/gup_longterm.c
+++ b/tools/testing/selftests/mm/gup_longterm.c
@@ -179,7 +179,7 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
 		if (rw && shared && fs_is_unknown(fs_type)) {
 			ksft_print_msg("Unknown filesystem\n");
 			result = KSFT_SKIP;
-			return;
+			break;
 		}
 		/*
 		 * R/O pinning or pinning in a private mapping is always
-- 
cgit v1.2.3


From d2bdcde9626cbea0c44a6aaa33b440c8adf81e09 Mon Sep 17 00:00:00 2001
From: Dapeng Mi <dapeng1.mi@linux.intel.com>
Date: Wed, 14 Jan 2026 09:17:45 +0800
Subject: perf/x86/intel: Add support for PEBS memory auxiliary info field in
 DMR

With the introduction of the OMR feature, the PEBS memory auxiliary info
field for load and store latency events has been restructured for DMR.

The memory auxiliary info field's bit[8] indicates whether a L2 cache
miss occurred for a memory load or store instruction. If bit[8] is 0,
it signifies no L2 cache miss, and bits[7:0] specify the exact cache data
source (up to the L2 cache level). If bit[8] is 1, bits[7:0] represent
the OMR encoding, indicating the specific L3 cache or memory region
involved in the memory access. A significant enhancement is OMR encoding
provides up to 8 fine-grained memory regions besides the cache region.

A significant enhancement for OMR encoding is the ability to provide
up to 8 fine-grained memory regions in addition to the cache region,
offering more detailed insights into memory access regions.

For detailed information on the memory auxiliary info encoding, please
refer to section 16.2 "PEBS LOAD LATENCY AND STORE LATENCY FACILITY" in
the ISE documentation.

This patch ensures that the PEBS memory auxiliary info field is correctly
interpreted and utilized in DMR.

Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260114011750.350569-3-dapeng1.mi@linux.intel.com
---
 arch/x86/events/intel/ds.c            | 140 ++++++++++++++++++++++++++++++++++
 arch/x86/events/perf_event.h          |   2 +
 include/uapi/linux/perf_event.h       |  27 ++++++-
 tools/include/uapi/linux/perf_event.h |  27 ++++++-
 4 files changed, 190 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index feb1c3cf63e4..272e652f25fc 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -34,6 +34,17 @@ struct pebs_record_32 {
 
  */
 
+union omr_encoding {
+	struct {
+		u8 omr_source : 4;
+		u8 omr_remote : 1;
+		u8 omr_hitm : 1;
+		u8 omr_snoop : 1;
+		u8 omr_promoted : 1;
+	};
+	u8 omr_full;
+};
+
 union intel_x86_pebs_dse {
 	u64 val;
 	struct {
@@ -73,6 +84,18 @@ union intel_x86_pebs_dse {
 		unsigned int lnc_addr_blk:1;
 		unsigned int ld_reserved6:18;
 	};
+	struct {
+		unsigned int pnc_dse: 8;
+		unsigned int pnc_l2_miss:1;
+		unsigned int pnc_stlb_clean_hit:1;
+		unsigned int pnc_stlb_any_hit:1;
+		unsigned int pnc_stlb_miss:1;
+		unsigned int pnc_locked:1;
+		unsigned int pnc_data_blk:1;
+		unsigned int pnc_addr_blk:1;
+		unsigned int pnc_fb_full:1;
+		unsigned int ld_reserved8:16;
+	};
 };
 
 
@@ -228,6 +251,85 @@ void __init intel_pmu_pebs_data_source_lnl(void)
 	__intel_pmu_pebs_data_source_cmt(data_source);
 }
 
+/* Version for Panthercove and later */
+
+/* L2 hit */
+#define PNC_PEBS_DATA_SOURCE_MAX	16
+static u64 pnc_pebs_l2_hit_data_source[PNC_PEBS_DATA_SOURCE_MAX] = {
+	P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA),	/* 0x00: non-cache access */
+	OP_LH               | LEVEL(L0) | P(SNOOP, NONE),	/* 0x01: L0 hit */
+	OP_LH | P(LVL, L1)  | LEVEL(L1) | P(SNOOP, NONE),	/* 0x02: L1 hit */
+	OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE),	/* 0x03: L1 Miss Handling Buffer hit */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, NONE),	/* 0x04: L2 Hit Clean */
+	0,							/* 0x05: Reserved */
+	0,							/* 0x06: Reserved */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, HIT),	/* 0x07: L2 Hit Snoop HIT */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, HITM),	/* 0x08: L2 Hit Snoop Hit Modified */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, MISS),	/* 0x09: Prefetch Promotion */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, MISS),	/* 0x0a: Cross Core Prefetch Promotion */
+	0,							/* 0x0b: Reserved */
+	0,							/* 0x0c: Reserved */
+	0,							/* 0x0d: Reserved */
+	0,							/* 0x0e: Reserved */
+	OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE),	/* 0x0f: uncached */
+};
+
+/* L2 miss */
+#define OMR_DATA_SOURCE_MAX		16
+static u64 omr_data_source[OMR_DATA_SOURCE_MAX] = {
+	P(OP, LOAD) | P(LVL, NA) | LEVEL(NA) | P(SNOOP, NA),	/* 0x00: invalid */
+	0,							/* 0x01: Reserved */
+	OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_SHARE),	/* 0x02: local CA shared cache */
+	OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, L_NON_SHARE),/* 0x03: local CA non-shared cache */
+	OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_IO),	/* 0x04: other CA IO agent */
+	OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_SHARE),	/* 0x05: other CA shared cache */
+	OP_LH | P(LVL, L3) | LEVEL(L3) | P(REGION, O_NON_SHARE),/* 0x06: other CA non-shared cache */
+	OP_LH | LEVEL(RAM) | P(REGION, MMIO),			/* 0x07: MMIO */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM0),			/* 0x08: Memory region 0 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM1),			/* 0x09: Memory region 1 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM2),			/* 0x0a: Memory region 2 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM3),			/* 0x0b: Memory region 3 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM4),			/* 0x0c: Memory region 4 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM5),			/* 0x0d: Memory region 5 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM6),			/* 0x0e: Memory region 6 */
+	OP_LH | LEVEL(RAM) | P(REGION, MEM7),			/* 0x0f: Memory region 7 */
+};
+
+static u64 parse_omr_data_source(u8 dse)
+{
+	union omr_encoding omr;
+	u64 val = 0;
+
+	omr.omr_full = dse;
+	val = omr_data_source[omr.omr_source];
+	if (omr.omr_source > 0x1 && omr.omr_source < 0x7)
+		val |= omr.omr_remote ? P(LVL, REM_CCE1) : 0;
+	else if (omr.omr_source > 0x7)
+		val |= omr.omr_remote ? P(LVL, REM_RAM1) : P(LVL, LOC_RAM);
+
+	if (omr.omr_remote)
+		val |= REM;
+
+	val |= omr.omr_hitm ? P(SNOOP, HITM) : P(SNOOP, HIT);
+
+	if (omr.omr_source == 0x2) {
+		u8 snoop = omr.omr_snoop | omr.omr_promoted;
+
+		if (snoop == 0x0)
+			val |= P(SNOOP, NA);
+		else if (snoop == 0x1)
+			val |= P(SNOOP, MISS);
+		else if (snoop == 0x2)
+			val |= P(SNOOP, HIT);
+		else if (snoop == 0x3)
+			val |= P(SNOOP, NONE);
+	} else if (omr.omr_source > 0x2 && omr.omr_source < 0x7) {
+		val |= omr.omr_snoop ? P(SNOOPX, FWD) : 0;
+	}
+
+	return val;
+}
+
 static u64 precise_store_data(u64 status)
 {
 	union intel_x86_pebs_dse dse;
@@ -411,6 +513,44 @@ u64 arl_h_latency_data(struct perf_event *event, u64 status)
 	return lnl_latency_data(event, status);
 }
 
+u64 pnc_latency_data(struct perf_event *event, u64 status)
+{
+	union intel_x86_pebs_dse dse;
+	union perf_mem_data_src src;
+	u64 val;
+
+	dse.val = status;
+
+	if (!dse.pnc_l2_miss)
+		val = pnc_pebs_l2_hit_data_source[dse.pnc_dse & 0xf];
+	else
+		val = parse_omr_data_source(dse.pnc_dse);
+
+	if (!val)
+		val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA);
+
+	if (dse.pnc_stlb_miss)
+		val |= P(TLB, MISS) | P(TLB, L2);
+	else
+		val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+	if (dse.pnc_locked)
+		val |= P(LOCK, LOCKED);
+
+	if (dse.pnc_data_blk)
+		val |= P(BLK, DATA);
+	if (dse.pnc_addr_blk)
+		val |= P(BLK, ADDR);
+	if (!dse.pnc_data_blk && !dse.pnc_addr_blk)
+		val |= P(BLK, NA);
+
+	src.val = val;
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+		src.mem_op = P(OP, STORE);
+
+	return src.val;
+}
+
 static u64 load_latency_data(struct perf_event *event, u64 status)
 {
 	union intel_x86_pebs_dse dse;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 586e3fdfe6d8..bd501c2a0f73 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1664,6 +1664,8 @@ u64 lnl_latency_data(struct perf_event *event, u64 status);
 
 u64 arl_h_latency_data(struct perf_event *event, u64 status);
 
+u64 pnc_latency_data(struct perf_event *event, u64 status);
+
 extern struct event_constraint intel_core2_pebs_event_constraints[];
 
 extern struct event_constraint intel_atom_pebs_event_constraints[];
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c44a8fb3e418..533393ec94d0 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1330,14 +1330,16 @@ union perf_mem_data_src {
 			mem_snoopx  :  2, /* Snoop mode, ext */
 			mem_blk     :  3, /* Access blocked */
 			mem_hops    :  3, /* Hop level */
-			mem_rsvd    : 18;
+			mem_region  :  5, /* cache/memory regions */
+			mem_rsvd    : 13;
 	};
 };
 #elif defined(__BIG_ENDIAN_BITFIELD)
 union perf_mem_data_src {
 	__u64 val;
 	struct {
-		__u64	mem_rsvd    : 18,
+		__u64	mem_rsvd    : 13,
+			mem_region  :  5, /* cache/memory regions */
 			mem_hops    :  3, /* Hop level */
 			mem_blk     :  3, /* Access blocked */
 			mem_snoopx  :  2, /* Snoop mode, ext */
@@ -1394,7 +1396,7 @@ union perf_mem_data_src {
 #define PERF_MEM_LVLNUM_L4			0x0004 /* L4 */
 #define PERF_MEM_LVLNUM_L2_MHB			0x0005 /* L2 Miss Handling Buffer */
 #define PERF_MEM_LVLNUM_MSC			0x0006 /* Memory-side Cache */
-/* 0x007 available */
+#define PERF_MEM_LVLNUM_L0			0x0007 /* L0 */
 #define PERF_MEM_LVLNUM_UNC			0x0008 /* Uncached */
 #define PERF_MEM_LVLNUM_CXL			0x0009 /* CXL */
 #define PERF_MEM_LVLNUM_IO			0x000a /* I/O */
@@ -1447,6 +1449,25 @@ union perf_mem_data_src {
 /* 5-7 available */
 #define PERF_MEM_HOPS_SHIFT			43
 
+/* Cache/Memory region */
+#define PERF_MEM_REGION_NA		0x0  /* Invalid */
+#define PERF_MEM_REGION_RSVD		0x01 /* Reserved */
+#define PERF_MEM_REGION_L_SHARE		0x02 /* Local CA shared cache */
+#define PERF_MEM_REGION_L_NON_SHARE	0x03 /* Local CA non-shared cache */
+#define PERF_MEM_REGION_O_IO		0x04 /* Other CA IO agent */
+#define PERF_MEM_REGION_O_SHARE		0x05 /* Other CA shared cache */
+#define PERF_MEM_REGION_O_NON_SHARE	0x06 /* Other CA non-shared cache */
+#define PERF_MEM_REGION_MMIO		0x07 /* MMIO */
+#define PERF_MEM_REGION_MEM0		0x08 /* Memory region 0 */
+#define PERF_MEM_REGION_MEM1		0x09 /* Memory region 1 */
+#define PERF_MEM_REGION_MEM2		0x0a /* Memory region 2 */
+#define PERF_MEM_REGION_MEM3		0x0b /* Memory region 3 */
+#define PERF_MEM_REGION_MEM4		0x0c /* Memory region 4 */
+#define PERF_MEM_REGION_MEM5		0x0d /* Memory region 5 */
+#define PERF_MEM_REGION_MEM6		0x0e /* Memory region 6 */
+#define PERF_MEM_REGION_MEM7		0x0f /* Memory region 7 */
+#define PERF_MEM_REGION_SHIFT		46
+
 #define PERF_MEM_S(a, s) \
 	(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
 
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index c44a8fb3e418..d4b99610a3b0 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -1330,14 +1330,16 @@ union perf_mem_data_src {
 			mem_snoopx  :  2, /* Snoop mode, ext */
 			mem_blk     :  3, /* Access blocked */
 			mem_hops    :  3, /* Hop level */
-			mem_rsvd    : 18;
+			mem_region  :  5, /* cache/memory regions */
+			mem_rsvd    : 13;
 	};
 };
 #elif defined(__BIG_ENDIAN_BITFIELD)
 union perf_mem_data_src {
 	__u64 val;
 	struct {
-		__u64	mem_rsvd    : 18,
+		__u64	mem_rsvd    : 13,
+			mem_region  :  5, /* cache/memory regions */
 			mem_hops    :  3, /* Hop level */
 			mem_blk     :  3, /* Access blocked */
 			mem_snoopx  :  2, /* Snoop mode, ext */
@@ -1394,7 +1396,7 @@ union perf_mem_data_src {
 #define PERF_MEM_LVLNUM_L4			0x0004 /* L4 */
 #define PERF_MEM_LVLNUM_L2_MHB			0x0005 /* L2 Miss Handling Buffer */
 #define PERF_MEM_LVLNUM_MSC			0x0006 /* Memory-side Cache */
-/* 0x007 available */
+#define PERF_MEM_LVLNUM_L0			0x0007   /* L0 */
 #define PERF_MEM_LVLNUM_UNC			0x0008 /* Uncached */
 #define PERF_MEM_LVLNUM_CXL			0x0009 /* CXL */
 #define PERF_MEM_LVLNUM_IO			0x000a /* I/O */
@@ -1447,6 +1449,25 @@ union perf_mem_data_src {
 /* 5-7 available */
 #define PERF_MEM_HOPS_SHIFT			43
 
+/* Cache/Memory region */
+#define PERF_MEM_REGION_NA		0x0  /* Invalid */
+#define PERF_MEM_REGION_RSVD		0x01 /* Reserved */
+#define PERF_MEM_REGION_L_SHARE		0x02 /* Local CA shared cache */
+#define PERF_MEM_REGION_L_NON_SHARE	0x03 /* Local CA non-shared cache */
+#define PERF_MEM_REGION_O_IO		0x04 /* Other CA IO agent */
+#define PERF_MEM_REGION_O_SHARE		0x05 /* Other CA shared cache */
+#define PERF_MEM_REGION_O_NON_SHARE	0x06 /* Other CA non-shared cache */
+#define PERF_MEM_REGION_MMIO		0x07 /* MMIO */
+#define PERF_MEM_REGION_MEM0		0x08 /* Memory region 0 */
+#define PERF_MEM_REGION_MEM1		0x09 /* Memory region 1 */
+#define PERF_MEM_REGION_MEM2		0x0a /* Memory region 2 */
+#define PERF_MEM_REGION_MEM3		0x0b /* Memory region 3 */
+#define PERF_MEM_REGION_MEM4		0x0c /* Memory region 4 */
+#define PERF_MEM_REGION_MEM5		0x0d /* Memory region 5 */
+#define PERF_MEM_REGION_MEM6		0x0e /* Memory region 6 */
+#define PERF_MEM_REGION_MEM7		0x0f /* Memory region 7 */
+#define PERF_MEM_REGION_SHIFT		46
+
 #define PERF_MEM_S(a, s) \
 	(((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
 
-- 
cgit v1.2.3


From b638a9d0f8965b98403022cb91d8f3b31170eb35 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 8 Jan 2026 17:32:33 +0000
Subject: KVM: arm64: selftests: Add a test for FEAT_IDST

Add a very basic test checking that FEAT_IDST actually works for
the {GMID,SMIDR,CSSIDR2}_EL1 registers.

Link: https://patch.msgid.link/20260108173233.2911955-10-maz@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 tools/testing/selftests/kvm/Makefile.kvm       |   1 +
 tools/testing/selftests/kvm/arm64/idreg-idst.c | 117 +++++++++++++++++++++++++
 2 files changed, 118 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/arm64/idreg-idst.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index ba5c2b643efa..bfa2fad0aba1 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -175,6 +175,7 @@ TEST_GEN_PROGS_arm64 += arm64/vgic_irq
 TEST_GEN_PROGS_arm64 += arm64/vgic_lpi_stress
 TEST_GEN_PROGS_arm64 += arm64/vpmu_counter_access
 TEST_GEN_PROGS_arm64 += arm64/no-vgic-v3
+TEST_GEN_PROGS_arm64 += arm64/idreg-idst
 TEST_GEN_PROGS_arm64 += arm64/kvm-uuid
 TEST_GEN_PROGS_arm64 += access_tracking_perf_test
 TEST_GEN_PROGS_arm64 += arch_timer
diff --git a/tools/testing/selftests/kvm/arm64/idreg-idst.c b/tools/testing/selftests/kvm/arm64/idreg-idst.c
new file mode 100644
index 000000000000..9ca9f125abdb
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/idreg-idst.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Access all FEAT_IDST-handled registers that depend on more than
+ * just FEAT_AA64, and fail if we don't get an a trap with an 0x18 EC.
+ */
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+static volatile bool sys64, undef;
+
+#define __check_sr_read(r)					\
+	({							\
+		uint64_t val;					\
+								\
+		sys64 = false;					\
+		undef = false;					\
+		dsb(sy);					\
+		val = read_sysreg_s(SYS_ ## r);			\
+		val;						\
+	})
+
+/* Fatal checks */
+#define check_sr_read(r)					\
+	do {							\
+		__check_sr_read(r);				\
+		__GUEST_ASSERT(!undef, #r " unexpected UNDEF");	\
+		__GUEST_ASSERT(sys64, #r " didn't trap");	\
+	} while(0)
+
+
+static void guest_code(void)
+{
+	check_sr_read(CCSIDR2_EL1);
+	check_sr_read(SMIDR_EL1);
+	check_sr_read(GMID_EL1);
+
+	GUEST_DONE();
+}
+
+static void guest_sys64_handler(struct ex_regs *regs)
+{
+	sys64 = true;
+	undef = false;
+	regs->pc += 4;
+}
+
+static void guest_undef_handler(struct ex_regs *regs)
+{
+	sys64 = false;
+	undef = true;
+	regs->pc += 4;
+}
+
+static void test_run_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	do {
+		vcpu_run(vcpu);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_PRINTF:
+			printf("%s", uc.buffer);
+			break;
+		case UCALL_DONE:
+			break;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	} while (uc.cmd != UCALL_DONE);
+}
+
+static void test_guest_feat_idst(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	/* This VM has no MTE, no SME, no CCIDX */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+				ESR_ELx_EC_SYS64, guest_sys64_handler);
+	vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT,
+				ESR_ELx_EC_UNKNOWN, guest_undef_handler);
+
+	test_run_vcpu(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	uint64_t mmfr2;
+
+	test_disable_default_vgic();
+
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+	mmfr2 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64MMFR2_EL1));
+	__TEST_REQUIRE(FIELD_GET(ID_AA64MMFR2_EL1_IDS, mmfr2) > 0,
+		       "FEAT_IDST not supported");
+	kvm_vm_free(vm);
+
+	test_guest_feat_idst();
+
+	return 0;
+}
-- 
cgit v1.2.3


From 7e03d07d03a486c66d5c084c7185b1bef29049e9 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Fri, 9 Jan 2026 08:22:14 +0000
Subject: KVM: arm64: selftests: Disable unused TTBR1_EL1 translations

KVM selftests map all guest code and data into the lower virtual address
range (0x0000...) managed by TTBR0_EL1. The upper range (0xFFFF...)
managed by TTBR1_EL1 is unused and uninitialized.

If a guest accesses the upper range, the MMU attempts a translation
table walk using uninitialized registers, leading to unpredictable
behavior.

Set `TCR_EL1.EPD1` to disable translation table walks for TTBR1_EL1,
ensuring that any access to the upper range generates an immediate
Translation Fault. Additionally, set `TCR_EL1.TBI1` (Top Byte Ignore) to
ensure that tagged pointers in the upper range also deterministically
trigger a Translation Fault via EPD1.

Define `TCR_EPD1_MASK`, `TCR_EPD1_SHIFT`, and `TCR_TBI1` in
`processor.h` to support this configuration. These are based on their
definitions in `arch/arm64/include/asm/pgtable-hwdef.h`.

Suggested-by: Will Deacon <will@kernel.org>
Reviewed-by: Itaru Kitayama <itaru.kitayama@fujitsu.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Link: https://patch.msgid.link/20260109082218.3236580-2-tabba@google.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 tools/testing/selftests/kvm/include/arm64/processor.h | 4 ++++
 tools/testing/selftests/kvm/lib/arm64/processor.c     | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/arm64/processor.h b/tools/testing/selftests/kvm/include/arm64/processor.h
index ff928716574d..ac97a1c436fc 100644
--- a/tools/testing/selftests/kvm/include/arm64/processor.h
+++ b/tools/testing/selftests/kvm/include/arm64/processor.h
@@ -90,6 +90,9 @@
 #define TCR_TG0_64K		(UL(1) << TCR_TG0_SHIFT)
 #define TCR_TG0_16K		(UL(2) << TCR_TG0_SHIFT)
 
+#define TCR_EPD1_SHIFT		23
+#define TCR_EPD1_MASK		(UL(1) << TCR_EPD1_SHIFT)
+
 #define TCR_IPS_SHIFT		32
 #define TCR_IPS_MASK		(UL(7) << TCR_IPS_SHIFT)
 #define TCR_IPS_52_BITS	(UL(6) << TCR_IPS_SHIFT)
@@ -97,6 +100,7 @@
 #define TCR_IPS_40_BITS	(UL(2) << TCR_IPS_SHIFT)
 #define TCR_IPS_36_BITS	(UL(1) << TCR_IPS_SHIFT)
 
+#define TCR_TBI1		(UL(1) << 38)
 #define TCR_HA			(UL(1) << 39)
 #define TCR_DS			(UL(1) << 59)
 
diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c
index d46e4b13b92c..5b379da8cb90 100644
--- a/tools/testing/selftests/kvm/lib/arm64/processor.c
+++ b/tools/testing/selftests/kvm/lib/arm64/processor.c
@@ -384,6 +384,8 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init)
 
 	tcr_el1 |= TCR_IRGN0_WBWA | TCR_ORGN0_WBWA | TCR_SH0_INNER;
 	tcr_el1 |= TCR_T0SZ(vm->va_bits);
+	tcr_el1 |= TCR_TBI1;
+	tcr_el1 |= TCR_EPD1_MASK;
 	if (use_lpa2_pte_format(vm))
 		tcr_el1 |= TCR_DS;
 
-- 
cgit v1.2.3


From dd0c5d04d13cae8ff2694ef83d1ae5804d6d9798 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Fri, 9 Jan 2026 08:22:15 +0000
Subject: KVM: arm64: selftests: Fix incorrect rounding in page_align()

The implementation of `page_align()` in `processor.c` calculates
alignment incorrectly for values that are already aligned. Specifically,
`(v + vm->page_size) & ~(vm->page_size - 1)` aligns to the *next* page
boundary even if `v` is already page-aligned, potentially wasting a page
of memory.

Fix the calculation to use standard alignment logic: `(v + vm->page_size
- 1) & ~(vm->page_size - 1)`.

Fixes: 7a6629ef746d ("kvm: selftests: add virt mem support for aarch64")
Reviewed-by: Andrew Jones <andrew.jones@linux.dev>
Signed-off-by: Fuad Tabba <tabba@google.com>
Link: https://patch.msgid.link/20260109082218.3236580-3-tabba@google.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 tools/testing/selftests/kvm/lib/arm64/processor.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c
index 5b379da8cb90..607a4e462984 100644
--- a/tools/testing/selftests/kvm/lib/arm64/processor.c
+++ b/tools/testing/selftests/kvm/lib/arm64/processor.c
@@ -23,7 +23,7 @@ static vm_vaddr_t exception_handlers;
 
 static uint64_t page_align(struct kvm_vm *vm, uint64_t v)
 {
-	return (v + vm->page_size) & ~(vm->page_size - 1);
+	return (v + vm->page_size - 1) & ~(vm->page_size - 1);
 }
 
 static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva)
-- 
cgit v1.2.3


From 582b39463f1c0774e0b3cb5be2118e8564b7941e Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Fri, 9 Jan 2026 08:22:16 +0000
Subject: KVM: riscv: selftests: Fix incorrect rounding in page_align()

The implementation of `page_align()` in `processor.c` calculates
alignment incorrectly for values that are already aligned. Specifically,
`(v + vm->page_size) & ~(vm->page_size - 1)` aligns to the *next* page
boundary even if `v` is already page-aligned, potentially wasting a page
of memory.

Fix the calculation to use standard alignment logic: `(v + vm->page_size
- 1) & ~(vm->page_size - 1)`.

Fixes: 3e06cdf10520 ("KVM: selftests: Add initial support for RISC-V 64-bit")
Reviewed-by: Andrew Jones <andrew.jones@linux.dev>
Signed-off-by: Fuad Tabba <tabba@google.com>
Link: https://patch.msgid.link/20260109082218.3236580-4-tabba@google.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 tools/testing/selftests/kvm/lib/riscv/processor.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c
index 2eac7d4b59e9..d5e8747b5e69 100644
--- a/tools/testing/selftests/kvm/lib/riscv/processor.c
+++ b/tools/testing/selftests/kvm/lib/riscv/processor.c
@@ -28,7 +28,7 @@ bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext)
 
 static uint64_t page_align(struct kvm_vm *vm, uint64_t v)
 {
-	return (v + vm->page_size) & ~(vm->page_size - 1);
+	return (v + vm->page_size - 1) & ~(vm->page_size - 1);
 }
 
 static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry)
-- 
cgit v1.2.3


From de00d07321cf3f182762de2308c08062d5b824c0 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Fri, 9 Jan 2026 08:22:17 +0000
Subject: KVM: selftests: Move page_align() to shared header

To avoid code duplication, move page_align() to the shared `kvm_util.h`
header file. Rename it to vm_page_align(), to make it clear that the
alignment is done with respect to the guest's base page size.

No functional change intended.

Reviewed-by: Andrew Jones <andrew.jones@linux.dev>
Signed-off-by: Fuad Tabba <tabba@google.com>
Link: https://patch.msgid.link/20260109082218.3236580-5-tabba@google.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 tools/testing/selftests/kvm/include/kvm_util.h    | 5 +++++
 tools/testing/selftests/kvm/lib/arm64/processor.c | 7 +------
 tools/testing/selftests/kvm/lib/riscv/processor.c | 7 +------
 3 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 81f4355ff28a..747effa614f1 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -1258,6 +1258,11 @@ static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm)
 	return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0);
 }
 
+static inline uint64_t vm_page_align(struct kvm_vm *vm, uint64_t v)
+{
+	return (v + vm->page_size - 1) & ~(vm->page_size - 1);
+}
+
 /*
  * Arch hook that is invoked via a constructor, i.e. before exeucting main(),
  * to allow for arch-specific setup that is common to all tests, e.g. computing
diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c
index 607a4e462984..1605dc740d1e 100644
--- a/tools/testing/selftests/kvm/lib/arm64/processor.c
+++ b/tools/testing/selftests/kvm/lib/arm64/processor.c
@@ -21,11 +21,6 @@
 
 static vm_vaddr_t exception_handlers;
 
-static uint64_t page_align(struct kvm_vm *vm, uint64_t v)
-{
-	return (v + vm->page_size - 1) & ~(vm->page_size - 1);
-}
-
 static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva)
 {
 	unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
@@ -115,7 +110,7 @@ static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm)
 
 void virt_arch_pgd_alloc(struct kvm_vm *vm)
 {
-	size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size;
+	size_t nr_pages = vm_page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size;
 
 	if (vm->pgd_created)
 		return;
diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c
index d5e8747b5e69..401245fe31db 100644
--- a/tools/testing/selftests/kvm/lib/riscv/processor.c
+++ b/tools/testing/selftests/kvm/lib/riscv/processor.c
@@ -26,11 +26,6 @@ bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext)
 	return !ret && !!value;
 }
 
-static uint64_t page_align(struct kvm_vm *vm, uint64_t v)
-{
-	return (v + vm->page_size - 1) & ~(vm->page_size - 1);
-}
-
 static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry)
 {
 	return ((entry & PGTBL_PTE_ADDR_MASK) >> PGTBL_PTE_ADDR_SHIFT) <<
@@ -68,7 +63,7 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level)
 
 void virt_arch_pgd_alloc(struct kvm_vm *vm)
 {
-	size_t nr_pages = page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size;
+	size_t nr_pages = vm_page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size;
 
 	if (vm->pgd_created)
 		return;
-- 
cgit v1.2.3


From e0a99a2b72f3c6365d9f4d6943ed45f7fc286b70 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Fri, 9 Jan 2026 08:22:18 +0000
Subject: KVM: selftests: Fix typos and stale comments in kvm_util

Fix minor documentation errors in `kvm_util.h` and `kvm_util.c`.

- Correct the argument description for `vcpu_args_set` in `kvm_util.h`,
  which incorrectly listed `vm` instead of `vcpu`.
- Fix a typo in the comment for `kvm_selftest_arch_init` ("exeucting" ->
  "executing").
- Correct the return value description for `vm_vaddr_unused_gap` in
  `kvm_util.c` to match the implementation, which returns an address "at
  or above" `vaddr_min`, not "at or below".

No functional change intended.

Reviewed-by: Andrew Jones <andrew.jones@linux.dev>
Signed-off-by: Fuad Tabba <tabba@google.com>
Link: https://patch.msgid.link/20260109082218.3236580-6-tabba@google.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 tools/testing/selftests/kvm/include/kvm_util.h | 4 ++--
 tools/testing/selftests/kvm/lib/kvm_util.c     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 747effa614f1..97f9251eb073 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -939,7 +939,7 @@ void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu);
  * VM VCPU Args Set
  *
  * Input Args:
- *   vm - Virtual Machine
+ *   vcpu - vCPU
  *   num - number of arguments
  *   ... - arguments, each of type uint64_t
  *
@@ -1264,7 +1264,7 @@ static inline uint64_t vm_page_align(struct kvm_vm *vm, uint64_t v)
 }
 
 /*
- * Arch hook that is invoked via a constructor, i.e. before exeucting main(),
+ * Arch hook that is invoked via a constructor, i.e. before executing main(),
  * to allow for arch-specific setup that is common to all tests, e.g. computing
  * the default guest "mode".
  */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 8279b6ced8d2..fab6b62d7810 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1351,7 +1351,7 @@ struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
  * Output Args: None
  *
  * Return:
- *   Lowest virtual address at or below vaddr_min, with at least
+ *   Lowest virtual address at or above vaddr_min, with at least
  *   sz unused bytes.  TEST_ASSERT failure if no area of at least
  *   size sz is available.
  *
-- 
cgit v1.2.3


From 934d9746ed0206e93506a68c838fe82ef748576a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 12 Jan 2026 13:11:57 +0100
Subject: selftests/bpf: Add test for bpf_override_return helper

We do not actually test the bpf_override_return helper functionality
itself at the moment, only the bpf program being able to attach it.

Adding test that override prctl syscall return value on top of
kprobe and kprobe.multi.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/bpf/20260112121157.854473-2-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/kprobe_multi_test.c   | 44 ++++++++++++++++++++++
 .../selftests/bpf/progs/kprobe_multi_override.c    | 15 ++++++++
 tools/testing/selftests/bpf/trace_helpers.h        | 12 ++++++
 3 files changed, 71 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
index 6cfaa978bc9a..9caef222e528 100644
--- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
@@ -1,4 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <sys/prctl.h>
 #include <test_progs.h>
 #include "kprobe_multi.skel.h"
 #include "trace_helpers.h"
@@ -540,6 +542,46 @@ cleanup:
 	kprobe_multi_override__destroy(skel);
 }
 
+static void test_override(void)
+{
+	struct kprobe_multi_override *skel = NULL;
+	int err;
+
+	skel = kprobe_multi_override__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load"))
+		goto cleanup;
+
+	skel->bss->pid = getpid();
+
+	/* no override */
+	err = prctl(0xffff, 0);
+	ASSERT_EQ(err, -1, "err");
+
+	/* kprobe.multi override */
+	skel->links.test_override = bpf_program__attach_kprobe_multi_opts(skel->progs.test_override,
+						SYS_PREFIX "sys_prctl", NULL);
+	if (!ASSERT_OK_PTR(skel->links.test_override, "bpf_program__attach_kprobe_multi_opts"))
+		goto cleanup;
+
+	err = prctl(0xffff, 0);
+	ASSERT_EQ(err, 123, "err");
+
+	bpf_link__destroy(skel->links.test_override);
+	skel->links.test_override = NULL;
+
+	/* kprobe override */
+	skel->links.test_kprobe_override = bpf_program__attach_kprobe(skel->progs.test_kprobe_override,
+							false, SYS_PREFIX "sys_prctl");
+	if (!ASSERT_OK_PTR(skel->links.test_kprobe_override, "bpf_program__attach_kprobe"))
+		goto cleanup;
+
+	err = prctl(0xffff, 0);
+	ASSERT_EQ(err, 123, "err");
+
+cleanup:
+	kprobe_multi_override__destroy(skel);
+}
+
 #ifdef __x86_64__
 static void test_attach_write_ctx(void)
 {
@@ -597,6 +639,8 @@ void test_kprobe_multi_test(void)
 		test_attach_api_fails();
 	if (test__start_subtest("attach_override"))
 		test_attach_override();
+	if (test__start_subtest("override"))
+		test_override();
 	if (test__start_subtest("session"))
 		test_session_skel_api();
 	if (test__start_subtest("session_cookie"))
diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_override.c b/tools/testing/selftests/bpf/progs/kprobe_multi_override.c
index 28f8487c9059..14f39fa6d515 100644
--- a/tools/testing/selftests/bpf/progs/kprobe_multi_override.c
+++ b/tools/testing/selftests/bpf/progs/kprobe_multi_override.c
@@ -5,9 +5,24 @@
 
 char _license[] SEC("license") = "GPL";
 
+int pid = 0;
+
 SEC("kprobe.multi")
 int test_override(struct pt_regs *ctx)
 {
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	bpf_override_return(ctx, 123);
+	return 0;
+}
+
+SEC("kprobe")
+int test_kprobe_override(struct pt_regs *ctx)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
 	bpf_override_return(ctx, 123);
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index 9437bdd4afa5..a5576b2dfc26 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -4,6 +4,18 @@
 
 #include <bpf/libbpf.h>
 
+#ifdef __x86_64__
+#define SYS_PREFIX "__x64_"
+#elif defined(__s390x__)
+#define SYS_PREFIX "__s390x_"
+#elif defined(__aarch64__)
+#define SYS_PREFIX "__arm64_"
+#elif defined(__riscv)
+#define SYS_PREFIX "__riscv_"
+#else
+#define SYS_PREFIX ""
+#endif
+
 #define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
 #define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
 
-- 
cgit v1.2.3


From a63e5fe0959200afcfefa7640db44c491f102c4c Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Tue, 13 Jan 2026 16:08:19 +0100
Subject: vsock/test: Add test for a linear and non-linear skb getting
 coalesced

Loopback transport can mangle data in rx queue when a linear skb is
followed by a small MSG_ZEROCOPY packet.

To exercise the logic, send out two packets: a weirdly sized one (to ensure
some spare tail room in the skb) and a zerocopy one that's small enough to
fit in the spare room of its predecessor. Then, wait for both to land in
the rx queue, and check the data received. Faulty packets merger manifests
itself by corrupting payload of the later packet.

Signed-off-by: Michal Luczaj <mhal@rbox.co>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20260113-vsock-recv-coalescence-v2-2-552b17837cf4@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/vsock/vsock_test.c          |  5 +++
 tools/testing/vsock/vsock_test_zerocopy.c | 74 +++++++++++++++++++++++++++++++
 tools/testing/vsock/vsock_test_zerocopy.h |  3 ++
 3 files changed, 82 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index bbe3723babdc..27e39354499a 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -2403,6 +2403,11 @@ static struct test_case test_cases[] = {
 		.run_client = test_stream_accepted_setsockopt_client,
 		.run_server = test_stream_accepted_setsockopt_server,
 	},
+	{
+		.name = "SOCK_STREAM virtio MSG_ZEROCOPY coalescence corruption",
+		.run_client = test_stream_msgzcopy_mangle_client,
+		.run_server = test_stream_msgzcopy_mangle_server,
+	},
 	{},
 };
 
diff --git a/tools/testing/vsock/vsock_test_zerocopy.c b/tools/testing/vsock/vsock_test_zerocopy.c
index 9d9a6cb9614a..a31ddfc1cd0c 100644
--- a/tools/testing/vsock/vsock_test_zerocopy.c
+++ b/tools/testing/vsock/vsock_test_zerocopy.c
@@ -9,14 +9,18 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <unistd.h>
 #include <poll.h>
 #include <linux/errqueue.h>
 #include <linux/kernel.h>
+#include <linux/sockios.h>
+#include <linux/time64.h>
 #include <errno.h>
 
 #include "control.h"
+#include "timeout.h"
 #include "vsock_test_zerocopy.h"
 #include "msg_zerocopy_common.h"
 
@@ -356,3 +360,73 @@ void test_stream_msgzcopy_empty_errq_server(const struct test_opts *opts)
 	control_expectln("DONE");
 	close(fd);
 }
+
+#define GOOD_COPY_LEN	128	/* net/vmw_vsock/virtio_transport_common.c */
+
+void test_stream_msgzcopy_mangle_client(const struct test_opts *opts)
+{
+	char sbuf1[PAGE_SIZE + 1], sbuf2[GOOD_COPY_LEN];
+	unsigned long hash;
+	struct pollfd fds;
+	int fd, i;
+
+	fd = vsock_stream_connect(opts->peer_cid, opts->peer_port);
+	if (fd < 0) {
+		perror("connect");
+		exit(EXIT_FAILURE);
+	}
+
+	enable_so_zerocopy_check(fd);
+
+	memset(sbuf1, 'x', sizeof(sbuf1));
+	send_buf(fd, sbuf1, sizeof(sbuf1), 0, sizeof(sbuf1));
+
+	for (i = 0; i < sizeof(sbuf2); i++)
+		sbuf2[i] = rand() & 0xff;
+
+	send_buf(fd, sbuf2, sizeof(sbuf2), MSG_ZEROCOPY, sizeof(sbuf2));
+
+	hash = hash_djb2(sbuf2, sizeof(sbuf2));
+	control_writeulong(hash);
+
+	fds.fd = fd;
+	fds.events = 0;
+
+	if (poll(&fds, 1, TIMEOUT * MSEC_PER_SEC) != 1 ||
+	    !(fds.revents & POLLERR)) {
+		perror("poll");
+		exit(EXIT_FAILURE);
+	}
+
+	close(fd);
+}
+
+void test_stream_msgzcopy_mangle_server(const struct test_opts *opts)
+{
+	unsigned long local_hash, remote_hash;
+	char rbuf[PAGE_SIZE + 1];
+	int fd;
+
+	fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL);
+	if (fd < 0) {
+		perror("accept");
+		exit(EXIT_FAILURE);
+	}
+
+	/* Wait, don't race the (buggy) skbs coalescence. */
+	vsock_ioctl_int(fd, SIOCINQ, PAGE_SIZE + 1 + GOOD_COPY_LEN);
+
+	/* Discard the first packet. */
+	recv_buf(fd, rbuf, PAGE_SIZE + 1, 0, PAGE_SIZE + 1);
+
+	recv_buf(fd, rbuf, GOOD_COPY_LEN, 0, GOOD_COPY_LEN);
+	remote_hash = control_readulong();
+	local_hash = hash_djb2(rbuf, GOOD_COPY_LEN);
+
+	if (local_hash != remote_hash) {
+		fprintf(stderr, "Data received corrupted\n");
+		exit(EXIT_FAILURE);
+	}
+
+	close(fd);
+}
diff --git a/tools/testing/vsock/vsock_test_zerocopy.h b/tools/testing/vsock/vsock_test_zerocopy.h
index 3ef2579e024d..d46c91a69f16 100644
--- a/tools/testing/vsock/vsock_test_zerocopy.h
+++ b/tools/testing/vsock/vsock_test_zerocopy.h
@@ -12,4 +12,7 @@ void test_seqpacket_msgzcopy_server(const struct test_opts *opts);
 void test_stream_msgzcopy_empty_errq_client(const struct test_opts *opts);
 void test_stream_msgzcopy_empty_errq_server(const struct test_opts *opts);
 
+void test_stream_msgzcopy_mangle_client(const struct test_opts *opts);
+void test_stream_msgzcopy_mangle_server(const struct test_opts *opts);
+
 #endif /* VSOCK_TEST_ZEROCOPY_H */
-- 
cgit v1.2.3


From 4f5f148dd7c0459229d2ab9a769b2e820f9ee6a2 Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marlière" <rbm@suse.com>
Date: Tue, 13 Jan 2026 12:37:44 -0300
Subject: selftests: net: fib-onlink-tests: Convert to use namespaces by
 default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, the test breaks if the SUT already has a default route
configured for IPv6. Fix by avoiding the use of the default namespace.

Fixes: 4ed591c8ab44 ("net/ipv6: Allow onlink routes to have a device mismatch if it is the default route")
Suggested-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Ricardo B. Marlière <rbm@suse.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Fernando Fernandez Mancera <fmancera@suse.de>
Link: https://patch.msgid.link/20260113-selftests-net-fib-onlink-v2-1-89de2b931389@suse.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fib-onlink-tests.sh | 71 +++++++++++--------------
 1 file changed, 30 insertions(+), 41 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh
index ec2d6ceb1f08..c01be076b210 100755
--- a/tools/testing/selftests/net/fib-onlink-tests.sh
+++ b/tools/testing/selftests/net/fib-onlink-tests.sh
@@ -120,7 +120,7 @@ log_subsection()
 
 run_cmd()
 {
-	local cmd="$*"
+	local cmd="$1"
 	local out
 	local rc
 
@@ -145,7 +145,7 @@ get_linklocal()
 	local pfx
 	local addr
 
-	addr=$(${pfx} ip -6 -br addr show dev ${dev} | \
+	addr=$(${pfx} ${IP} -6 -br addr show dev ${dev} | \
 	awk '{
 		for (i = 3; i <= NF; ++i) {
 			if ($i ~ /^fe80/)
@@ -173,58 +173,48 @@ setup()
 
 	set -e
 
-	# create namespace
-	setup_ns PEER_NS
+	# create namespaces
+	setup_ns ns1
+	IP="ip -netns $ns1"
+	setup_ns ns2
 
 	# add vrf table
-	ip li add ${VRF} type vrf table ${VRF_TABLE}
-	ip li set ${VRF} up
-	ip ro add table ${VRF_TABLE} unreachable default metric 8192
-	ip -6 ro add table ${VRF_TABLE} unreachable default metric 8192
+	${IP} li add ${VRF} type vrf table ${VRF_TABLE}
+	${IP} li set ${VRF} up
+	${IP} ro add table ${VRF_TABLE} unreachable default metric 8192
+	${IP} -6 ro add table ${VRF_TABLE} unreachable default metric 8192
 
 	# create test interfaces
-	ip li add ${NETIFS[p1]} type veth peer name ${NETIFS[p2]}
-	ip li add ${NETIFS[p3]} type veth peer name ${NETIFS[p4]}
-	ip li add ${NETIFS[p5]} type veth peer name ${NETIFS[p6]}
-	ip li add ${NETIFS[p7]} type veth peer name ${NETIFS[p8]}
+	${IP} li add ${NETIFS[p1]} type veth peer name ${NETIFS[p2]}
+	${IP} li add ${NETIFS[p3]} type veth peer name ${NETIFS[p4]}
+	${IP} li add ${NETIFS[p5]} type veth peer name ${NETIFS[p6]}
+	${IP} li add ${NETIFS[p7]} type veth peer name ${NETIFS[p8]}
 
 	# enslave vrf interfaces
 	for n in 5 7; do
-		ip li set ${NETIFS[p${n}]} vrf ${VRF}
+		${IP} li set ${NETIFS[p${n}]} vrf ${VRF}
 	done
 
 	# add addresses
 	for n in 1 3 5 7; do
-		ip li set ${NETIFS[p${n}]} up
-		ip addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]}
-		ip addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad
+		${IP} li set ${NETIFS[p${n}]} up
+		${IP} addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]}
+		${IP} addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad
 	done
 
 	# move peer interfaces to namespace and add addresses
 	for n in 2 4 6 8; do
-		ip li set ${NETIFS[p${n}]} netns ${PEER_NS} up
-		ip -netns ${PEER_NS} addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]}
-		ip -netns ${PEER_NS} addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad
+		${IP} li set ${NETIFS[p${n}]} netns ${ns2} up
+		ip -netns $ns2 addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]}
+		ip -netns $ns2 addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad
 	done
 
-	ip -6 ro add default via ${V6ADDRS[p3]/::[0-9]/::64}
-	ip -6 ro add table ${VRF_TABLE} default via ${V6ADDRS[p7]/::[0-9]/::64}
+	${IP} -6 ro add default via ${V6ADDRS[p3]/::[0-9]/::64}
+	${IP} -6 ro add table ${VRF_TABLE} default via ${V6ADDRS[p7]/::[0-9]/::64}
 
 	set +e
 }
 
-cleanup()
-{
-	# make sure we start from a clean slate
-	cleanup_ns ${PEER_NS} 2>/dev/null
-	for n in 1 3 5 7; do
-		ip link del ${NETIFS[p${n}]} 2>/dev/null
-	done
-	ip link del ${VRF} 2>/dev/null
-	ip ro flush table ${VRF_TABLE}
-	ip -6 ro flush table ${VRF_TABLE}
-}
-
 ################################################################################
 # IPv4 tests
 #
@@ -241,7 +231,7 @@ run_ip()
 	# dev arg may be empty
 	[ -n "${dev}" ] && dev="dev ${dev}"
 
-	run_cmd ip ro add table "${table}" "${prefix}"/32 via "${gw}" "${dev}" onlink
+	run_cmd "${IP} ro add table ${table} ${prefix}/32 via ${gw} ${dev} onlink"
 	log_test $? ${exp_rc} "${desc}"
 }
 
@@ -257,8 +247,8 @@ run_ip_mpath()
 	# dev arg may be empty
 	[ -n "${dev}" ] && dev="dev ${dev}"
 
-	run_cmd ip ro add table "${table}" "${prefix}"/32 \
-		nexthop via ${nh1} nexthop via ${nh2}
+	run_cmd "${IP} ro add table ${table} ${prefix}/32 \
+		nexthop via ${nh1} nexthop via ${nh2}"
 	log_test $? ${exp_rc} "${desc}"
 }
 
@@ -339,7 +329,7 @@ run_ip6()
 	# dev arg may be empty
 	[ -n "${dev}" ] && dev="dev ${dev}"
 
-	run_cmd ip -6 ro add table "${table}" "${prefix}"/128 via "${gw}" "${dev}" onlink
+	run_cmd "${IP} -6 ro add table ${table} ${prefix}/128 via ${gw} ${dev} onlink"
 	log_test $? ${exp_rc} "${desc}"
 }
 
@@ -353,8 +343,8 @@ run_ip6_mpath()
 	local exp_rc="$6"
 	local desc="$7"
 
-	run_cmd ip -6 ro add table "${table}" "${prefix}"/128 "${opts}" \
-		nexthop via ${nh1} nexthop via ${nh2}
+	run_cmd "${IP} -6 ro add table ${table} ${prefix}/128 ${opts} \
+		nexthop via ${nh1} nexthop via ${nh2}"
 	log_test $? ${exp_rc} "${desc}"
 }
 
@@ -491,10 +481,9 @@ do
 	esac
 done
 
-cleanup
 setup
 run_onlink_tests
-cleanup
+cleanup_ns ${ns1} ${ns2}
 
 if [ "$TESTS" != "none" ]; then
 	printf "\nTests passed: %3d\n" ${nsuccess}
-- 
cgit v1.2.3


From a91cc48246605af9aeef1edd32232976d74d9502 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 15 Jan 2026 09:21:54 -0800
Subject: KVM: selftests: Test READ=>WRITE dirty logging behavior for shadow
 MMU

Update the nested dirty log test to validate KVM's handling of READ faults
when dirty logging is enabled.  Specifically, set the Dirty bit in the
guest PTEs used to map L2 GPAs, so that KVM will create writable SPTEs
when handling L2 read faults.  When handling read faults in the shadow MMU,
KVM opportunistically creates a writable SPTE if the mapping can be
writable *and* the gPTE is dirty (or doesn't support the Dirty bit), i.e.
if KVM doesn't need to intercept writes in order to emulate Dirty-bit
updates.

To actually test the L2 READ=>WRITE sequence, e.g. without masking a false
pass by other test activity, route the READ=>WRITE and WRITE=>WRITE
sequences to separate L1 pages, and differentiate between "marked dirty
due to a WRITE access/fault" and "marked dirty due to creating a writable
SPTE for a READ access/fault".  The updated sequence exposes the bug fixed
by KVM commit 1f4e5fc83a42 ("KVM: x86: fix nested guest live migration
with PML") when the guest performs a READ=>WRITE sequence with dirty guest
PTEs.

Opportunistically tweak and rename the address macros, and add comments,
to make it more obvious what the test is doing.  E.g. NESTED_TEST_MEM1
vs. GUEST_TEST_MEM doesn't make it all that obvious that the test is
creating aliases in both the L2 GPA and GVA address spaces, but only when
L1 is using TDP to run L2.

Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Link: https://patch.msgid.link/20260115172154.709024-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../testing/selftests/kvm/include/x86/processor.h  |   1 +
 tools/testing/selftests/kvm/lib/x86/processor.c    |   7 +
 .../selftests/kvm/x86/nested_dirty_log_test.c      | 187 +++++++++++++++------
 3 files changed, 143 insertions(+), 52 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 6bfffc3b0a33..4ebae4269e68 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -1486,6 +1486,7 @@ bool kvm_cpu_has_tdp(void);
 void tdp_map(struct kvm_vm *vm, uint64_t nested_paddr, uint64_t paddr, uint64_t size);
 void tdp_identity_map_default_memslots(struct kvm_vm *vm);
 void tdp_identity_map_1g(struct kvm_vm *vm,  uint64_t addr, uint64_t size);
+uint64_t *tdp_get_pte(struct kvm_vm *vm, uint64_t l2_gpa);
 
 /*
  * Basic CPU control in CR0
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index ab869a98bbdc..fab18e9be66c 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -390,6 +390,13 @@ static uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm,
 	return virt_get_pte(vm, mmu, pte, vaddr, PG_LEVEL_4K);
 }
 
+uint64_t *tdp_get_pte(struct kvm_vm *vm, uint64_t l2_gpa)
+{
+	int level = PG_LEVEL_4K;
+
+	return __vm_get_page_table_entry(vm, &vm->stage2_mmu, l2_gpa, &level);
+}
+
 uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr)
 {
 	int level = PG_LEVEL_4K;
diff --git a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c
index 89d2e86a0db9..619229bbd693 100644
--- a/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c
+++ b/tools/testing/selftests/kvm/x86/nested_dirty_log_test.c
@@ -17,29 +17,54 @@
 
 /* The memory slot index to track dirty pages */
 #define TEST_MEM_SLOT_INDEX		1
-#define TEST_MEM_PAGES			3
 
-/* L1 guest test virtual memory offset */
-#define GUEST_TEST_MEM			0xc0000000
+/*
+ * Allocate four pages total.  Two pages are used to verify that the KVM marks
+ * the accessed page/GFN as marked dirty, but not the "other" page.  Times two
+ * so that each "normal" page can be accessed from L2 via an aliased L2 GVA+GPA
+ * (when TDP is enabled), to verify KVM marks _L1's_ page/GFN as dirty (to
+ * detect failures, L2 => L1 GPAs can't be identity mapped in the TDP page
+ * tables, as marking L2's GPA dirty would get a false pass if L1 == L2).
+ */
+#define TEST_MEM_PAGES			4
+
+#define TEST_MEM_BASE			0xc0000000
+#define TEST_MEM_ALIAS_BASE		0xc0002000
+
+#define TEST_GUEST_ADDR(base, idx)	((base) + (idx) * PAGE_SIZE)
 
-/* L2 guest test virtual memory offset */
-#define NESTED_TEST_MEM1		0xc0001000
-#define NESTED_TEST_MEM2		0xc0002000
+#define TEST_GVA(idx)			TEST_GUEST_ADDR(TEST_MEM_BASE, idx)
+#define TEST_GPA(idx)			TEST_GUEST_ADDR(TEST_MEM_BASE, idx)
+
+#define TEST_ALIAS_GPA(idx)		TEST_GUEST_ADDR(TEST_MEM_ALIAS_BASE, idx)
+
+#define TEST_HVA(vm, idx)		addr_gpa2hva(vm, TEST_GPA(idx))
 
 #define L2_GUEST_STACK_SIZE 64
 
-static void l2_guest_code(u64 *a, u64 *b)
-{
-	READ_ONCE(*a);
-	WRITE_ONCE(*a, 1);
-	GUEST_SYNC(true);
-	GUEST_SYNC(false);
+/* Use the page offset bits to communicate the access+fault type. */
+#define TEST_SYNC_READ_FAULT		BIT(0)
+#define TEST_SYNC_WRITE_FAULT		BIT(1)
+#define TEST_SYNC_NO_FAULT		BIT(2)
 
-	WRITE_ONCE(*b, 1);
-	GUEST_SYNC(true);
-	WRITE_ONCE(*b, 1);
-	GUEST_SYNC(true);
-	GUEST_SYNC(false);
+static void l2_guest_code(vm_vaddr_t base)
+{
+	vm_vaddr_t page0 = TEST_GUEST_ADDR(base, 0);
+	vm_vaddr_t page1 = TEST_GUEST_ADDR(base, 1);
+
+	READ_ONCE(*(u64 *)page0);
+	GUEST_SYNC(page0 | TEST_SYNC_READ_FAULT);
+	WRITE_ONCE(*(u64 *)page0, 1);
+	GUEST_SYNC(page0 | TEST_SYNC_WRITE_FAULT);
+	READ_ONCE(*(u64 *)page0);
+	GUEST_SYNC(page0 | TEST_SYNC_NO_FAULT);
+
+	WRITE_ONCE(*(u64 *)page1, 1);
+	GUEST_SYNC(page1 | TEST_SYNC_WRITE_FAULT);
+	WRITE_ONCE(*(u64 *)page1, 1);
+	GUEST_SYNC(page1 | TEST_SYNC_WRITE_FAULT);
+	READ_ONCE(*(u64 *)page1);
+	GUEST_SYNC(page1 | TEST_SYNC_NO_FAULT);
 
 	/* Exit to L1 and never come back.  */
 	vmcall();
@@ -47,13 +72,22 @@ static void l2_guest_code(u64 *a, u64 *b)
 
 static void l2_guest_code_tdp_enabled(void)
 {
-	l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2);
+	/*
+	 * Use the aliased virtual addresses when running with TDP to verify
+	 * that KVM correctly handles the case where a page is dirtied via a
+	 * different GPA than would be used by L1.
+	 */
+	l2_guest_code(TEST_MEM_ALIAS_BASE);
 }
 
 static void l2_guest_code_tdp_disabled(void)
 {
-	/* Access the same L1 GPAs as l2_guest_code_tdp_enabled() */
-	l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM);
+	/*
+	 * Use the "normal" virtual addresses when running without TDP enabled,
+	 * in which case L2 will use the same page tables as L1, and thus needs
+	 * to use the same virtual addresses that are mapped into L1.
+	 */
+	l2_guest_code(TEST_MEM_BASE);
 }
 
 void l1_vmx_code(struct vmx_pages *vmx)
@@ -72,9 +106,9 @@ void l1_vmx_code(struct vmx_pages *vmx)
 
 	prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
 
-	GUEST_SYNC(false);
+	GUEST_SYNC(TEST_SYNC_NO_FAULT);
 	GUEST_ASSERT(!vmlaunch());
-	GUEST_SYNC(false);
+	GUEST_SYNC(TEST_SYNC_NO_FAULT);
 	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_VMCALL);
 	GUEST_DONE();
 }
@@ -91,9 +125,9 @@ static void l1_svm_code(struct svm_test_data *svm)
 
 	generic_svm_setup(svm, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
 
-	GUEST_SYNC(false);
+	GUEST_SYNC(TEST_SYNC_NO_FAULT);
 	run_guest(svm->vmcb, svm->vmcb_gpa);
-	GUEST_SYNC(false);
+	GUEST_SYNC(TEST_SYNC_NO_FAULT);
 	GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL);
 	GUEST_DONE();
 }
@@ -106,12 +140,66 @@ static void l1_guest_code(void *data)
 		l1_svm_code(data);
 }
 
+static void test_handle_ucall_sync(struct kvm_vm *vm, u64 arg,
+				   unsigned long *bmap)
+{
+	vm_vaddr_t gva = arg & ~(PAGE_SIZE - 1);
+	int page_nr, i;
+
+	/*
+	 * Extract the page number of underlying physical page, which is also
+	 * the _L1_ page number.  The dirty bitmap _must_ be updated based on
+	 * the L1 GPA, not L2 GPA, i.e. whether or not L2 used an aliased GPA
+	 * (i.e. if TDP enabled for L2) is irrelevant with respect to the dirty
+	 * bitmap and which underlying physical page is accessed.
+	 *
+	 * Note, gva will be '0' if there was no access, i.e. if the purpose of
+	 * the sync is to verify all pages are clean.
+	 */
+	if (!gva)
+		page_nr = 0;
+	else if (gva >= TEST_MEM_ALIAS_BASE)
+		page_nr = (gva - TEST_MEM_ALIAS_BASE) >> PAGE_SHIFT;
+	else
+		page_nr = (gva - TEST_MEM_BASE) >> PAGE_SHIFT;
+	TEST_ASSERT(page_nr == 0 || page_nr == 1,
+		    "Test bug, unexpected frame number '%u' for arg = %lx", page_nr, arg);
+	TEST_ASSERT(gva || (arg & TEST_SYNC_NO_FAULT),
+		    "Test bug, gva must be valid if a fault is expected");
+
+	kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
+
+	/*
+	 * Check all pages to verify the correct physical page was modified (or
+	 * not), and that all pages are clean/dirty as expected.
+	 *
+	 * If a fault of any kind is expected, the target page should be dirty
+	 * as the Dirty bit is set in the gPTE.  KVM should create a writable
+	 * SPTE even on a read fault, *and* KVM must mark the GFN as dirty
+	 * when doing so.
+	 */
+	for (i = 0; i < TEST_MEM_PAGES; i++) {
+		if (i == page_nr && (arg & TEST_SYNC_WRITE_FAULT))
+			TEST_ASSERT(*(u64 *)TEST_HVA(vm, i) == 1,
+				    "Page %u incorrectly not written by guest", i);
+		else
+			TEST_ASSERT(*(u64 *)TEST_HVA(vm, i) == 0xaaaaaaaaaaaaaaaaULL,
+				    "Page %u incorrectly written by guest", i);
+
+		if (i == page_nr && !(arg & TEST_SYNC_NO_FAULT))
+			TEST_ASSERT(test_bit(i, bmap),
+				    "Page %u incorrectly reported clean on %s fault",
+				    i, arg & TEST_SYNC_READ_FAULT ? "read" : "write");
+		else
+			TEST_ASSERT(!test_bit(i, bmap),
+				    "Page %u incorrectly reported dirty", i);
+	}
+}
+
 static void test_dirty_log(bool nested_tdp)
 {
 	vm_vaddr_t nested_gva = 0;
 	unsigned long *bmap;
-	uint64_t *host_test_mem;
-
 	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
 	struct ucall uc;
@@ -133,35 +221,46 @@ static void test_dirty_log(bool nested_tdp)
 
 	/* Add an extra memory slot for testing dirty logging */
 	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
-				    GUEST_TEST_MEM,
+				    TEST_MEM_BASE,
 				    TEST_MEM_SLOT_INDEX,
 				    TEST_MEM_PAGES,
 				    KVM_MEM_LOG_DIRTY_PAGES);
 
 	/*
-	 * Add an identity map for GVA range [0xc0000000, 0xc0002000).  This
+	 * Add an identity map for GVA range [0xc0000000, 0xc0004000).  This
 	 * affects both L1 and L2.  However...
 	 */
-	virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES);
+	virt_map(vm, TEST_MEM_BASE, TEST_MEM_BASE, TEST_MEM_PAGES);
 
 	/*
-	 * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to
-	 * 0xc0000000.
+	 * ... pages in the L2 GPA address range [0xc0002000, 0xc0004000) will
+	 * map to [0xc0000000, 0xc0002000) when TDP is enabled (for L2).
 	 *
 	 * When TDP is disabled, the L2 guest code will still access the same L1
 	 * GPAs as the TDP enabled case.
+	 *
+	 * Set the Dirty bit in the PTEs used by L2 so that KVM will create
+	 * writable SPTEs when handling read faults (if the Dirty bit isn't
+	 * set, KVM must intercept the next write to emulate the Dirty bit
+	 * update).
 	 */
 	if (nested_tdp) {
 		tdp_identity_map_default_memslots(vm);
-		tdp_map(vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE);
-		tdp_map(vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE);
+		tdp_map(vm, TEST_ALIAS_GPA(0), TEST_GPA(0), PAGE_SIZE);
+		tdp_map(vm, TEST_ALIAS_GPA(1), TEST_GPA(1), PAGE_SIZE);
+
+		*tdp_get_pte(vm, TEST_ALIAS_GPA(0)) |= PTE_DIRTY_MASK(&vm->stage2_mmu);
+		*tdp_get_pte(vm, TEST_ALIAS_GPA(1)) |= PTE_DIRTY_MASK(&vm->stage2_mmu);
+	} else {
+		*vm_get_pte(vm, TEST_GVA(0)) |= PTE_DIRTY_MASK(&vm->mmu);
+		*vm_get_pte(vm, TEST_GVA(1)) |= PTE_DIRTY_MASK(&vm->mmu);
 	}
 
 	bmap = bitmap_zalloc(TEST_MEM_PAGES);
-	host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM);
 
 	while (!done) {
-		memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE);
+		memset(TEST_HVA(vm, 0), 0xaa, TEST_MEM_PAGES * PAGE_SIZE);
+
 		vcpu_run(vcpu);
 		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
@@ -170,23 +269,7 @@ static void test_dirty_log(bool nested_tdp)
 			REPORT_GUEST_ASSERT(uc);
 			/* NOT REACHED */
 		case UCALL_SYNC:
-			/*
-			 * The nested guest wrote at offset 0x1000 in the memslot, but the
-			 * dirty bitmap must be filled in according to L1 GPA, not L2.
-			 */
-			kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
-			if (uc.args[1]) {
-				TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean");
-				TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest");
-			} else {
-				TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty");
-				TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest");
-			}
-
-			TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty");
-			TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest");
-			TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty");
-			TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest");
+			test_handle_ucall_sync(vm, uc.args[1], bmap);
 			break;
 		case UCALL_DONE:
 			done = true;
-- 
cgit v1.2.3


From 086c99fbe45070d02851427eab5ae26fe7d0f3c0 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Thu, 15 Jan 2026 07:11:41 -0800
Subject: selftests: bpf: Add test for multiple syncs from linked register

Before the last commit, sync_linked_regs() corrupted the register whose
bounds are being updated by copying known_reg's id to it. The ids are
the same in value but known_reg has the BPF_ADD_CONST flag which is
wrongly copied to reg.

This later causes issues when creating new links to this reg.
assign_scalar_id_before_mov() sees this BPF_ADD_CONST and gives a new id
to this register and breaks the old links. This is exposed by the added
selftest.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20260115151143.1344724-3-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/progs/verifier_linked_scalars.c  | 33 ++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c
index 8f755d2464cf..5f41bbb730a7 100644
--- a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c
+++ b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c
@@ -31,4 +31,37 @@ l1:						\
 "	::: __clobber_all);
 }
 
+/*
+ * Test that sync_linked_regs() preserves register IDs.
+ *
+ * The sync_linked_regs() function copies bounds from known_reg to linked
+ * registers. When doing so, it must preserve each register's original id
+ * to allow subsequent syncs from the same source to work correctly.
+ *
+ */
+SEC("socket")
+__success
+__naked void sync_linked_regs_preserves_id(void)
+{
+	asm volatile ("						\
+	call %[bpf_get_prandom_u32];				\
+	r0 &= 0xff;	/* r0 in [0, 255] */			\
+	r1 = r0;	/* r0, r1 linked with id 1 */		\
+	r1 += 4;	/* r1 has id=1 and off=4 in [4, 259] */ \
+	if r1 < 10 goto l0_%=;					\
+	/* r1 in [10, 259], r0 synced to [6, 255] */		\
+	r2 = r0;	/* r2 has id=1 and in [6, 255] */	\
+	if r1 < 14 goto l0_%=;					\
+	/* r1 in [14, 259], r0 synced to [10, 255] */		\
+	if r0 >= 10 goto l0_%=;					\
+	/* Never executed */					\
+	r0 /= 0;						\
+l0_%=:								\
+	r0 = 0;							\
+	exit;							\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From b8f7622aa6e32d6fd750697b99d8ce19ad8e66d0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 29 Dec 2025 14:03:25 +0100
Subject: selftests/open_tree: add OPEN_TREE_NAMESPACE tests

Add tests for OPEN_TREE_NAMESPACE.

Link: https://patch.msgid.link/20251229-work-empty-namespace-v1-2-bfb24c7b061f@kernel.org
Tested-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../selftests/filesystems/open_tree_ns/.gitignore  |    1 +
 .../selftests/filesystems/open_tree_ns/Makefile    |   10 +
 .../filesystems/open_tree_ns/open_tree_ns_test.c   | 1030 ++++++++++++++++++++
 tools/testing/selftests/filesystems/utils.c        |   26 +
 tools/testing/selftests/filesystems/utils.h        |    1 +
 5 files changed, 1068 insertions(+)
 create mode 100644 tools/testing/selftests/filesystems/open_tree_ns/.gitignore
 create mode 100644 tools/testing/selftests/filesystems/open_tree_ns/Makefile
 create mode 100644 tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/filesystems/open_tree_ns/.gitignore b/tools/testing/selftests/filesystems/open_tree_ns/.gitignore
new file mode 100644
index 000000000000..fb12b93fbcaa
--- /dev/null
+++ b/tools/testing/selftests/filesystems/open_tree_ns/.gitignore
@@ -0,0 +1 @@
+open_tree_ns_test
diff --git a/tools/testing/selftests/filesystems/open_tree_ns/Makefile b/tools/testing/selftests/filesystems/open_tree_ns/Makefile
new file mode 100644
index 000000000000..73c03c4a7ef6
--- /dev/null
+++ b/tools/testing/selftests/filesystems/open_tree_ns/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := open_tree_ns_test
+
+CFLAGS := -Wall -Werror -g $(KHDR_INCLUDES)
+LDLIBS := -lcap
+
+include ../../lib.mk
+
+$(OUTPUT)/open_tree_ns_test: open_tree_ns_test.c ../utils.c
+	$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)
diff --git a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c
new file mode 100644
index 000000000000..9711556280ae
--- /dev/null
+++ b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c
@@ -0,0 +1,1030 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for OPEN_TREE_NAMESPACE flag.
+ *
+ * Test that open_tree() with OPEN_TREE_NAMESPACE creates a new mount
+ * namespace containing the specified mount tree.
+ */
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/nsfs.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../wrappers.h"
+#include "../statmount/statmount.h"
+#include "../utils.h"
+#include "../../kselftest_harness.h"
+
+#ifndef OPEN_TREE_NAMESPACE
+#define OPEN_TREE_NAMESPACE	(1 << 1)
+#endif
+
+static int get_mnt_ns_id(int fd, uint64_t *mnt_ns_id)
+{
+	if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0)
+		return -errno;
+	return 0;
+}
+
+static int get_mnt_ns_id_from_path(const char *path, uint64_t *mnt_ns_id)
+{
+	int fd, ret;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+
+	ret = get_mnt_ns_id(fd, mnt_ns_id);
+	close(fd);
+	return ret;
+}
+
+#define STATMOUNT_BUFSIZE (1 << 15)
+
+static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask)
+{
+	struct statmount *buf;
+	size_t bufsize = STATMOUNT_BUFSIZE;
+	int ret;
+
+	for (;;) {
+		buf = malloc(bufsize);
+		if (!buf)
+			return NULL;
+
+		ret = statmount(mnt_id, mnt_ns_id, mask, buf, bufsize, 0);
+		if (ret == 0)
+			return buf;
+
+		free(buf);
+		if (errno != EOVERFLOW)
+			return NULL;
+
+		bufsize <<= 1;
+	}
+}
+
+static void log_mount(struct __test_metadata *_metadata, struct statmount *sm)
+{
+	const char *fs_type = "";
+	const char *mnt_root = "";
+	const char *mnt_point = "";
+
+	if (sm->mask & STATMOUNT_FS_TYPE)
+		fs_type = sm->str + sm->fs_type;
+	if (sm->mask & STATMOUNT_MNT_ROOT)
+		mnt_root = sm->str + sm->mnt_root;
+	if (sm->mask & STATMOUNT_MNT_POINT)
+		mnt_point = sm->str + sm->mnt_point;
+
+	TH_LOG("  mnt_id: %llu, parent_id: %llu, fs_type: %s, root: %s, point: %s",
+	       (unsigned long long)sm->mnt_id,
+	       (unsigned long long)sm->mnt_parent_id,
+	       fs_type, mnt_root, mnt_point);
+}
+
+static void dump_mounts(struct __test_metadata *_metadata, uint64_t mnt_ns_id)
+{
+	uint64_t list[256];
+	ssize_t nr_mounts;
+
+	nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0);
+	if (nr_mounts < 0) {
+		TH_LOG("listmount failed: %s", strerror(errno));
+		return;
+	}
+
+	TH_LOG("Mount namespace %llu contains %zd mount(s):",
+	       (unsigned long long)mnt_ns_id, nr_mounts);
+
+	for (ssize_t i = 0; i < nr_mounts; i++) {
+		struct statmount *sm;
+
+		sm = statmount_alloc(list[i], mnt_ns_id,
+				     STATMOUNT_MNT_BASIC |
+				     STATMOUNT_FS_TYPE |
+				     STATMOUNT_MNT_ROOT |
+				     STATMOUNT_MNT_POINT);
+		if (!sm) {
+			TH_LOG("  [%zd] mnt_id %llu: statmount failed: %s",
+			       i, (unsigned long long)list[i], strerror(errno));
+			continue;
+		}
+
+		log_mount(_metadata, sm);
+		free(sm);
+	}
+}
+
+FIXTURE(open_tree_ns)
+{
+	int fd;
+	uint64_t current_ns_id;
+};
+
+FIXTURE_VARIANT(open_tree_ns)
+{
+	const char *path;
+	unsigned int flags;
+	bool expect_success;
+	bool expect_different_ns;
+	int min_mounts;
+};
+
+FIXTURE_VARIANT_ADD(open_tree_ns, basic_root)
+{
+	.path = "/",
+	.flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC,
+	.expect_success = true,
+	.expect_different_ns = true,
+	/*
+	 * The empty rootfs is hidden from listmount()/mountinfo,
+	 * so we only see the bind mount on top of it.
+	 */
+	.min_mounts = 1,
+};
+
+FIXTURE_VARIANT_ADD(open_tree_ns, recursive_root)
+{
+	.path = "/",
+	.flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC,
+	.expect_success = true,
+	.expect_different_ns = true,
+	.min_mounts = 1,
+};
+
+FIXTURE_VARIANT_ADD(open_tree_ns, subdir_tmp)
+{
+	.path = "/tmp",
+	.flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC,
+	.expect_success = true,
+	.expect_different_ns = true,
+	.min_mounts = 1,
+};
+
+FIXTURE_VARIANT_ADD(open_tree_ns, subdir_proc)
+{
+	.path = "/proc",
+	.flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC,
+	.expect_success = true,
+	.expect_different_ns = true,
+	.min_mounts = 1,
+};
+
+FIXTURE_VARIANT_ADD(open_tree_ns, recursive_tmp)
+{
+	.path = "/tmp",
+	.flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC,
+	.expect_success = true,
+	.expect_different_ns = true,
+	.min_mounts = 1,
+};
+
+FIXTURE_VARIANT_ADD(open_tree_ns, recursive_run)
+{
+	.path = "/run",
+	.flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC,
+	.expect_success = true,
+	.expect_different_ns = true,
+	.min_mounts = 1,
+};
+
+FIXTURE_VARIANT_ADD(open_tree_ns, invalid_recursive_alone)
+{
+	.path = "/",
+	.flags = AT_RECURSIVE | OPEN_TREE_CLOEXEC,
+	.expect_success = false,
+	.expect_different_ns = false,
+	.min_mounts = 0,
+};
+
+FIXTURE_SETUP(open_tree_ns)
+{
+	int ret;
+
+	self->fd = -1;
+
+	/* Check if open_tree syscall is supported */
+	ret = sys_open_tree(-1, NULL, 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "open_tree() syscall not supported");
+
+	/* Check if statmount/listmount are supported */
+	ret = statmount(0, 0, 0, NULL, 0, 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "statmount() syscall not supported");
+
+	/* Get current mount namespace ID for comparison */
+	ret = get_mnt_ns_id_from_path("/proc/self/ns/mnt", &self->current_ns_id);
+	if (ret < 0)
+		SKIP(return, "Failed to get current mount namespace ID");
+}
+
+FIXTURE_TEARDOWN(open_tree_ns)
+{
+	if (self->fd >= 0)
+		close(self->fd);
+}
+
+TEST_F(open_tree_ns, create_namespace)
+{
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	self->fd = sys_open_tree(AT_FDCWD, variant->path, variant->flags);
+
+	if (!variant->expect_success) {
+		ASSERT_LT(self->fd, 0);
+		ASSERT_EQ(errno, EINVAL);
+		return;
+	}
+
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "OPEN_TREE_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	/* Verify we can get the namespace ID */
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	/* Verify it's a different namespace */
+	if (variant->expect_different_ns)
+		ASSERT_NE(new_ns_id, self->current_ns_id);
+
+	/* List mounts in the new namespace */
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 0) {
+		TH_LOG("%m - listmount failed");
+	}
+
+	/* Verify minimum expected mounts */
+	ASSERT_GE(nr_mounts, variant->min_mounts);
+	TH_LOG("Namespace contains %zd mounts", nr_mounts);
+}
+
+TEST_F(open_tree_ns, setns_into_namespace)
+{
+	uint64_t new_ns_id;
+	pid_t pid;
+	int status;
+	int ret;
+
+	/* Only test with basic flags */
+	if (!(variant->flags & OPEN_TREE_NAMESPACE))
+		SKIP(return, "setns test only for basic / case");
+
+	self->fd = sys_open_tree(AT_FDCWD, variant->path, variant->flags);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "OPEN_TREE_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	/* Get namespace ID and dump all mounts */
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	dump_mounts(_metadata, new_ns_id);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* Child: try to enter the namespace */
+		if (setns(self->fd, CLONE_NEWNS) < 0)
+			_exit(1);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+	ASSERT_EQ(WEXITSTATUS(status), 0);
+}
+
+TEST_F(open_tree_ns, verify_mount_properties)
+{
+	struct statmount sm;
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int ret;
+
+	/* Only test with basic flags on root */
+	if (variant->flags != (OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC) ||
+	    strcmp(variant->path, "/") != 0)
+		SKIP(return, "mount properties test only for basic / case");
+
+	self->fd = sys_open_tree(AT_FDCWD, "/", OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC);
+	if (self->fd < 0 && errno == EINVAL)
+		SKIP(return, "OPEN_TREE_NAMESPACE not supported");
+
+	ASSERT_GE(self->fd, 0);
+
+	ret = get_mnt_ns_id(self->fd, &new_ns_id);
+	ASSERT_EQ(ret, 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 1);
+
+	/* Get info about the root mount (the bind mount, rootfs is hidden) */
+	ret = statmount(list[0], new_ns_id, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0);
+	ASSERT_EQ(ret, 0);
+
+	ASSERT_NE(sm.mnt_id, sm.mnt_parent_id);
+
+	TH_LOG("Root mount id: %llu, parent: %llu",
+	       (unsigned long long)sm.mnt_id,
+	       (unsigned long long)sm.mnt_parent_id);
+}
+
+FIXTURE(open_tree_ns_caps)
+{
+	bool has_caps;
+};
+
+FIXTURE_SETUP(open_tree_ns_caps)
+{
+	int ret;
+
+	/* Check if open_tree syscall is supported */
+	ret = sys_open_tree(-1, NULL, 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "open_tree() syscall not supported");
+
+	self->has_caps = (geteuid() == 0);
+}
+
+FIXTURE_TEARDOWN(open_tree_ns_caps)
+{
+}
+
+TEST_F(open_tree_ns_caps, requires_cap_sys_admin)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		int fd;
+
+		/* Child: drop privileges using utils.h helper */
+		if (enter_userns() != 0)
+			_exit(2);
+
+		/* Drop all caps using utils.h helper */
+		if (caps_down() == 0)
+			_exit(3);
+
+		fd = sys_open_tree(AT_FDCWD, "/",
+				   OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC);
+		if (fd >= 0) {
+			close(fd);
+			/* Should have failed without caps */
+			_exit(1);
+		}
+
+		if (errno == EPERM)
+			_exit(0);
+
+		/* EINVAL means OPEN_TREE_NAMESPACE not supported */
+		if (errno == EINVAL)
+			_exit(4);
+
+		/* Unexpected error */
+		_exit(5);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		/* Expected: EPERM without caps */
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("OPEN_TREE_NAMESPACE succeeded without caps");
+		break;
+	case 2:
+		SKIP(return, "setup_userns failed");
+		break;
+	case 3:
+		SKIP(return, "caps_down failed");
+		break;
+	case 4:
+		SKIP(return, "OPEN_TREE_NAMESPACE not supported");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+FIXTURE(open_tree_ns_userns)
+{
+	int fd;
+};
+
+FIXTURE_SETUP(open_tree_ns_userns)
+{
+	int ret;
+
+	self->fd = -1;
+
+	/* Check if open_tree syscall is supported */
+	ret = sys_open_tree(-1, NULL, 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "open_tree() syscall not supported");
+
+	/* Check if statmount/listmount are supported */
+	ret = statmount(0, 0, 0, NULL, 0, 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "statmount() syscall not supported");
+}
+
+FIXTURE_TEARDOWN(open_tree_ns_userns)
+{
+	if (self->fd >= 0)
+		close(self->fd);
+}
+
+TEST_F(open_tree_ns_userns, create_in_userns)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t new_ns_id;
+		uint64_t list[256];
+		ssize_t nr_mounts;
+		int fd;
+
+		/* Create new user namespace (also creates mount namespace) */
+		if (enter_userns() != 0)
+			_exit(2);
+
+		/* Now we have CAP_SYS_ADMIN in the user namespace */
+		fd = sys_open_tree(AT_FDCWD, "/",
+				   OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC);
+		if (fd < 0) {
+			if (errno == EINVAL)
+				_exit(4); /* OPEN_TREE_NAMESPACE not supported */
+			_exit(1);
+		}
+
+		/* Verify we can get the namespace ID */
+		if (get_mnt_ns_id(fd, &new_ns_id) != 0)
+			_exit(5);
+
+		/* Verify we can list mounts in the new namespace */
+		nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+		if (nr_mounts < 0)
+			_exit(6);
+
+		/* Should have at least 1 mount */
+		if (nr_mounts < 1)
+			_exit(7);
+
+		close(fd);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		/* Success */
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed in userns");
+		break;
+	case 2:
+		SKIP(return, "setup_userns failed");
+		break;
+	case 4:
+		SKIP(return, "OPEN_TREE_NAMESPACE not supported");
+		break;
+	case 5:
+		ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID");
+		break;
+	case 6:
+		ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace");
+		break;
+	case 7:
+		ASSERT_FALSE(true) TH_LOG("New namespace has no mounts");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+TEST_F(open_tree_ns_userns, setns_in_userns)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t new_ns_id;
+		int fd;
+		pid_t inner_pid;
+		int inner_status;
+
+		/* Create new user namespace */
+		if (enter_userns() != 0)
+			_exit(2);
+
+		fd = sys_open_tree(AT_FDCWD, "/",
+				   OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC);
+		if (fd < 0) {
+			if (errno == EINVAL)
+				_exit(4);
+			_exit(1);
+		}
+
+		if (get_mnt_ns_id(fd, &new_ns_id) != 0)
+			_exit(5);
+
+		/* Fork again to test setns into the new namespace */
+		inner_pid = fork();
+		if (inner_pid < 0)
+			_exit(8);
+
+		if (inner_pid == 0) {
+			/* Inner child: enter the new namespace */
+			if (setns(fd, CLONE_NEWNS) < 0)
+				_exit(1);
+			_exit(0);
+		}
+
+		if (waitpid(inner_pid, &inner_status, 0) != inner_pid)
+			_exit(9);
+
+		if (!WIFEXITED(inner_status) || WEXITSTATUS(inner_status) != 0)
+			_exit(10);
+
+		close(fd);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		/* Success */
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("open_tree or setns failed in userns");
+		break;
+	case 2:
+		SKIP(return, "setup_userns failed");
+		break;
+	case 4:
+		SKIP(return, "OPEN_TREE_NAMESPACE not supported");
+		break;
+	case 5:
+		ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID");
+		break;
+	case 8:
+		ASSERT_FALSE(true) TH_LOG("Inner fork failed");
+		break;
+	case 9:
+		ASSERT_FALSE(true) TH_LOG("Inner waitpid failed");
+		break;
+	case 10:
+		ASSERT_FALSE(true) TH_LOG("setns into new namespace failed");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+TEST_F(open_tree_ns_userns, recursive_in_userns)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t new_ns_id;
+		uint64_t list[256];
+		ssize_t nr_mounts;
+		int fd;
+
+		/* Create new user namespace */
+		if (enter_userns() != 0)
+			_exit(2);
+
+		/* Test recursive flag in userns */
+		fd = sys_open_tree(AT_FDCWD, "/",
+				   OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC);
+		if (fd < 0) {
+			if (errno == EINVAL)
+				_exit(4);
+			_exit(1);
+		}
+
+		if (get_mnt_ns_id(fd, &new_ns_id) != 0)
+			_exit(5);
+
+		nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+		if (nr_mounts < 0)
+			_exit(6);
+
+		/* Recursive should copy submounts too */
+		if (nr_mounts < 1)
+			_exit(7);
+
+		close(fd);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		/* Success */
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE|AT_RECURSIVE) failed in userns");
+		break;
+	case 2:
+		SKIP(return, "setup_userns failed");
+		break;
+	case 4:
+		SKIP(return, "OPEN_TREE_NAMESPACE not supported");
+		break;
+	case 5:
+		ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID");
+		break;
+	case 6:
+		ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace");
+		break;
+	case 7:
+		ASSERT_FALSE(true) TH_LOG("New namespace has no mounts");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+TEST_F(open_tree_ns_userns, umount_fails_einval)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t new_ns_id;
+		uint64_t list[256];
+		ssize_t nr_mounts;
+		int fd;
+		ssize_t i;
+
+		/* Create new user namespace */
+		if (enter_userns() != 0)
+			_exit(2);
+
+		fd = sys_open_tree(AT_FDCWD, "/",
+				   OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC);
+		if (fd < 0) {
+			if (errno == EINVAL)
+				_exit(4);
+			_exit(1);
+		}
+
+		if (get_mnt_ns_id(fd, &new_ns_id) != 0)
+			_exit(5);
+
+		/* Get all mounts in the new namespace */
+		nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE);
+		if (nr_mounts < 0)
+			_exit(9);
+
+		if (nr_mounts < 1)
+			_exit(10);
+
+		/* Enter the new namespace */
+		if (setns(fd, CLONE_NEWNS) < 0)
+			_exit(6);
+
+		for (i = 0; i < nr_mounts; i++) {
+			struct statmount *sm;
+			const char *mnt_point;
+
+			sm = statmount_alloc(list[i], new_ns_id,
+					     STATMOUNT_MNT_POINT);
+			if (!sm)
+				_exit(11);
+
+			mnt_point = sm->str + sm->mnt_point;
+
+			TH_LOG("Trying to umount %s", mnt_point);
+			if (umount2(mnt_point, MNT_DETACH) == 0) {
+				free(sm);
+				_exit(7);
+			}
+
+			if (errno != EINVAL) {
+				/* Wrong error */
+				free(sm);
+				_exit(8);
+			}
+
+			free(sm);
+		}
+
+		close(fd);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed");
+		break;
+	case 2:
+		SKIP(return, "setup_userns failed");
+		break;
+	case 4:
+		SKIP(return, "OPEN_TREE_NAMESPACE not supported");
+		break;
+	case 5:
+		ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID");
+		break;
+	case 6:
+		ASSERT_FALSE(true) TH_LOG("setns into new namespace failed");
+		break;
+	case 7:
+		ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL");
+		break;
+	case 8:
+		ASSERT_FALSE(true) TH_LOG("umount failed with wrong error (expected EINVAL)");
+		break;
+	case 9:
+		ASSERT_FALSE(true) TH_LOG("listmount failed");
+		break;
+	case 10:
+		ASSERT_FALSE(true) TH_LOG("No mounts in new namespace");
+		break;
+	case 11:
+		ASSERT_FALSE(true) TH_LOG("statmount_alloc failed");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+TEST_F(open_tree_ns_userns, umount_succeeds)
+{
+	pid_t pid;
+	int status;
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		uint64_t new_ns_id;
+		uint64_t list[256];
+		ssize_t nr_mounts;
+		int fd;
+		ssize_t i;
+
+		if (unshare(CLONE_NEWNS))
+			_exit(1);
+
+		if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) != 0)
+			_exit(1);
+
+		fd = sys_open_tree(AT_FDCWD, "/",
+				   OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC);
+		if (fd < 0) {
+			if (errno == EINVAL)
+				_exit(4);
+			_exit(1);
+		}
+
+		if (get_mnt_ns_id(fd, &new_ns_id) != 0)
+			_exit(5);
+
+		/* Get all mounts in the new namespace */
+		nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE);
+		if (nr_mounts < 0)
+			_exit(9);
+
+		if (nr_mounts < 1)
+			_exit(10);
+
+		/* Enter the new namespace */
+		if (setns(fd, CLONE_NEWNS) < 0)
+			_exit(6);
+
+		for (i = 0; i < nr_mounts; i++) {
+			struct statmount *sm;
+			const char *mnt_point;
+
+			sm = statmount_alloc(list[i], new_ns_id,
+					     STATMOUNT_MNT_POINT);
+			if (!sm)
+				_exit(11);
+
+			mnt_point = sm->str + sm->mnt_point;
+
+			TH_LOG("Trying to umount %s", mnt_point);
+			if (umount2(mnt_point, MNT_DETACH) != 0) {
+				free(sm);
+				_exit(7);
+			}
+
+			free(sm);
+		}
+
+		close(fd);
+		_exit(0);
+	}
+
+	ASSERT_EQ(waitpid(pid, &status, 0), pid);
+	ASSERT_TRUE(WIFEXITED(status));
+
+	switch (WEXITSTATUS(status)) {
+	case 0:
+		break;
+	case 1:
+		ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed");
+		break;
+	case 2:
+		SKIP(return, "setup_userns failed");
+		break;
+	case 4:
+		SKIP(return, "OPEN_TREE_NAMESPACE not supported");
+		break;
+	case 5:
+		ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID");
+		break;
+	case 6:
+		ASSERT_FALSE(true) TH_LOG("setns into new namespace failed");
+		break;
+	case 7:
+		ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL");
+		break;
+	case 9:
+		ASSERT_FALSE(true) TH_LOG("listmount failed");
+		break;
+	case 10:
+		ASSERT_FALSE(true) TH_LOG("No mounts in new namespace");
+		break;
+	case 11:
+		ASSERT_FALSE(true) TH_LOG("statmount_alloc failed");
+		break;
+	default:
+		ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)",
+					  WEXITSTATUS(status));
+		break;
+	}
+}
+
+FIXTURE(open_tree_ns_unbindable)
+{
+	char tmpdir[PATH_MAX];
+	bool mounted;
+};
+
+FIXTURE_SETUP(open_tree_ns_unbindable)
+{
+	int ret;
+
+	self->mounted = false;
+
+	/* Check if open_tree syscall is supported */
+	ret = sys_open_tree(-1, NULL, 0);
+	if (ret == -1 && errno == ENOSYS)
+		SKIP(return, "open_tree() syscall not supported");
+
+	/* Create a temporary directory for the test mount */
+	snprintf(self->tmpdir, sizeof(self->tmpdir),
+		 "/tmp/open_tree_ns_test.XXXXXX");
+	ASSERT_NE(mkdtemp(self->tmpdir), NULL);
+
+	/* Mount tmpfs there */
+	ret = mount("tmpfs", self->tmpdir, "tmpfs", 0, NULL);
+	if (ret < 0) {
+		rmdir(self->tmpdir);
+		SKIP(return, "Failed to mount tmpfs");
+	}
+	self->mounted = true;
+
+	ret = mount(NULL, self->tmpdir, NULL, MS_UNBINDABLE, NULL);
+	if (ret < 0) {
+		rmdir(self->tmpdir);
+		SKIP(return, "Failed to make tmpfs unbindable");
+	}
+}
+
+FIXTURE_TEARDOWN(open_tree_ns_unbindable)
+{
+	if (self->mounted)
+		umount2(self->tmpdir, MNT_DETACH);
+	rmdir(self->tmpdir);
+}
+
+TEST_F(open_tree_ns_unbindable, fails_on_unbindable)
+{
+	int fd;
+
+	fd = sys_open_tree(AT_FDCWD, self->tmpdir,
+			   OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC);
+	ASSERT_LT(fd, 0);
+}
+
+TEST_F(open_tree_ns_unbindable, recursive_skips_on_unbindable)
+{
+	uint64_t new_ns_id;
+	uint64_t list[256];
+	ssize_t nr_mounts;
+	int fd;
+	ssize_t i;
+	bool found_unbindable = false;
+
+	fd = sys_open_tree(AT_FDCWD, "/",
+			   OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC);
+	ASSERT_GT(fd, 0);
+
+	ASSERT_EQ(get_mnt_ns_id(fd, &new_ns_id), 0);
+
+	nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0);
+	ASSERT_GE(nr_mounts, 0) {
+		TH_LOG("listmount failed: %m");
+	}
+
+	/*
+	 * Iterate through all mounts in the new namespace and verify
+	 * the unbindable tmpfs mount was silently dropped.
+	 */
+	for (i = 0; i < nr_mounts; i++) {
+		struct statmount *sm;
+		const char *mnt_point;
+
+		sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT);
+		ASSERT_NE(sm, NULL) {
+			TH_LOG("statmount_alloc failed for mnt_id %llu",
+			       (unsigned long long)list[i]);
+		}
+
+		mnt_point = sm->str + sm->mnt_point;
+
+		if (strcmp(mnt_point, self->tmpdir) == 0) {
+			TH_LOG("Found unbindable mount at %s (should have been dropped)",
+			       mnt_point);
+			found_unbindable = true;
+		}
+
+		free(sm);
+	}
+
+	ASSERT_FALSE(found_unbindable) {
+		TH_LOG("Unbindable mount at %s was not dropped", self->tmpdir);
+	}
+
+	close(fd);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c
index c9dd5412b37b..d6f26f849053 100644
--- a/tools/testing/selftests/filesystems/utils.c
+++ b/tools/testing/selftests/filesystems/utils.c
@@ -515,6 +515,32 @@ int setup_userns(void)
 	return 0;
 }
 
+int enter_userns(void)
+{
+	int ret;
+	char buf[32];
+	uid_t uid = getuid();
+	gid_t gid = getgid();
+
+	ret = unshare(CLONE_NEWUSER);
+	if (ret)
+		return ret;
+
+	sprintf(buf, "0 %d 1", uid);
+	ret = write_file("/proc/self/uid_map", buf);
+	if (ret)
+		return ret;
+	ret = write_file("/proc/self/setgroups", "deny");
+	if (ret)
+		return ret;
+	sprintf(buf, "0 %d 1", gid);
+	ret = write_file("/proc/self/gid_map", buf);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
 /* caps_down - lower all effective caps */
 int caps_down(void)
 {
diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h
index 70f7ccc607f4..0bccfed666a9 100644
--- a/tools/testing/selftests/filesystems/utils.h
+++ b/tools/testing/selftests/filesystems/utils.h
@@ -28,6 +28,7 @@ extern int cap_down(cap_value_t down);
 
 extern bool switch_ids(uid_t uid, gid_t gid);
 extern int setup_userns(void);
+extern int enter_userns(void);
 
 static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps)
 {
-- 
cgit v1.2.3


From 7799ba2160e4919913ecabca8a7fc1aa4c576fb4 Mon Sep 17 00:00:00 2001
From: João Marcos Costa <joaomarcos.costa@bootlin.com>
Date: Tue, 13 Jan 2026 14:27:53 +0100
Subject: cpupower: make systemd unit installation optional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cpupower currently installs a cpupower.service unit file into unitdir
unconditionally, regardless of whether systemd is used by the host.

Improve the installation procedure by making this systemd step optional:
a 'SYSTEMD' build parameter that defaults to 'true' and can be set to
'false' to disable the installation of systemd's unit file.

Since 'SYSTEMD' defaults to true, the current behavior is kept as the
default.

Link: https://lore.kernel.org/r/20260113132753.1730020-2-joaomarcos.costa@bootlin.com
Signed-off-by: João Marcos Costa <joaomarcos.costa@bootlin.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/power/cpupower/Makefile | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile
index a1df9196dc45..969716dfe8de 100644
--- a/tools/power/cpupower/Makefile
+++ b/tools/power/cpupower/Makefile
@@ -315,7 +315,17 @@ endif
 	$(INSTALL_DATA) lib/cpuidle.h $(DESTDIR)${includedir}/cpuidle.h
 	$(INSTALL_DATA) lib/powercap.h $(DESTDIR)${includedir}/powercap.h
 
-install-tools: $(OUTPUT)cpupower
+# SYSTEMD=false disables installation of the systemd unit file
+SYSTEMD ?=	true
+
+install-systemd:
+	$(INSTALL) -d $(DESTDIR)${unitdir}
+	sed 's|___CDIR___|${confdir}|; s|___LDIR___|${libexecdir}|' cpupower.service.in > '$(DESTDIR)${unitdir}/cpupower.service'
+	$(SETPERM_DATA) '$(DESTDIR)${unitdir}/cpupower.service'
+
+INSTALL_SYSTEMD := $(if $(filter true,$(strip $(SYSTEMD))),install-systemd)
+
+install-tools: $(OUTPUT)cpupower $(INSTALL_SYSTEMD)
 	$(INSTALL) -d $(DESTDIR)${bindir}
 	$(INSTALL_PROGRAM) $(OUTPUT)cpupower $(DESTDIR)${bindir}
 	$(INSTALL) -d $(DESTDIR)${bash_completion_dir}
@@ -324,9 +334,6 @@ install-tools: $(OUTPUT)cpupower
 	$(INSTALL_DATA) cpupower-service.conf '$(DESTDIR)${confdir}'
 	$(INSTALL) -d $(DESTDIR)${libexecdir}
 	$(INSTALL_PROGRAM) cpupower.sh '$(DESTDIR)${libexecdir}/cpupower'
-	$(INSTALL) -d $(DESTDIR)${unitdir}
-	sed 's|___CDIR___|${confdir}|; s|___LDIR___|${libexecdir}|' cpupower.service.in > '$(DESTDIR)${unitdir}/cpupower.service'
-	$(SETPERM_DATA) '$(DESTDIR)${unitdir}/cpupower.service'
 
 install-man:
 	$(INSTALL_DATA) -D man/cpupower.1 $(DESTDIR)${mandir}/man1/cpupower.1
@@ -406,4 +413,4 @@ help:
 	@echo  '  uninstall	  - Remove previously installed files from the dir defined by "DESTDIR"'
 	@echo  '                    cmdline or Makefile config block option (default: "")'
 
-.PHONY: all utils libcpupower update-po create-gmo install-lib install-tools install-man install-gmo install uninstall clean help
+.PHONY: all utils libcpupower update-po create-gmo install-lib install-systemd install-tools install-man install-gmo install uninstall clean help
-- 
cgit v1.2.3


From 3ec6cefc398b93e5f28500f80e7321a80fffee8a Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marlière" <rbm@suse.com>
Date: Fri, 16 Jan 2026 11:20:54 -0300
Subject: selftests/run_kselftest.sh: Add `--skip` argument option
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the only way of excluding certain tests from a collection is by
passing all the other tests explicitly via `--test`. Therefore, if the user
wants to skip a single test the resulting command line might be too big,
depending on the collection. Add an option `--skip` that takes care of
that.

Link: https://lore.kernel.org/r/20260116-selftests-add_skip_opt-v1-1-ab54afaae81b@suse.com
Signed-off-by: Ricardo B. Marlière <rbm@suse.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/run_kselftest.sh | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/run_kselftest.sh b/tools/testing/selftests/run_kselftest.sh
index d4be97498b32..84d45254675c 100755
--- a/tools/testing/selftests/run_kselftest.sh
+++ b/tools/testing/selftests/run_kselftest.sh
@@ -30,6 +30,7 @@ Usage: $0 [OPTIONS]
   -s | --summary		Print summary with detailed log in output.log (conflict with -p)
   -p | --per-test-log		Print test log in /tmp with each test name (conflict with -s)
   -t | --test COLLECTION:TEST	Run TEST from COLLECTION
+  -S | --skip COLLECTION:TEST	Skip TEST from COLLECTION
   -c | --collection COLLECTION	Run all tests from COLLECTION
   -l | --list			List the available collection:test entries
   -d | --dry-run		Don't actually run any tests
@@ -43,6 +44,7 @@ EOF
 
 COLLECTIONS=""
 TESTS=""
+SKIP=""
 dryrun=""
 kselftest_override_timeout=""
 ERROR_ON_FAIL=true
@@ -58,6 +60,9 @@ while true; do
 		-t | --test)
 			TESTS="$TESTS $2"
 			shift 2 ;;
+		-S | --skip)
+			SKIP="$SKIP $2"
+			shift 2 ;;
 		-c | --collection)
 			COLLECTIONS="$COLLECTIONS $2"
 			shift 2 ;;
@@ -109,6 +114,12 @@ if [ -n "$TESTS" ]; then
 	done
 	available="$(echo "$valid" | sed -e 's/ /\n/g')"
 fi
+# Remove tests to be skipped from available list
+if [ -n "$SKIP" ]; then
+	for skipped in $SKIP ; do
+		available="$(echo "$available" | grep -v "^${skipped}$")"
+	done
+fi
 
 kselftest_failures_file="$(mktemp --tmpdir kselftest-failures-XXXXXX)"
 export kselftest_failures_file
-- 
cgit v1.2.3


From db7855c96d4216b2ed45e2781fae9293b323c7ef Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 16 Jan 2026 12:40:56 -0800
Subject: x86/entry/vdso/selftest: Update location of vgetrandom-chacha.S

As part of the vdso build restructuring, vgetrandom-chacha.S moved
into the vdso/vdso64 subdirectory. Update the selftest #include to
match.

Closes: https://lore.kernel.org/oe-lkp/202601161608.5cd5af9a-lkp@intel.com
Fixes: 693c819fedcd ("x86/entry/vdso: Refactor the vdso build")
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://patch.msgid.link/20260116204057.386268-4-hpa@zytor.com
---
 tools/testing/selftests/vDSO/vgetrandom-chacha.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vgetrandom-chacha.S b/tools/testing/selftests/vDSO/vgetrandom-chacha.S
index a4a82e1c28a9..10f982157a1f 100644
--- a/tools/testing/selftests/vDSO/vgetrandom-chacha.S
+++ b/tools/testing/selftests/vDSO/vgetrandom-chacha.S
@@ -16,5 +16,5 @@
 #elif defined(__s390x__)
 #include "../../../../arch/s390/kernel/vdso64/vgetrandom-chacha.S"
 #elif defined(__x86_64__)
-#include "../../../../arch/x86/entry/vdso/vgetrandom-chacha.S"
+#include "../../../../arch/x86/entry/vdso/vdso64/vgetrandom-chacha.S"
 #endif
-- 
cgit v1.2.3


From 999b2395e3c32273dec98f811f0ab5c8a7441850 Mon Sep 17 00:00:00 2001
From: Gyutae Bae <gyutae.bae@navercorp.com>
Date: Mon, 12 Jan 2026 12:45:16 +0900
Subject: bpftool: Add 'prepend' option for tcx attach to insert at chain start

Add support for the 'prepend' option when attaching tcx_ingress and
tcx_egress programs. This option allows inserting a BPF program at
the beginning of the TCX chain instead of appending it at the end.

The implementation uses BPF_F_BEFORE flag which automatically inserts
the program at the beginning of the chain when no relative reference
is specified.

This change includes:
- Modify do_attach_tcx() to support prepend insertion using BPF_F_BEFORE
- Update documentation to describe the new 'prepend' option
- Add bash completion support for the 'prepend' option on tcx attach types
- Add example usage in the documentation
- Add validation to reject 'overwrite' for non-XDP attach types

The 'prepend' option is only valid for tcx_ingress and tcx_egress attach
types. For XDP attach types, the existing 'overwrite' option remains
available.

Example usage:
  # bpftool net attach tcx_ingress name tc_prog dev lo prepend

This feature is useful when the order of program execution in the TCX
chain matters and users need to ensure certain programs run first.

Co-developed-by: Siwan Kim <siwan.kim@navercorp.com>
Signed-off-by: Siwan Kim <siwan.kim@navercorp.com>
Signed-off-by: Gyutae Bae <gyutae.bae@navercorp.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Quentin Monnet <qmo@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20260112034516.22723-1-gyutae.opensource@navercorp.com
---
 tools/bpf/bpftool/Documentation/bpftool-net.rst | 30 ++++++++++++++++++------
 tools/bpf/bpftool/bash-completion/bpftool       |  9 ++++++-
 tools/bpf/bpftool/net.c                         | 31 +++++++++++++++++++++----
 3 files changed, 58 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index a9ed8992800f..22da07087e42 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -24,7 +24,7 @@ NET COMMANDS
 ============
 
 | **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ]
-| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
+| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** | **prepend** ]
 | **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
 | **bpftool** **net help**
 |
@@ -58,11 +58,9 @@ bpftool net { show | list } [ dev *NAME* ]
     then all bpf programs attached to non clsact qdiscs, and finally all bpf
     programs attached to root and clsact qdisc.
 
-bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ]
+bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite | prepend ]
     Attach bpf program *PROG* to network interface *NAME* with type specified
-    by *ATTACH_TYPE*. Previously attached bpf program can be replaced by the
-    command used with **overwrite** option. Currently, only XDP-related modes
-    are supported for *ATTACH_TYPE*.
+    by *ATTACH_TYPE*.
 
     *ATTACH_TYPE* can be of:
     **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it;
@@ -72,11 +70,18 @@ bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ]
     **tcx_ingress** - Ingress TCX. runs on ingress net traffic;
     **tcx_egress** - Egress TCX. runs on egress net traffic;
 
+    For XDP-related attach types (**xdp**, **xdpgeneric**, **xdpdrv**,
+    **xdpoffload**), the **overwrite** option can be used to replace a
+    previously attached bpf program.
+
+    For **tcx_ingress** and **tcx_egress** attach types, the **prepend** option
+    can be used to attach the program at the beginning of the chain instead of
+    at the end.
+
 bpftool net detach *ATTACH_TYPE* dev *NAME*
     Detach bpf program attached to network interface *NAME* with type specified
     by *ATTACH_TYPE*. To detach bpf program, same *ATTACH_TYPE* previously used
-    for attach must be specified. Currently, only XDP-related modes are
-    supported for *ATTACH_TYPE*.
+    for attach must be specified.
 
 bpftool net help
     Print short help message.
@@ -191,6 +196,17 @@ EXAMPLES
       tc:
       lo(1) tcx/ingress tc_prog prog_id 29
 
+|
+| **# bpftool net attach tcx_ingress name tc_prog2 dev lo prepend**
+| **# bpftool net**
+|
+
+::
+
+      tc:
+      lo(1) tcx/ingress tc_prog2 prog_id 30
+      lo(1) tcx/ingress tc_prog prog_id 29
+
 |
 | **# bpftool net attach tcx_ingress name tc_prog dev lo**
 | **# bpftool net detach tcx_ingress dev lo**
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 53bcfeb1a76e..a28f0cc522e4 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -1142,7 +1142,14 @@ _bpftool()
                             return 0
                             ;;
                         8)
-                            _bpftool_once_attr 'overwrite'
+                            case ${words[3]} in
+                                tcx_ingress|tcx_egress)
+                                    _bpftool_once_attr 'prepend'
+                                    ;;
+                                *)
+                                    _bpftool_once_attr 'overwrite'
+                                    ;;
+                            esac
                             return 0
                             ;;
                     esac
diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index cfc6f944f7c3..f25d66c8395e 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -666,10 +666,16 @@ static int get_tcx_type(enum net_attach_type attach_type)
 	}
 }
 
-static int do_attach_tcx(int progfd, enum net_attach_type attach_type, int ifindex)
+static int do_attach_tcx(int progfd, enum net_attach_type attach_type, int ifindex, bool prepend)
 {
 	int type = get_tcx_type(attach_type);
 
+	if (prepend) {
+		LIBBPF_OPTS(bpf_prog_attach_opts, opts,
+			.flags = BPF_F_BEFORE
+		);
+		return bpf_prog_attach_opts(progfd, ifindex, type, &opts);
+	}
 	return bpf_prog_attach(progfd, ifindex, type, 0);
 }
 
@@ -685,6 +691,7 @@ static int do_attach(int argc, char **argv)
 	enum net_attach_type attach_type;
 	int progfd, ifindex, err = 0;
 	bool overwrite = false;
+	bool prepend = false;
 
 	/* parse attach args */
 	if (!REQ_ARGS(5))
@@ -709,9 +716,25 @@ static int do_attach(int argc, char **argv)
 
 	if (argc) {
 		if (is_prefix(*argv, "overwrite")) {
+			if (attach_type != NET_ATTACH_TYPE_XDP &&
+			    attach_type != NET_ATTACH_TYPE_XDP_GENERIC &&
+			    attach_type != NET_ATTACH_TYPE_XDP_DRIVER &&
+			    attach_type != NET_ATTACH_TYPE_XDP_OFFLOAD) {
+				p_err("'overwrite' is only supported for xdp types");
+				err = -EINVAL;
+				goto cleanup;
+			}
 			overwrite = true;
+		} else if (is_prefix(*argv, "prepend")) {
+			if (attach_type != NET_ATTACH_TYPE_TCX_INGRESS &&
+			    attach_type != NET_ATTACH_TYPE_TCX_EGRESS) {
+				p_err("'prepend' is only supported for tcx_ingress/tcx_egress");
+				err = -EINVAL;
+				goto cleanup;
+			}
+			prepend = true;
 		} else {
-			p_err("expected 'overwrite', got: '%s'?", *argv);
+			p_err("expected 'overwrite' or 'prepend', got: '%s'?", *argv);
 			err = -EINVAL;
 			goto cleanup;
 		}
@@ -728,7 +751,7 @@ static int do_attach(int argc, char **argv)
 	/* attach tcx prog */
 	case NET_ATTACH_TYPE_TCX_INGRESS:
 	case NET_ATTACH_TYPE_TCX_EGRESS:
-		err = do_attach_tcx(progfd, attach_type, ifindex);
+		err = do_attach_tcx(progfd, attach_type, ifindex, prepend);
 		break;
 	default:
 		break;
@@ -985,7 +1008,7 @@ static int do_help(int argc, char **argv)
 
 	fprintf(stderr,
 		"Usage: %1$s %2$s { show | list } [dev <devname>]\n"
-		"       %1$s %2$s attach ATTACH_TYPE PROG dev <devname> [ overwrite ]\n"
+		"       %1$s %2$s attach ATTACH_TYPE PROG dev <devname> [ overwrite | prepend ]\n"
 		"       %1$s %2$s detach ATTACH_TYPE dev <devname>\n"
 		"       %1$s %2$s help\n"
 		"\n"
-- 
cgit v1.2.3


From 47d440d0a5bb822f3f4e4b2479246da5efb765e6 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Thu, 15 Jan 2026 16:34:57 +0000
Subject: selftests/bpf: Support when CONFIG_VXLAN=m

If CONFIG_VXLAN is 'm', struct vxlanhdr will not be in vmlinux.h.
Add a ___local variant to support cases where vxlan is a module.

Fixes: 8517b1abe5ea ("selftests/bpf: Integrate test_tc_tunnel.sh tests into test_progs")
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260115163457.146267-1-alan.maguire@oracle.com
---
 tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
index 7330c61b5730..7376df405a6b 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
@@ -23,7 +23,12 @@ static const int cfg_udp_src = 20000;
 	(((__u64)len & BPF_ADJ_ROOM_ENCAP_L2_MASK)	\
 	 << BPF_ADJ_ROOM_ENCAP_L2_SHIFT)
 
-#define	L2_PAD_SZ	(sizeof(struct vxlanhdr) + ETH_HLEN)
+struct vxlanhdr___local {
+	__be32 vx_flags;
+	__be32 vx_vni;
+};
+
+#define	L2_PAD_SZ	(sizeof(struct vxlanhdr___local) + ETH_HLEN)
 
 #define	UDP_PORT		5555
 #define	MPLS_OVER_UDP_PORT	6635
@@ -154,7 +159,7 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
 		l2_len = ETH_HLEN;
 		if (ext_proto & EXTPROTO_VXLAN) {
 			udp_dst = VXLAN_UDP_PORT;
-			l2_len += sizeof(struct vxlanhdr);
+			l2_len += sizeof(struct vxlanhdr___local);
 		} else
 			udp_dst = ETH_OVER_UDP_PORT;
 		break;
@@ -195,12 +200,12 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
 		flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
 
 		if (ext_proto & EXTPROTO_VXLAN) {
-			struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
+			struct vxlanhdr___local *vxlan_hdr = (struct vxlanhdr___local *)l2_hdr;
 
 			vxlan_hdr->vx_flags = VXLAN_FLAGS;
 			vxlan_hdr->vx_vni = VXLAN_VNI;
 
-			l2_hdr += sizeof(struct vxlanhdr);
+			l2_hdr += sizeof(struct vxlanhdr___local);
 		}
 
 		if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN))
@@ -285,7 +290,7 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
 		l2_len = ETH_HLEN;
 		if (ext_proto & EXTPROTO_VXLAN) {
 			udp_dst = VXLAN_UDP_PORT;
-			l2_len += sizeof(struct vxlanhdr);
+			l2_len += sizeof(struct vxlanhdr___local);
 		} else
 			udp_dst = ETH_OVER_UDP_PORT;
 		break;
@@ -325,12 +330,12 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
 		flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
 
 		if (ext_proto & EXTPROTO_VXLAN) {
-			struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
+			struct vxlanhdr___local *vxlan_hdr = (struct vxlanhdr___local *)l2_hdr;
 
 			vxlan_hdr->vx_flags = VXLAN_FLAGS;
 			vxlan_hdr->vx_vni = VXLAN_VNI;
 
-			l2_hdr += sizeof(struct vxlanhdr);
+			l2_hdr += sizeof(struct vxlanhdr___local);
 		}
 
 		if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN))
@@ -639,7 +644,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
 			olen += ETH_HLEN;
 			break;
 		case VXLAN_UDP_PORT:
-			olen += ETH_HLEN + sizeof(struct vxlanhdr);
+			olen += ETH_HLEN + sizeof(struct vxlanhdr___local);
 			break;
 		}
 		break;
-- 
cgit v1.2.3


From efad162f5a840ae178e7761c176c49f433c7bb68 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 15 Jan 2026 21:22:45 -0800
Subject: selftests/bpf: Fix map_kptr test failure

On my arm64 machine, I get the following failure:
  ...
  tester_init:PASS:tester_log_buf 0 nsec
  process_subtest:PASS:obj_open_mem 0 nsec
  process_subtest:PASS:specs_alloc 0 nsec
  serial_test_map_kptr:PASS:rcu_tasks_trace_gp__open_and_load 0 nsec
  ...
  test_map_kptr_success:PASS:map_kptr__open_and_load 0 nsec
  test_map_kptr_success:PASS:test_map_kptr_ref1 refcount 0 nsec
  test_map_kptr_success:FAIL:test_map_kptr_ref1 retval unexpected error: 2 (errno 2)
  test_map_kptr_success:PASS:test_map_kptr_ref2 refcount 0 nsec
  test_map_kptr_success:FAIL:test_map_kptr_ref2 retval unexpected error: 1 (errno 2)
  ...
  #201/21  map_kptr/success-map:FAIL

In serial_test_map_kptr(), before test_map_kptr_success(), one
kern_sync_rcu() is used to have some delay for freeing the map.
But in my environment, one kern_sync_rcu() seems not enough and
caused the test failure.

In bpf_map_free_in_work() in syscall.c, the queue time for
  queue_work(system_dfl_wq, &map->work)
may be longer than expected. This may cause the test failure
since test_map_kptr_success() expects all previous maps having been freed.

Since it is not clear how long queue_work() time takes, a bpf prog
is added to count the reference after bpf_kfunc_call_test_acquire().
If the number of references is 2 (for initial ref and the one just
acquired), all previous maps should have been released. This will
resolve the above 'retval unexpected error' issue.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/bpf/20260116052245.3692405-1-yonghong.song@linux.dev
---
 tools/testing/selftests/bpf/prog_tests/map_kptr.c | 23 +++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/map_kptr.c      | 18 ++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c
index 8743df599567..f372162c0280 100644
--- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c
@@ -131,6 +131,25 @@ static int kern_sync_rcu_tasks_trace(struct rcu_tasks_trace_gp *rcu)
 	return 0;
 }
 
+static void wait_for_map_release(void)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, lopts);
+	struct map_kptr *skel;
+	int ret;
+
+	skel = map_kptr__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "map_kptr__open_and_load"))
+		return;
+
+	do {
+		ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.count_ref), &lopts);
+		ASSERT_OK(ret, "count_ref ret");
+		ASSERT_OK(lopts.retval, "count_ref retval");
+	} while (skel->bss->num_of_refs != 2);
+
+	map_kptr__destroy(skel);
+}
+
 void serial_test_map_kptr(void)
 {
 	struct rcu_tasks_trace_gp *skel;
@@ -148,11 +167,15 @@ void serial_test_map_kptr(void)
 
 		ASSERT_OK(kern_sync_rcu_tasks_trace(skel), "sync rcu_tasks_trace");
 		ASSERT_OK(kern_sync_rcu(), "sync rcu");
+		wait_for_map_release();
+
 		/* Observe refcount dropping to 1 on bpf_map_free_deferred */
 		test_map_kptr_success(false);
 
 		ASSERT_OK(kern_sync_rcu_tasks_trace(skel), "sync rcu_tasks_trace");
 		ASSERT_OK(kern_sync_rcu(), "sync rcu");
+		wait_for_map_release();
+
 		/* Observe refcount dropping to 1 on synchronous delete elem */
 		test_map_kptr_success(true);
 	}
diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c
index edaba481db9d..e708ffbe1f61 100644
--- a/tools/testing/selftests/bpf/progs/map_kptr.c
+++ b/tools/testing/selftests/bpf/progs/map_kptr.c
@@ -487,6 +487,24 @@ int test_map_kptr_ref3(struct __sk_buff *ctx)
 	return 0;
 }
 
+int num_of_refs;
+
+SEC("syscall")
+int count_ref(void *ctx)
+{
+	struct prog_test_ref_kfunc *p;
+	unsigned long arg = 0;
+
+	p = bpf_kfunc_call_test_acquire(&arg);
+	if (!p)
+		return 1;
+
+	num_of_refs = p->cnt.refs.counter;
+
+	bpf_kfunc_call_test_release(p);
+	return 0;
+}
+
 SEC("syscall")
 int test_ls_map_kptr_ref1(void *ctx)
 {
-- 
cgit v1.2.3


From 6588b8845e7387438d4b91ea86e7cb6d838b3108 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Fri, 16 Jan 2026 15:30:23 -0800
Subject: tools/power/x86/intel-speed-select: Allow non root users

When permitted by the file /dev/isst_interface, allow to issue commands
for non root users.

When user id is non root, check if  "/dev/isst_interface" can still be
opened. If this file can be opened, allow all read only commands.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 tools/power/x86/intel-speed-select/isst-config.c | 39 ++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index 558138eea75e..807feb17bb81 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c
@@ -80,6 +80,18 @@ struct cpu_topology {
 	short die_id;
 };
 
+static int read_only;
+
+static void check_privilege(void)
+{
+	if (!read_only)
+		return;
+
+	isst_display_error_info_message(1, "Insufficient privileges", 0, 0);
+	isst_ctdp_display_information_end(outf);
+	exit(1);
+}
+
 FILE *get_output_file(void)
 {
 	return outf;
@@ -1578,6 +1590,8 @@ free_mask:
 
 static void set_tdp_level(int arg)
 {
+	check_privilege();
+
 	if (cmd_help) {
 		fprintf(stderr, "Set Config TDP level\n");
 		fprintf(stderr,
@@ -2046,6 +2060,8 @@ static void set_pbf_enable(int arg)
 {
 	int enable = arg;
 
+	check_privilege();
+
 	if (cmd_help) {
 		if (enable) {
 			fprintf(stderr,
@@ -2212,6 +2228,8 @@ static void set_fact_enable(int arg)
 	int i, ret, enable = arg;
 	struct isst_id id;
 
+	check_privilege();
+
 	if (cmd_help) {
 		if (enable) {
 			fprintf(stderr,
@@ -2361,6 +2379,8 @@ static void set_clos_enable(int arg)
 {
 	int enable = arg;
 
+	check_privilege();
+
 	if (cmd_help) {
 		if (enable) {
 			fprintf(stderr,
@@ -2491,6 +2511,8 @@ static void set_clos_config_for_cpu(struct isst_id *id, void *arg1, void *arg2,
 
 static void set_clos_config(int arg)
 {
+	check_privilege();
+
 	if (cmd_help) {
 		fprintf(stderr,
 			"Set core-power configuration for one of the four clos ids\n");
@@ -2556,6 +2578,8 @@ static void set_clos_assoc_for_cpu(struct isst_id *id, void *arg1, void *arg2, v
 
 static void set_clos_assoc(int arg)
 {
+	check_privilege();
+
 	if (cmd_help) {
 		fprintf(stderr, "Associate a clos id to a CPU\n");
 		fprintf(stderr,
@@ -2637,6 +2661,8 @@ static void set_turbo_mode(int arg)
 	int i, disable = arg;
 	struct isst_id id;
 
+	check_privilege();
+
 	if (cmd_help) {
 		if (disable)
 			fprintf(stderr, "Set turbo mode disable\n");
@@ -2682,6 +2708,7 @@ static void get_set_trl(struct isst_id *id, void *arg1, void *arg2, void *arg3,
 	}
 
 	if (set) {
+		check_privilege();
 		ret = isst_set_trl(id, fact_trl);
 		isst_display_result(id, outf, "turbo-mode", "set-trl", ret);
 		return;
@@ -3204,8 +3231,16 @@ static void cmdline(int argc, char **argv)
 	};
 
 	if (geteuid() != 0) {
-		fprintf(stderr, "Must run as root\n");
-		exit(0);
+		int fd;
+
+		fd = open(pathname, O_RDWR);
+		if (fd < 0) {
+			fprintf(stderr, "Must run as root\n");
+			exit(0);
+		}
+		fprintf(stderr, "\nNot running as root, Only read only operations are supported\n");
+		close(fd);
+		read_only = 1;
 	}
 
 	ret = update_cpu_model();
-- 
cgit v1.2.3


From 21adcd5ec99f342489a49e9d237a987b1bd9fab5 Mon Sep 17 00:00:00 2001
From: Khem Raj <raj.khem@gmail.com>
Date: Mon, 29 Dec 2025 12:45:06 -0800
Subject: tools/power/x86/intel-speed-select: Use pkg-config for libnl-3.0
 detection

Replace hardcoded libnl3 include path with pkg-config detection to
improve portability across different distributions and build environments.

The previous implementation used a fixed path constructed from the
compiler's sysroot, which could fail on systems with non-standard
library installations. Now the build system:
- Attempts to detect libnl-3.0 include paths using pkg-config
- Falls back to /usr/include/libnl3 if pkg-config is unavailable
- Maintains backward compatibility with existing build configurations

This ensures the tool builds correctly on a wider range of systems
while preserving existing behavior when pkg-config is not present.

Closes:https://bugzilla.kernel.org/show_bug.cgi?id=220819
Signed-off-by: Khem Raj <raj.khem@gmail.com>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 tools/power/x86/intel-speed-select/Makefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/Makefile b/tools/power/x86/intel-speed-select/Makefile
index 8d3a02a20f3d..6b299aae2ded 100644
--- a/tools/power/x86/intel-speed-select/Makefile
+++ b/tools/power/x86/intel-speed-select/Makefile
@@ -13,7 +13,13 @@ endif
 # Do not use make's built-in rules
 # (this improves performance and avoids hard-to-debug behaviour);
 MAKEFLAGS += -r
-override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include -I$(shell $(CC) -print-sysroot)/usr/include/libnl3
+
+NL3_CFLAGS = $(shell pkg-config --cflags libnl-3.0 2>/dev/null)
+ifeq ($(NL3_CFLAGS),)
+NL3_CFLAGS = -I/usr/include/libnl3
+endif
+
+override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include $(NL3_CFLAGS)
 override LDFLAGS += -lnl-genl-3 -lnl-3
 
 ALL_TARGETS := intel-speed-select
-- 
cgit v1.2.3


From 56c17ee151c6e1a73d77e15b82a8e2130cd8dd16 Mon Sep 17 00:00:00 2001
From: Malaya Kumar Rout <mrout@redhat.com>
Date: Thu, 15 Jan 2026 15:33:33 +0530
Subject: tools/power/x86/intel-speed-select: Fix file descriptor leak in
 isolate_cpus()

The file descriptor opened in isolate_cpus() when (!level) is true was
not being closed before returning, causing a file descriptor leak in
both the error path and the success path.

When write() fails at line 950, the function returns at line 953 without
closing the file descriptor. Similarly, on success, the function returns
at line 956 without closing the file descriptor.

Add close(fd) calls before both return statements to fix the resource
leak. This follows the same pattern used elsewhere in the same function
where file descriptors are properly closed before returning (see lines
1005 and 1027).

Fixes: 997074df658e ("tools/power/x86/intel-speed-select: Use cgroup v2 isolation")
Signed-off-by: Malaya Kumar Rout <mrout@redhat.com>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 tools/power/x86/intel-speed-select/isst-config.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index 807feb17bb81..68c9955dd7f4 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c
@@ -962,9 +962,11 @@ int isolate_cpus(struct isst_id *id, int mask_size, cpu_set_t *cpu_mask, int lev
 		ret = write(fd, "member", strlen("member"));
 		if (ret == -1) {
 			printf("Can't update to member\n");
+			close(fd);
 			return ret;
 		}
 
+		close(fd);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 6142b726e6e64870ab0c7ffb158bffa141f83bb6 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Fri, 16 Jan 2026 15:48:50 -0800
Subject: tools/power/x86/intel-speed-select: v1.25 release

This version includes the following changes:
- Allow read only commands for non root users when permitted
- Fix file descriptor leak in isolate_cpus()
- Replace hardcoded libnl3 include path

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 tools/power/x86/intel-speed-select/isst-config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c
index 68c9955dd7f4..dd9056ddb016 100644
--- a/tools/power/x86/intel-speed-select/isst-config.c
+++ b/tools/power/x86/intel-speed-select/isst-config.c
@@ -16,7 +16,7 @@ struct process_cmd_struct {
 	int arg;
 };
 
-static const char *version_str = "v1.24";
+static const char *version_str = "v1.25";
 
 static const int supported_api_ver = 3;
 static struct isst_if_platform_info isst_platform_info;
-- 
cgit v1.2.3


From d9b40d7262a227442bf402ea0708dc94f438bb52 Mon Sep 17 00:00:00 2001
From: Bala-Vignesh-Reddy <reddybalavignesh9979@gmail.com>
Date: Wed, 22 Oct 2025 11:59:48 +0530
Subject: selftests/x86: Add selftests include path for kselftest.h after
 centralization

The previous change centralizing kselftest.h include path in lib.mk caused x86
selftests to fail, as x86 Makefile overwrites CFLAGS using ":=", dropping the
include path added in lib.mk. Therefore, helpers.h could not find kselftest.h
during compilation.

Fix this by adding the tools/testing/sefltest to CFLAGS in x86 Makefile.

  [ bp: Correct commit ID in Fixes: ]

Fixes: e6fbd1759c9e ("selftests: complete kselftest include centralization")
Closes: https://lore.kernel.org/lkml/CA+G9fYvKjQcCBMfXA-z2YuL2L+3Qd-pJjEUDX8PDdz2-EEQd=Q@mail.gmail.com/T/#m83fd330231287fc9d6c921155bee16c591db7360
Reported-by: Linux Kernel Functional Testing <lkft@linaro.org>
Signed-off-by: Bala-Vignesh-Reddy <reddybalavignesh9979@gmail.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Tested-by: Anders Roxell <anders.roxell@linaro.org>
Tested-by: Brendan Jackman <jackmanb@google.com>
Link: https://patch.msgid.link/20251022062948.162852-1-reddybalavignesh9979@gmail.com
---
 tools/testing/selftests/x86/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 83148875a12c..434065215d12 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -36,6 +36,7 @@ BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
 BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
 
 CFLAGS := -O2 -g -std=gnu99 -pthread -Wall $(KHDR_INCLUDES)
+CFLAGS += -I $(top_srcdir)/tools/testing/selftests/
 
 # call32_from_64 in thunks.S uses absolute addresses.
 ifeq ($(CAN_BUILD_WITH_NOPIE),1)
-- 
cgit v1.2.3


From d045e166d3c51b7aec069669bb243e057d80d04f Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Thu, 15 Jan 2026 14:56:52 +0100
Subject: selftests: vDSO: getrandom: Fix path to s390 chacha implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The s390 vDSO source directory was recently moved,
but this reference was not updated.

Fixes: c0087d807ae8 ("s390/vdso: Rename vdso64 to vdso")
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 tools/testing/selftests/vDSO/vgetrandom-chacha.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vDSO/vgetrandom-chacha.S b/tools/testing/selftests/vDSO/vgetrandom-chacha.S
index a4a82e1c28a9..8c3cbf4dfd6a 100644
--- a/tools/testing/selftests/vDSO/vgetrandom-chacha.S
+++ b/tools/testing/selftests/vDSO/vgetrandom-chacha.S
@@ -14,7 +14,7 @@
 #elif defined(__riscv) && __riscv_xlen == 64
 #include "../../../../arch/riscv/kernel/vdso/vgetrandom-chacha.S"
 #elif defined(__s390x__)
-#include "../../../../arch/s390/kernel/vdso64/vgetrandom-chacha.S"
+#include "../../../../arch/s390/kernel/vdso/vgetrandom-chacha.S"
 #elif defined(__x86_64__)
 #include "../../../../arch/x86/entry/vdso/vgetrandom-chacha.S"
 #endif
-- 
cgit v1.2.3


From a120a832e3ebca48474d7183ddeadf4138472535 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 20 Oct 2025 15:01:16 -0700
Subject: lkdtm/bugs: Add __counted_by_ptr() test PTR_BOUNDS

Provide run-time validation of the __counted_by_ptr() annotation via
newly added PTR_BOUNDS LKDTM test.

Link: https://patch.msgid.link/20251020220118.1226740-2-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 drivers/misc/lkdtm/bugs.c               | 90 ++++++++++++++++++++++++++++++---
 tools/testing/selftests/lkdtm/tests.txt |  2 +
 2 files changed, 84 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
index 502059078b45..b2aee36b956d 100644
--- a/drivers/misc/lkdtm/bugs.c
+++ b/drivers/misc/lkdtm/bugs.c
@@ -465,32 +465,32 @@ static void lkdtm_ARRAY_BOUNDS(void)
 		pr_expected_config(CONFIG_UBSAN_BOUNDS);
 }
 
-struct lkdtm_annotated {
+struct lkdtm_cb_fam {
 	unsigned long flags;
 	int count;
 	int array[] __counted_by(count);
 };
 
-static volatile int fam_count = 4;
+static volatile int element_count = 4;
 
 static void lkdtm_FAM_BOUNDS(void)
 {
-	struct lkdtm_annotated *inst;
+	struct lkdtm_cb_fam *inst;
 
-	inst = kzalloc(struct_size(inst, array, fam_count + 1), GFP_KERNEL);
+	inst = kzalloc(struct_size(inst, array, element_count + 1), GFP_KERNEL);
 	if (!inst) {
 		pr_err("FAIL: could not allocate test struct!\n");
 		return;
 	}
 
-	inst->count = fam_count;
+	inst->count = element_count;
 	pr_info("Array access within bounds ...\n");
-	inst->array[1] = fam_count;
+	inst->array[1] = element_count;
 	ignored = inst->array[1];
 
 	pr_info("Array access beyond bounds ...\n");
-	inst->array[fam_count] = fam_count;
-	ignored = inst->array[fam_count];
+	inst->array[element_count] = element_count;
+	ignored = inst->array[element_count];
 
 	kfree(inst);
 
@@ -505,6 +505,79 @@ static void lkdtm_FAM_BOUNDS(void)
 		pr_expected_config(CONFIG_UBSAN_BOUNDS);
 }
 
+struct lkdtm_extra {
+	short a, b;
+	u16 sixteen;
+	u32 bigger;
+	u64 biggest;
+};
+
+struct lkdtm_cb_ptr {
+	int a, b, c;
+	int nr_extra;
+	char *buf __counted_by_ptr(len);
+	size_t len;
+	struct lkdtm_extra *extra __counted_by_ptr(nr_extra);
+};
+
+static noinline void check_ptr_len(struct lkdtm_cb_ptr *p, size_t len)
+{
+	if (__member_size(p->buf) != len)
+		pr_err("FAIL: could not determine size of inst->buf: %zu\n",
+			__member_size(p->buf));
+	else
+		pr_info("good: inst->buf length is %zu\n", len);
+}
+
+static void lkdtm_PTR_BOUNDS(void)
+{
+	struct lkdtm_cb_ptr *inst;
+
+	inst = kzalloc(sizeof(*inst), GFP_KERNEL);
+	if (!inst) {
+		pr_err("FAIL: could not allocate struct lkdtm_cb_ptr!\n");
+		return;
+	}
+
+	inst->buf = kzalloc(element_count, GFP_KERNEL);
+	if (!inst->buf) {
+		pr_err("FAIL: could not allocate inst->buf!\n");
+		return;
+	}
+	inst->len = element_count;
+
+	/* Double element_count */
+	inst->extra = kcalloc(element_count * 2, sizeof(*inst->extra), GFP_KERNEL);
+	inst->nr_extra = element_count * 2;
+
+	pr_info("Pointer access within bounds ...\n");
+	check_ptr_len(inst, 4);
+	/* All 4 bytes */
+	inst->buf[0] = 'A';
+	inst->buf[1] = 'B';
+	inst->buf[2] = 'C';
+	inst->buf[3] = 'D';
+	/* Halfway into the array */
+	inst->extra[element_count].biggest = 0x1000;
+
+	pr_info("Pointer access beyond bounds ...\n");
+	ignored = inst->extra[inst->nr_extra].b;
+
+	kfree(inst->extra);
+	kfree(inst->buf);
+	kfree(inst);
+
+	pr_err("FAIL: survived access of invalid pointer member offset!\n");
+
+	if (!IS_ENABLED(CONFIG_CC_HAS_COUNTED_BY_PTR))
+		pr_warn("This is expected since this %s was built with a compiler that does not support __counted_by_ptr\n",
+			lkdtm_kernel_info);
+	else if (IS_ENABLED(CONFIG_UBSAN_BOUNDS))
+		pr_expected_config(CONFIG_UBSAN_TRAP);
+	else
+		pr_expected_config(CONFIG_UBSAN_BOUNDS);
+}
+
 static void lkdtm_CORRUPT_LIST_ADD(void)
 {
 	/*
@@ -769,6 +842,7 @@ static struct crashtype crashtypes[] = {
 	CRASHTYPE(OVERFLOW_UNSIGNED),
 	CRASHTYPE(ARRAY_BOUNDS),
 	CRASHTYPE(FAM_BOUNDS),
+	CRASHTYPE(PTR_BOUNDS),
 	CRASHTYPE(CORRUPT_LIST_ADD),
 	CRASHTYPE(CORRUPT_LIST_DEL),
 	CRASHTYPE(STACK_GUARD_PAGE_LEADING),
diff --git a/tools/testing/selftests/lkdtm/tests.txt b/tools/testing/selftests/lkdtm/tests.txt
index 67cd53715d93..e62b85b591be 100644
--- a/tools/testing/selftests/lkdtm/tests.txt
+++ b/tools/testing/selftests/lkdtm/tests.txt
@@ -11,6 +11,8 @@ EXCEPTION
 #CORRUPT_STACK Crashes entire system on success
 #CORRUPT_STACK_STRONG Crashes entire system on success
 ARRAY_BOUNDS call trace:|UBSAN: array-index-out-of-bounds
+FAM_BOUNDS call trace:|UBSAN: array-index-out-of-bounds
+PTR_BOUNDS call trace:|UBSAN: array-index-out-of-bounds
 CORRUPT_LIST_ADD list_add corruption
 CORRUPT_LIST_DEL list_del corruption
 STACK_GUARD_PAGE_LEADING
-- 
cgit v1.2.3


From 68578370f9b3a2aba5964b273312d51c581b6aad Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Thu, 15 Jan 2026 17:24:47 +0000
Subject: tools: ynl: Specify --no-line-number in ynl-regen.sh.

If grep.lineNumber is enabled in .gitconfig,

  [grep]
  lineNumber = true

ynl-regen.sh fails with the following error:

  $ ./tools/net/ynl/ynl-regen.sh -f
  ...
  ynl_gen_c.py: error: argument --mode: invalid choice: '4:' (choose from user, kernel, uapi)
  	GEN 4:	net/ipv4/fou_nl.c

Let's specify --no-line-number explicitly.

Fixes: be5bea1cc0bf ("net: add basic C code generators for Netlink")
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20260115172533.693652-3-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynl-regen.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynl-regen.sh b/tools/net/ynl/ynl-regen.sh
index 81b4ecd89100..d9809276db98 100755
--- a/tools/net/ynl/ynl-regen.sh
+++ b/tools/net/ynl/ynl-regen.sh
@@ -21,7 +21,7 @@ files=$(git grep --files-with-matches '^/\* YNL-GEN \(kernel\|uapi\|user\)')
 for f in $files; do
     # params:     0       1      2     3
     #         $YAML YNL-GEN kernel $mode
-    params=( $(git grep -B1 -h '/\* YNL-GEN' $f | sed 's@/\*\(.*\)\*/@\1@') )
+    params=( $(git grep --no-line-number -B1 -h '/\* YNL-GEN' $f | sed 's@/\*\(.*\)\*/@\1@') )
     args=$(sed -n 's@/\* YNL-ARG \(.*\) \*/@\1@p' $f)
 
     if [ $f -nt ${params[0]} -a -z "$force" ]; then
-- 
cgit v1.2.3


From 52b4859730434902731e1cd4ea061d9611398008 Mon Sep 17 00:00:00 2001
From: Yohei Kojima <yk@y-koj.net>
Date: Tue, 13 Jan 2026 23:11:54 +0900
Subject: selftests: net: fix passive TFO test to fail if child processes
 failed

Improve the passive TFO test to report failure if the server or the
client timed out or exited with non-zero status.

Before this commit, TFO test didn't fail even if exit(EXIT_FAILURE) is
added to the first line of the run_server() and run_client() functions.

Signed-off-by: Yohei Kojima <yk@y-koj.net>
Link: https://patch.msgid.link/214d399caec2e5de7738ced5736829915d507e4e.1768312014.git.yk@y-koj.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tfo_passive.sh | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tfo_passive.sh b/tools/testing/selftests/net/tfo_passive.sh
index a4550511830a..f116f888b794 100755
--- a/tools/testing/selftests/net/tfo_passive.sh
+++ b/tools/testing/selftests/net/tfo_passive.sh
@@ -85,12 +85,15 @@ timeout -k 1s 30s ip netns exec nssv ./tfo        \
 				-s                \
 				-p ${SERVER_PORT} \
 				-o ${out_file}&
+server_pid="$!"
 
 wait_local_port_listen nssv ${SERVER_PORT} tcp
 
 ip netns exec nscl ./tfo -c -h ${SERVER_IP} -p ${SERVER_PORT}
+client_exit_status="$?"
 
-wait
+wait "$server_pid"
+server_exit_status="$?"
 
 res=$(cat $out_file)
 rm $out_file
@@ -101,6 +104,14 @@ if [ "$res" = "0" ]; then
 	exit 1
 fi
 
+if [ "$client_exit_status" -ne 0 ] || [ "$server_exit_status" -ne 0 ]; then
+	# Note: timeout(1) exits with 124 if it timed out
+	echo "client exited with ${client_exit_status}"
+	echo "server exited with ${server_exit_status}"
+	cleanup_ns
+	exit 1
+fi
+
 echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK
 
 echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL
-- 
cgit v1.2.3


From 342e31254f02041e3b9d4ad573204d53b2f832c9 Mon Sep 17 00:00:00 2001
From: Yohei Kojima <yk@y-koj.net>
Date: Tue, 13 Jan 2026 23:11:55 +0900
Subject: selftests: net: improve error handling in passive TFO test

Improve the error handling in passive TFO test to check the return value
from sendto(), and to fail if read() or fprintf() failed.

Signed-off-by: Yohei Kojima <yk@y-koj.net>
Link: https://patch.msgid.link/24707c8133f7095c0e5a94afa69e75c3a80bf6e7.1768312014.git.yk@y-koj.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tfo.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tfo.c b/tools/testing/selftests/net/tfo.c
index 8d82140f0f76..3b1ee2d3d417 100644
--- a/tools/testing/selftests/net/tfo.c
+++ b/tools/testing/selftests/net/tfo.c
@@ -82,8 +82,10 @@ static void run_server(void)
 		error(1, errno, "getsockopt(SO_INCOMING_NAPI_ID)");
 
 	if (read(connfd, buf, 64) < 0)
-		perror("read()");
-	fprintf(outfile, "%d\n", opt);
+		error(1, errno, "read()");
+
+	if (fprintf(outfile, "%d\n", opt) < 0)
+		error(1, errno, "fprintf()");
 
 	fclose(outfile);
 	close(connfd);
@@ -92,14 +94,17 @@ static void run_server(void)
 
 static void run_client(void)
 {
-	int fd;
+	int fd, ret;
 	char *msg = "Hello, world!";
 
 	fd = socket(AF_INET6, SOCK_STREAM, 0);
 	if (fd == -1)
 		error(1, errno, "socket()");
 
-	sendto(fd, msg, strlen(msg), MSG_FASTOPEN, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr));
+	ret = sendto(fd, msg, strlen(msg), MSG_FASTOPEN,
+		     (struct sockaddr *)&cfg_addr, sizeof(cfg_addr));
+	if (ret < 0)
+		error(1, errno, "sendto()");
 
 	close(fd);
 }
-- 
cgit v1.2.3


From 1deecf7805f16cbcb3541cc57d8478b8b992a2ab Mon Sep 17 00:00:00 2001
From: LeeYongjun <jun85566@gmail.com>
Date: Sun, 18 Jan 2026 15:55:10 +0900
Subject: selftests: ALSA: Remove unused variable in utimer-test

The variable 'i' in wrong_timers_test() is declared but never used.
This was detected by Cppcheck static analysis.

tools/testing/selftests/alsa/utimer-test.c:144:9: style: Unused variable: i [unusedVariable]

Remove it to clean up the code and silence the warning.

Signed-off-by: LeeYongjun <jun85566@gmail.com>
Link: https://patch.msgid.link/20260118065510.29644-1-jun85566@gmail.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 tools/testing/selftests/alsa/utimer-test.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/alsa/utimer-test.c b/tools/testing/selftests/alsa/utimer-test.c
index c45cb226bd8f..d221972cd8fb 100644
--- a/tools/testing/selftests/alsa/utimer-test.c
+++ b/tools/testing/selftests/alsa/utimer-test.c
@@ -141,7 +141,6 @@ TEST_F(timer_f, utimer) {
 TEST(wrong_timers_test) {
 	int timer_dev_fd;
 	int utimer_fd;
-	size_t i;
 	struct snd_timer_uinfo wrong_timer = {
 		.resolution = 0,
 		.id = UTIMER_DEFAULT_ID,
-- 
cgit v1.2.3


From 59cac9d52b885cbeba45fa455417b03dfb03eaa7 Mon Sep 17 00:00:00 2001
From: UYeol Jo <jouyeol8739@gmail.com>
Date: Mon, 12 Jan 2026 06:01:26 +0900
Subject: selftests/x86: Clean up sysret_rip coding style

Tidy up sysret_rip style (cast spacing, main(void), const placement).
No functional change intended.

Signed-off-by: UYeol Jo <jouyeol8739@gmail.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://patch.msgid.link/20260111210126.74752-1-jouyeol8739@gmail.com
---
 tools/testing/selftests/x86/sysret_rip.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c
index 5fb531e3ad7c..2e423a335e1c 100644
--- a/tools/testing/selftests/x86/sysret_rip.c
+++ b/tools/testing/selftests/x86/sysret_rip.c
@@ -31,7 +31,7 @@
 void test_syscall_ins(void);
 extern const char test_page[];
 
-static void const *current_test_page_addr = test_page;
+static const void *current_test_page_addr = test_page;
 
 /* State used by our signal handlers. */
 static gregset_t initial_regs;
@@ -40,7 +40,7 @@ static volatile unsigned long rip;
 
 static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void)
 {
-	ucontext_t *ctx = (ucontext_t*)ctx_void;
+	ucontext_t *ctx = (ucontext_t *)ctx_void;
 
 	if (rip != ctx->uc_mcontext.gregs[REG_RIP]) {
 		printf("[FAIL]\tRequested RIP=0x%lx but got RIP=0x%lx\n",
@@ -56,7 +56,7 @@ static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void)
 
 static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
 {
-	ucontext_t *ctx = (ucontext_t*)ctx_void;
+	ucontext_t *ctx = (ucontext_t *)ctx_void;
 
 	memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
 
@@ -69,8 +69,6 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
 	       ctx->uc_mcontext.gregs[REG_R11]);
 
 	sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND);
-
-	return;
 }
 
 static void test_sigreturn_to(unsigned long ip)
@@ -84,7 +82,7 @@ static jmp_buf jmpbuf;
 
 static void sigsegv_for_fallthrough(int sig, siginfo_t *info, void *ctx_void)
 {
-	ucontext_t *ctx = (ucontext_t*)ctx_void;
+	ucontext_t *ctx = (ucontext_t *)ctx_void;
 
 	if (rip != ctx->uc_mcontext.gregs[REG_RIP]) {
 		printf("[FAIL]\tExpected SIGSEGV at 0x%lx but got RIP=0x%lx\n",
@@ -130,7 +128,7 @@ static void test_syscall_fallthrough_to(unsigned long ip)
 	printf("[OK]\tWe survived\n");
 }
 
-int main()
+int main(void)
 {
 	/*
 	 * When the kernel returns from a slow-path syscall, it will
-- 
cgit v1.2.3


From f93bc869825fdba3632ff6ddece4906a6673e679 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 16 Jan 2026 15:30:35 +0100
Subject: selftests: add dm-verity keyring selftests

Add selftests that verify the keyring behaves correctly.
For simplicity this works with dm-verity as a module.

Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 tools/testing/selftests/dm-verity/Makefile         |   5 +
 tools/testing/selftests/dm-verity/config           |  10 +
 .../selftests/dm-verity/test-dm-verity-keyring.sh  | 873 +++++++++++++++++++++
 3 files changed, 888 insertions(+)
 create mode 100644 tools/testing/selftests/dm-verity/Makefile
 create mode 100644 tools/testing/selftests/dm-verity/config
 create mode 100755 tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/dm-verity/Makefile b/tools/testing/selftests/dm-verity/Makefile
new file mode 100644
index 000000000000..b75ee08a54af
--- /dev/null
+++ b/tools/testing/selftests/dm-verity/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_PROGS := test-dm-verity-keyring.sh
+
+include ../lib.mk
diff --git a/tools/testing/selftests/dm-verity/config b/tools/testing/selftests/dm-verity/config
new file mode 100644
index 000000000000..1cd3712fa0a4
--- /dev/null
+++ b/tools/testing/selftests/dm-verity/config
@@ -0,0 +1,10 @@
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_VERITY=m
+CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_KEYS=y
+CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_PKCS7_MESSAGE_PARSER=y
+CONFIG_SYSTEM_DATA_VERIFICATION=y
diff --git a/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh b/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh
new file mode 100755
index 000000000000..1f9601ef22f8
--- /dev/null
+++ b/tools/testing/selftests/dm-verity/test-dm-verity-keyring.sh
@@ -0,0 +1,873 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test script for dm-verity keyring functionality
+#
+# This script has two modes depending on kernel configuration:
+#
+# 1. keyring_unsealed=1 AND require_signatures=1:
+#    - Upload a test key to the .dm-verity keyring
+#    - Seal the keyring
+#    - Create a dm-verity device with a signed root hash
+#    - Verify signature verification works
+#
+# 2. keyring_unsealed=0 (default) OR require_signatures=0:
+#    - Verify the keyring is already sealed (if unsealed=0)
+#    - Verify keys cannot be added to a sealed keyring
+#    - Verify the keyring is inactive (not used for verification)
+#
+# Requirements:
+# - Root privileges
+# - openssl
+# - veritysetup (cryptsetup)
+# - keyctl (keyutils)
+
+set -e
+
+WORK_DIR=""
+DATA_DEV=""
+HASH_DEV=""
+DM_NAME="verity-test-$$"
+CLEANUP_DONE=0
+
+# Module parameters (detected at runtime)
+KEYRING_UNSEALED=""
+REQUIRE_SIGNATURES=""
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+log_info() {
+    echo -e "${GREEN}[INFO]${NC} $*"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $*"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $*" >&2
+}
+
+log_pass() {
+    echo -e "${GREEN}[PASS]${NC} $*"
+}
+
+log_fail() {
+    echo -e "${RED}[FAIL]${NC} $*" >&2
+}
+
+log_skip() {
+    echo -e "${YELLOW}[SKIP]${NC} $*"
+}
+
+cleanup() {
+    if [ "$CLEANUP_DONE" -eq 1 ]; then
+        return
+    fi
+    CLEANUP_DONE=1
+
+    log_info "Cleaning up..."
+
+    # Remove dm-verity device if it exists
+    if dmsetup info "$DM_NAME" &>/dev/null; then
+        dmsetup remove "$DM_NAME" 2>/dev/null || true
+    fi
+
+    # Detach loop devices
+    if [ -n "$DATA_DEV" ] && [[ "$DATA_DEV" == /dev/loop* ]]; then
+        losetup -d "$DATA_DEV" 2>/dev/null || true
+    fi
+    if [ -n "$HASH_DEV" ] && [[ "$HASH_DEV" == /dev/loop* ]]; then
+        losetup -d "$HASH_DEV" 2>/dev/null || true
+    fi
+
+    # Remove work directory
+    if [ -n "$WORK_DIR" ] && [ -d "$WORK_DIR" ]; then
+        rm -rf "$WORK_DIR"
+    fi
+}
+
+trap cleanup EXIT
+
+die() {
+    log_error "$*"
+    exit 1
+}
+
+find_dm_verity_keyring() {
+    # The .dm-verity keyring is not linked to user-accessible keyrings,
+    # so we need to find it via /proc/keys
+    local serial_hex
+    serial_hex=$(awk '/\.dm-verity/ {print $1}' /proc/keys 2>/dev/null)
+
+    if [ -z "$serial_hex" ]; then
+        return 1
+    fi
+
+    # Convert hex to decimal for keyctl
+    echo $((16#$serial_hex))
+}
+
+get_module_param() {
+    local param="$1"
+    local path="/sys/module/dm_verity/parameters/$param"
+
+    if [ -f "$path" ]; then
+        cat "$path"
+    else
+        echo ""
+    fi
+}
+
+check_requirements() {
+    log_info "Checking requirements..."
+
+    # Check for root
+    if [ "$(id -u)" -ne 0 ]; then
+        die "This script must be run as root"
+    fi
+
+    # Check for required tools
+    for cmd in openssl veritysetup keyctl losetup dmsetup dd awk; do
+        if ! command -v "$cmd" &>/dev/null; then
+            die "Required command not found: $cmd"
+        fi
+    done
+
+    # Check for dm-verity module
+    if ! modprobe -n dm-verity &>/dev/null; then
+        die "dm-verity module not available"
+    fi
+
+    # Verify OpenSSL can create signatures
+    # OpenSSL cms -sign with -binary -outform DER creates detached signatures by default
+    log_info "Using OpenSSL for PKCS#7 signatures"
+}
+
+load_dm_verity_module() {
+    local keyring_unsealed="${1:-0}"
+    local require_signatures="${2:-0}"
+
+    log_info "Loading dm-verity module with keyring_unsealed=$keyring_unsealed require_signatures=$require_signatures"
+
+    # Unload if already loaded
+    if lsmod | grep -q '^dm_verity'; then
+        log_info "Unloading existing dm-verity module..."
+        modprobe -r dm-verity 2>/dev/null || \
+            die "Failed to unload dm-verity module (may be in use)"
+        sleep 1
+    fi
+
+    # Load with specified parameters
+    modprobe dm-verity keyring_unsealed="$keyring_unsealed" require_signatures="$require_signatures" || \
+        die "Failed to load dm-verity module"
+
+    # Wait for keyring to be created (poll with timeout)
+    local keyring_id=""
+    local timeout=50  # 5 seconds (50 * 0.1s)
+    while [ $timeout -gt 0 ]; do
+        keyring_id=$(find_dm_verity_keyring) && break
+        sleep 0.1
+        timeout=$((timeout - 1))
+    done
+
+    if [ -z "$keyring_id" ]; then
+        die "dm-verity keyring not found after module load (timeout)"
+    fi
+
+    log_info "Found .dm-verity keyring: $keyring_id"
+    echo "$keyring_id" > "$WORK_DIR/keyring_id"
+
+    # Read and display module parameters
+    KEYRING_UNSEALED=$(get_module_param "keyring_unsealed")
+    REQUIRE_SIGNATURES=$(get_module_param "require_signatures")
+
+    log_info "Module parameters:"
+    log_info "  keyring_unsealed=$KEYRING_UNSEALED"
+    log_info "  require_signatures=$REQUIRE_SIGNATURES"
+}
+
+unload_dm_verity_module() {
+    log_info "Unloading dm-verity module..."
+
+    # Clean up any dm-verity devices first
+    local dm_dev
+    while read -r dm_dev _; do
+        [ -n "$dm_dev" ] || continue
+        log_info "Removing dm-verity device: $dm_dev"
+        dmsetup remove "$dm_dev" 2>/dev/null || true
+    done < <(dmsetup ls --target verity 2>/dev/null)
+
+    if lsmod | grep -q '^dm_verity'; then
+        modprobe -r dm-verity 2>/dev/null || \
+            log_warn "Failed to unload dm-verity module"
+        sleep 1
+    fi
+}
+
+generate_keys() {
+    log_info "Generating signing key pair..."
+
+    # Generate private key (2048-bit for faster test execution)
+    openssl genrsa -out "$WORK_DIR/private.pem" 2048 2>/dev/null
+
+    # Create OpenSSL config for certificate extensions
+    # The kernel requires digitalSignature key usage for signature verification
+    # Both subjectKeyIdentifier and authorityKeyIdentifier are needed for
+    # the kernel to match keys in the keyring (especially for self-signed certs)
+    cat > "$WORK_DIR/openssl.cnf" << 'EOF'
+[req]
+distinguished_name = req_distinguished_name
+x509_extensions = v3_ca
+prompt = no
+
+[req_distinguished_name]
+CN = dm-verity-test-key
+
+[v3_ca]
+basicConstraints = critical,CA:FALSE
+keyUsage = digitalSignature
+subjectKeyIdentifier = hash
+authorityKeyIdentifier = keyid
+EOF
+
+    # Generate self-signed certificate with proper extensions
+    openssl req -new -x509 -key "$WORK_DIR/private.pem" \
+        -out "$WORK_DIR/cert.pem" -days 365 \
+        -config "$WORK_DIR/openssl.cnf" 2>/dev/null
+
+    # Convert certificate to DER format for kernel
+    openssl x509 -in "$WORK_DIR/cert.pem" -outform DER \
+        -out "$WORK_DIR/cert.der"
+
+    # Show certificate info for debugging
+    log_info "Certificate details:"
+    openssl x509 -in "$WORK_DIR/cert.pem" -noout -text 2>/dev/null | \
+        grep -E "Subject:|Issuer:|Key Usage|Extended" | head -10
+
+    log_info "Keys generated successfully"
+}
+
+seal_keyring() {
+    log_info "Sealing the .dm-verity keyring..."
+
+    local keyring_id
+    keyring_id=$(cat "$WORK_DIR/keyring_id")
+
+    keyctl restrict_keyring "$keyring_id" || \
+        die "Failed to seal keyring"
+
+    log_info "Keyring sealed successfully"
+}
+
+create_test_device() {
+    log_info "Creating test device images..."
+
+    # Create data image with random content (8MB is sufficient for testing)
+    dd if=/dev/urandom of="$WORK_DIR/data.img" bs=1M count=8 status=none
+
+    # Create hash image (will be populated by veritysetup)
+    dd if=/dev/zero of="$WORK_DIR/hash.img" bs=1M count=1 status=none
+
+    # Setup loop devices
+    DATA_DEV=$(losetup --find --show "$WORK_DIR/data.img")
+    HASH_DEV=$(losetup --find --show "$WORK_DIR/hash.img")
+
+    log_info "Data device: $DATA_DEV"
+    log_info "Hash device: $HASH_DEV"
+}
+
+create_verity_hash() {
+    log_info "Creating dm-verity hash tree..."
+
+    local root_hash output
+    output=$(veritysetup format "$DATA_DEV" "$HASH_DEV" 2>&1)
+    root_hash=$(echo "$output" | grep "Root hash:" | awk '{print $3}')
+
+    if [ -z "$root_hash" ]; then
+        log_error "veritysetup format output:"
+        echo "$output" | sed 's/^/  /'
+        die "Failed to get root hash from veritysetup format"
+    fi
+
+    echo "$root_hash" > "$WORK_DIR/root_hash"
+    log_info "Root hash: $root_hash"
+}
+
+create_detached_signature() {
+    local infile="$1"
+    local outfile="$2"
+    local cert="$3"
+    local key="$4"
+
+    # Use openssl smime (not cms) for PKCS#7 signatures compatible with kernel
+    # Flags from working veritysetup example:
+    #   -nocerts: don't include certificate in signature
+    #   -noattr: no signed attributes
+    #   -binary: binary input mode
+    if openssl smime -sign -nocerts -noattr -binary \
+        -in "$infile" \
+        -inkey "$key" \
+        -signer "$cert" \
+        -outform der \
+        -out "$outfile" 2>/dev/null; then
+        return 0
+    fi
+
+    log_error "Failed to create signature"
+    return 1
+}
+
+activate_verity_device() {
+    local with_sig="$1"
+    local root_hash
+    root_hash=$(cat "$WORK_DIR/root_hash")
+
+    # Clear dmesg and capture any kernel messages during activation
+    dmesg -C 2>/dev/null || true
+
+    if [ "$with_sig" = "yes" ]; then
+        log_info "Activating dm-verity device with signature..."
+        veritysetup open "$DATA_DEV" "$DM_NAME" "$HASH_DEV" "$root_hash" \
+            --root-hash-signature="$WORK_DIR/root_hash.p7s" 2>&1
+        local ret=$?
+    else
+        log_info "Activating dm-verity device without signature..."
+        veritysetup open "$DATA_DEV" "$DM_NAME" "$HASH_DEV" "$root_hash" 2>&1
+        local ret=$?
+    fi
+
+    # Show relevant kernel messages
+    local kmsg
+    kmsg=$(dmesg 2>/dev/null | grep -i -E 'verity|pkcs|signature|asymmetric|key' | tail -10)
+    if [ -n "$kmsg" ]; then
+        log_info "Kernel messages:"
+        echo "$kmsg" | while read -r line; do echo "  $line"; done
+    fi
+
+    return $ret
+}
+
+deactivate_verity_device() {
+    if dmsetup info "$DM_NAME" &>/dev/null; then
+        dmsetup remove "$DM_NAME" 2>/dev/null || true
+    fi
+}
+
+show_keyring_status() {
+    log_info "Keyring status:"
+
+    local keyring_id
+    keyring_id=$(find_dm_verity_keyring) || true
+
+    if [ -n "$keyring_id" ]; then
+        echo "  Keyring ID: $keyring_id"
+        keyctl show "$keyring_id" 2>/dev/null || true
+        grep '\.dm-verity' /proc/keys 2>/dev/null || true
+    fi
+}
+
+list_keyring_keys() {
+    log_info "Keys in .dm-verity keyring:"
+
+    local keyring_id
+    keyring_id=$(cat "$WORK_DIR/keyring_id" 2>/dev/null) || \
+        keyring_id=$(find_dm_verity_keyring) || true
+
+    if [ -z "$keyring_id" ]; then
+        log_warn "Could not find keyring"
+        return
+    fi
+
+    # List all keys in the keyring
+    local keys
+    keys=$(keyctl list "$keyring_id" 2>/dev/null)
+    if [ -z "$keys" ] || [ "$keys" = "keyring is empty" ]; then
+        echo "  (empty)"
+    else
+        echo "$keys" | while read -r line; do
+            echo "  $line"
+        done
+
+        # Show detailed info for each key
+        log_info "Key details:"
+        keyctl list "$keyring_id" 2>/dev/null | awk '{print $1}' | grep -E '^[0-9]+$' | while read -r key_id; do
+            echo "  Key $key_id:"
+            keyctl describe "$key_id" 2>/dev/null | sed 's/^/    /'
+        done
+    fi
+}
+
+generate_named_key() {
+    local name="$1"
+    local key_dir="$WORK_DIR/keys/$name"
+
+    mkdir -p "$key_dir"
+
+    # Log to stderr so it doesn't interfere with return value
+    echo "[INFO] Generating key pair: $name" >&2
+
+    # Generate private key
+    openssl genrsa -out "$key_dir/private.pem" 2048 2>/dev/null
+
+    # Create OpenSSL config for certificate extensions
+    # Both subjectKeyIdentifier and authorityKeyIdentifier are needed for
+    # the kernel to match keys in the keyring (especially for self-signed certs)
+    cat > "$key_dir/openssl.cnf" << EOF
+[req]
+distinguished_name = req_distinguished_name
+x509_extensions = v3_ca
+prompt = no
+
+[req_distinguished_name]
+CN = dm-verity-test-$name
+
+[v3_ca]
+basicConstraints = critical,CA:FALSE
+keyUsage = digitalSignature
+subjectKeyIdentifier = hash
+authorityKeyIdentifier = keyid
+EOF
+
+    # Generate self-signed certificate with proper extensions
+    openssl req -new -x509 -key "$key_dir/private.pem" \
+        -out "$key_dir/cert.pem" -days 365 \
+        -config "$key_dir/openssl.cnf" 2>/dev/null
+
+    # Convert certificate to DER format for kernel
+    openssl x509 -in "$key_dir/cert.pem" -outform DER \
+        -out "$key_dir/cert.der"
+
+    # Return the key directory path (only this goes to stdout)
+    echo "$key_dir"
+}
+
+upload_named_key() {
+    local name="$1"
+    local key_dir="$2"
+
+    local keyring_id
+    keyring_id=$(cat "$WORK_DIR/keyring_id")
+
+    log_info "Uploading key '$name' to keyring..."
+
+    local key_id
+    if key_id=$(keyctl padd asymmetric "$name" "$keyring_id" \
+        < "$key_dir/cert.der" 2>&1); then
+        log_info "Key '$name' uploaded with ID: $key_id"
+        echo "$key_id" > "$key_dir/key_id"
+        return 0
+    else
+        log_error "Failed to upload key '$name': $key_id"
+        return 1
+    fi
+}
+
+#
+# Test: Verify sealed keyring rejects key additions
+#
+test_sealed_keyring_rejects_keys() {
+    log_info "TEST: Verify sealed keyring rejects key additions"
+
+    local keyring_id
+    keyring_id=$(cat "$WORK_DIR/keyring_id")
+
+    generate_keys
+
+    # Try to add a key - should fail
+    if keyctl padd asymmetric "dm-verity-test" "$keyring_id" \
+        < "$WORK_DIR/cert.der" 2>/dev/null; then
+        log_fail "Key addition should have been rejected on sealed keyring"
+        return 1
+    else
+        log_pass "Sealed keyring correctly rejected key addition"
+        return 0
+    fi
+}
+
+#
+# Test: Multiple keys in keyring
+#
+test_multiple_keys() {
+    log_info "TEST: Multiple keys in keyring"
+
+    local key1_dir key2_dir key3_dir
+
+    # Generate three different keys
+    key1_dir=$(generate_named_key "vendor-a")
+    key2_dir=$(generate_named_key "vendor-b")
+    key3_dir=$(generate_named_key "vendor-c")
+
+    # Upload all three keys
+    upload_named_key "vendor-a" "$key1_dir" || return 1
+    upload_named_key "vendor-b" "$key2_dir" || return 1
+    upload_named_key "vendor-c" "$key3_dir" || return 1
+
+    log_info ""
+    log_info "Keys in keyring before sealing:"
+    list_keyring_keys
+    show_keyring_status
+
+    # Seal the keyring
+    log_info ""
+    seal_keyring
+
+    # List keys after sealing
+    log_info ""
+    log_info "Keys in keyring after sealing:"
+    list_keyring_keys
+    show_keyring_status
+
+    log_pass "Key upload and keyring sealing succeeded"
+
+    # Create test device
+    log_info ""
+    create_test_device
+    create_verity_hash
+
+    # Test 1: Sign with key1, should verify successfully
+    log_info ""
+    log_info "Sub-test: Verify with vendor-a key"
+    if ! sign_root_hash_with_key "$key1_dir"; then
+        log_fail "Failed to sign with vendor-a key"
+        return 1
+    fi
+    if activate_verity_device "yes"; then
+        log_pass "Verification with vendor-a key succeeded"
+        deactivate_verity_device
+    else
+        log_fail "Verification with vendor-a key should succeed"
+        return 1
+    fi
+
+    # Test 2: Sign with key2, should also verify successfully
+    log_info ""
+    log_info "Sub-test: Verify with vendor-b key"
+    if ! sign_root_hash_with_key "$key2_dir"; then
+        log_fail "Failed to sign with vendor-b key"
+        return 1
+    fi
+    if activate_verity_device "yes"; then
+        log_pass "Verification with vendor-b key succeeded"
+        deactivate_verity_device
+    else
+        log_fail "Verification with vendor-b key should succeed"
+        return 1
+    fi
+
+    # Test 3: Sign with key3, should also verify successfully
+    log_info ""
+    log_info "Sub-test: Verify with vendor-c key"
+    if ! sign_root_hash_with_key "$key3_dir"; then
+        log_fail "Failed to sign with vendor-c key"
+        return 1
+    fi
+    if activate_verity_device "yes"; then
+        log_pass "Verification with vendor-c key succeeded"
+        deactivate_verity_device
+    else
+        log_fail "Verification with vendor-c key should succeed"
+        return 1
+    fi
+
+    # Test 4: Generate a key NOT in the keyring, should fail
+    log_info ""
+    log_info "Sub-test: Verify with unknown key (should fail)"
+    local unknown_key_dir
+    unknown_key_dir=$(generate_named_key "unknown-vendor")
+    if ! sign_root_hash_with_key "$unknown_key_dir"; then
+        log_fail "Failed to sign with unknown-vendor key"
+        return 1
+    fi
+    if activate_verity_device "yes"; then
+        log_fail "Verification with unknown key should fail"
+        deactivate_verity_device
+        return 1
+    else
+        log_pass "Verification with unknown key correctly rejected"
+    fi
+
+    log_info ""
+    log_pass "Multiple keys test completed successfully"
+    return 0
+}
+
+sign_root_hash_with_key() {
+    local key_dir="$1"
+
+    local root_hash
+    root_hash=$(cat "$WORK_DIR/root_hash")
+
+    # Create the data to sign (hex string, not binary)
+    echo -n "$root_hash" > "$WORK_DIR/root_hash.txt"
+
+    # Debug: show exactly what we're signing
+    log_info "Root hash (hex): $root_hash"
+    log_info "Root hash hex string size: $(wc -c < "$WORK_DIR/root_hash.txt") bytes"
+
+    # Create detached PKCS#7 signature
+    if ! create_detached_signature "$WORK_DIR/root_hash.txt" "$WORK_DIR/root_hash.p7s" \
+            "$key_dir/cert.pem" "$key_dir/private.pem"; then
+        log_error "Failed to sign root hash with key from $key_dir"
+        return 1
+    fi
+
+    # Debug: show signing certificate info
+    log_info "Signed with certificate:"
+    openssl x509 -in "$key_dir/cert.pem" -noout -subject 2>/dev/null | sed 's/^/  /'
+
+    # Debug: verify signature locally
+    # -nointern: cert not in signature, use -certfile
+    # -noverify: skip certificate chain validation (self-signed)
+    if openssl smime -verify -binary -inform der -nointern -noverify \
+        -in "$WORK_DIR/root_hash.p7s" \
+        -content "$WORK_DIR/root_hash.txt" \
+        -certfile "$key_dir/cert.pem" \
+        -out /dev/null 2>/dev/null; then
+        log_info "Local signature verification: PASSED"
+    else
+        log_warn "Local signature verification: FAILED"
+    fi
+    return 0
+}
+
+#
+# Test: Verify corrupted signatures are rejected
+#
+test_corrupted_signature() {
+    log_info "TEST: Verify corrupted signatures are rejected"
+
+    # This test requires a valid setup from test_multiple_keys or similar
+    # It modifies the signature file and verifies rejection
+
+    if [ ! -f "$WORK_DIR/root_hash.p7s" ]; then
+        log_warn "No signature file found, skipping corrupted signature test"
+        return 0
+    fi
+
+    # Save original signature
+    cp "$WORK_DIR/root_hash.p7s" "$WORK_DIR/root_hash.p7s.orig"
+
+    # Test 1: Truncated signature
+    log_info "Sub-test: Truncated signature (should fail)"
+    head -c 100 "$WORK_DIR/root_hash.p7s.orig" > "$WORK_DIR/root_hash.p7s"
+    if activate_verity_device "yes"; then
+        log_fail "Truncated signature should be rejected"
+        deactivate_verity_device
+        cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s"
+        return 1
+    else
+        log_pass "Truncated signature correctly rejected"
+    fi
+
+    # Test 2: Corrupted signature (flip some bytes)
+    log_info "Sub-test: Corrupted signature bytes (should fail)"
+    cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s"
+    # Corrupt bytes in the middle of the signature
+    local sig_size
+    sig_size=$(wc -c < "$WORK_DIR/root_hash.p7s")
+    local corrupt_offset=$((sig_size / 2))
+    printf '\xff\xff\xff\xff' | dd of="$WORK_DIR/root_hash.p7s" bs=1 seek=$corrupt_offset conv=notrunc 2>/dev/null
+    if activate_verity_device "yes"; then
+        log_fail "Corrupted signature should be rejected"
+        deactivate_verity_device
+        cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s"
+        return 1
+    else
+        log_pass "Corrupted signature correctly rejected"
+    fi
+
+    # Test 3: Signature over wrong data (sign different content)
+    log_info "Sub-test: Signature over wrong data (should fail)"
+    # Create a different root hash (all zeros as hex string)
+    printf '%064d' 0 > "$WORK_DIR/wrong_hash.txt"
+    # Get the first key directory that was used
+    local key_dir="$WORK_DIR/keys/vendor-a"
+    if [ -d "$key_dir" ]; then
+        create_detached_signature "$WORK_DIR/wrong_hash.txt" "$WORK_DIR/root_hash.p7s" \
+            "$key_dir/cert.pem" "$key_dir/private.pem"
+        if activate_verity_device "yes"; then
+            log_fail "Signature over wrong data should be rejected"
+            deactivate_verity_device
+            cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s"
+            return 1
+        else
+            log_pass "Signature over wrong data correctly rejected"
+        fi
+    else
+        log_warn "Key directory not found, skipping wrong data test"
+    fi
+
+    # Restore original signature
+    cp "$WORK_DIR/root_hash.p7s.orig" "$WORK_DIR/root_hash.p7s"
+
+    log_pass "Corrupted signature test completed successfully"
+    return 0
+}
+
+#
+# Test: Verify keyring is sealed when keyring_unsealed=0
+#
+test_keyring_sealed_by_default() {
+    log_info "TEST: Verify keyring is sealed by default (keyring_unsealed=0)"
+
+    local keyring_id
+    keyring_id=$(cat "$WORK_DIR/keyring_id")
+
+    log_info "Current keyring state (should be empty and sealed):"
+    list_keyring_keys
+    show_keyring_status
+
+    generate_keys
+
+    # Try to add a key - should fail if keyring is sealed
+    log_info "Attempting to add key to sealed keyring..."
+    if keyctl padd asymmetric "dm-verity-test" "$keyring_id" \
+        < "$WORK_DIR/cert.der" 2>/dev/null; then
+        log_fail "Keyring should be sealed when keyring_unsealed=0"
+        list_keyring_keys
+        return 1
+    else
+        log_pass "Keyring is correctly sealed when keyring_unsealed=0"
+        log_info "Keyring state after failed add attempt:"
+        list_keyring_keys
+        return 0
+    fi
+}
+
+#
+# Test: Verify dm-verity keyring is inactive when sealed empty
+#
+test_keyring_inactive_when_empty() {
+    log_info "TEST: Verify dm-verity keyring is inactive when sealed empty"
+
+    # When keyring_unsealed=0, the keyring is sealed immediately while empty
+    # This means it should NOT be used for verification (nr_leaves_on_tree=0)
+
+    log_info "Keyring state (should be empty and sealed):"
+    list_keyring_keys
+    show_keyring_status
+
+    create_test_device
+    create_verity_hash
+
+    # Without any keys in the dm-verity keyring, and with it sealed,
+    # verification should fall through to the secondary/platform keyrings
+    # and likely succeed (if require_signatures=0) or fail (if =1)
+
+    log_info "Sub-test: Device activation with sealed empty keyring"
+    if [ "$REQUIRE_SIGNATURES" = "Y" ] || [ "$REQUIRE_SIGNATURES" = "1" ]; then
+        if activate_verity_device "no"; then
+            log_fail "Device should NOT activate without signature when require_signatures=1"
+            deactivate_verity_device
+            return 1
+        else
+            log_pass "Device correctly rejected (require_signatures=1, no valid signature)"
+        fi
+    else
+        if activate_verity_device "no"; then
+            log_pass "Device activated (require_signatures=0, empty dm-verity keyring is inactive)"
+            deactivate_verity_device
+        else
+            log_fail "Device should activate when require_signatures=0"
+            return 1
+        fi
+    fi
+
+    return 0
+}
+
+main() {
+    local rc=0
+
+    log_info "=== dm-verity keyring test ==="
+    log_info ""
+
+    # Create work directory
+    WORK_DIR=$(mktemp -d -t dm-verity-test.XXXXXX)
+    log_info "Work directory: $WORK_DIR"
+
+    check_requirements
+
+    #
+    # Test 1: UNSEALED keyring mode (keyring_unsealed=1)
+    #
+    log_info ""
+    log_info "========================================"
+    log_info "=== TEST MODE: UNSEALED KEYRING ==="
+    log_info "========================================"
+    log_info ""
+
+    load_dm_verity_module 1 1  # keyring_unsealed=1, require_signatures=1
+    show_keyring_status
+
+    log_info ""
+    if ! test_multiple_keys; then
+        rc=1
+    fi
+
+    # After sealing, verify it rejects new keys
+    log_info ""
+    if ! test_sealed_keyring_rejects_keys; then
+        rc=1
+    fi
+
+    # Test corrupted signatures are rejected
+    log_info ""
+    if ! test_corrupted_signature; then
+        rc=1
+    fi
+
+    # Clean up devices before reloading module
+    deactivate_verity_device
+    if [ -n "$DATA_DEV" ] && [[ "$DATA_DEV" == /dev/loop* ]]; then
+        losetup -d "$DATA_DEV" 2>/dev/null || true
+        DATA_DEV=""
+    fi
+    if [ -n "$HASH_DEV" ] && [[ "$HASH_DEV" == /dev/loop* ]]; then
+        losetup -d "$HASH_DEV" 2>/dev/null || true
+        HASH_DEV=""
+    fi
+
+    #
+    # Test 2: SEALED keyring mode (keyring_unsealed=0, default)
+    #
+    log_info ""
+    log_info "========================================"
+    log_info "=== TEST MODE: SEALED KEYRING (default) ==="
+    log_info "========================================"
+    log_info ""
+
+    load_dm_verity_module 0 0  # keyring_unsealed=0, require_signatures=0
+    show_keyring_status
+
+    log_info ""
+    if ! test_keyring_sealed_by_default; then
+        rc=1
+    fi
+
+    log_info ""
+    if ! test_keyring_inactive_when_empty; then
+        rc=1
+    fi
+
+    #
+    # Summary
+    #
+    log_info ""
+    log_info "========================================"
+    if [ $rc -eq 0 ]; then
+        log_info "=== All tests PASSED ==="
+    else
+        log_error "=== Some tests FAILED ==="
+    fi
+    log_info "========================================"
+
+    return $rc
+}
+
+main "$@"
-- 
cgit v1.2.3


From 03b7c2d763c907f508edf8c317c0e920ce072a33 Mon Sep 17 00:00:00 2001
From: Alex Mastro <amastro@fb.com>
Date: Wed, 14 Jan 2026 10:57:16 -0800
Subject: vfio: selftests: Centralize IOMMU mode name definitions

Replace scattered string literals with MODE_* macros in iommu.h. This
provides a single source of truth for IOMMU mode name strings.

Signed-off-by: Alex Mastro <amastro@fb.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Tested-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20260114-map-mmio-test-v3-1-44e036d95e64@fb.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 tools/testing/selftests/vfio/lib/include/libvfio/iommu.h |  6 ++++++
 tools/testing/selftests/vfio/lib/iommu.c                 | 12 ++++++------
 tools/testing/selftests/vfio/vfio_dma_mapping_test.c     |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h
index 5c9b9dc6d993..e9a3386a4719 100644
--- a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h
+++ b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h
@@ -61,6 +61,12 @@ iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr);
 
 struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges);
 
+#define MODE_VFIO_TYPE1_IOMMU "vfio_type1_iommu"
+#define MODE_VFIO_TYPE1V2_IOMMU "vfio_type1v2_iommu"
+#define MODE_IOMMUFD_COMPAT_TYPE1 "iommufd_compat_type1"
+#define MODE_IOMMUFD_COMPAT_TYPE1V2 "iommufd_compat_type1v2"
+#define MODE_IOMMUFD "iommufd"
+
 /*
  * Generator for VFIO selftests fixture variants that replicate across all
  * possible IOMMU modes. Tests must define FIXTURE_VARIANT_ADD_IOMMU_MODE()
diff --git a/tools/testing/selftests/vfio/lib/iommu.c b/tools/testing/selftests/vfio/lib/iommu.c
index 58b7fb7430d4..035dac069d60 100644
--- a/tools/testing/selftests/vfio/lib/iommu.c
+++ b/tools/testing/selftests/vfio/lib/iommu.c
@@ -20,32 +20,32 @@
 #include "../../../kselftest.h"
 #include <libvfio.h>
 
-const char *default_iommu_mode = "iommufd";
+const char *default_iommu_mode = MODE_IOMMUFD;
 
 /* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */
 static const struct iommu_mode iommu_modes[] = {
 	{
-		.name = "vfio_type1_iommu",
+		.name = MODE_VFIO_TYPE1_IOMMU,
 		.container_path = "/dev/vfio/vfio",
 		.iommu_type = VFIO_TYPE1_IOMMU,
 	},
 	{
-		.name = "vfio_type1v2_iommu",
+		.name = MODE_VFIO_TYPE1V2_IOMMU,
 		.container_path = "/dev/vfio/vfio",
 		.iommu_type = VFIO_TYPE1v2_IOMMU,
 	},
 	{
-		.name = "iommufd_compat_type1",
+		.name = MODE_IOMMUFD_COMPAT_TYPE1,
 		.container_path = "/dev/iommu",
 		.iommu_type = VFIO_TYPE1_IOMMU,
 	},
 	{
-		.name = "iommufd_compat_type1v2",
+		.name = MODE_IOMMUFD_COMPAT_TYPE1V2,
 		.container_path = "/dev/iommu",
 		.iommu_type = VFIO_TYPE1v2_IOMMU,
 	},
 	{
-		.name = "iommufd",
+		.name = MODE_IOMMUFD,
 	},
 };
 
diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
index 3bf984b337ac..3d2f44f9c62f 100644
--- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
+++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
@@ -165,7 +165,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap)
 	 * IOMMUFD compatibility-mode does not support huge mappings when
 	 * using VFIO_TYPE1_IOMMU.
 	 */
-	if (!strcmp(variant->iommu_mode, "iommufd_compat_type1"))
+	if (!strcmp(variant->iommu_mode, MODE_IOMMUFD_COMPAT_TYPE1))
 		mapping_size = SZ_4K;
 
 	ASSERT_EQ(0, rc);
-- 
cgit v1.2.3


From 557dbdf6c4e9c2dc3d4a4476c67ef14dca32378d Mon Sep 17 00:00:00 2001
From: Alex Mastro <amastro@fb.com>
Date: Wed, 14 Jan 2026 10:57:17 -0800
Subject: vfio: selftests: Align BAR mmaps for efficient IOMMU mapping

Update vfio_pci_bar_map() to align BAR mmaps for efficient huge page
mappings. The manual mmap alignment can be removed once mmap(!MAP_FIXED)
on vfio device fds improves to automatically return well-aligned
addresses.

Also add MADV_HUGEPAGE, which encourages the kernel to use huge pages
(e.g. when /sys/kernel/mm/transparent_hugepage/enabled is set to "madvise").

Drop MAP_FILE from mmap(). It is an ignored compatibility flag.

Signed-off-by: Alex Mastro <amastro@fb.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Tested-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20260114-map-mmio-test-v3-2-44e036d95e64@fb.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 tools/testing/selftests/vfio/lib/include/libvfio.h |  9 ++++++++
 tools/testing/selftests/vfio/lib/libvfio.c         | 25 ++++++++++++++++++++++
 tools/testing/selftests/vfio/lib/vfio_pci_device.c | 24 ++++++++++++++++++++-
 3 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vfio/lib/include/libvfio.h b/tools/testing/selftests/vfio/lib/include/libvfio.h
index 279ddcd70194..1b6da54cc2cb 100644
--- a/tools/testing/selftests/vfio/lib/include/libvfio.h
+++ b/tools/testing/selftests/vfio/lib/include/libvfio.h
@@ -23,4 +23,13 @@
 const char *vfio_selftests_get_bdf(int *argc, char *argv[]);
 char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs);
 
+/*
+ * Reserve virtual address space of size at an address satisfying
+ * (vaddr % align) == offset.
+ *
+ * Returns the reserved vaddr. The caller is responsible for unmapping
+ * the returned region.
+ */
+void *mmap_reserve(size_t size, size_t align, size_t offset);
+
 #endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H */
diff --git a/tools/testing/selftests/vfio/lib/libvfio.c b/tools/testing/selftests/vfio/lib/libvfio.c
index a23a3cc5be69..3a3d1ed635c1 100644
--- a/tools/testing/selftests/vfio/lib/libvfio.c
+++ b/tools/testing/selftests/vfio/lib/libvfio.c
@@ -2,6 +2,9 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <sys/mman.h>
+
+#include <linux/align.h>
 
 #include "../../../kselftest.h"
 #include <libvfio.h>
@@ -76,3 +79,25 @@ const char *vfio_selftests_get_bdf(int *argc, char *argv[])
 
 	return vfio_selftests_get_bdfs(argc, argv, &nr_bdfs)[0];
 }
+
+void *mmap_reserve(size_t size, size_t align, size_t offset)
+{
+	void *map_base, *map_align;
+	size_t delta;
+
+	VFIO_ASSERT_GT(align, offset);
+	delta = align - offset;
+
+	map_base = mmap(NULL, size + align, PROT_NONE,
+			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	VFIO_ASSERT_NE(map_base, MAP_FAILED);
+
+	map_align = (void *)(ALIGN((uintptr_t)map_base + delta, align) - delta);
+
+	if (map_align > map_base)
+		VFIO_ASSERT_EQ(munmap(map_base, map_align - map_base), 0);
+
+	VFIO_ASSERT_EQ(munmap(map_align + size, map_base + align - map_align), 0);
+
+	return map_align;
+}
diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
index fac4c0ecadef..4e5871f1ebc3 100644
--- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c
+++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
@@ -11,10 +11,14 @@
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 
+#include <linux/align.h>
 #include <linux/iommufd.h>
+#include <linux/kernel.h>
 #include <linux/limits.h>
+#include <linux/log2.h>
 #include <linux/mman.h>
 #include <linux/overflow.h>
+#include <linux/sizes.h>
 #include <linux/types.h>
 #include <linux/vfio.h>
 
@@ -123,20 +127,38 @@ static void vfio_pci_region_get(struct vfio_pci_device *device, int index,
 static void vfio_pci_bar_map(struct vfio_pci_device *device, int index)
 {
 	struct vfio_pci_bar *bar = &device->bars[index];
+	size_t align, size;
 	int prot = 0;
+	void *vaddr;
 
 	VFIO_ASSERT_LT(index, PCI_STD_NUM_BARS);
 	VFIO_ASSERT_NULL(bar->vaddr);
 	VFIO_ASSERT_TRUE(bar->info.flags & VFIO_REGION_INFO_FLAG_MMAP);
+	VFIO_ASSERT_TRUE(is_power_of_2(bar->info.size));
 
 	if (bar->info.flags & VFIO_REGION_INFO_FLAG_READ)
 		prot |= PROT_READ;
 	if (bar->info.flags & VFIO_REGION_INFO_FLAG_WRITE)
 		prot |= PROT_WRITE;
 
-	bar->vaddr = mmap(NULL, bar->info.size, prot, MAP_FILE | MAP_SHARED,
+	size = bar->info.size;
+
+	/*
+	 * Align BAR mmaps to improve page fault granularity during potential
+	 * subsequent IOMMU mapping of these BAR vaddr. 1G for x86 is the
+	 * largest hugepage size across any architecture, so no benefit from
+	 * larger alignment. BARs smaller than 1G will be aligned by their
+	 * power-of-two size, guaranteeing sufficient alignment for smaller
+	 * hugepages, if present.
+	 */
+	align = min_t(size_t, size, SZ_1G);
+
+	vaddr = mmap_reserve(size, align, 0);
+	bar->vaddr = mmap(vaddr, size, prot, MAP_SHARED | MAP_FIXED,
 			  device->fd, bar->info.offset);
 	VFIO_ASSERT_NE(bar->vaddr, MAP_FAILED);
+
+	madvise(bar->vaddr, size, MADV_HUGEPAGE);
 }
 
 static void vfio_pci_bar_unmap(struct vfio_pci_device *device, int index)
-- 
cgit v1.2.3


From 080723f4d4c3c6fb0720aae614deb1f30ee9ef2e Mon Sep 17 00:00:00 2001
From: Alex Mastro <amastro@fb.com>
Date: Wed, 14 Jan 2026 10:57:18 -0800
Subject: vfio: selftests: Add vfio_dma_mapping_mmio_test

Test IOMMU mapping the BAR mmaps created during vfio_pci_device_setup().

All IOMMU modes are tested: vfio_type1 variants are expected to succeed,
while non-type1 modes are expected to fail. iommufd compat mode can be
updated to expect success once kernel support lands. Native iommufd will
not support mapping vaddrs backed by MMIO (it will support dma-buf based
MMIO mapping instead).

Signed-off-by: Alex Mastro <amastro@fb.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Tested-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20260114-map-mmio-test-v3-3-44e036d95e64@fb.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 tools/testing/selftests/vfio/Makefile              |   1 +
 .../selftests/vfio/vfio_dma_mapping_mmio_test.c    | 143 +++++++++++++++++++++
 2 files changed, 144 insertions(+)
 create mode 100644 tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile
index 3c796ca99a50..ead27892ab65 100644
--- a/tools/testing/selftests/vfio/Makefile
+++ b/tools/testing/selftests/vfio/Makefile
@@ -1,5 +1,6 @@
 CFLAGS = $(KHDR_INCLUDES)
 TEST_GEN_PROGS += vfio_dma_mapping_test
+TEST_GEN_PROGS += vfio_dma_mapping_mmio_test
 TEST_GEN_PROGS += vfio_iommufd_setup_test
 TEST_GEN_PROGS += vfio_pci_device_test
 TEST_GEN_PROGS += vfio_pci_device_init_perf_test
diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c
new file mode 100644
index 000000000000..957a89ce7b3a
--- /dev/null
+++ b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <stdio.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <uapi/linux/types.h>
+#include <linux/pci_regs.h>
+#include <linux/sizes.h>
+#include <linux/vfio.h>
+
+#include <libvfio.h>
+
+#include "../kselftest_harness.h"
+
+static const char *device_bdf;
+
+static struct vfio_pci_bar *largest_mapped_bar(struct vfio_pci_device *device)
+{
+	u32 flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE;
+	struct vfio_pci_bar *largest = NULL;
+	u64 bar_size = 0;
+
+	for (int i = 0; i < PCI_STD_NUM_BARS; i++) {
+		struct vfio_pci_bar *bar = &device->bars[i];
+
+		if (!bar->vaddr)
+			continue;
+
+		/*
+		 * iommu_map() maps with READ|WRITE, so require the same
+		 * abilities for the underlying VFIO region.
+		 */
+		if ((bar->info.flags & flags) != flags)
+			continue;
+
+		if (bar->info.size > bar_size) {
+			bar_size = bar->info.size;
+			largest = bar;
+		}
+	}
+
+	return largest;
+}
+
+FIXTURE(vfio_dma_mapping_mmio_test) {
+	struct iommu *iommu;
+	struct vfio_pci_device *device;
+	struct iova_allocator *iova_allocator;
+	struct vfio_pci_bar *bar;
+};
+
+FIXTURE_VARIANT(vfio_dma_mapping_mmio_test) {
+	const char *iommu_mode;
+};
+
+#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode)			       \
+FIXTURE_VARIANT_ADD(vfio_dma_mapping_mmio_test, _iommu_mode) {		       \
+	.iommu_mode = #_iommu_mode,					       \
+}
+
+FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES();
+
+#undef FIXTURE_VARIANT_ADD_IOMMU_MODE
+
+FIXTURE_SETUP(vfio_dma_mapping_mmio_test)
+{
+	self->iommu = iommu_init(variant->iommu_mode);
+	self->device = vfio_pci_device_init(device_bdf, self->iommu);
+	self->iova_allocator = iova_allocator_init(self->iommu);
+	self->bar = largest_mapped_bar(self->device);
+
+	if (!self->bar)
+		SKIP(return, "No mappable BAR found on device %s", device_bdf);
+}
+
+FIXTURE_TEARDOWN(vfio_dma_mapping_mmio_test)
+{
+	iova_allocator_cleanup(self->iova_allocator);
+	vfio_pci_device_cleanup(self->device);
+	iommu_cleanup(self->iommu);
+}
+
+static void do_mmio_map_test(struct iommu *iommu,
+			     struct iova_allocator *iova_allocator,
+			     void *vaddr, size_t size)
+{
+	struct dma_region region = {
+		.vaddr = vaddr,
+		.size = size,
+		.iova = iova_allocator_alloc(iova_allocator, size),
+	};
+
+	/*
+	 * NOTE: Check for iommufd compat success once it lands. Native iommufd
+	 * will never support this.
+	 */
+	if (!strcmp(iommu->mode->name, MODE_VFIO_TYPE1V2_IOMMU) ||
+	    !strcmp(iommu->mode->name, MODE_VFIO_TYPE1_IOMMU)) {
+		iommu_map(iommu, &region);
+		iommu_unmap(iommu, &region);
+	} else {
+		VFIO_ASSERT_NE(__iommu_map(iommu, &region), 0);
+		VFIO_ASSERT_NE(__iommu_unmap(iommu, &region, NULL), 0);
+	}
+}
+
+TEST_F(vfio_dma_mapping_mmio_test, map_full_bar)
+{
+	do_mmio_map_test(self->iommu, self->iova_allocator,
+			 self->bar->vaddr, self->bar->info.size);
+}
+
+TEST_F(vfio_dma_mapping_mmio_test, map_partial_bar)
+{
+	if (self->bar->info.size < 2 * getpagesize())
+		SKIP(return, "BAR too small (size=0x%llx)", self->bar->info.size);
+
+	do_mmio_map_test(self->iommu, self->iova_allocator,
+			 self->bar->vaddr, getpagesize());
+}
+
+/* Test IOMMU mapping of BAR mmap with intentionally poor vaddr alignment. */
+TEST_F(vfio_dma_mapping_mmio_test, map_bar_misaligned)
+{
+	/* Limit size to bound test time for large BARs */
+	size_t size = min_t(size_t, self->bar->info.size, SZ_1G);
+	void *vaddr;
+
+	vaddr = mmap_reserve(size, SZ_1G, getpagesize());
+	vaddr = mmap(vaddr, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED,
+		     self->device->fd, self->bar->info.offset);
+	VFIO_ASSERT_NE(vaddr, MAP_FAILED);
+
+	do_mmio_map_test(self->iommu, self->iova_allocator, vaddr, size);
+
+	VFIO_ASSERT_EQ(munmap(vaddr, size), 0);
+}
+
+int main(int argc, char *argv[])
+{
+	device_bdf = vfio_selftests_get_bdf(&argc, argv);
+	return test_harness_run(argc, argv);
+}
-- 
cgit v1.2.3


From 1c588bca3bd5b39c93a28a5986bf82ebfb05eec2 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Wed, 14 Jan 2026 21:12:52 +0000
Subject: vfio: selftests: Drop IOMMU mapping size assertions for
 VFIO_TYPE1_IOMMU

Drop the assertions about IOMMU mappings sizes for VFIO_TYPE1_IOMMU
modes (both the VFIO mode and the iommufd compatibility mode). These
assertions fail when CONFIG_IOMMUFD_VFIO_CONTAINER is enabled, since
iommufd compatibility mode provides different huge page behavior than
VFIO for VFIO_TYPE1_IOMMU. VFIO_TYPE1_IOMMU is an old enough interface
that it's not worth changing the behavior of VFIO and iommufd to match
nor care about the IOMMU mapping sizes.

Cc: Jason Gunthorpe <jgg@ziepe.ca>
Link: https://lore.kernel.org/kvm/20260109143830.176dc279@shazbot.org/
Signed-off-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20260114211252.2581145-1-dmatlack@google.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 tools/testing/selftests/vfio/vfio_dma_mapping_test.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
index 3d2f44f9c62f..abb170bdcef7 100644
--- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
+++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
@@ -161,12 +161,8 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap)
 	if (rc == -EOPNOTSUPP)
 		goto unmap;
 
-	/*
-	 * IOMMUFD compatibility-mode does not support huge mappings when
-	 * using VFIO_TYPE1_IOMMU.
-	 */
-	if (!strcmp(variant->iommu_mode, MODE_IOMMUFD_COMPAT_TYPE1))
-		mapping_size = SZ_4K;
+	if (self->iommu->mode->iommu_type == VFIO_TYPE1_IOMMU)
+		goto unmap;
 
 	ASSERT_EQ(0, rc);
 	printf("Found IOMMU mappings for IOVA 0x%lx:\n", region.iova);
-- 
cgit v1.2.3


From 8becfe16e4a12218c703a98f5bfc15b6f0fbd99c Mon Sep 17 00:00:00 2001
From: Dmitry Skorodumov <dskr99@gmail.com>
Date: Mon, 12 Jan 2026 17:24:07 +0300
Subject: selftests: net: simple selftest for ipvtap

This is a simple ipvtap test to test handling
IP-address add/remove on ipvlan interface.

It creates a veth-interface and then creates several
network-namespace with ipvlan0 interface in it linked to veth.

Then it starts to add/remove addresses on ipvlan0 interfaces
in several threads.

At finish, it checks that there is no duplicated addresses.

Signed-off-by: Dmitry Skorodumov <skorodumov.dmitry@huawei.com>
Link: https://patch.msgid.link/20260112142417.4039566-3-skorodumov.dmitry@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile       |   1 +
 tools/testing/selftests/net/config         |   2 +
 tools/testing/selftests/net/ipvtap_test.sh | 168 +++++++++++++++++++++++++++++
 3 files changed, 171 insertions(+)
 create mode 100755 tools/testing/selftests/net/ipvtap_test.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index b66ba04f19d9..45c4ea381bc3 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -48,6 +48,7 @@ TEST_PROGS := \
 	ipv6_flowlabel.sh \
 	ipv6_force_forwarding.sh \
 	ipv6_route_update_soft_lockup.sh \
+	ipvtap_test.sh \
 	l2_tos_ttl_inherit.sh \
 	l2tp.sh \
 	link_netns.py \
diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index 1e1f253118f5..b84362b9b508 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -48,6 +48,7 @@ CONFIG_IPV6_SEG6_LWTUNNEL=y
 CONFIG_IPV6_SIT=y
 CONFIG_IPV6_VTI=y
 CONFIG_IPVLAN=m
+CONFIG_IPVTAP=m
 CONFIG_KALLSYMS=y
 CONFIG_L2TP=m
 CONFIG_L2TP_ETH=m
@@ -116,6 +117,7 @@ CONFIG_PROC_SYSCTL=y
 CONFIG_PSAMPLE=m
 CONFIG_RPS=y
 CONFIG_SYSFS=y
+CONFIG_TAP=m
 CONFIG_TCP_MD5SIG=y
 CONFIG_TEST_BLACKHOLE_DEV=m
 CONFIG_TEST_BPF=m
diff --git a/tools/testing/selftests/net/ipvtap_test.sh b/tools/testing/selftests/net/ipvtap_test.sh
new file mode 100755
index 000000000000..354ca7ce8584
--- /dev/null
+++ b/tools/testing/selftests/net/ipvtap_test.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Simple tests for ipvtap
+
+
+#
+# The testing environment looks this way:
+#
+# |------HNS-------|     |------PHY-------|
+# |      veth<----------------->veth      |
+# |------|--|------|     |----------------|
+#        |  |
+#        |  |            |-----TST0-------|
+#        |  |------------|----ipvlan      |
+#        |               |----------------|
+#        |
+#        |               |-----TST1-------|
+#        |---------------|----ipvlan      |
+#                        |----------------|
+#
+
+ALL_TESTS="
+	test_ip_set
+"
+
+source lib.sh
+
+DEBUG=0
+
+VETH_HOST=vethtst.h
+VETH_PHY=vethtst.p
+
+NS_COUNT=32
+IP_ITERATIONS=1024
+IPSET_TIMEOUT="60s"
+
+ns_run() {
+	ns=$1
+	shift
+	if [[ "$ns" == "global" ]]; then
+		"$@" >/dev/null
+	else
+		ip netns exec "$ns" "$@" >/dev/null
+	fi
+}
+
+test_ip_setup_env() {
+	setup_ns NS_PHY
+	setup_ns HST_NS
+
+	# setup simulated other-host (phy) and host itself
+	ns_run "$HST_NS" ip link add $VETH_HOST type veth peer name $VETH_PHY \
+		netns "$NS_PHY" >/dev/null
+	ns_run "$HST_NS" ip link set $VETH_HOST up
+	ns_run "$NS_PHY" ip link set $VETH_PHY up
+
+	for ((i=0; i<NS_COUNT; i++)); do
+		setup_ns ipvlan_ns_$i
+		ns="ipvlan_ns_$i"
+		if [ "$DEBUG" = "1" ]; then
+			echo "created NS ${!ns}"
+		fi
+		if ! ns_run "$HST_NS" ip link add netns ${!ns} ipvlan0 \
+		    link $VETH_HOST \
+		    type ipvtap mode l2 bridge; then
+			exit_error "FAIL: Failed to configure ipvlan link."
+		fi
+	done
+}
+
+test_ip_cleanup_env() {
+	ns_run "$HST_NS" ip link del $VETH_HOST
+	cleanup_all_ns
+}
+
+exit_error() {
+	echo "$1"
+	exit $ksft_fail
+}
+
+rnd() {
+	echo $(( RANDOM % 32 + 16 ))
+}
+
+test_ip_set_thread() {
+	# Here we are trying to create some IP conflicts between namespaces.
+	# If just add/remove IP, nothing interesting will happen.
+	# But if add random IP and then remove random IP,
+	# eventually conflicts start to apear.
+	ip link set ipvlan0 up
+	for ((i=0; i<IP_ITERATIONS; i++)); do
+		v=$(rnd)
+		ip a a "172.25.0.$v/24" dev ipvlan0 2>/dev/null
+		ip a a "fc00::$v/64" dev ipvlan0 2>/dev/null
+		v=$(rnd)
+		ip a d "172.25.0.$v/24" dev ipvlan0 2>/dev/null
+		ip a d "fc00::$v/64" dev ipvlan0 2>/dev/null
+	done
+}
+
+test_ip_set() {
+	RET=0
+
+	trap test_ip_cleanup_env EXIT
+
+	test_ip_setup_env
+
+	declare -A ns_pids
+	for ((i=0; i<NS_COUNT; i++)); do
+		ns="ipvlan_ns_$i"
+		ns_run ${!ns} timeout "$IPSET_TIMEOUT" \
+			bash -c "$0 test_ip_set_thread"&
+		ns_pids[$i]=$!
+	done
+
+	for ((i=0; i<NS_COUNT; i++)); do
+		wait "${ns_pids[$i]}"
+	done
+
+	declare -A all_ips
+	for ((i=0; i<NS_COUNT; i++)); do
+		ns="ipvlan_ns_$i"
+		ip_output=$(ip netns exec ${!ns} ip a l dev ipvlan0 | grep inet)
+		while IFS= read -r nsip_out; do
+			if [[ -z $nsip_out ]]; then
+				continue;
+			fi
+			nsip=$(awk '{print $2}' <<< "$nsip_out")
+			if [[ -v all_ips[$nsip] ]]; then
+				RET=$ksft_fail
+				log_test "conflict for $nsip"
+				return "$RET"
+			else
+				all_ips[$nsip]=$i
+			fi
+		done <<< "$ip_output"
+	done
+
+	if [ "$DEBUG" = "1" ]; then
+		for key in "${!all_ips[@]}"; do
+			echo "$key: ${all_ips[$key]}"
+		done
+	fi
+
+	trap - EXIT
+	test_ip_cleanup_env
+
+	log_test "test multithreaded ip set"
+}
+
+if [[ "$1" == "-d" ]]; then
+	DEBUG=1
+	shift
+fi
+
+if [[ "$1" == "-t" ]]; then
+	shift
+	TESTS="$*"
+fi
+
+if [[ "$1" == "test_ip_set_thread" ]]; then
+	test_ip_set_thread
+else
+	require_command ip
+
+	tests_run
+fi
-- 
cgit v1.2.3


From d321d505edb64286bae0e464574d0fd553e31adc Mon Sep 17 00:00:00 2001
From: "Christophe Leroy (CS GROUP)" <chleroy@kernel.org>
Date: Fri, 16 Jan 2026 10:48:55 +0100
Subject: selftests: net: csum: Fix printk format in
 recv_get_packet_csum_status()

Following warning is encountered when building selftests on powerpc/32.

  CC       csum
csum.c: In function 'recv_get_packet_csum_status':
csum.c:710:50: warning: format '%lu' expects argument of type 'long unsigned int', but argument 4 has type 'size_t' {aka 'unsigned int'} [-Wformat=]
  710 |                         error(1, 0, "cmsg: len=%lu expected=%lu",
      |                                                ~~^
      |                                                  |
      |                                                  long unsigned int
      |                                                %u
  711 |                               cm->cmsg_len, CMSG_LEN(sizeof(struct tpacket_auxdata)));
      |                               ~~~~~~~~~~~~
      |                                 |
      |                                 size_t {aka unsigned int}
csum.c:710:63: warning: format '%lu' expects argument of type 'long unsigned int', but argument 5 has type 'unsigned int' [-Wformat=]
  710 |                         error(1, 0, "cmsg: len=%lu expected=%lu",
      |                                                             ~~^
      |                                                               |
      |                                                               long unsigned int
      |                                                             %u

cm->cmsg_len has type __kernel_size_t and CMSG() macro has the type
returned by sizeof() which is size_t.

size_t is 'unsigned int' on some platforms and 'unsigned long' on
other ones so use %zu instead of %lu.

The code in question was introduced by
commit 91a7de85600d ("selftests/net: add csum offload test").

Signed-off-by: Christophe Leroy (CS GROUP) <chleroy@kernel.org>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/8b69b40826553c1dd500d9d25e45883744f3f348.1768556791.git.chleroy@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/lib/csum.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/lib/csum.c b/tools/testing/selftests/net/lib/csum.c
index 27437590eeb5..e28884ce3ab3 100644
--- a/tools/testing/selftests/net/lib/csum.c
+++ b/tools/testing/selftests/net/lib/csum.c
@@ -707,7 +707,7 @@ static uint32_t recv_get_packet_csum_status(struct msghdr *msg)
 			      cm->cmsg_level, cm->cmsg_type);
 
 		if (cm->cmsg_len != CMSG_LEN(sizeof(struct tpacket_auxdata)))
-			error(1, 0, "cmsg: len=%lu expected=%lu",
+			error(1, 0, "cmsg: len=%zu expected=%zu",
 			      cm->cmsg_len, CMSG_LEN(sizeof(struct tpacket_auxdata)));
 
 		aux = (void *)CMSG_DATA(cm);
-- 
cgit v1.2.3


From 2460f31e6e444a52a4e718e4fe64cff29ffaab05 Mon Sep 17 00:00:00 2001
From: Victor Nogueira <victor@mojatatu.com>
Date: Wed, 14 Jan 2026 11:02:43 -0500
Subject: selftests/tc-testing: Try to add teql as a child qdisc

Add a selftest that attempts to add a teql qdisc as a qfq child.
Since teql _must_ be added as a root qdisc, the kernel should reject
this.

Signed-off-by: Victor Nogueira <victor@mojatatu.com>
Link: https://patch.msgid.link/20260114160243.913069-4-jhs@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/tc-testing/tc-tests/qdiscs/teql.json | 25 ++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/teql.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/teql.json
index e5cc31f265f8..0179c57104ad 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/teql.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/teql.json
@@ -81,5 +81,30 @@
             "$TC qdisc del dev $DUMMY handle 1: root",
             "$IP link del dev $DUMMY"
         ]
+    },
+    {
+        "id": "124e",
+        "name": "Try to add teql as a child qdisc",
+        "category": [
+            "qdisc",
+            "ets",
+            "tbf"
+        ],
+        "plugins": {
+            "requires": [
+                "nsPlugin"
+            ]
+        },
+        "setup": [
+            "$TC qdisc add dev $DUMMY root handle 1: qfq",
+            "$TC class add dev $DUMMY parent 1: classid 1:1 qfq weight 15 maxpkt 16384"
+        ],
+        "cmdUnderTest": "$TC qdisc add dev $DUMMY parent 1:1 handle 2:1 teql0",
+        "expExitCode": "2",
+        "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:1",
+        "matchJSON": [],
+        "teardown": [
+            "$TC qdisc del dev $DUMMY root handle 1:"
+        ]
     }
 ]
-- 
cgit v1.2.3


From a5546e18f77c0cb15d434bf5b92647687fe483e3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 15 Jan 2026 09:25:48 +0100
Subject: net: Add queue-create operation

Add a ynl netdev family operation called queue-create that creates a
new queue on a netdevice:

      name: queue-create
      attribute-set: queue
      flags: [admin-perm]
      do:
        request:
          attributes:
            - ifindex
            - type
            - lease
        reply: &queue-create-op
          attributes:
            - id

This is a generic operation such that it can be extended for various
use cases in future. Right now it is mandatory to specify ifindex,
the queue type which is enforced to rx and a lease. The newly created
queue id is returned to the caller.

A queue from a virtual device can have a lease which refers to another
queue from a physical device. This is useful for memory providers
and AF_XDP operations which take an ifindex and queue id to allow
applications to bind against virtual devices in containers. The lease
couples both queues together and allows to proxy the operations from
a virtual device in a container to the physical device.

In future, the nested lease attribute can be lifted and made optional
for other use-cases such as dynamic queue creation for physical
netdevs. The lack of lease and the specification of the physical
device as an ifindex will imply that we need a real queue to be
allocated. Similarly, the queue type enforcement to rx can then be
lifted as well to support tx.

An early implementation had only driver-specific integration [0], but
in order for other virtual devices to reuse, it makes sense to have
this as a generic API in core net.

For leasing queues, the virtual netdev must have real_num_rx_queue
less than num_rx_queues at the time of calling queue-create. The
queue-type must be rx as only rx queues are supported for leasing
for now. We also enforce that the queue-create ifindex must point
to a virtual device, and that the nested lease attribute's ifindex
must point to a physical device. The nested lease attribute set
contains a netns-id attribute which is currently only intended for
dumping as part of the queue-get operation. Also, it is modeled as
an s32 type similarly as done elsewhere in the stack.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Co-developed-by: David Wei <dw@davidwei.uk>
Signed-off-by: David Wei <dw@davidwei.uk>
Link: https://bpfconf.ebpf.io/bpfconf2025/bpfconf2025_material/lsfmmbpf_2025_netkit_borkmann.pdf [0]
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20260115082603.219152-2-daniel@iogearbox.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/netlink/specs/netdev.yaml | 44 +++++++++++++++++++++++++++++++++
 include/uapi/linux/netdev.h             | 11 +++++++++
 net/core/netdev-genl-gen.c              | 20 +++++++++++++++
 net/core/netdev-genl-gen.h              |  2 ++
 net/core/netdev-genl.c                  |  5 ++++
 tools/include/uapi/linux/netdev.h       | 11 +++++++++
 6 files changed, 93 insertions(+)

(limited to 'tools')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 596c306ce52b..b86db8656eac 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -339,6 +339,15 @@ attribute-sets:
         doc: XSK information for this queue, if any.
         type: nest
         nested-attributes: xsk-info
+      -
+        name: lease
+        doc: |
+          A queue from a virtual device can have a lease which refers to
+          another queue from a physical device. This is useful for memory
+          providers and AF_XDP operations which take an ifindex and queue id
+          to allow applications to bind against virtual devices in containers.
+        type: nest
+        nested-attributes: lease
   -
     name: qstats
     doc: |
@@ -537,6 +546,24 @@ attribute-sets:
         name: id
       -
         name: type
+  -
+    name: lease
+    attributes:
+      -
+        name: ifindex
+        doc: The netdev ifindex to lease the queue from.
+        type: u32
+        checks:
+          min: 1
+      -
+        name: queue
+        doc: The netdev queue to lease from.
+        type: nest
+        nested-attributes: queue-id
+      -
+        name: netns-id
+        doc: The network namespace id of the netdev.
+        type: s32
   -
     name: dmabuf
     attributes:
@@ -686,6 +713,7 @@ operations:
             - dmabuf
             - io-uring
             - xsk
+            - lease
       dump:
         request:
           attributes:
@@ -797,6 +825,22 @@ operations:
         reply:
           attributes:
             - id
+    -
+      name: queue-create
+      doc: |
+        Create a new queue for the given netdevice. Whether this operation
+        is supported depends on the device and the driver.
+      attribute-set: queue
+      flags: [admin-perm]
+      do:
+        request:
+          attributes:
+            - ifindex
+            - type
+            - lease
+        reply: &queue-create-op
+          attributes:
+            - id
 
 kernel-family:
   headers: ["net/netdev_netlink.h"]
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index e0b579a1df4f..7df1056a35fd 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -160,6 +160,7 @@ enum {
 	NETDEV_A_QUEUE_DMABUF,
 	NETDEV_A_QUEUE_IO_URING,
 	NETDEV_A_QUEUE_XSK,
+	NETDEV_A_QUEUE_LEASE,
 
 	__NETDEV_A_QUEUE_MAX,
 	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -202,6 +203,15 @@ enum {
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
 };
 
+enum {
+	NETDEV_A_LEASE_IFINDEX = 1,
+	NETDEV_A_LEASE_QUEUE,
+	NETDEV_A_LEASE_NETNS_ID,
+
+	__NETDEV_A_LEASE_MAX,
+	NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1)
+};
+
 enum {
 	NETDEV_A_DMABUF_IFINDEX = 1,
 	NETDEV_A_DMABUF_QUEUES,
@@ -228,6 +238,7 @@ enum {
 	NETDEV_CMD_BIND_RX,
 	NETDEV_CMD_NAPI_SET,
 	NETDEV_CMD_BIND_TX,
+	NETDEV_CMD_QUEUE_CREATE,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index ba673e81716f..52ba99c019e7 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -28,6 +28,12 @@ static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range
 };
 
 /* Common nested types */
+const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = {
+	[NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+	[NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
+	[NETDEV_A_LEASE_NETNS_ID] = { .type = NLA_S32, },
+};
+
 const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = {
 	[NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
 	[NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range),
@@ -107,6 +113,13 @@ static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1]
 	[NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
 };
 
+/* NETDEV_CMD_QUEUE_CREATE - do */
+static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = {
+	[NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+	[NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
+	[NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy),
+};
+
 /* Ops table for netdev */
 static const struct genl_split_ops netdev_nl_ops[] = {
 	{
@@ -205,6 +218,13 @@ static const struct genl_split_ops netdev_nl_ops[] = {
 		.maxattr	= NETDEV_A_DMABUF_FD,
 		.flags		= GENL_CMD_CAP_DO,
 	},
+	{
+		.cmd		= NETDEV_CMD_QUEUE_CREATE,
+		.doit		= netdev_nl_queue_create_doit,
+		.policy		= netdev_queue_create_nl_policy,
+		.maxattr	= NETDEV_A_QUEUE_LEASE,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
 };
 
 static const struct genl_multicast_group netdev_nl_mcgrps[] = {
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index cffc08517a41..d71b435d72c1 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -14,6 +14,7 @@
 #include <net/netdev_netlink.h>
 
 /* Common nested types */
+extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1];
 extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1];
 extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1];
 
@@ -36,6 +37,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
 int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
 int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
 int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info);
 
 enum {
 	NETDEV_NLGRP_MGMT,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 470fabbeacd9..aae75431858d 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -1120,6 +1120,11 @@ err_genlmsg_free:
 	return err;
 }
 
+int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	return -EOPNOTSUPP;
+}
+
 void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)
 {
 	INIT_LIST_HEAD(&priv->bindings);
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index e0b579a1df4f..7df1056a35fd 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -160,6 +160,7 @@ enum {
 	NETDEV_A_QUEUE_DMABUF,
 	NETDEV_A_QUEUE_IO_URING,
 	NETDEV_A_QUEUE_XSK,
+	NETDEV_A_QUEUE_LEASE,
 
 	__NETDEV_A_QUEUE_MAX,
 	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -202,6 +203,15 @@ enum {
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
 };
 
+enum {
+	NETDEV_A_LEASE_IFINDEX = 1,
+	NETDEV_A_LEASE_QUEUE,
+	NETDEV_A_LEASE_NETNS_ID,
+
+	__NETDEV_A_LEASE_MAX,
+	NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1)
+};
+
 enum {
 	NETDEV_A_DMABUF_IFINDEX = 1,
 	NETDEV_A_DMABUF_QUEUES,
@@ -228,6 +238,7 @@ enum {
 	NETDEV_CMD_BIND_RX,
 	NETDEV_CMD_NAPI_SET,
 	NETDEV_CMD_BIND_TX,
+	NETDEV_CMD_QUEUE_CREATE,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
-- 
cgit v1.2.3


From 61d99ce3dfc2f1ba5bf713093bb3a5faf5ebc2dc Mon Sep 17 00:00:00 2001
From: David Wei <dw@davidwei.uk>
Date: Thu, 15 Jan 2026 09:26:00 +0100
Subject: selftests/net: Add bpf skb forwarding program

Add nk_forward.bpf.c, a BPF program that forwards skbs matching some IPv6
prefix received on eth0 ifindex to a specified netkit ifindex. This will
be needed by netkit container tests.

Signed-off-by: David Wei <dw@davidwei.uk>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20260115082603.219152-14-daniel@iogearbox.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../selftests/drivers/net/hw/nk_forward.bpf.c      | 49 ++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c b/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c
new file mode 100644
index 000000000000..86ebfc1445b6
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/pkt_cls.h>
+#include <linux/if_ether.h>
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+
+#define TC_ACT_OK 0
+#define ETH_P_IPV6 0x86DD
+
+#define ctx_ptr(field)		((void *)(long)(field))
+
+#define v6_p64_equal(a, b)	(a.s6_addr32[0] == b.s6_addr32[0] && \
+				 a.s6_addr32[1] == b.s6_addr32[1])
+
+volatile __u32 netkit_ifindex;
+volatile __u8 ipv6_prefix[16];
+
+SEC("tc/ingress")
+int tc_redirect_peer(struct __sk_buff *skb)
+{
+	void *data_end = ctx_ptr(skb->data_end);
+	void *data = ctx_ptr(skb->data);
+	struct in6_addr *peer_addr;
+	struct ipv6hdr *ip6h;
+	struct ethhdr *eth;
+
+	peer_addr = (struct in6_addr *)ipv6_prefix;
+
+	if (skb->protocol != bpf_htons(ETH_P_IPV6))
+		return TC_ACT_OK;
+
+	eth = data;
+	if ((void *)(eth + 1) > data_end)
+		return TC_ACT_OK;
+
+	ip6h = data + sizeof(struct ethhdr);
+	if ((void *)(ip6h + 1) > data_end)
+		return TC_ACT_OK;
+
+	if (!v6_p64_equal(ip6h->daddr, (*peer_addr)))
+		return TC_ACT_OK;
+
+	return bpf_redirect_peer(netkit_ifindex, 0);
+}
+
+char __license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 6be87fbb27763c2999e1c69bbec1f3a63cf05422 Mon Sep 17 00:00:00 2001
From: David Wei <dw@davidwei.uk>
Date: Thu, 15 Jan 2026 09:26:01 +0100
Subject: selftests/net: Add env for container based tests

Add an env NetDrvContEnv for container based selftests. This automates
the setup of a netns, netkit pair with one inside the netns, and a BPF
program that forwards skbs from the NETIF host inside the container.

Currently only netkit is used, but other virtual netdevs e.g. veth can
be used too.

Expect netkit container datapath selftests to have a publicly routable
IP prefix to assign to netkit in a container, such that packets will
land on eth0. The BPF skb forward program will then forward such packets
from the host netns to the container netns.

Signed-off-by: David Wei <dw@davidwei.uk>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20260115082603.219152-15-daniel@iogearbox.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/drivers/net/README.rst     |   7 ++
 .../selftests/drivers/net/hw/lib/py/__init__.py    |   7 +-
 .../selftests/drivers/net/lib/py/__init__.py       |   7 +-
 tools/testing/selftests/drivers/net/lib/py/env.py  | 112 +++++++++++++++++++++
 4 files changed, 127 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst
index eb838ae94844..b94e81c2e030 100644
--- a/tools/testing/selftests/drivers/net/README.rst
+++ b/tools/testing/selftests/drivers/net/README.rst
@@ -62,6 +62,13 @@ LOCAL_V4, LOCAL_V6, REMOTE_V4, REMOTE_V6
 
 Local and remote endpoint IP addresses.
 
+LOCAL_PREFIX_V4, LOCAL_PREFIX_V6
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Local IP prefix/subnet which can be used to allocate extra IP addresses (for
+network name spaces behind macvlan, veth, netkit devices). DUT must be
+reachable using these addresses from the endpoint.
+
 REMOTE_TYPE
 ~~~~~~~~~~~
 
diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
index d5d247eca6b7..022008249313 100644
--- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
+++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
@@ -3,6 +3,7 @@
 """
 Driver test environment (hardware-only tests).
 NetDrvEnv and NetDrvEpEnv are the main environment classes.
+NetDrvContEnv extends NetDrvEpEnv with netkit container support.
 Former is for local host only tests, latter creates / connects
 to a remote endpoint. See NIPA wiki for more information about
 running and writing driver tests.
@@ -29,7 +30,7 @@ try:
     from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \
         ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none
     from drivers.net.lib.py import GenerateTraffic, Remote, Iperf3Runner
-    from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv
+    from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv
 
     __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev",
                "EthtoolFamily", "NetdevFamily", "NetshaperFamily",
@@ -44,8 +45,8 @@ try:
                "ksft_eq", "ksft_ge", "ksft_in", "ksft_is", "ksft_lt",
                "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt",
                "ksft_not_none", "ksft_not_none",
-               "NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote",
-               "Iperf3Runner"]
+               "NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic",
+               "Remote", "Iperf3Runner"]
 except ModuleNotFoundError as e:
     print("Failed importing `net` library from kernel sources")
     print(str(e))
diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py
index 8b75faa9af6d..be3a8a936882 100644
--- a/tools/testing/selftests/drivers/net/lib/py/__init__.py
+++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py
@@ -3,6 +3,7 @@
 """
 Driver test environment.
 NetDrvEnv and NetDrvEpEnv are the main environment classes.
+NetDrvContEnv extends NetDrvEpEnv with netkit container support.
 Former is for local host only tests, latter creates / connects
 to a remote endpoint. See NIPA wiki for more information about
 running and writing driver tests.
@@ -43,12 +44,12 @@ try:
                "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt",
                "ksft_not_none", "ksft_not_none"]
 
-    from .env import NetDrvEnv, NetDrvEpEnv
+    from .env import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv
     from .load import GenerateTraffic, Iperf3Runner
     from .remote import Remote
 
-    __all__ += ["NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote",
-                "Iperf3Runner"]
+    __all__ += ["NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic",
+                "Remote", "Iperf3Runner"]
 except ModuleNotFoundError as e:
     print("Failed importing `net` library from kernel sources")
     print(str(e))
diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py
index 41cc248ac848..5b12c4c59e09 100644
--- a/tools/testing/selftests/drivers/net/lib/py/env.py
+++ b/tools/testing/selftests/drivers/net/lib/py/env.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 
+import ipaddress
 import os
+import re
 import time
 from pathlib import Path
 from lib.py import KsftSkipEx, KsftXfailEx
@@ -8,6 +10,7 @@ from lib.py import ksft_setup, wait_file
 from lib.py import cmd, ethtool, ip, CmdExitFailure
 from lib.py import NetNS, NetdevSimDev
 from .remote import Remote
+from . import bpftool
 
 
 class NetDrvEnvBase:
@@ -289,3 +292,112 @@ class NetDrvEpEnv(NetDrvEnvBase):
                 data.get('stats-block-usecs', 0) / 1000 / 1000
 
         time.sleep(self._stats_settle_time)
+
+
+class NetDrvContEnv(NetDrvEpEnv):
+    """
+    Class for an environment with a netkit pair setup for forwarding traffic
+    between the physical interface and a network namespace.
+    """
+
+    def __init__(self, src_path, nk_rxqueues=1, **kwargs):
+        super().__init__(src_path, **kwargs)
+
+        self.require_ipver("6")
+        local_prefix = self.env.get("LOCAL_PREFIX_V6")
+        if not local_prefix:
+            raise KsftSkipEx("LOCAL_PREFIX_V6 required")
+
+        local_prefix = local_prefix.rstrip("/64").rstrip("::").rstrip(":")
+        self.ipv6_prefix = f"{local_prefix}::"
+        self.nk_host_ipv6 = f"{local_prefix}::2:1"
+        self.nk_guest_ipv6 = f"{local_prefix}::2:2"
+
+        self.netns = None
+        self._nk_host_ifname = None
+        self._nk_guest_ifname = None
+        self._tc_attached = False
+        self._bpf_prog_pref = None
+        self._bpf_prog_id = None
+
+        ip(f"link add type netkit mode l2 forward peer forward numrxqueues {nk_rxqueues}")
+
+        all_links = ip("-d link show", json=True)
+        netkit_links = [link for link in all_links
+                        if link.get('linkinfo', {}).get('info_kind') == 'netkit'
+                        and 'UP' not in link.get('flags', [])]
+
+        if len(netkit_links) != 2:
+            raise KsftSkipEx("Failed to create netkit pair")
+
+        netkit_links.sort(key=lambda x: x['ifindex'])
+        self._nk_host_ifname = netkit_links[1]['ifname']
+        self._nk_guest_ifname = netkit_links[0]['ifname']
+        self.nk_host_ifindex = netkit_links[1]['ifindex']
+        self.nk_guest_ifindex = netkit_links[0]['ifindex']
+
+        self._setup_ns()
+        self._attach_bpf()
+
+    def __del__(self):
+        if self._tc_attached:
+            cmd(f"tc filter del dev {self.ifname} ingress pref {self._bpf_prog_pref}")
+            self._tc_attached = False
+
+        if self._nk_host_ifname:
+            cmd(f"ip link del dev {self._nk_host_ifname}")
+            self._nk_host_ifname = None
+            self._nk_guest_ifname = None
+
+        if self.netns:
+            del self.netns
+            self.netns = None
+
+        super().__del__()
+
+    def _setup_ns(self):
+        self.netns = NetNS()
+        ip(f"link set dev {self._nk_guest_ifname} netns {self.netns.name}")
+        ip(f"link set dev {self._nk_host_ifname} up")
+        ip(f"-6 addr add fe80::1/64 dev {self._nk_host_ifname} nodad")
+        ip(f"-6 route add {self.nk_guest_ipv6}/128 via fe80::2 dev {self._nk_host_ifname}")
+
+        ip("link set lo up", ns=self.netns)
+        ip(f"link set dev {self._nk_guest_ifname} up", ns=self.netns)
+        ip(f"-6 addr add fe80::2/64 dev {self._nk_guest_ifname}", ns=self.netns)
+        ip(f"-6 addr add {self.nk_guest_ipv6}/64 dev {self._nk_guest_ifname} nodad", ns=self.netns)
+        ip(f"-6 route add default via fe80::1 dev {self._nk_guest_ifname}", ns=self.netns)
+
+    def _attach_bpf(self):
+        bpf_obj = self.test_dir / "nk_forward.bpf.o"
+        if not bpf_obj.exists():
+            raise KsftSkipEx("BPF prog not found")
+
+        cmd(f"tc filter add dev {self.ifname} ingress bpf obj {bpf_obj} sec tc/ingress direct-action")
+        self._tc_attached = True
+
+        tc_info = cmd(f"tc filter show dev {self.ifname} ingress").stdout
+        match = re.search(r'pref (\d+).*nk_forward\.bpf.*id (\d+)', tc_info)
+        if not match:
+            raise Exception("Failed to get BPF prog ID")
+        self._bpf_prog_pref = int(match.group(1))
+        self._bpf_prog_id = int(match.group(2))
+
+        prog_info = bpftool(f"prog show id {self._bpf_prog_id}", json=True)
+        map_ids = prog_info.get("map_ids", [])
+
+        bss_map_id = None
+        for map_id in map_ids:
+            map_info = bpftool(f"map show id {map_id}", json=True)
+            if map_info.get("name").endswith("bss"):
+                bss_map_id = map_id
+
+        if bss_map_id is None:
+            raise Exception("Failed to find .bss map")
+
+        ipv6_addr = ipaddress.IPv6Address(self.ipv6_prefix)
+        ipv6_bytes = ipv6_addr.packed
+        ifindex_bytes = self.nk_host_ifindex.to_bytes(4, byteorder='little')
+        value = ipv6_bytes + ifindex_bytes
+        value_hex = ' '.join(f'{b:02x}' for b in value)
+        bpftool(f"map update id {bss_map_id} key hex 00 00 00 00 value hex {value_hex}")
-- 
cgit v1.2.3


From ab771c938d9a57d510bb70c565c9388b10494090 Mon Sep 17 00:00:00 2001
From: David Wei <dw@davidwei.uk>
Date: Thu, 15 Jan 2026 09:26:02 +0100
Subject: selftests/net: Make NetDrvContEnv support queue leasing

Add a new parameter `lease` to NetDrvContEnv that sets up queue leasing
in the env.

The NETIF also has some ethtool parameters changed to support memory
provider tests. This is needed in NetDrvContEnv rather than individual
test cases since the cleanup to restore NETIF can't be done, until the
netns in the env is gone.

Signed-off-by: David Wei <dw@davidwei.uk>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20260115082603.219152-16-daniel@iogearbox.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/drivers/net/lib/py/env.py | 47 ++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py
index 5b12c4c59e09..7066d78395c6 100644
--- a/tools/testing/selftests/drivers/net/lib/py/env.py
+++ b/tools/testing/selftests/drivers/net/lib/py/env.py
@@ -9,6 +9,7 @@ from lib.py import KsftSkipEx, KsftXfailEx
 from lib.py import ksft_setup, wait_file
 from lib.py import cmd, ethtool, ip, CmdExitFailure
 from lib.py import NetNS, NetdevSimDev
+from lib.py import NetdevFamily, EthtoolFamily
 from .remote import Remote
 from . import bpftool
 
@@ -300,7 +301,7 @@ class NetDrvContEnv(NetDrvEpEnv):
     between the physical interface and a network namespace.
     """
 
-    def __init__(self, src_path, nk_rxqueues=1, **kwargs):
+    def __init__(self, src_path, lease=False, **kwargs):
         super().__init__(src_path, **kwargs)
 
         self.require_ipver("6")
@@ -308,6 +309,9 @@ class NetDrvContEnv(NetDrvEpEnv):
         if not local_prefix:
             raise KsftSkipEx("LOCAL_PREFIX_V6 required")
 
+        self.netdevnl = NetdevFamily()
+        self.ethnl = EthtoolFamily()
+
         local_prefix = local_prefix.rstrip("/64").rstrip("::").rstrip(":")
         self.ipv6_prefix = f"{local_prefix}::"
         self.nk_host_ipv6 = f"{local_prefix}::2:1"
@@ -319,7 +323,11 @@ class NetDrvContEnv(NetDrvEpEnv):
         self._tc_attached = False
         self._bpf_prog_pref = None
         self._bpf_prog_id = None
+        self._leased = False
 
+        nk_rxqueues = 1
+        if lease:
+            nk_rxqueues = 2
         ip(f"link add type netkit mode l2 forward peer forward numrxqueues {nk_rxqueues}")
 
         all_links = ip("-d link show", json=True)
@@ -336,6 +344,9 @@ class NetDrvContEnv(NetDrvEpEnv):
         self.nk_host_ifindex = netkit_links[1]['ifindex']
         self.nk_guest_ifindex = netkit_links[0]['ifindex']
 
+        if lease:
+            self._lease_queues()
+
         self._setup_ns()
         self._attach_bpf()
 
@@ -353,8 +364,42 @@ class NetDrvContEnv(NetDrvEpEnv):
             del self.netns
             self.netns = None
 
+        if self._leased:
+            self.ethnl.rings_set({'header': {'dev-index': self.ifindex},
+                                  'tcp-data-split': 'unknown',
+                                  'hds-thresh': self._hds_thresh,
+                                  'rx': self._rx_rings})
+            self._leased = False
+
         super().__del__()
 
+    def _lease_queues(self):
+        channels = self.ethnl.channels_get({'header': {'dev-index': self.ifindex}})
+        channels = channels['combined-count']
+        if channels < 2:
+            raise KsftSkipEx('Test requires NETIF with at least 2 combined channels')
+
+        rings = self.ethnl.rings_get({'header': {'dev-index': self.ifindex}})
+        self._rx_rings = rings['rx']
+        self._hds_thresh = rings.get('hds-thresh', 0)
+        self.ethnl.rings_set({'header': {'dev-index': self.ifindex},
+                            'tcp-data-split': 'enabled',
+                            'hds-thresh': 0,
+                            'rx': 64})
+        self.src_queue = channels - 1
+        bind_result = self.netdevnl.queue_create(
+            {
+                "ifindex": self.nk_guest_ifindex,
+                "type": "rx",
+                "lease": {
+                    "ifindex": self.ifindex,
+                    "queue": {"id": self.src_queue, "type": "rx"},
+                },
+            }
+        )
+        self.nk_queue = bind_result['id']
+        self._leased = True
+
     def _setup_ns(self):
         self.netns = NetNS()
         ip(f"link set dev {self._nk_guest_ifname} netns {self.netns.name}")
-- 
cgit v1.2.3


From 931420a2fc363817c92990fa14eb1bdec024ce04 Mon Sep 17 00:00:00 2001
From: David Wei <dw@davidwei.uk>
Date: Thu, 15 Jan 2026 09:26:03 +0100
Subject: selftests/net: Add netkit container tests

Add two tests using NetDrvContEnv. One basic test that sets up a netkit
pair, with one end in a netns. Use LOCAL_PREFIX_V6 and nk_forward BPF
program to ping from a remote host to the netkit in netns.

Second is a selftest for netkit queue leasing, using io_uring zero copy
test binary inside of a netns with netkit. This checks that memory
providers can be bound against virtual queues in a netkit within a
netns that are leasing from a physical netdev in the default netns.

Signed-off-by: David Wei <dw@davidwei.uk>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20260115082603.219152-17-daniel@iogearbox.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/drivers/net/hw/Makefile    |  2 +
 tools/testing/selftests/drivers/net/hw/nk_netns.py | 23 +++++++++
 .../testing/selftests/drivers/net/hw/nk_qlease.py  | 55 ++++++++++++++++++++++
 3 files changed, 80 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/hw/nk_netns.py
 create mode 100755 tools/testing/selftests/drivers/net/hw/nk_qlease.py

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
index 9c163ba6feee..39ad86d693b3 100644
--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -32,6 +32,8 @@ TEST_PROGS = \
 	irq.py \
 	loopback.sh \
 	nic_timestamp.py \
+	nk_netns.py \
+	nk_qlease.py \
 	pp_alloc_fail.py \
 	rss_api.py \
 	rss_ctx.py \
diff --git a/tools/testing/selftests/drivers/net/hw/nk_netns.py b/tools/testing/selftests/drivers/net/hw/nk_netns.py
new file mode 100755
index 000000000000..afa8638195d8
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/nk_netns.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+from lib.py import ksft_run, ksft_exit
+from lib.py import NetDrvContEnv
+from lib.py import cmd
+
+
+def test_ping(cfg) -> None:
+    cfg.require_ipver("6")
+
+    cmd(f"ping -c 1 -W5 {cfg.nk_guest_ipv6}", host=cfg.remote)
+    cmd(f"ping -c 1 -W5 {cfg.remote_addr_v['6']}", ns=cfg.netns)
+
+
+def main() -> None:
+    with NetDrvContEnv(__file__) as cfg:
+        ksft_run([test_ping], args=(cfg,))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/testing/selftests/drivers/net/hw/nk_qlease.py b/tools/testing/selftests/drivers/net/hw/nk_qlease.py
new file mode 100755
index 000000000000..738a46d2d20c
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/nk_qlease.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import re
+from os import path
+from lib.py import ksft_run, ksft_exit
+from lib.py import NetDrvContEnv
+from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen
+
+
+def create_rss_ctx(cfg):
+    output = ethtool(f"-X {cfg.ifname} context new start {cfg.src_queue} equal 1").stdout
+    values = re.search(r'New RSS context is (\d+)', output).group(1)
+    return int(values)
+
+
+def set_flow_rule(cfg):
+    output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.src_queue}").stdout
+    values = re.search(r'ID (\d+)', output).group(1)
+    return int(values)
+
+
+def set_flow_rule_rss(cfg, rss_ctx_id):
+    output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout
+    values = re.search(r'ID (\d+)', output).group(1)
+    return int(values)
+
+
+def test_iou_zcrx(cfg) -> None:
+    cfg.require_ipver('6')
+
+    ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}")
+    defer(ethtool, f"-X {cfg.ifname} default")
+
+    flow_rule_id = set_flow_rule(cfg)
+    defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
+
+    rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}"
+    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840"
+    with bkg(rx_cmd, exit_wait=True):
+        wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns)
+        cmd(tx_cmd, host=cfg.remote)
+
+
+def main() -> None:
+    with NetDrvContEnv(__file__, lease=True) as cfg:
+        cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx")
+        cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
+        cfg.port = rand_port()
+        ksft_run([test_iou_zcrx], args=(cfg,))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
-- 
cgit v1.2.3


From 5d54aa40c7b7e9dee5746cca99e9ddbcca13e895 Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Fri, 16 Jan 2026 09:52:36 +0100
Subject: vsock/test: Do not filter kallsyms by symbol type

Blamed commit implemented logic to discover available vsock transports by
grepping /proc/kallsyms for known symbols. It incorrectly filtered entries
by type 'd'.

For some kernel configs having

    CONFIG_VIRTIO_VSOCKETS=m
    CONFIG_VSOCKETS_LOOPBACK=y

kallsyms reports

    0000000000000000 d virtio_transport	[vmw_vsock_virtio_transport]
    0000000000000000 t loopback_transport

Overzealous filtering might have affected vsock test suit, resulting in
insufficient/misleading testing.

Do not filter symbols by type. It never helped much.

Fixes: 3070c05b7afd ("vsock/test: Introduce get_transports()")
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20260116-vsock_test-kallsyms-grep-v1-1-3320bc3346f2@rbox.co
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/vsock/util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index 142c02a6834a..bf633cde82b0 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -25,7 +25,7 @@ enum transport {
 };
 
 static const char * const transport_ksyms[] = {
-	#define x(name, symbol) "d " symbol "_transport",
+	#define x(name, symbol) " " symbol "_transport",
 	KNOWN_TRANSPORTS(x)
 	#undef x
 };
-- 
cgit v1.2.3


From 92d65d9c31621befe0a5f7c0bd43bd217613c6b6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:27 -0800
Subject: perf symbol-elf: Fix leak of ELF files with GNU debugdata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The processing of DSO_BINARY_TYPE__GNU_DEBUGDATA in symsrc__init happens
with an open ELF file but the error path only closes the associate fd.

Fix the goto so that the ELF file is also ended and memory released.

Fixes: b10f74308e130527 ("perf symbol: Support .gnu_debugdata for symbols")
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/symbol-elf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index b8fea12997a0..76912c62b6a0 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -1173,7 +1173,7 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 		Elf *embedded = read_gnu_debugdata(dso, elf, name, &new_fd);
 
 		if (!embedded)
-			goto out_close;
+			goto out_elf_end;
 
 		elf_end(elf);
 		close(fd);
-- 
cgit v1.2.3


From e99d544c7f3691eb321c88fdbadf04b777c114c4 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:28 -0800
Subject: perf dso: Extra validity checks that e_machine is valid
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Better ensure a read e_machine is valid by checking the file appears
like an ELF file and the read e_machine value is less than EM_NUM.

This better avoids spurious e_machine values when looking for an
e_machine in say a thread.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dso.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 18e656712f5a..143720d1ecb1 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -1236,17 +1236,28 @@ uint16_t dso__e_machine(struct dso *dso, struct machine *machine)
 	try_to_open_dso(dso, machine);
 	fd = dso__data(dso)->fd;
 	if (fd >= 0) {
-		_Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset");
-		_Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset");
-		if (dso__needs_swap(dso) == DSO_SWAP__UNSET) {
-			unsigned char eidata;
-
-			if (pread(fd, &eidata, sizeof(eidata), EI_DATA) == sizeof(eidata))
-				dso__swap_init(dso, eidata);
+		unsigned char e_ident[EI_NIDENT];
+
+		_Static_assert(offsetof(Elf32_Ehdr, e_ident) == 0, "Unexpected offset");
+		_Static_assert(offsetof(Elf64_Ehdr, e_ident) == 0, "Unexpected offset");
+		if (pread(fd, &e_ident, sizeof(e_ident), 0) == sizeof(e_ident) &&
+		    memcmp(e_ident, ELFMAG, SELFMAG) == 0 &&
+		    e_ident[EI_CLASS] > ELFCLASSNONE && e_ident[EI_CLASS] < ELFCLASSNUM &&
+		    e_ident[EI_DATA] > ELFDATANONE && e_ident[EI_DATA] < ELFDATANUM &&
+		    e_ident[EI_VERSION] == EV_CURRENT) {
+			_Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset");
+			_Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset");
+
+			if (dso__needs_swap(dso) == DSO_SWAP__UNSET)
+				dso__swap_init(dso, e_ident[EI_DATA]);
+
+			if (dso__needs_swap(dso) != DSO_SWAP__UNSET &&
+			    pread(fd, &e_machine, sizeof(e_machine), 18) == sizeof(e_machine) &&
+			    e_machine < EM_NUM)
+				e_machine = DSO__SWAP(dso, uint16_t, e_machine);
+			else
+				e_machine = EM_NONE;
 		}
-		if (dso__needs_swap(dso) != DSO_SWAP__UNSET &&
-		    pread(fd, &e_machine, sizeof(e_machine), 18) == sizeof(e_machine))
-			e_machine = DSO__SWAP(dso, uint16_t, e_machine);
 	}
 	mutex_unlock(dso__data_open_lock());
 	return e_machine;
-- 
cgit v1.2.3


From 86f3801208ed1632ddd75a8e95ade5e433567be1 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:29 -0800
Subject: perf record: Disable inline frames when marking build IDs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Marking DSOs doesn't need inline frames traversing as the inline
frames are all part of the same DSO. Disable to improve performance
and also to avoid potential issues with dwarf information.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 003e47a4fc1d..663ca3a03396 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1509,6 +1509,8 @@ static int process_buildids(struct record *rec)
 	if (perf_data__size(&rec->data) == 0)
 		return 0;
 
+	/* A single DSO is needed and not all inline frames. */
+	symbol_conf.inline_name = false;
 	/*
 	 * During this process, it'll load kernel map and replace the
 	 * dso->long_name to a real pathname it found.  In this case
@@ -1519,7 +1521,6 @@ static int process_buildids(struct record *rec)
 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
 	 */
 	symbol_conf.ignore_vmlinux_buildid = true;
-
 	/*
 	 * If --buildid-all is given, it marks all DSO regardless of hits,
 	 * so no need to process samples. But if timestamp_boundary is enabled,
-- 
cgit v1.2.3


From e62fae9d9e85d38cdda1ee08a424e1b5b8246620 Mon Sep 17 00:00:00 2001
From: Shimin Guo <shimin.guo@skydio.com>
Date: Fri, 16 Jan 2026 21:28:30 -0800
Subject: perf unwind-libdw: Fix a cross-arch unwinding bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The set_initial_registers field of Dwfl_Thread_Callbacks needs to be set
according to the arch of the stack samples being analyzed, not the arch
that perf itself is built for.

Currently perf fails to unwind stack samples collected from archs
different from that of the host perf is running on.

This patch moves the arch-specific implementations of set_initial_registers
from tools/perf/arch to tools/perf/utli/unwind-libdw-arch, similar to the
way the perf-regs-arch folder contains arch-specific functions related to
registers, and chooses the implementation based on the arch of the data
being processed.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Shimin Guo <shimin.guo@skydio.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/util/Build                     |  1 -
 tools/perf/arch/arm/util/unwind-libdw.c            | 39 -----------
 tools/perf/arch/arm64/util/Build                   |  1 -
 tools/perf/arch/arm64/util/unwind-libdw.c          | 61 -----------------
 tools/perf/arch/csky/util/Build                    |  2 -
 tools/perf/arch/csky/util/unwind-libdw.c           | 78 ----------------------
 tools/perf/arch/loongarch/util/unwind-libdw.c      | 57 ----------------
 tools/perf/arch/powerpc/util/Build                 |  1 -
 tools/perf/arch/powerpc/util/unwind-libdw.c        | 76 ---------------------
 tools/perf/arch/riscv/util/Build                   |  1 -
 tools/perf/arch/riscv/util/unwind-libdw.c          | 58 ----------------
 tools/perf/arch/s390/util/Build                    |  2 -
 tools/perf/arch/s390/util/unwind-libdw.c           | 65 ------------------
 tools/perf/arch/x86/util/Build                     |  1 -
 tools/perf/arch/x86/util/unwind-libdw.c            | 54 ---------------
 tools/perf/util/Build                              |  1 +
 tools/perf/util/unwind-libdw-arch/Build            |  8 +++
 .../perf/util/unwind-libdw-arch/unwind-libdw-arm.c | 39 +++++++++++
 .../util/unwind-libdw-arch/unwind-libdw-arm64.c    | 61 +++++++++++++++++
 .../util/unwind-libdw-arch/unwind-libdw-csky.c     | 78 ++++++++++++++++++++++
 .../unwind-libdw-arch/unwind-libdw-loongarch.c     | 57 ++++++++++++++++
 .../util/unwind-libdw-arch/unwind-libdw-powerpc.c  | 76 +++++++++++++++++++++
 .../util/unwind-libdw-arch/unwind-libdw-riscv.c    | 58 ++++++++++++++++
 .../util/unwind-libdw-arch/unwind-libdw-s390.c     | 65 ++++++++++++++++++
 .../perf/util/unwind-libdw-arch/unwind-libdw-x86.c | 54 +++++++++++++++
 tools/perf/util/unwind-libdw.c                     | 51 ++++++++++++--
 tools/perf/util/unwind-libdw.h                     | 10 ++-
 27 files changed, 551 insertions(+), 504 deletions(-)
 delete mode 100644 tools/perf/arch/arm/util/unwind-libdw.c
 delete mode 100644 tools/perf/arch/arm64/util/unwind-libdw.c
 delete mode 100644 tools/perf/arch/csky/util/unwind-libdw.c
 delete mode 100644 tools/perf/arch/loongarch/util/unwind-libdw.c
 delete mode 100644 tools/perf/arch/powerpc/util/unwind-libdw.c
 delete mode 100644 tools/perf/arch/riscv/util/unwind-libdw.c
 delete mode 100644 tools/perf/arch/s390/util/unwind-libdw.c
 delete mode 100644 tools/perf/arch/x86/util/unwind-libdw.c
 create mode 100644 tools/perf/util/unwind-libdw-arch/Build
 create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c
 create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c
 create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c
 create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c
 create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c
 create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c
 create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c
 create mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c

(limited to 'tools')

diff --git a/tools/perf/arch/arm/util/Build b/tools/perf/arch/arm/util/Build
index fd695e1fdaee..3291f893b943 100644
--- a/tools/perf/arch/arm/util/Build
+++ b/tools/perf/arch/arm/util/Build
@@ -1,6 +1,5 @@
 perf-util-y += perf_regs.o
 
 perf-util-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind.o
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
 
 perf-util-y += pmu.o auxtrace.o cs-etm.o
diff --git a/tools/perf/arch/arm/util/unwind-libdw.c b/tools/perf/arch/arm/util/unwind-libdw.c
deleted file mode 100644
index fbb643f224ec..000000000000
--- a/tools/perf/arch/arm/util/unwind-libdw.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <elfutils/libdwfl.h>
-#include "perf_regs.h"
-#include "../../../util/unwind-libdw.h"
-#include "../../../util/perf_regs.h"
-#include "../../../util/sample.h"
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[PERF_REG_ARM_MAX];
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_ARM_##r);	\
-	val;							\
-})
-
-	dwarf_regs[0]  = REG(R0);
-	dwarf_regs[1]  = REG(R1);
-	dwarf_regs[2]  = REG(R2);
-	dwarf_regs[3]  = REG(R3);
-	dwarf_regs[4]  = REG(R4);
-	dwarf_regs[5]  = REG(R5);
-	dwarf_regs[6]  = REG(R6);
-	dwarf_regs[7]  = REG(R7);
-	dwarf_regs[8]  = REG(R8);
-	dwarf_regs[9]  = REG(R9);
-	dwarf_regs[10] = REG(R10);
-	dwarf_regs[11] = REG(FP);
-	dwarf_regs[12] = REG(IP);
-	dwarf_regs[13] = REG(SP);
-	dwarf_regs[14] = REG(LR);
-	dwarf_regs[15] = REG(PC);
-
-	return dwfl_thread_state_registers(thread, 0, PERF_REG_ARM_MAX,
-					   dwarf_regs);
-}
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
index d63881081d2e..0177af19cc00 100644
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
@@ -1,4 +1,3 @@
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
 perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
 perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
 perf-util-y += ../../arm/util/auxtrace.o
diff --git a/tools/perf/arch/arm64/util/unwind-libdw.c b/tools/perf/arch/arm64/util/unwind-libdw.c
deleted file mode 100644
index b89b0a7e5ad9..000000000000
--- a/tools/perf/arch/arm64/util/unwind-libdw.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <elfutils/libdwfl.h>
-#include "perf_regs.h"
-#include "../../../util/unwind-libdw.h"
-#include "../../../util/perf_regs.h"
-#include "../../../util/sample.h"
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[PERF_REG_ARM64_MAX], dwarf_pc;
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_ARM64_##r);	\
-	val;							\
-})
-
-	dwarf_regs[0]  = REG(X0);
-	dwarf_regs[1]  = REG(X1);
-	dwarf_regs[2]  = REG(X2);
-	dwarf_regs[3]  = REG(X3);
-	dwarf_regs[4]  = REG(X4);
-	dwarf_regs[5]  = REG(X5);
-	dwarf_regs[6]  = REG(X6);
-	dwarf_regs[7]  = REG(X7);
-	dwarf_regs[8]  = REG(X8);
-	dwarf_regs[9]  = REG(X9);
-	dwarf_regs[10] = REG(X10);
-	dwarf_regs[11] = REG(X11);
-	dwarf_regs[12] = REG(X12);
-	dwarf_regs[13] = REG(X13);
-	dwarf_regs[14] = REG(X14);
-	dwarf_regs[15] = REG(X15);
-	dwarf_regs[16] = REG(X16);
-	dwarf_regs[17] = REG(X17);
-	dwarf_regs[18] = REG(X18);
-	dwarf_regs[19] = REG(X19);
-	dwarf_regs[20] = REG(X20);
-	dwarf_regs[21] = REG(X21);
-	dwarf_regs[22] = REG(X22);
-	dwarf_regs[23] = REG(X23);
-	dwarf_regs[24] = REG(X24);
-	dwarf_regs[25] = REG(X25);
-	dwarf_regs[26] = REG(X26);
-	dwarf_regs[27] = REG(X27);
-	dwarf_regs[28] = REG(X28);
-	dwarf_regs[29] = REG(X29);
-	dwarf_regs[30] = REG(LR);
-	dwarf_regs[31] = REG(SP);
-
-	if (!dwfl_thread_state_registers(thread, 0, PERF_REG_ARM64_MAX,
-					 dwarf_regs))
-		return false;
-
-	dwarf_pc = REG(PC);
-	dwfl_thread_state_register_pc(thread, dwarf_pc);
-
-	return true;
-}
diff --git a/tools/perf/arch/csky/util/Build b/tools/perf/arch/csky/util/Build
index 5e6ea82c4202..6b2d0e021b11 100644
--- a/tools/perf/arch/csky/util/Build
+++ b/tools/perf/arch/csky/util/Build
@@ -1,3 +1 @@
 perf-util-y += perf_regs.o
-
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/csky/util/unwind-libdw.c b/tools/perf/arch/csky/util/unwind-libdw.c
deleted file mode 100644
index b20b1569783d..000000000000
--- a/tools/perf/arch/csky/util/unwind-libdw.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
-
-#include <elfutils/libdwfl.h>
-#include "perf_regs.h"
-#include "../../util/unwind-libdw.h"
-#include "../../util/perf_regs.h"
-#include "../../util/event.h"
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[PERF_REG_CSKY_MAX];
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_CSKY_##r);	\
-	val;							\
-})
-
-#if defined(__CSKYABIV2__)
-	dwarf_regs[0]  = REG(A0);
-	dwarf_regs[1]  = REG(A1);
-	dwarf_regs[2]  = REG(A2);
-	dwarf_regs[3]  = REG(A3);
-	dwarf_regs[4]  = REG(REGS0);
-	dwarf_regs[5]  = REG(REGS1);
-	dwarf_regs[6]  = REG(REGS2);
-	dwarf_regs[7]  = REG(REGS3);
-	dwarf_regs[8]  = REG(REGS4);
-	dwarf_regs[9]  = REG(REGS5);
-	dwarf_regs[10] = REG(REGS6);
-	dwarf_regs[11] = REG(REGS7);
-	dwarf_regs[12] = REG(REGS8);
-	dwarf_regs[13] = REG(REGS9);
-	dwarf_regs[14] = REG(SP);
-	dwarf_regs[15] = REG(LR);
-	dwarf_regs[16] = REG(EXREGS0);
-	dwarf_regs[17] = REG(EXREGS1);
-	dwarf_regs[18] = REG(EXREGS2);
-	dwarf_regs[19] = REG(EXREGS3);
-	dwarf_regs[20] = REG(EXREGS4);
-	dwarf_regs[21] = REG(EXREGS5);
-	dwarf_regs[22] = REG(EXREGS6);
-	dwarf_regs[23] = REG(EXREGS7);
-	dwarf_regs[24] = REG(EXREGS8);
-	dwarf_regs[25] = REG(EXREGS9);
-	dwarf_regs[26] = REG(EXREGS10);
-	dwarf_regs[27] = REG(EXREGS11);
-	dwarf_regs[28] = REG(EXREGS12);
-	dwarf_regs[29] = REG(EXREGS13);
-	dwarf_regs[30] = REG(EXREGS14);
-	dwarf_regs[31] = REG(TLS);
-	dwarf_regs[32] = REG(PC);
-#else
-	dwarf_regs[0]  = REG(SP);
-	dwarf_regs[1]  = REG(REGS9);
-	dwarf_regs[2]  = REG(A0);
-	dwarf_regs[3]  = REG(A1);
-	dwarf_regs[4]  = REG(A2);
-	dwarf_regs[5]  = REG(A3);
-	dwarf_regs[6]  = REG(REGS0);
-	dwarf_regs[7]  = REG(REGS1);
-	dwarf_regs[8]  = REG(REGS2);
-	dwarf_regs[9]  = REG(REGS3);
-	dwarf_regs[10] = REG(REGS4);
-	dwarf_regs[11] = REG(REGS5);
-	dwarf_regs[12] = REG(REGS6);
-	dwarf_regs[13] = REG(REGS7);
-	dwarf_regs[14] = REG(REGS8);
-	dwarf_regs[15] = REG(LR);
-#endif
-	dwfl_thread_state_register_pc(thread, REG(PC));
-
-	return dwfl_thread_state_registers(thread, 0, PERF_REG_CSKY_MAX,
-					   dwarf_regs);
-}
diff --git a/tools/perf/arch/loongarch/util/unwind-libdw.c b/tools/perf/arch/loongarch/util/unwind-libdw.c
deleted file mode 100644
index 60b1144bedd5..000000000000
--- a/tools/perf/arch/loongarch/util/unwind-libdw.c
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2020-2023 Loongson Technology Corporation Limited */
-
-#include <elfutils/libdwfl.h>
-#include "perf_regs.h"
-#include "../../util/unwind-libdw.h"
-#include "../../util/perf_regs.h"
-#include "../../util/sample.h"
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[PERF_REG_LOONGARCH_MAX];
-
-#define REG(r) ({							\
-	Dwarf_Word val = 0;						\
-	perf_reg_value(&val, user_regs, PERF_REG_LOONGARCH_##r);	\
-	val;								\
-})
-
-	dwarf_regs[0]  = 0;
-	dwarf_regs[1]  = REG(R1);
-	dwarf_regs[2]  = REG(R2);
-	dwarf_regs[3]  = REG(R3);
-	dwarf_regs[4]  = REG(R4);
-	dwarf_regs[5]  = REG(R5);
-	dwarf_regs[6]  = REG(R6);
-	dwarf_regs[7]  = REG(R7);
-	dwarf_regs[8]  = REG(R8);
-	dwarf_regs[9]  = REG(R9);
-	dwarf_regs[10] = REG(R10);
-	dwarf_regs[11] = REG(R11);
-	dwarf_regs[12] = REG(R12);
-	dwarf_regs[13] = REG(R13);
-	dwarf_regs[14] = REG(R14);
-	dwarf_regs[15] = REG(R15);
-	dwarf_regs[16] = REG(R16);
-	dwarf_regs[17] = REG(R17);
-	dwarf_regs[18] = REG(R18);
-	dwarf_regs[19] = REG(R19);
-	dwarf_regs[20] = REG(R20);
-	dwarf_regs[21] = REG(R21);
-	dwarf_regs[22] = REG(R22);
-	dwarf_regs[23] = REG(R23);
-	dwarf_regs[24] = REG(R24);
-	dwarf_regs[25] = REG(R25);
-	dwarf_regs[26] = REG(R26);
-	dwarf_regs[27] = REG(R27);
-	dwarf_regs[28] = REG(R28);
-	dwarf_regs[29] = REG(R29);
-	dwarf_regs[30] = REG(R30);
-	dwarf_regs[31] = REG(R31);
-	dwfl_thread_state_register_pc(thread, REG(PC));
-
-	return dwfl_thread_state_registers(thread, 0, PERF_REG_LOONGARCH_MAX, dwarf_regs);
-}
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index 3d0d5427aef7..5fd28ec713a4 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -9,5 +9,4 @@ perf-util-y += evsel.o
 perf-util-$(CONFIG_LIBDW) += skip-callchain-idx.o
 
 perf-util-$(CONFIG_LIBUNWIND) += unwind-libunwind.o
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
 perf-util-y += auxtrace.o
diff --git a/tools/perf/arch/powerpc/util/unwind-libdw.c b/tools/perf/arch/powerpc/util/unwind-libdw.c
deleted file mode 100644
index 82d0c28ae345..000000000000
--- a/tools/perf/arch/powerpc/util/unwind-libdw.c
+++ /dev/null
@@ -1,76 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <elfutils/libdwfl.h>
-#include <linux/kernel.h>
-#include "perf_regs.h"
-#include "../../../util/unwind-libdw.h"
-#include "../../../util/perf_regs.h"
-#include "../../../util/sample.h"
-
-/* See backends/ppc_initreg.c and backends/ppc_regs.c in elfutils.  */
-static const int special_regs[3][2] = {
-	{ 65, PERF_REG_POWERPC_LINK },
-	{ 101, PERF_REG_POWERPC_XER },
-	{ 109, PERF_REG_POWERPC_CTR },
-};
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[32], dwarf_nip;
-	size_t i;
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_POWERPC_##r);	\
-	val;							\
-})
-
-	dwarf_regs[0]  = REG(R0);
-	dwarf_regs[1]  = REG(R1);
-	dwarf_regs[2]  = REG(R2);
-	dwarf_regs[3]  = REG(R3);
-	dwarf_regs[4]  = REG(R4);
-	dwarf_regs[5]  = REG(R5);
-	dwarf_regs[6]  = REG(R6);
-	dwarf_regs[7]  = REG(R7);
-	dwarf_regs[8]  = REG(R8);
-	dwarf_regs[9]  = REG(R9);
-	dwarf_regs[10] = REG(R10);
-	dwarf_regs[11] = REG(R11);
-	dwarf_regs[12] = REG(R12);
-	dwarf_regs[13] = REG(R13);
-	dwarf_regs[14] = REG(R14);
-	dwarf_regs[15] = REG(R15);
-	dwarf_regs[16] = REG(R16);
-	dwarf_regs[17] = REG(R17);
-	dwarf_regs[18] = REG(R18);
-	dwarf_regs[19] = REG(R19);
-	dwarf_regs[20] = REG(R20);
-	dwarf_regs[21] = REG(R21);
-	dwarf_regs[22] = REG(R22);
-	dwarf_regs[23] = REG(R23);
-	dwarf_regs[24] = REG(R24);
-	dwarf_regs[25] = REG(R25);
-	dwarf_regs[26] = REG(R26);
-	dwarf_regs[27] = REG(R27);
-	dwarf_regs[28] = REG(R28);
-	dwarf_regs[29] = REG(R29);
-	dwarf_regs[30] = REG(R30);
-	dwarf_regs[31] = REG(R31);
-	if (!dwfl_thread_state_registers(thread, 0, 32, dwarf_regs))
-		return false;
-
-	dwarf_nip = REG(NIP);
-	dwfl_thread_state_register_pc(thread, dwarf_nip);
-	for (i = 0; i < ARRAY_SIZE(special_regs); i++) {
-		Dwarf_Word val = 0;
-		perf_reg_value(&val, user_regs, special_regs[i][1]);
-		if (!dwfl_thread_state_registers(thread,
-						 special_regs[i][0], 1,
-						 &val))
-			return false;
-	}
-
-	return true;
-}
diff --git a/tools/perf/arch/riscv/util/Build b/tools/perf/arch/riscv/util/Build
index 58a672246024..628b9ebd418b 100644
--- a/tools/perf/arch/riscv/util/Build
+++ b/tools/perf/arch/riscv/util/Build
@@ -2,4 +2,3 @@ perf-util-y += perf_regs.o
 perf-util-y += header.o
 
 perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/riscv/util/unwind-libdw.c b/tools/perf/arch/riscv/util/unwind-libdw.c
deleted file mode 100644
index dc1476e16321..000000000000
--- a/tools/perf/arch/riscv/util/unwind-libdw.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. */
-
-#include <elfutils/libdwfl.h>
-#include "perf_regs.h"
-#include "../../util/unwind-libdw.h"
-#include "../../util/perf_regs.h"
-#include "../../util/sample.h"
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[32];
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_RISCV_##r);	\
-	val;							\
-})
-
-	dwarf_regs[0]  = 0;
-	dwarf_regs[1]  = REG(RA);
-	dwarf_regs[2]  = REG(SP);
-	dwarf_regs[3]  = REG(GP);
-	dwarf_regs[4]  = REG(TP);
-	dwarf_regs[5]  = REG(T0);
-	dwarf_regs[6]  = REG(T1);
-	dwarf_regs[7]  = REG(T2);
-	dwarf_regs[8]  = REG(S0);
-	dwarf_regs[9]  = REG(S1);
-	dwarf_regs[10] = REG(A0);
-	dwarf_regs[11] = REG(A1);
-	dwarf_regs[12] = REG(A2);
-	dwarf_regs[13] = REG(A3);
-	dwarf_regs[14] = REG(A4);
-	dwarf_regs[15] = REG(A5);
-	dwarf_regs[16] = REG(A6);
-	dwarf_regs[17] = REG(A7);
-	dwarf_regs[18] = REG(S2);
-	dwarf_regs[19] = REG(S3);
-	dwarf_regs[20] = REG(S4);
-	dwarf_regs[21] = REG(S5);
-	dwarf_regs[22] = REG(S6);
-	dwarf_regs[23] = REG(S7);
-	dwarf_regs[24] = REG(S8);
-	dwarf_regs[25] = REG(S9);
-	dwarf_regs[26] = REG(S10);
-	dwarf_regs[27] = REG(S11);
-	dwarf_regs[28] = REG(T3);
-	dwarf_regs[29] = REG(T4);
-	dwarf_regs[30] = REG(T5);
-	dwarf_regs[31] = REG(T6);
-	dwfl_thread_state_register_pc(thread, REG(PC));
-
-	return dwfl_thread_state_registers(thread, 0, PERF_REG_RISCV_MAX,
-					   dwarf_regs);
-}
diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build
index c64eb18dbdae..5391d26fedd4 100644
--- a/tools/perf/arch/s390/util/Build
+++ b/tools/perf/arch/s390/util/Build
@@ -2,8 +2,6 @@ perf-util-y += header.o
 perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
 perf-util-y += perf_regs.o
 
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
-
 perf-util-y += machine.o
 perf-util-y += pmu.o
 
diff --git a/tools/perf/arch/s390/util/unwind-libdw.c b/tools/perf/arch/s390/util/unwind-libdw.c
deleted file mode 100644
index c27c7a0d1076..000000000000
--- a/tools/perf/arch/s390/util/unwind-libdw.c
+++ /dev/null
@@ -1,65 +0,0 @@
-#include <linux/kernel.h>
-#include <elfutils/libdwfl.h>
-#include "../../util/unwind-libdw.h"
-#include "../../util/perf_regs.h"
-#include "../../util/event.h"
-#include "../../util/sample.h"
-#include "dwarf-regs-table.h"
-#include "perf_regs.h"
-
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[ARRAY_SIZE(s390_dwarf_regs)];
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_S390_##r);	\
-	val;							\
-})
-	/*
-	 * For DWARF register mapping details,
-	 * see also perf/arch/s390/include/dwarf-regs-table.h
-	 */
-	dwarf_regs[0]  = REG(R0);
-	dwarf_regs[1]  = REG(R1);
-	dwarf_regs[2]  = REG(R2);
-	dwarf_regs[3]  = REG(R3);
-	dwarf_regs[4]  = REG(R4);
-	dwarf_regs[5]  = REG(R5);
-	dwarf_regs[6]  = REG(R6);
-	dwarf_regs[7]  = REG(R7);
-	dwarf_regs[8]  = REG(R8);
-	dwarf_regs[9]  = REG(R9);
-	dwarf_regs[10] = REG(R10);
-	dwarf_regs[11] = REG(R11);
-	dwarf_regs[12] = REG(R12);
-	dwarf_regs[13] = REG(R13);
-	dwarf_regs[14] = REG(R14);
-	dwarf_regs[15] = REG(R15);
-
-	dwarf_regs[16] = REG(FP0);
-	dwarf_regs[17] = REG(FP2);
-	dwarf_regs[18] = REG(FP4);
-	dwarf_regs[19] = REG(FP6);
-	dwarf_regs[20] = REG(FP1);
-	dwarf_regs[21] = REG(FP3);
-	dwarf_regs[22] = REG(FP5);
-	dwarf_regs[23] = REG(FP7);
-	dwarf_regs[24] = REG(FP8);
-	dwarf_regs[25] = REG(FP10);
-	dwarf_regs[26] = REG(FP12);
-	dwarf_regs[27] = REG(FP14);
-	dwarf_regs[28] = REG(FP9);
-	dwarf_regs[29] = REG(FP11);
-	dwarf_regs[30] = REG(FP13);
-	dwarf_regs[31] = REG(FP15);
-
-	dwarf_regs[64] = REG(MASK);
-	dwarf_regs[65] = REG(PC);
-
-	dwfl_thread_state_register_pc(thread, dwarf_regs[65]);
-	return dwfl_thread_state_registers(thread, 0, 32, dwarf_regs);
-}
diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
index c0dc5965f362..fad256252bb9 100644
--- a/tools/perf/arch/x86/util/Build
+++ b/tools/perf/arch/x86/util/Build
@@ -12,7 +12,6 @@ perf-util-y += evsel.o
 perf-util-y += iostat.o
 
 perf-util-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind.o
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
 
 perf-util-y += auxtrace.o
 perf-util-y += archinsn.o
diff --git a/tools/perf/arch/x86/util/unwind-libdw.c b/tools/perf/arch/x86/util/unwind-libdw.c
deleted file mode 100644
index 798493e887d7..000000000000
--- a/tools/perf/arch/x86/util/unwind-libdw.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <elfutils/libdwfl.h>
-#include "perf_regs.h"
-#include "../../../util/unwind-libdw.h"
-#include "../../../util/perf_regs.h"
-#include "util/sample.h"
-
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[17];
-	unsigned nregs;
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_X86_##r);	\
-	val;							\
-})
-
-	if (user_regs->abi == PERF_SAMPLE_REGS_ABI_32) {
-		dwarf_regs[0] = REG(AX);
-		dwarf_regs[1] = REG(CX);
-		dwarf_regs[2] = REG(DX);
-		dwarf_regs[3] = REG(BX);
-		dwarf_regs[4] = REG(SP);
-		dwarf_regs[5] = REG(BP);
-		dwarf_regs[6] = REG(SI);
-		dwarf_regs[7] = REG(DI);
-		dwarf_regs[8] = REG(IP);
-		nregs = 9;
-	} else {
-		dwarf_regs[0]  = REG(AX);
-		dwarf_regs[1]  = REG(DX);
-		dwarf_regs[2]  = REG(CX);
-		dwarf_regs[3]  = REG(BX);
-		dwarf_regs[4]  = REG(SI);
-		dwarf_regs[5]  = REG(DI);
-		dwarf_regs[6]  = REG(BP);
-		dwarf_regs[7]  = REG(SP);
-		dwarf_regs[8]  = REG(R8);
-		dwarf_regs[9]  = REG(R9);
-		dwarf_regs[10] = REG(R10);
-		dwarf_regs[11] = REG(R11);
-		dwarf_regs[12] = REG(R12);
-		dwarf_regs[13] = REG(R13);
-		dwarf_regs[14] = REG(R14);
-		dwarf_regs[15] = REG(R15);
-		dwarf_regs[16] = REG(IP);
-		nregs = 17;
-	}
-
-	return dwfl_thread_state_registers(thread, 0, nregs, dwarf_regs);
-}
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 4915f237ba9e..5efec73be474 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -227,6 +227,7 @@ perf-util-$(CONFIG_LIBDW) += annotate-data.o
 perf-util-$(CONFIG_LIBDW) += libdw.o
 
 perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
+perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw-arch/
 perf-util-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind-local.o
 perf-util-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
 perf-util-$(CONFIG_LIBUNWIND_X86)      += libunwind/x86_32.o
diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build
new file mode 100644
index 000000000000..ef17a83a7813
--- /dev/null
+++ b/tools/perf/util/unwind-libdw-arch/Build
@@ -0,0 +1,8 @@
+perf-util-y += unwind-libdw-x86.o
+perf-util-y += unwind-libdw-arm.o
+perf-util-y += unwind-libdw-arm64.o
+perf-util-y += unwind-libdw-csky.o
+perf-util-y += unwind-libdw-loongarch.o
+perf-util-y += unwind-libdw-powerpc.o
+perf-util-y += unwind-libdw-riscv.o
+perf-util-y += unwind-libdw-s390.o
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c
new file mode 100644
index 000000000000..56e9b5975bcc
--- /dev/null
+++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <elfutils/libdwfl.h>
+#include "../arch/arm/include/uapi/asm/perf_regs.h"
+#include "util/unwind-libdw.h"
+#include "util/perf_regs.h"
+#include "util/sample.h"
+
+bool libdw_set_initial_registers_arm(Dwfl_Thread *thread, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
+	Dwarf_Word dwarf_regs[PERF_REG_ARM_MAX];
+
+#define REG(r) ({						\
+	Dwarf_Word val = 0;					\
+	perf_reg_value(&val, user_regs, PERF_REG_ARM_##r);	\
+	val;							\
+})
+
+	dwarf_regs[0]  = REG(R0);
+	dwarf_regs[1]  = REG(R1);
+	dwarf_regs[2]  = REG(R2);
+	dwarf_regs[3]  = REG(R3);
+	dwarf_regs[4]  = REG(R4);
+	dwarf_regs[5]  = REG(R5);
+	dwarf_regs[6]  = REG(R6);
+	dwarf_regs[7]  = REG(R7);
+	dwarf_regs[8]  = REG(R8);
+	dwarf_regs[9]  = REG(R9);
+	dwarf_regs[10] = REG(R10);
+	dwarf_regs[11] = REG(FP);
+	dwarf_regs[12] = REG(IP);
+	dwarf_regs[13] = REG(SP);
+	dwarf_regs[14] = REG(LR);
+	dwarf_regs[15] = REG(PC);
+
+	return dwfl_thread_state_registers(thread, 0, PERF_REG_ARM_MAX,
+					   dwarf_regs);
+}
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c
new file mode 100644
index 000000000000..29b6833e036c
--- /dev/null
+++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <elfutils/libdwfl.h>
+#include "../arch/arm64/include/uapi/asm/perf_regs.h"
+#include "util/unwind-libdw.h"
+#include "util/perf_regs.h"
+#include "util/sample.h"
+
+bool libdw_set_initial_registers_arm64(Dwfl_Thread *thread, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
+	Dwarf_Word dwarf_regs[PERF_REG_ARM64_MAX], dwarf_pc;
+
+#define REG(r) ({						\
+	Dwarf_Word val = 0;					\
+	perf_reg_value(&val, user_regs, PERF_REG_ARM64_##r);	\
+	val;							\
+})
+
+	dwarf_regs[0]  = REG(X0);
+	dwarf_regs[1]  = REG(X1);
+	dwarf_regs[2]  = REG(X2);
+	dwarf_regs[3]  = REG(X3);
+	dwarf_regs[4]  = REG(X4);
+	dwarf_regs[5]  = REG(X5);
+	dwarf_regs[6]  = REG(X6);
+	dwarf_regs[7]  = REG(X7);
+	dwarf_regs[8]  = REG(X8);
+	dwarf_regs[9]  = REG(X9);
+	dwarf_regs[10] = REG(X10);
+	dwarf_regs[11] = REG(X11);
+	dwarf_regs[12] = REG(X12);
+	dwarf_regs[13] = REG(X13);
+	dwarf_regs[14] = REG(X14);
+	dwarf_regs[15] = REG(X15);
+	dwarf_regs[16] = REG(X16);
+	dwarf_regs[17] = REG(X17);
+	dwarf_regs[18] = REG(X18);
+	dwarf_regs[19] = REG(X19);
+	dwarf_regs[20] = REG(X20);
+	dwarf_regs[21] = REG(X21);
+	dwarf_regs[22] = REG(X22);
+	dwarf_regs[23] = REG(X23);
+	dwarf_regs[24] = REG(X24);
+	dwarf_regs[25] = REG(X25);
+	dwarf_regs[26] = REG(X26);
+	dwarf_regs[27] = REG(X27);
+	dwarf_regs[28] = REG(X28);
+	dwarf_regs[29] = REG(X29);
+	dwarf_regs[30] = REG(LR);
+	dwarf_regs[31] = REG(SP);
+
+	if (!dwfl_thread_state_registers(thread, 0, PERF_REG_ARM64_MAX,
+					 dwarf_regs))
+		return false;
+
+	dwarf_pc = REG(PC);
+	dwfl_thread_state_register_pc(thread, dwarf_pc);
+
+	return true;
+}
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c
new file mode 100644
index 000000000000..2556d034c32a
--- /dev/null
+++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
+
+#include <elfutils/libdwfl.h>
+#include "../arch/csky/include/uapi/asm/perf_regs.h"
+#include "util/unwind-libdw.h"
+#include "util/perf_regs.h"
+#include "util/sample.h"
+
+bool libdw_set_initial_registers_csky(Dwfl_Thread *thread, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
+	Dwarf_Word dwarf_regs[PERF_REG_CSKY_MAX];
+
+#define REG(r) ({						\
+	Dwarf_Word val = 0;					\
+	perf_reg_value(&val, user_regs, PERF_REG_CSKY_##r);	\
+	val;							\
+})
+
+#if defined(__CSKYABIV2__)
+	dwarf_regs[0]  = REG(A0);
+	dwarf_regs[1]  = REG(A1);
+	dwarf_regs[2]  = REG(A2);
+	dwarf_regs[3]  = REG(A3);
+	dwarf_regs[4]  = REG(REGS0);
+	dwarf_regs[5]  = REG(REGS1);
+	dwarf_regs[6]  = REG(REGS2);
+	dwarf_regs[7]  = REG(REGS3);
+	dwarf_regs[8]  = REG(REGS4);
+	dwarf_regs[9]  = REG(REGS5);
+	dwarf_regs[10] = REG(REGS6);
+	dwarf_regs[11] = REG(REGS7);
+	dwarf_regs[12] = REG(REGS8);
+	dwarf_regs[13] = REG(REGS9);
+	dwarf_regs[14] = REG(SP);
+	dwarf_regs[15] = REG(LR);
+	dwarf_regs[16] = REG(EXREGS0);
+	dwarf_regs[17] = REG(EXREGS1);
+	dwarf_regs[18] = REG(EXREGS2);
+	dwarf_regs[19] = REG(EXREGS3);
+	dwarf_regs[20] = REG(EXREGS4);
+	dwarf_regs[21] = REG(EXREGS5);
+	dwarf_regs[22] = REG(EXREGS6);
+	dwarf_regs[23] = REG(EXREGS7);
+	dwarf_regs[24] = REG(EXREGS8);
+	dwarf_regs[25] = REG(EXREGS9);
+	dwarf_regs[26] = REG(EXREGS10);
+	dwarf_regs[27] = REG(EXREGS11);
+	dwarf_regs[28] = REG(EXREGS12);
+	dwarf_regs[29] = REG(EXREGS13);
+	dwarf_regs[30] = REG(EXREGS14);
+	dwarf_regs[31] = REG(TLS);
+	dwarf_regs[32] = REG(PC);
+#else
+	dwarf_regs[0]  = REG(SP);
+	dwarf_regs[1]  = REG(REGS9);
+	dwarf_regs[2]  = REG(A0);
+	dwarf_regs[3]  = REG(A1);
+	dwarf_regs[4]  = REG(A2);
+	dwarf_regs[5]  = REG(A3);
+	dwarf_regs[6]  = REG(REGS0);
+	dwarf_regs[7]  = REG(REGS1);
+	dwarf_regs[8]  = REG(REGS2);
+	dwarf_regs[9]  = REG(REGS3);
+	dwarf_regs[10] = REG(REGS4);
+	dwarf_regs[11] = REG(REGS5);
+	dwarf_regs[12] = REG(REGS6);
+	dwarf_regs[13] = REG(REGS7);
+	dwarf_regs[14] = REG(REGS8);
+	dwarf_regs[15] = REG(LR);
+#endif
+	dwfl_thread_state_register_pc(thread, REG(PC));
+
+	return dwfl_thread_state_registers(thread, 0, PERF_REG_CSKY_MAX,
+					   dwarf_regs);
+}
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c
new file mode 100644
index 000000000000..5fca673508be
--- /dev/null
+++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2020-2023 Loongson Technology Corporation Limited */
+
+#include <elfutils/libdwfl.h>
+#include "../arch/loongarch/include/uapi/asm/perf_regs.h"
+#include "util/unwind-libdw.h"
+#include "util/perf_regs.h"
+#include "util/sample.h"
+
+bool libdw_set_initial_registers_loongarch(Dwfl_Thread *thread, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
+	Dwarf_Word dwarf_regs[PERF_REG_LOONGARCH_MAX];
+
+#define REG(r) ({							\
+	Dwarf_Word val = 0;						\
+	perf_reg_value(&val, user_regs, PERF_REG_LOONGARCH_##r);	\
+	val;								\
+})
+
+	dwarf_regs[0]  = 0;
+	dwarf_regs[1]  = REG(R1);
+	dwarf_regs[2]  = REG(R2);
+	dwarf_regs[3]  = REG(R3);
+	dwarf_regs[4]  = REG(R4);
+	dwarf_regs[5]  = REG(R5);
+	dwarf_regs[6]  = REG(R6);
+	dwarf_regs[7]  = REG(R7);
+	dwarf_regs[8]  = REG(R8);
+	dwarf_regs[9]  = REG(R9);
+	dwarf_regs[10] = REG(R10);
+	dwarf_regs[11] = REG(R11);
+	dwarf_regs[12] = REG(R12);
+	dwarf_regs[13] = REG(R13);
+	dwarf_regs[14] = REG(R14);
+	dwarf_regs[15] = REG(R15);
+	dwarf_regs[16] = REG(R16);
+	dwarf_regs[17] = REG(R17);
+	dwarf_regs[18] = REG(R18);
+	dwarf_regs[19] = REG(R19);
+	dwarf_regs[20] = REG(R20);
+	dwarf_regs[21] = REG(R21);
+	dwarf_regs[22] = REG(R22);
+	dwarf_regs[23] = REG(R23);
+	dwarf_regs[24] = REG(R24);
+	dwarf_regs[25] = REG(R25);
+	dwarf_regs[26] = REG(R26);
+	dwarf_regs[27] = REG(R27);
+	dwarf_regs[28] = REG(R28);
+	dwarf_regs[29] = REG(R29);
+	dwarf_regs[30] = REG(R30);
+	dwarf_regs[31] = REG(R31);
+	dwfl_thread_state_register_pc(thread, REG(PC));
+
+	return dwfl_thread_state_registers(thread, 0, PERF_REG_LOONGARCH_MAX, dwarf_regs);
+}
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c
new file mode 100644
index 000000000000..1560db45e7b4
--- /dev/null
+++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <elfutils/libdwfl.h>
+#include <linux/kernel.h>
+#include "../arch/powerpc/include/uapi/asm/perf_regs.h"
+#include "util/unwind-libdw.h"
+#include "util/perf_regs.h"
+#include "util/sample.h"
+
+/* See backends/ppc_initreg.c and backends/ppc_regs.c in elfutils.  */
+static const int special_regs[3][2] = {
+	{ 65, PERF_REG_POWERPC_LINK },
+	{ 101, PERF_REG_POWERPC_XER },
+	{ 109, PERF_REG_POWERPC_CTR },
+};
+
+bool libdw_set_initial_registers_powerpc(Dwfl_Thread *thread, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
+	Dwarf_Word dwarf_regs[32], dwarf_nip;
+	size_t i;
+
+#define REG(r) ({						\
+	Dwarf_Word val = 0;					\
+	perf_reg_value(&val, user_regs, PERF_REG_POWERPC_##r);	\
+	val;							\
+})
+
+	dwarf_regs[0]  = REG(R0);
+	dwarf_regs[1]  = REG(R1);
+	dwarf_regs[2]  = REG(R2);
+	dwarf_regs[3]  = REG(R3);
+	dwarf_regs[4]  = REG(R4);
+	dwarf_regs[5]  = REG(R5);
+	dwarf_regs[6]  = REG(R6);
+	dwarf_regs[7]  = REG(R7);
+	dwarf_regs[8]  = REG(R8);
+	dwarf_regs[9]  = REG(R9);
+	dwarf_regs[10] = REG(R10);
+	dwarf_regs[11] = REG(R11);
+	dwarf_regs[12] = REG(R12);
+	dwarf_regs[13] = REG(R13);
+	dwarf_regs[14] = REG(R14);
+	dwarf_regs[15] = REG(R15);
+	dwarf_regs[16] = REG(R16);
+	dwarf_regs[17] = REG(R17);
+	dwarf_regs[18] = REG(R18);
+	dwarf_regs[19] = REG(R19);
+	dwarf_regs[20] = REG(R20);
+	dwarf_regs[21] = REG(R21);
+	dwarf_regs[22] = REG(R22);
+	dwarf_regs[23] = REG(R23);
+	dwarf_regs[24] = REG(R24);
+	dwarf_regs[25] = REG(R25);
+	dwarf_regs[26] = REG(R26);
+	dwarf_regs[27] = REG(R27);
+	dwarf_regs[28] = REG(R28);
+	dwarf_regs[29] = REG(R29);
+	dwarf_regs[30] = REG(R30);
+	dwarf_regs[31] = REG(R31);
+	if (!dwfl_thread_state_registers(thread, 0, 32, dwarf_regs))
+		return false;
+
+	dwarf_nip = REG(NIP);
+	dwfl_thread_state_register_pc(thread, dwarf_nip);
+	for (i = 0; i < ARRAY_SIZE(special_regs); i++) {
+		Dwarf_Word val = 0;
+		perf_reg_value(&val, user_regs, special_regs[i][1]);
+		if (!dwfl_thread_state_registers(thread,
+						 special_regs[i][0], 1,
+						 &val))
+			return false;
+	}
+
+	return true;
+}
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c
new file mode 100644
index 000000000000..c2e2c4b6b2e0
--- /dev/null
+++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. */
+
+#include <elfutils/libdwfl.h>
+#include "../arch/riscv/include/uapi/asm/perf_regs.h"
+#include "util/unwind-libdw.h"
+#include "util/perf_regs.h"
+#include "util/sample.h"
+
+bool libdw_set_initial_registers_riscv(Dwfl_Thread *thread, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
+	Dwarf_Word dwarf_regs[32];
+
+#define REG(r) ({						\
+	Dwarf_Word val = 0;					\
+	perf_reg_value(&val, user_regs, PERF_REG_RISCV_##r);	\
+	val;							\
+})
+
+	dwarf_regs[0]  = 0;
+	dwarf_regs[1]  = REG(RA);
+	dwarf_regs[2]  = REG(SP);
+	dwarf_regs[3]  = REG(GP);
+	dwarf_regs[4]  = REG(TP);
+	dwarf_regs[5]  = REG(T0);
+	dwarf_regs[6]  = REG(T1);
+	dwarf_regs[7]  = REG(T2);
+	dwarf_regs[8]  = REG(S0);
+	dwarf_regs[9]  = REG(S1);
+	dwarf_regs[10] = REG(A0);
+	dwarf_regs[11] = REG(A1);
+	dwarf_regs[12] = REG(A2);
+	dwarf_regs[13] = REG(A3);
+	dwarf_regs[14] = REG(A4);
+	dwarf_regs[15] = REG(A5);
+	dwarf_regs[16] = REG(A6);
+	dwarf_regs[17] = REG(A7);
+	dwarf_regs[18] = REG(S2);
+	dwarf_regs[19] = REG(S3);
+	dwarf_regs[20] = REG(S4);
+	dwarf_regs[21] = REG(S5);
+	dwarf_regs[22] = REG(S6);
+	dwarf_regs[23] = REG(S7);
+	dwarf_regs[24] = REG(S8);
+	dwarf_regs[25] = REG(S9);
+	dwarf_regs[26] = REG(S10);
+	dwarf_regs[27] = REG(S11);
+	dwarf_regs[28] = REG(T3);
+	dwarf_regs[29] = REG(T4);
+	dwarf_regs[30] = REG(T5);
+	dwarf_regs[31] = REG(T6);
+	dwfl_thread_state_register_pc(thread, REG(PC));
+
+	return dwfl_thread_state_registers(thread, 0, PERF_REG_RISCV_MAX,
+					   dwarf_regs);
+}
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c
new file mode 100644
index 000000000000..1e05e9d9d95f
--- /dev/null
+++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c
@@ -0,0 +1,65 @@
+#include <linux/kernel.h>
+#include <elfutils/libdwfl.h>
+#include "util/unwind-libdw.h"
+#include "util/perf_regs.h"
+#include "util/event.h"
+#include "util/sample.h"
+#include "../arch/s390/include/dwarf-regs-table.h"
+#include "../arch/s390/include/uapi/asm/perf_regs.h"
+
+
+bool libdw_set_initial_registers_s390(Dwfl_Thread *thread, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
+	Dwarf_Word dwarf_regs[ARRAY_SIZE(s390_dwarf_regs)];
+
+#define REG(r) ({						\
+	Dwarf_Word val = 0;					\
+	perf_reg_value(&val, user_regs, PERF_REG_S390_##r);	\
+	val;							\
+})
+	/*
+	 * For DWARF register mapping details,
+	 * see also perf/arch/s390/include/dwarf-regs-table.h
+	 */
+	dwarf_regs[0]  = REG(R0);
+	dwarf_regs[1]  = REG(R1);
+	dwarf_regs[2]  = REG(R2);
+	dwarf_regs[3]  = REG(R3);
+	dwarf_regs[4]  = REG(R4);
+	dwarf_regs[5]  = REG(R5);
+	dwarf_regs[6]  = REG(R6);
+	dwarf_regs[7]  = REG(R7);
+	dwarf_regs[8]  = REG(R8);
+	dwarf_regs[9]  = REG(R9);
+	dwarf_regs[10] = REG(R10);
+	dwarf_regs[11] = REG(R11);
+	dwarf_regs[12] = REG(R12);
+	dwarf_regs[13] = REG(R13);
+	dwarf_regs[14] = REG(R14);
+	dwarf_regs[15] = REG(R15);
+
+	dwarf_regs[16] = REG(FP0);
+	dwarf_regs[17] = REG(FP2);
+	dwarf_regs[18] = REG(FP4);
+	dwarf_regs[19] = REG(FP6);
+	dwarf_regs[20] = REG(FP1);
+	dwarf_regs[21] = REG(FP3);
+	dwarf_regs[22] = REG(FP5);
+	dwarf_regs[23] = REG(FP7);
+	dwarf_regs[24] = REG(FP8);
+	dwarf_regs[25] = REG(FP10);
+	dwarf_regs[26] = REG(FP12);
+	dwarf_regs[27] = REG(FP14);
+	dwarf_regs[28] = REG(FP9);
+	dwarf_regs[29] = REG(FP11);
+	dwarf_regs[30] = REG(FP13);
+	dwarf_regs[31] = REG(FP15);
+
+	dwarf_regs[64] = REG(MASK);
+	dwarf_regs[65] = REG(PC);
+
+	dwfl_thread_state_register_pc(thread, dwarf_regs[65]);
+	return dwfl_thread_state_registers(thread, 0, 32, dwarf_regs);
+}
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c
new file mode 100644
index 000000000000..dd27545a4a68
--- /dev/null
+++ b/tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <elfutils/libdwfl.h>
+#include "../arch/x86/include/uapi/asm/perf_regs.h"
+#include "util/unwind-libdw.h"
+#include "util/perf_regs.h"
+#include "util/sample.h"
+
+bool libdw_set_initial_registers_x86(Dwfl_Thread *thread, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
+	Dwarf_Word dwarf_regs[17];
+	unsigned nregs;
+
+#define REG(r) ({						\
+	Dwarf_Word val = 0;					\
+	perf_reg_value(&val, user_regs, PERF_REG_X86_##r);	\
+	val;							\
+})
+
+	if (user_regs->abi == PERF_SAMPLE_REGS_ABI_32) {
+		dwarf_regs[0] = REG(AX);
+		dwarf_regs[1] = REG(CX);
+		dwarf_regs[2] = REG(DX);
+		dwarf_regs[3] = REG(BX);
+		dwarf_regs[4] = REG(SP);
+		dwarf_regs[5] = REG(BP);
+		dwarf_regs[6] = REG(SI);
+		dwarf_regs[7] = REG(DI);
+		dwarf_regs[8] = REG(IP);
+		nregs = 9;
+	} else {
+		dwarf_regs[0]  = REG(AX);
+		dwarf_regs[1]  = REG(DX);
+		dwarf_regs[2]  = REG(CX);
+		dwarf_regs[3]  = REG(BX);
+		dwarf_regs[4]  = REG(SI);
+		dwarf_regs[5]  = REG(DI);
+		dwarf_regs[6]  = REG(BP);
+		dwarf_regs[7]  = REG(SP);
+		dwarf_regs[8]  = REG(R8);
+		dwarf_regs[9]  = REG(R9);
+		dwarf_regs[10] = REG(R10);
+		dwarf_regs[11] = REG(R11);
+		dwarf_regs[12] = REG(R12);
+		dwarf_regs[13] = REG(R13);
+		dwarf_regs[14] = REG(R14);
+		dwarf_regs[15] = REG(R15);
+		dwarf_regs[16] = REG(IP);
+		nregs = 17;
+	}
+
+	return dwfl_thread_state_registers(thread, 0, nregs, dwarf_regs);
+}
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 3ff427a49e4c..b2e194a8be39 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -225,11 +225,45 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *
 	return true;
 }
 
-static const Dwfl_Thread_Callbacks callbacks = {
-	.next_thread		= next_thread,
-	.memory_read		= memory_read,
-	.set_initial_registers	= libdw__arch_set_initial_registers,
-};
+#define DEFINE_DWFL_THREAD_CALLBACKS(arch)                           \
+static const Dwfl_Thread_Callbacks callbacks_##arch = {              \
+	.next_thread           = next_thread,                        \
+	.memory_read           = memory_read,                        \
+	.set_initial_registers = libdw_set_initial_registers_##arch, \
+}
+
+DEFINE_DWFL_THREAD_CALLBACKS(x86);
+DEFINE_DWFL_THREAD_CALLBACKS(arm);
+DEFINE_DWFL_THREAD_CALLBACKS(arm64);
+DEFINE_DWFL_THREAD_CALLBACKS(csky);
+DEFINE_DWFL_THREAD_CALLBACKS(loongarch);
+DEFINE_DWFL_THREAD_CALLBACKS(powerpc);
+DEFINE_DWFL_THREAD_CALLBACKS(riscv);
+DEFINE_DWFL_THREAD_CALLBACKS(s390);
+
+static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch)
+{
+	if (!strcmp(arch, "arm"))
+		return &callbacks_arm;
+	else if (!strcmp(arch, "arm64"))
+		return &callbacks_arm64;
+	else if (!strcmp(arch, "csky"))
+		return &callbacks_csky;
+	else if (!strcmp(arch, "loongarch"))
+		return &callbacks_loongarch;
+	else if (!strcmp(arch, "powerpc"))
+		return &callbacks_powerpc;
+	else if (!strcmp(arch, "riscv"))
+		return &callbacks_riscv;
+	else if (!strcmp(arch, "s390"))
+		return &callbacks_s390;
+	else if (!strcmp(arch, "x86"))
+		return &callbacks_x86;
+
+	pr_err("Fail to get thread callbacks for arch %s, returns NULL\n",
+	       arch);
+	return NULL;
+}
 
 static int
 frame_callback(Dwfl_Frame *state, void *arg)
@@ -278,6 +312,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 	const char *arch = perf_env__arch(ui_buf.machine->env);
 	Dwarf_Word ip;
 	int err = -EINVAL, i;
+	const Dwfl_Thread_Callbacks *callbacks;
 
 	if (!data->user_regs || !data->user_regs->regs)
 		return -EINVAL;
@@ -300,7 +335,11 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 	if (err)
 		goto out;
 
-	err = !dwfl_attach_state(ui->dwfl, EM_NONE, thread__tid(thread), &callbacks, ui);
+	callbacks = get_thread_callbacks(arch);
+	if (!callbacks)
+		goto out;
+
+	err = !dwfl_attach_state(ui->dwfl, EM_NONE, thread__tid(thread), callbacks, ui);
 	if (err)
 		goto out;
 
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 8c88bc4f2304..574b29848cce 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -9,7 +9,15 @@ struct machine;
 struct perf_sample;
 struct thread;
 
-bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg);
+bool libdw_set_initial_registers_x86(Dwfl_Thread *thread, void *arg);
+bool libdw_set_initial_registers_arm(Dwfl_Thread *thread, void *arg);
+bool libdw_set_initial_registers_arm64(Dwfl_Thread *thread, void *arg);
+bool libdw_set_initial_registers_csky(Dwfl_Thread *thread, void *arg);
+bool libdw_set_initial_registers_loongarch(Dwfl_Thread *thread, void *arg);
+bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg);
+bool libdw_set_initial_registers_powerpc(Dwfl_Thread *thread, void *arg);
+bool libdw_set_initial_registers_riscv(Dwfl_Thread *thread, void *arg);
+bool libdw_set_initial_registers_s390(Dwfl_Thread *thread, void *arg);
 
 struct unwind_info {
 	Dwfl			*dwfl;
-- 
cgit v1.2.3


From 6cc3e0f659b890cfb4a8753eb0e31c871cc7555b Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:31 -0800
Subject: perf libdw_addr2line: Fixes to srcline memory allocation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some irregular stack traces are causing double frees and memory
leaks. Make the code robust by proactively freeing and being more
careful with the memory management of the leaf_srcline.

Fixes: 88c51002d06f9a68 ("perf addr2line: Add a libdw implementation")
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/libdw.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/libdw.c b/tools/perf/util/libdw.c
index e4bfd52bd172..b96c4e0d728f 100644
--- a/tools/perf/util/libdw.c
+++ b/tools/perf/util/libdw.c
@@ -42,16 +42,24 @@ static int libdw_a2l_cb(Dwarf_Die *die, void *_args)
 		call_srcline = srcline_from_fileline(call_fname, die_get_call_lineno(die));
 
 	list_for_each_entry(ilist, &args->node->val, list) {
+		if (args->leaf_srcline == ilist->srcline)
+			args->leaf_srcline_used = false;
+		else if (ilist->srcline != srcline__unknown)
+			free(ilist->srcline);
 		ilist->srcline =  call_srcline;
 		call_srcline = NULL;
 		break;
 	}
-	if (call_srcline && call_fname)
+	if (call_srcline && call_srcline != srcline__unknown)
 		free(call_srcline);
 
 	/* Add this symbol to the chain as the leaf. */
-	inline_list__append_tail(inline_sym, args->leaf_srcline, args->node);
-	args->leaf_srcline_used = true;
+	if (!args->leaf_srcline_used) {
+		inline_list__append_tail(inline_sym, args->leaf_srcline, args->node);
+		args->leaf_srcline_used = true;
+	} else {
+		inline_list__append_tail(inline_sym, strdup(args->leaf_srcline), args->node);
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 8c59835851de28595e5899290f4b7aec656c7f24 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:32 -0800
Subject: perf unwind-libdw: Correct argument to dwfl_attach_state()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Argument is a pointer but EM_NONE (0) was being passed. Correct by
passing NULL.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/unwind-libdw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index b2e194a8be39..dc882f17f52d 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -339,7 +339,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 	if (!callbacks)
 		goto out;
 
-	err = !dwfl_attach_state(ui->dwfl, EM_NONE, thread__tid(thread), callbacks, ui);
+	err = !dwfl_attach_state(ui->dwfl, /*elf=*/NULL, thread__tid(thread), callbacks, ui);
 	if (err)
 		goto out;
 
-- 
cgit v1.2.3


From b7a2b011e9627ff3359306f1eaac718baeadbd83 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:33 -0800
Subject: perf powerpc: Unify the skip-callchain-idx libdw with that for
 addr2line
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rather than have 2 Dwfl unify the Dwfl in skip-callchain-idx with that
is used by libdw__addr2line().

Rename that variable in 'struct dso' from 'a2l_libdw' to just 'libdw' as
it is now used in more than addr2line.

The Dwfl in skip-callchain-idx uses a map address when being read with
dwfl_report_elf (rather than dwfl_report_offline that addr2line
uses).

skip-callchain-idx is wrong as the map address can vary between
processes because of ASLR, ie it should need a different Dwfl per
process.

In the code after this patch the base address becomes 0 and the mapped
PC is used with the dwfl functions.

This should increase the accuracy of skip-callchain-idx, but the impact
has only been build tested.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/powerpc/util/skip-callchain-idx.c |  52 +++--------
 tools/perf/util/dso.c                             |   2 +-
 tools/perf/util/dso.h                             |  23 +++--
 tools/perf/util/libdw.c                           | 101 ++++++++++++----------
 tools/perf/util/libdw.h                           |  12 ++-
 tools/perf/util/srcline.c                         |   2 +-
 6 files changed, 89 insertions(+), 103 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
index 356786432fd3..e57f10798fa6 100644
--- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c
+++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
@@ -30,14 +30,6 @@
  * The libdwfl code in this file is based on code from elfutils
  * (libdwfl/argp-std.c, libdwfl/tests/addrcfi.c, etc).
  */
-static char *debuginfo_path;
-
-static const Dwfl_Callbacks offline_callbacks = {
-	.debuginfo_path = &debuginfo_path,
-	.find_debuginfo = dwfl_standard_find_debuginfo,
-	.section_address = dwfl_offline_section_address,
-};
-
 
 /*
  * Use the DWARF expression for the Call-frame-address and determine
@@ -149,44 +141,22 @@ static Dwarf_Frame *get_dwarf_frame(Dwfl_Module *mod, Dwarf_Addr pc)
  *		yet used)
  *	-1 in case of errors
  */
-static int check_return_addr(struct dso *dso, u64 map_start, Dwarf_Addr pc)
+static int check_return_addr(struct dso *dso, Dwarf_Addr mapped_pc)
 {
 	int		rc = -1;
 	Dwfl		*dwfl;
 	Dwfl_Module	*mod;
 	Dwarf_Frame	*frame;
 	int		ra_regno;
-	Dwarf_Addr	start = pc;
-	Dwarf_Addr	end = pc;
+	Dwarf_Addr	start = mapped_pc;
+	Dwarf_Addr	end = mapped_pc;
 	bool		signalp;
-	const char	*exec_file = dso__long_name(dso);
-
-	dwfl = RC_CHK_ACCESS(dso)->dwfl;
-
-	if (!dwfl) {
-		dwfl = dwfl_begin(&offline_callbacks);
-		if (!dwfl) {
-			pr_debug("dwfl_begin() failed: %s\n", dwarf_errmsg(-1));
-			return -1;
-		}
-
-		mod = dwfl_report_elf(dwfl, exec_file, exec_file, -1,
-						map_start, false);
-		if (!mod) {
-			pr_debug("dwfl_report_elf() failed %s\n",
-						dwarf_errmsg(-1));
-			/*
-			 * We normally cache the DWARF debug info and never
-			 * call dwfl_end(). But to prevent fd leak, free in
-			 * case of error.
-			 */
-			dwfl_end(dwfl);
-			goto out;
-		}
-		RC_CHK_ACCESS(dso)->dwfl = dwfl;
-	}
 
-	mod = dwfl_addrmodule(dwfl, pc);
+	dwfl = dso__libdw_dwfl(dso);
+	if (!dwfl)
+		return -1;
+
+	mod = dwfl_addrmodule(dwfl, mapped_pc);
 	if (!mod) {
 		pr_debug("dwfl_addrmodule() failed, %s\n", dwarf_errmsg(-1));
 		goto out;
@@ -196,9 +166,9 @@ static int check_return_addr(struct dso *dso, u64 map_start, Dwarf_Addr pc)
 	 * To work with split debug info files (eg: glibc), check both
 	 * .eh_frame and .debug_frame sections of the ELF header.
 	 */
-	frame = get_eh_frame(mod, pc);
+	frame = get_eh_frame(mod, mapped_pc);
 	if (!frame) {
-		frame = get_dwarf_frame(mod, pc);
+		frame = get_dwarf_frame(mod, mapped_pc);
 		if (!frame)
 			goto out;
 	}
@@ -264,7 +234,7 @@ int arch_skip_callchain_idx(struct thread *thread, struct ip_callchain *chain)
 		return skip_slot;
 	}
 
-	rc = check_return_addr(dso, map__start(al.map), ip);
+	rc = check_return_addr(dso, map__map_ip(al.map, ip));
 
 	pr_debug("[DSO %s, sym %s, ip 0x%" PRIx64 "] rc %d\n",
 		dso__long_name(dso), al.sym->name, ip, rc);
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 143720d1ecb1..dce207c7f862 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -1612,7 +1612,7 @@ void dso__delete(struct dso *dso)
 	auxtrace_cache__free(RC_CHK_ACCESS(dso)->auxtrace_cache);
 	dso_cache__free(dso);
 	dso__free_a2l(dso);
-	dso__free_a2l_libdw(dso);
+	dso__free_libdw(dso);
 	dso__free_symsrc_filename(dso);
 	nsinfo__zput(RC_CHK_ACCESS(dso)->nsinfo);
 	mutex_destroy(dso__lock(dso));
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index 4aee23775054..295388085031 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -268,11 +268,8 @@ DECLARE_RC_STRUCT(dso) {
 	const char	 *short_name;
 	const char	 *long_name;
 	void		 *a2l;
-	void		 *a2l_libdw;
+	void		 *libdw;
 	char		 *symsrc_filename;
-#if defined(__powerpc__)
-	void		*dwfl;			/* DWARF debug info */
-#endif
 	struct nsinfo	*nsinfo;
 	struct auxtrace_cache *auxtrace_cache;
 	union { /* Tool specific area */
@@ -335,16 +332,26 @@ static inline void dso__set_a2l(struct dso *dso, void *val)
 	RC_CHK_ACCESS(dso)->a2l = val;
 }
 
-static inline void *dso__a2l_libdw(const struct dso *dso)
+static inline void *dso__libdw(const struct dso *dso)
 {
-	return RC_CHK_ACCESS(dso)->a2l_libdw;
+	return RC_CHK_ACCESS(dso)->libdw;
 }
 
-static inline void dso__set_a2l_libdw(struct dso *dso, void *val)
+static inline void dso__set_libdw(struct dso *dso, void *val)
 {
-	RC_CHK_ACCESS(dso)->a2l_libdw = val;
+	RC_CHK_ACCESS(dso)->libdw = val;
 }
 
+struct Dwfl;
+#ifdef HAVE_LIBDW_SUPPORT
+struct Dwfl *dso__libdw_dwfl(struct dso *dso);
+#else
+static inline struct Dwfl *dso__libdw_dwfl(struct dso *dso __maybe_unused)
+{
+	return NULL;
+}
+#endif
+
 static inline unsigned int dso__a2l_fails(const struct dso *dso)
 {
 	return RC_CHK_ACCESS(dso)->a2l_fails;
diff --git a/tools/perf/util/libdw.c b/tools/perf/util/libdw.c
index b96c4e0d728f..216977884103 100644
--- a/tools/perf/util/libdw.c
+++ b/tools/perf/util/libdw.c
@@ -8,14 +8,62 @@
 #include <unistd.h>
 #include <elfutils/libdwfl.h>
 
-void dso__free_a2l_libdw(struct dso *dso)
+static const Dwfl_Callbacks offline_callbacks = {
+	.find_debuginfo = dwfl_standard_find_debuginfo,
+	.section_address = dwfl_offline_section_address,
+	.find_elf = dwfl_build_id_find_elf,
+};
+
+void dso__free_libdw(struct dso *dso)
 {
-	Dwfl *dwfl = dso__a2l_libdw(dso);
+	Dwfl *dwfl = dso__libdw(dso);
 
 	if (dwfl) {
 		dwfl_end(dwfl);
-		dso__set_a2l_libdw(dso, NULL);
+		dso__set_libdw(dso, NULL);
+	}
+}
+
+struct Dwfl *dso__libdw_dwfl(struct dso *dso)
+{
+	Dwfl *dwfl = dso__libdw(dso);
+	const char *dso_name;
+	Dwfl_Module *mod;
+	int fd;
+
+	if (dwfl)
+		return dwfl;
+
+	dso_name = dso__long_name(dso);
+	/*
+	 * Initialize Dwfl session.
+	 * We need to open the DSO file to report it to libdw.
+	 */
+	fd = open(dso_name, O_RDONLY);
+	if (fd < 0)
+		return NULL;
+
+	dwfl = dwfl_begin(&offline_callbacks);
+	if (!dwfl) {
+		close(fd);
+		return NULL;
+	}
+
+	/*
+	 * If the report is successful, the file descriptor fd is consumed
+	 * and closed by the Dwfl. If not, it is not closed.
+	 */
+	mod = dwfl_report_offline(dwfl, dso_name, dso_name, fd);
+	if (!mod) {
+		dwfl_end(dwfl);
+		close(fd);
+		return NULL;
 	}
+
+	dwfl_report_end(dwfl, /*removed=*/NULL, /*arg=*/NULL);
+	dso__set_libdw(dso, dwfl);
+
+	return dwfl;
 }
 
 struct libdw_a2l_cb_args {
@@ -63,58 +111,21 @@ static int libdw_a2l_cb(Dwarf_Die *die, void *_args)
 	return 0;
 }
 
-int libdw__addr2line(const char *dso_name, u64 addr,
-		     char **file, unsigned int *line_nr,
+int libdw__addr2line(u64 addr, char **file, unsigned int *line_nr,
 		     struct dso *dso, bool unwind_inlines,
 		     struct inline_node *node, struct symbol *sym)
 {
-	static const Dwfl_Callbacks offline_callbacks = {
-		.find_debuginfo = dwfl_standard_find_debuginfo,
-		.section_address = dwfl_offline_section_address,
-		.find_elf = dwfl_build_id_find_elf,
-	};
-	Dwfl *dwfl = dso__a2l_libdw(dso);
+	Dwfl *dwfl = dso__libdw_dwfl(dso);
 	Dwfl_Module *mod;
 	Dwfl_Line *dwline;
 	Dwarf_Addr bias;
 	const char *src;
 	int lineno = 0;
 
-	if (!dwfl) {
-		/*
-		 * Initialize Dwfl session.
-		 * We need to open the DSO file to report it to libdw.
-		 */
-		int fd;
-
-		fd = open(dso_name, O_RDONLY);
-		if (fd < 0)
-			return 0;
-
-		dwfl = dwfl_begin(&offline_callbacks);
-		if (!dwfl) {
-			close(fd);
-			return 0;
-		}
-
-		/*
-		 * If the report is successful, the file descriptor fd is consumed
-		 * and closed by the Dwfl. If not, it is not closed.
-		 */
-		mod = dwfl_report_offline(dwfl, dso_name, dso_name, fd);
-		if (!mod) {
-			dwfl_end(dwfl);
-			close(fd);
-			return 0;
-		}
-
-		dwfl_report_end(dwfl, /*removed=*/NULL, /*arg=*/NULL);
-		dso__set_a2l_libdw(dso, dwfl);
-	} else {
-		/* Dwfl session already initialized, get module for address. */
-		mod = dwfl_addrmodule(dwfl, addr);
-	}
+	if (!dwfl)
+		return 0;
 
+	mod = dwfl_addrmodule(dwfl, addr);
 	if (!mod)
 		return 0;
 
diff --git a/tools/perf/util/libdw.h b/tools/perf/util/libdw.h
index 0f8d7b4a11a5..b12094737415 100644
--- a/tools/perf/util/libdw.h
+++ b/tools/perf/util/libdw.h
@@ -11,7 +11,6 @@ struct symbol;
 #ifdef HAVE_LIBDW_SUPPORT
 /*
  * libdw__addr2line - Convert address to source location using libdw
- * @dso_name: Name of the DSO
  * @addr: Address to resolve
  * @file: Pointer to return filename (caller must free)
  * @line_nr: Pointer to return line number
@@ -26,23 +25,22 @@ struct symbol;
  *
  * Returns 1 on success (found), 0 on failure (not found).
  */
-int libdw__addr2line(const char *dso_name, u64 addr, char **file,
+int libdw__addr2line(u64 addr, char **file,
 		     unsigned int *line_nr, struct dso *dso,
 		     bool unwind_inlines, struct inline_node *node,
 		     struct symbol *sym);
 
 /*
- * dso__free_a2l_libdw - Free libdw resources associated with the DSO
+ * dso__free_libdw - Free libdw resources associated with the DSO
  * @dso: The dso to free resources for
  *
  * This function cleans up the Dwfl context used for addr2line lookups.
  */
-void dso__free_a2l_libdw(struct dso *dso);
+void dso__free_libdw(struct dso *dso);
 
 #else /* HAVE_LIBDW_SUPPORT */
 
-static inline int libdw__addr2line(const char *dso_name __maybe_unused,
-				   u64 addr __maybe_unused, char **file __maybe_unused,
+static inline int libdw__addr2line(u64 addr __maybe_unused, char **file __maybe_unused,
 				   unsigned int *line_nr __maybe_unused,
 				   struct dso *dso __maybe_unused,
 				   bool unwind_inlines __maybe_unused,
@@ -52,7 +50,7 @@ static inline int libdw__addr2line(const char *dso_name __maybe_unused,
 	return 0;
 }
 
-static inline void dso__free_a2l_libdw(struct dso *dso __maybe_unused)
+static inline void dso__free_libdw(struct dso *dso __maybe_unused)
 {
 }
 #endif /* HAVE_LIBDW_SUPPORT */
diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c
index 28fa1abd1fd3..9be42f398440 100644
--- a/tools/perf/util/srcline.c
+++ b/tools/perf/util/srcline.c
@@ -161,7 +161,7 @@ static int addr2line(const char *dso_name, u64 addr, char **file, unsigned int *
 	for (size_t i = 0; i < ARRAY_SIZE(symbol_conf.addr2line_style); i++) {
 		switch (symbol_conf.addr2line_style[i]) {
 		case A2L_STYLE_LIBDW:
-			ret = libdw__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines,
+			ret = libdw__addr2line(addr, file, line_nr, dso, unwind_inlines,
 					       node, sym);
 			break;
 		case A2L_STYLE_LLVM:
-- 
cgit v1.2.3


From db0c35ca36526f3072affcb573631ccf8c85f827 Mon Sep 17 00:00:00 2001
From: Ryota Sakamoto <sakamo.ryota@gmail.com>
Date: Sat, 17 Jan 2026 02:46:34 +0900
Subject: kunit: add bash completion

Currently, kunit.py has many subcommands and options, making it difficult
to remember them without checking the help message.

Add --list-cmds and --list-opts to kunit.py to get available commands and
options, use those outputs in kunit-completion.sh to show completion.

This implementation is similar to perf and tools/perf/perf-completion.sh.

Example output:
  $ source tools/testing/kunit/kunit-completion.sh
  $ ./tools/testing/kunit/kunit.py [TAB][TAB]
  build   config  exec    parse   run
  $ ./tools/testing/kunit/kunit.py run --k[TAB][TAB]
  --kconfig_add  --kernel_args  --kunitconfig

Link: https://lore.kernel.org/r/20260117-kunit-completion-v2-1-cabd127d0801@gmail.com
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Ryota Sakamoto <sakamo.ryota@gmail.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 Documentation/dev-tools/kunit/run_wrapper.rst |  9 +++++++
 tools/testing/kunit/kunit-completion.sh       | 34 +++++++++++++++++++++++++++
 tools/testing/kunit/kunit.py                  | 30 +++++++++++++++++++++++
 tools/testing/kunit/kunit_tool_test.py        | 21 +++++++++++++++++
 4 files changed, 94 insertions(+)
 create mode 100644 tools/testing/kunit/kunit-completion.sh

(limited to 'tools')

diff --git a/Documentation/dev-tools/kunit/run_wrapper.rst b/Documentation/dev-tools/kunit/run_wrapper.rst
index 6697c71ee8ca..3c0b585dcfff 100644
--- a/Documentation/dev-tools/kunit/run_wrapper.rst
+++ b/Documentation/dev-tools/kunit/run_wrapper.rst
@@ -335,3 +335,12 @@ command line arguments:
 
 - ``--list_tests_attr``: If set, lists all tests that will be run and all of their
   attributes.
+
+Command-line completion
+==============================
+
+The kunit_tool comes with a bash completion script:
+
+.. code-block:: bash
+
+	source tools/testing/kunit/kunit-completion.sh
diff --git a/tools/testing/kunit/kunit-completion.sh b/tools/testing/kunit/kunit-completion.sh
new file mode 100644
index 000000000000..f053e7b5d265
--- /dev/null
+++ b/tools/testing/kunit/kunit-completion.sh
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-2.0
+# bash completion support for KUnit
+
+_kunit_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+
+_kunit()
+{
+	local cur prev words cword
+	_init_completion || return
+
+	local script="${_kunit_dir}/kunit.py"
+
+	if [[ $cword -eq 1 && "$cur" != -* ]]; then
+		local cmds=$(${script} --list-cmds 2>/dev/null)
+		COMPREPLY=($(compgen -W "${cmds}" -- "$cur"))
+		return 0
+	fi
+
+	if [[ "$cur" == -* ]]; then
+		if [[ -n "${words[1]}" && "${words[1]}" != -* ]]; then
+			local opts=$(${script} ${words[1]} --list-opts 2>/dev/null)
+			COMPREPLY=($(compgen -W "${opts}" -- "$cur"))
+			return 0
+		else
+			local opts=$(${script} --list-opts 2>/dev/null)
+			COMPREPLY=($(compgen -W "${opts}" -- "$cur"))
+			return 0
+		fi
+	fi
+}
+
+complete -o default -F _kunit kunit.py
+complete -o default -F _kunit kunit
+complete -o default -F _kunit ./tools/testing/kunit/kunit.py
diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index e3d82a038f93..4ec5ecba6d49 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -328,6 +328,17 @@ def get_default_build_dir() -> str:
 		return os.path.join(os.environ['KBUILD_OUTPUT'], '.kunit')
 	return '.kunit'
 
+def add_completion_opts(parser: argparse.ArgumentParser) -> None:
+	parser.add_argument('--list-opts',
+			    help=argparse.SUPPRESS,
+			    action='store_true')
+
+def add_root_opts(parser: argparse.ArgumentParser) -> None:
+	parser.add_argument('--list-cmds',
+			    help=argparse.SUPPRESS,
+			    action='store_true')
+	add_completion_opts(parser)
+
 def add_common_opts(parser: argparse.ArgumentParser) -> None:
 	parser.add_argument('--build_dir',
 			    help='As in the make command, it specifies the build '
@@ -379,6 +390,8 @@ def add_common_opts(parser: argparse.ArgumentParser) -> None:
 			    help='Additional QEMU arguments, e.g. "-smp 8"',
 			    action='append', metavar='')
 
+	add_completion_opts(parser)
+
 def add_build_opts(parser: argparse.ArgumentParser) -> None:
 	parser.add_argument('--jobs',
 			    help='As in the make command, "Specifies  the number of '
@@ -574,6 +587,7 @@ subcommand_handlers_map = {
 def main(argv: Sequence[str]) -> None:
 	parser = argparse.ArgumentParser(
 			description='Helps writing and running KUnit tests.')
+	add_root_opts(parser)
 	subparser = parser.add_subparsers(dest='subcommand')
 
 	# The 'run' command will config, build, exec, and parse in one go.
@@ -608,12 +622,28 @@ def main(argv: Sequence[str]) -> None:
 	parse_parser.add_argument('file',
 				  help='Specifies the file to read results from.',
 				  type=str, nargs='?', metavar='input_file')
+	add_completion_opts(parse_parser)
 
 	cli_args = parser.parse_args(massage_argv(argv))
 
 	if get_kernel_root_path():
 		os.chdir(get_kernel_root_path())
 
+	if cli_args.list_cmds:
+		print(" ".join(subparser.choices.keys()))
+		return
+
+	if cli_args.list_opts:
+		target_parser = subparser.choices.get(cli_args.subcommand)
+		if not target_parser:
+			target_parser = parser
+
+		# Accessing private attribute _option_string_actions to get
+		# the list of options. This is not a public API, but argparse
+		# does not provide a way to inspect options programmatically.
+		print(' '.join(target_parser._option_string_actions.keys()))
+		return
+
 	subcomand_handler = subcommand_handlers_map.get(cli_args.subcommand, None)
 
 	if subcomand_handler is None:
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 238a31a5cc29..b67408147c1f 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -11,11 +11,13 @@ from unittest import mock
 
 import tempfile, shutil # Handling test_tmpdir
 
+import io
 import itertools
 import json
 import os
 import signal
 import subprocess
+import sys
 from typing import Iterable
 
 import kunit_config
@@ -886,5 +888,24 @@ class KUnitMainTest(unittest.TestCase):
 			mock.call(args=None, build_dir='.kunit', filter_glob='suite2.test1', filter='', filter_action=None, timeout=300),
 		])
 
+	@mock.patch.object(sys, 'stdout', new_callable=io.StringIO)
+	def test_list_cmds(self, mock_stdout):
+		kunit.main(['--list-cmds'])
+		output = mock_stdout.getvalue()
+		output_cmds = sorted(output.split())
+		expected_cmds = sorted(['build', 'config', 'exec', 'parse', 'run'])
+		self.assertEqual(output_cmds, expected_cmds)
+
+	@mock.patch.object(sys, 'stdout', new_callable=io.StringIO)
+	def test_run_list_opts(self, mock_stdout):
+		kunit.main(['run', '--list-opts'])
+		output = mock_stdout.getvalue()
+		output_cmds = set(output.split())
+		self.assertIn('--help', output_cmds)
+		self.assertIn('--kunitconfig', output_cmds)
+		self.assertIn('--jobs', output_cmds)
+		self.assertIn('--kernel_args', output_cmds)
+		self.assertIn('--raw_output', output_cmds)
+
 if __name__ == '__main__':
 	unittest.main()
-- 
cgit v1.2.3


From a457ef08a72cb408318ddb851865c5981b842c63 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:34 -0800
Subject: perf perf_regs: Switch from arch string to int e_machine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The arch string requires multiple strcmp to identify things like the
IP and SP.

Switch to passing in an e_machine that in the bulk of cases is computed
using a current thread load.

The e_machine also allows identification of 32-bit vs 64-bit processes.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
[ Include dwarf-regs.h to get conditional defines for EM_CSKY and EM_LOONGARCH, not available in old distros ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c                        |  17 ++--
 tools/perf/util/evsel.c                            |  14 ++-
 tools/perf/util/perf_regs.c                        | 106 +++++++++++++--------
 tools/perf/util/perf_regs.h                        |  10 +-
 .../util/scripting-engines/trace-event-python.c    |  21 ++--
 tools/perf/util/session.c                          |  65 +++++++++----
 tools/perf/util/session.h                          |   1 +
 tools/perf/util/unwind-libdw.c                     |  12 ++-
 tools/perf/util/unwind-libunwind-local.c           |   7 +-
 9 files changed, 165 insertions(+), 88 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 62e43d3c5ad7..372bede30230 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -717,7 +717,7 @@ out:
 	return 0;
 }
 
-static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, const char *arch,
+static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, uint16_t e_machine,
 				     FILE *fp)
 {
 	unsigned i = 0, r;
@@ -730,7 +730,7 @@ static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, cons
 
 	for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) {
 		u64 val = regs->regs[i++];
-		printed += fprintf(fp, "%5s:0x%"PRIx64" ", perf_reg_name(r, arch), val);
+		printed += fprintf(fp, "%5s:0x%"PRIx64" ", perf_reg_name(r, e_machine), val);
 	}
 
 	return printed;
@@ -787,23 +787,23 @@ tod_scnprintf(struct perf_script *script, char *buf, int buflen,
 }
 
 static int perf_sample__fprintf_iregs(struct perf_sample *sample,
-				      struct perf_event_attr *attr, const char *arch, FILE *fp)
+				      struct perf_event_attr *attr, uint16_t e_machine, FILE *fp)
 {
 	if (!sample->intr_regs)
 		return 0;
 
 	return perf_sample__fprintf_regs(perf_sample__intr_regs(sample),
-					 attr->sample_regs_intr, arch, fp);
+					 attr->sample_regs_intr, e_machine, fp);
 }
 
 static int perf_sample__fprintf_uregs(struct perf_sample *sample,
-				      struct perf_event_attr *attr, const char *arch, FILE *fp)
+				      struct perf_event_attr *attr, uint16_t e_machine, FILE *fp)
 {
 	if (!sample->user_regs)
 		return 0;
 
 	return perf_sample__fprintf_regs(perf_sample__user_regs(sample),
-					 attr->sample_regs_user, arch, fp);
+					 attr->sample_regs_user, e_machine, fp);
 }
 
 static int perf_sample__fprintf_start(struct perf_script *script,
@@ -2418,7 +2418,6 @@ static void process_event(struct perf_script *script,
 	struct evsel_script *es = evsel->priv;
 	FILE *fp = es->fp;
 	char str[PAGE_SIZE_NAME_LEN];
-	const char *arch = perf_env__arch(machine->env);
 
 	if (output[type].fields == 0)
 		return;
@@ -2506,10 +2505,10 @@ static void process_event(struct perf_script *script,
 	}
 
 	if (PRINT_FIELD(IREGS))
-		perf_sample__fprintf_iregs(sample, attr, arch, fp);
+		perf_sample__fprintf_iregs(sample, attr, thread__e_machine(thread, machine), fp);
 
 	if (PRINT_FIELD(UREGS))
-		perf_sample__fprintf_uregs(sample, attr, arch, fp);
+		perf_sample__fprintf_uregs(sample, attr, thread__e_machine(thread, machine), fp);
 
 	if (PRINT_FIELD(BRSTACK))
 		perf_sample__fprintf_brstack(sample, thread, evsel, fp);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 6d324141588c..5ac1a05601b1 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -34,6 +34,7 @@
 #include "callchain.h"
 #include "cgroup.h"
 #include "counts.h"
+#include "dwarf-regs.h"
 #include "event.h"
 #include "evsel.h"
 #include "time-utils.h"
@@ -1007,6 +1008,13 @@ int evsel__group_desc(struct evsel *evsel, char *buf, size_t size)
 	return ret;
 }
 
+static uint16_t evsel__e_machine(struct evsel *evsel)
+{
+	struct perf_session *session = evsel__session(evsel);
+
+	return session ? perf_session__e_machine(session) : EM_HOST;
+}
+
 static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *opts,
 				      struct callchain_param *param)
 {
@@ -1042,13 +1050,13 @@ static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *o
 
 	if (param->record_mode == CALLCHAIN_DWARF) {
 		if (!function) {
-			const char *arch = perf_env__arch(evsel__env(evsel));
+			uint16_t e_machine = evsel__e_machine(evsel);
 
 			evsel__set_sample_bit(evsel, REGS_USER);
 			evsel__set_sample_bit(evsel, STACK_USER);
 			if (opts->sample_user_regs &&
-			    DWARF_MINIMAL_REGS(arch) != arch__user_reg_mask()) {
-				attr->sample_regs_user |= DWARF_MINIMAL_REGS(arch);
+			    DWARF_MINIMAL_REGS(e_machine) != arch__user_reg_mask()) {
+				attr->sample_regs_user |= DWARF_MINIMAL_REGS(e_machine);
 				pr_warning("WARNING: The use of --call-graph=dwarf may require all the user registers, "
 					   "specifying a subset with --user-regs may render DWARF unwinding unreliable, "
 					   "so the minimal registers set (IP, SP) is explicitly forced.\n");
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index 44b90bbf2d07..f9723091e673 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <elf.h>
 #include <errno.h>
 #include <string.h>
+#include "dwarf-regs.h"
 #include "perf_regs.h"
 #include "util/sample.h"
 #include "debug.h"
@@ -30,30 +32,48 @@ const struct sample_reg * __weak arch__sample_reg_masks(void)
 	return sample_reg_masks;
 }
 
-const char *perf_reg_name(int id, const char *arch)
+const char *perf_reg_name(int id, uint16_t e_machine)
 {
 	const char *reg_name = NULL;
 
-	if (!strcmp(arch, "csky"))
+	switch (e_machine) {
+	case EM_ARM:
+		reg_name = __perf_reg_name_arm(id);
+		break;
+	case EM_AARCH64:
+		reg_name = __perf_reg_name_arm64(id);
+		break;
+	case EM_CSKY:
 		reg_name = __perf_reg_name_csky(id);
-	else if (!strcmp(arch, "loongarch"))
+		break;
+	case EM_LOONGARCH:
 		reg_name = __perf_reg_name_loongarch(id);
-	else if (!strcmp(arch, "mips"))
+		break;
+	case EM_MIPS:
 		reg_name = __perf_reg_name_mips(id);
-	else if (!strcmp(arch, "powerpc"))
+		break;
+	case EM_PPC:
+	case EM_PPC64:
 		reg_name = __perf_reg_name_powerpc(id);
-	else if (!strcmp(arch, "riscv"))
+		break;
+	case EM_RISCV:
 		reg_name = __perf_reg_name_riscv(id);
-	else if (!strcmp(arch, "s390"))
+		break;
+	case EM_S390:
 		reg_name = __perf_reg_name_s390(id);
-	else if (!strcmp(arch, "x86"))
+		break;
+	case EM_386:
+	case EM_X86_64:
 		reg_name = __perf_reg_name_x86(id);
-	else if (!strcmp(arch, "arm"))
-		reg_name = __perf_reg_name_arm(id);
-	else if (!strcmp(arch, "arm64"))
-		reg_name = __perf_reg_name_arm64(id);
+		break;
+	default:
+		break;
+	}
+	if (reg_name)
+		return reg_name;
 
-	return reg_name ?: "unknown";
+	pr_debug("Failed to find register %d for ELF machine type %u\n", id, e_machine);
+	return "unknown";
 }
 
 int perf_reg_value(u64 *valp, struct regs_dump *regs, int id)
@@ -83,52 +103,60 @@ out:
 	return 0;
 }
 
-uint64_t perf_arch_reg_ip(const char *arch)
+uint64_t perf_arch_reg_ip(uint16_t e_machine)
 {
-	if (!strcmp(arch, "arm"))
+	switch (e_machine) {
+	case EM_ARM:
 		return __perf_reg_ip_arm();
-	else if (!strcmp(arch, "arm64"))
+	case EM_AARCH64:
 		return __perf_reg_ip_arm64();
-	else if (!strcmp(arch, "csky"))
+	case EM_CSKY:
 		return __perf_reg_ip_csky();
-	else if (!strcmp(arch, "loongarch"))
+	case EM_LOONGARCH:
 		return __perf_reg_ip_loongarch();
-	else if (!strcmp(arch, "mips"))
+	case EM_MIPS:
 		return __perf_reg_ip_mips();
-	else if (!strcmp(arch, "powerpc"))
+	case EM_PPC:
+	case EM_PPC64:
 		return __perf_reg_ip_powerpc();
-	else if (!strcmp(arch, "riscv"))
+	case EM_RISCV:
 		return __perf_reg_ip_riscv();
-	else if (!strcmp(arch, "s390"))
+	case EM_S390:
 		return __perf_reg_ip_s390();
-	else if (!strcmp(arch, "x86"))
+	case EM_386:
+	case EM_X86_64:
 		return __perf_reg_ip_x86();
-
-	pr_err("Fail to find IP register for arch %s, returns 0\n", arch);
-	return 0;
+	default:
+		pr_err("Failed to find IP register for ELF machine type %u\n", e_machine);
+		return 0;
+	}
 }
 
-uint64_t perf_arch_reg_sp(const char *arch)
+uint64_t perf_arch_reg_sp(uint16_t e_machine)
 {
-	if (!strcmp(arch, "arm"))
+	switch (e_machine) {
+	case EM_ARM:
 		return __perf_reg_sp_arm();
-	else if (!strcmp(arch, "arm64"))
+	case EM_AARCH64:
 		return __perf_reg_sp_arm64();
-	else if (!strcmp(arch, "csky"))
+	case EM_CSKY:
 		return __perf_reg_sp_csky();
-	else if (!strcmp(arch, "loongarch"))
+	case EM_LOONGARCH:
 		return __perf_reg_sp_loongarch();
-	else if (!strcmp(arch, "mips"))
+	case EM_MIPS:
 		return __perf_reg_sp_mips();
-	else if (!strcmp(arch, "powerpc"))
+	case EM_PPC:
+	case EM_PPC64:
 		return __perf_reg_sp_powerpc();
-	else if (!strcmp(arch, "riscv"))
+	case EM_RISCV:
 		return __perf_reg_sp_riscv();
-	else if (!strcmp(arch, "s390"))
+	case EM_S390:
 		return __perf_reg_sp_s390();
-	else if (!strcmp(arch, "x86"))
+	case EM_386:
+	case EM_X86_64:
 		return __perf_reg_sp_x86();
-
-	pr_err("Fail to find SP register for arch %s, returns 0\n", arch);
-	return 0;
+	default:
+		pr_err("Failed to find SP register for ELF machine type %u\n", e_machine);
+		return 0;
+	}
 }
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index f2d0736d65cc..7bfc6a34c02b 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -28,10 +28,10 @@ uint64_t arch__intr_reg_mask(void);
 uint64_t arch__user_reg_mask(void);
 const struct sample_reg *arch__sample_reg_masks(void);
 
-const char *perf_reg_name(int id, const char *arch);
+const char *perf_reg_name(int id, uint16_t e_machine);
 int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
-uint64_t perf_arch_reg_ip(const char *arch);
-uint64_t perf_arch_reg_sp(const char *arch);
+uint64_t perf_arch_reg_ip(uint16_t e_machine);
+uint64_t perf_arch_reg_sp(uint16_t e_machine);
 const char *__perf_reg_name_arm64(int id);
 uint64_t __perf_reg_ip_arm64(void);
 uint64_t __perf_reg_sp_arm64(void);
@@ -60,9 +60,9 @@ const char *__perf_reg_name_x86(int id);
 uint64_t __perf_reg_ip_x86(void);
 uint64_t __perf_reg_sp_x86(void);
 
-static inline uint64_t DWARF_MINIMAL_REGS(const char *arch)
+static inline uint64_t DWARF_MINIMAL_REGS(uint16_t e_machine)
 {
-	return (1ULL << perf_arch_reg_ip(arch)) | (1ULL << perf_arch_reg_sp(arch));
+	return (1ULL << perf_arch_reg_ip(e_machine)) | (1ULL << perf_arch_reg_sp(e_machine));
 }
 
 #endif /* __PERF_REGS_H */
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 6655c0bbe0d8..b90edc147796 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -50,6 +50,7 @@
 #include "../thread-stack.h"
 #include "../trace-event.h"
 #include "../call-path.h"
+#include "dwarf-regs.h"
 #include "map.h"
 #include "symbol.h"
 #include "thread_map.h"
@@ -713,7 +714,7 @@ static void set_sample_datasrc_in_dict(PyObject *dict,
 			_PyUnicode_FromString(decode));
 }
 
-static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, char *bf, int size)
+static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine, char *bf, int size)
 {
 	unsigned int i = 0, r;
 	int printed = 0;
@@ -731,7 +732,7 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, ch
 
 		printed += scnprintf(bf + printed, size - printed,
 				     "%5s:0x%" PRIx64 " ",
-				     perf_reg_name(r, arch), val);
+				     perf_reg_name(r, e_machine), val);
 	}
 }
 
@@ -739,10 +740,10 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, const char *arch, ch
 
 static int set_regs_in_dict(PyObject *dict,
 			     struct perf_sample *sample,
-			     struct evsel *evsel)
+			     struct evsel *evsel,
+			     uint16_t e_machine)
 {
 	struct perf_event_attr *attr = &evsel->core.attr;
-	const char *arch = perf_env__arch(evsel__env(evsel));
 
 	int size = (__sw_hweight64(attr->sample_regs_intr) * MAX_REG_SIZE) + 1;
 	char *bf = NULL;
@@ -752,7 +753,7 @@ static int set_regs_in_dict(PyObject *dict,
 		if (!bf)
 			return -1;
 
-		regs_map(sample->intr_regs, attr->sample_regs_intr, arch, bf, size);
+		regs_map(sample->intr_regs, attr->sample_regs_intr, e_machine, bf, size);
 
 		pydict_set_item_string_decref(dict, "iregs",
 					_PyUnicode_FromString(bf));
@@ -764,7 +765,7 @@ static int set_regs_in_dict(PyObject *dict,
 			if (!bf)
 				return -1;
 		}
-		regs_map(sample->user_regs, attr->sample_regs_user, arch, bf, size);
+		regs_map(sample->user_regs, attr->sample_regs_user, e_machine, bf, size);
 
 		pydict_set_item_string_decref(dict, "uregs",
 					_PyUnicode_FromString(bf));
@@ -834,6 +835,8 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample,
 					 PyObject *callchain)
 {
 	PyObject *dict, *dict_sample, *brstack, *brstacksym;
+	struct machine *machine;
+	uint16_t e_machine = EM_HOST;
 
 	dict = PyDict_New();
 	if (!dict)
@@ -920,7 +923,11 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample,
 			PyLong_FromUnsignedLongLong(sample->cyc_cnt));
 	}
 
-	if (set_regs_in_dict(dict, sample, evsel))
+	if (al->thread) {
+		machine = maps__machine(thread__maps(al->thread));
+		e_machine = thread__e_machine(al->thread, machine);
+	}
+	if (set_regs_in_dict(dict, sample, evsel, e_machine))
 		Py_FatalError("Failed to setting regs in dict");
 
 	return dict;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 922ef6577bbb..d7b28cb4e672 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -17,6 +17,7 @@
 #include "map_symbol.h"
 #include "branch.h"
 #include "debug.h"
+#include "dwarf-regs.h"
 #include "env.h"
 #include "evlist.h"
 #include "evsel.h"
@@ -942,7 +943,7 @@ static void branch_stack__printf(struct perf_sample *sample,
 	}
 }
 
-static void regs_dump__printf(u64 mask, u64 *regs, const char *arch)
+static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine)
 {
 	unsigned rid, i = 0;
 
@@ -950,7 +951,7 @@ static void regs_dump__printf(u64 mask, u64 *regs, const char *arch)
 		u64 val = regs[i++];
 
 		printf(".... %-5s 0x%016" PRIx64 "\n",
-		       perf_reg_name(rid, arch), val);
+		       perf_reg_name(rid, e_machine), val);
 	}
 }
 
@@ -968,7 +969,7 @@ static inline const char *regs_dump_abi(struct regs_dump *d)
 	return regs_abi[d->abi];
 }
 
-static void regs__printf(const char *type, struct regs_dump *regs, const char *arch)
+static void regs__printf(const char *type, struct regs_dump *regs, uint16_t e_machine)
 {
 	u64 mask = regs->mask;
 
@@ -977,10 +978,10 @@ static void regs__printf(const char *type, struct regs_dump *regs, const char *a
 	       mask,
 	       regs_dump_abi(regs));
 
-	regs_dump__printf(mask, regs->regs, arch);
+	regs_dump__printf(mask, regs->regs, e_machine);
 }
 
-static void regs_user__printf(struct perf_sample *sample, const char *arch)
+static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine)
 {
 	struct regs_dump *user_regs;
 
@@ -990,10 +991,10 @@ static void regs_user__printf(struct perf_sample *sample, const char *arch)
 	user_regs = perf_sample__user_regs(sample);
 
 	if (user_regs->regs)
-		regs__printf("user", user_regs, arch);
+		regs__printf("user", user_regs, e_machine);
 }
 
-static void regs_intr__printf(struct perf_sample *sample, const char *arch)
+static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine)
 {
 	struct regs_dump *intr_regs;
 
@@ -1003,7 +1004,7 @@ static void regs_intr__printf(struct perf_sample *sample, const char *arch)
 	intr_regs = perf_sample__intr_regs(sample);
 
 	if (intr_regs->regs)
-		regs__printf("intr", intr_regs, arch);
+		regs__printf("intr", intr_regs, e_machine);
 }
 
 static void stack_user__printf(struct stack_dump *dump)
@@ -1092,21 +1093,28 @@ char *get_page_size_name(u64 size, char *str)
 	return str;
 }
 
-static void dump_sample(struct evsel *evsel, union perf_event *event,
-			struct perf_sample *sample, const char *arch)
+static void dump_sample(struct machine *machine, struct evsel *evsel, union perf_event *event,
+			struct perf_sample *sample)
 {
 	u64 sample_type;
 	char str[PAGE_SIZE_NAME_LEN];
+	uint16_t e_machine = EM_NONE;
 
 	if (!dump_trace)
 		return;
 
+	sample_type = evsel->core.attr.sample_type;
+
+	if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR)) {
+		struct thread *thread = machine__find_thread(machine, sample->pid, sample->pid);
+
+		e_machine = thread__e_machine(thread, machine);
+	}
+
 	printf("(IP, 0x%x): %d/%d: %#" PRIx64 " period: %" PRIu64 " addr: %#" PRIx64 "\n",
 	       event->header.misc, sample->pid, sample->tid, sample->ip,
 	       sample->period, sample->addr);
 
-	sample_type = evsel->core.attr.sample_type;
-
 	if (evsel__has_callchain(evsel))
 		callchain__printf(evsel, sample);
 
@@ -1114,10 +1122,10 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
 		branch_stack__printf(sample, evsel);
 
 	if (sample_type & PERF_SAMPLE_REGS_USER)
-		regs_user__printf(sample, arch);
+		regs_user__printf(sample, e_machine);
 
 	if (sample_type & PERF_SAMPLE_REGS_INTR)
-		regs_intr__printf(sample, arch);
+		regs_intr__printf(sample, e_machine);
 
 	if (sample_type & PERF_SAMPLE_STACK_USER)
 		stack_user__printf(&sample->user_stack);
@@ -1432,10 +1440,10 @@ static int machines__deliver_event(struct machines *machines,
 		}
 		if (machine == NULL) {
 			++evlist->stats.nr_unprocessable_samples;
-			dump_sample(evsel, event, sample, perf_env__arch(NULL));
+			dump_sample(machine, evsel, event, sample);
 			return 0;
 		}
-		dump_sample(evsel, event, sample, perf_env__arch(machine->env));
+		dump_sample(machine, evsel, event, sample);
 		if (sample->deferred_callchain && tool->merge_deferred_callchains) {
 			struct deferred_event *de = malloc(sizeof(*de));
 			size_t sz = event->header.size;
@@ -2928,3 +2936,28 @@ struct perf_env *perf_session__env(struct perf_session *session)
 {
 	return &session->header.env;
 }
+
+static int perf_session__e_machine_cb(struct thread *thread,
+				      void *arg __maybe_unused)
+{
+	uint16_t *result = arg;
+	struct machine *machine = maps__machine(thread__maps(thread));
+
+	*result = thread__e_machine(thread, machine);
+	return *result != EM_NONE ? 1 : 0;
+}
+
+/*
+ * Note, a machine may have mixed 32-bit and 64-bit processes and so mixed
+ * e_machines. Use thread__e_machine when this matters.
+ */
+uint16_t perf_session__e_machine(struct perf_session *session)
+{
+	uint16_t e_machine = EM_NONE;
+
+	machines__for_each_thread(&session->machines,
+					 perf_session__e_machine_cb,
+					 &e_machine);
+
+	return e_machine == EM_NONE ? EM_HOST : e_machine;
+}
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 22d3ff877e83..eddc4c630b33 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -211,5 +211,6 @@ int perf_event__process_finished_round(const struct perf_tool *tool,
 				       struct ordered_events *oe);
 
 struct perf_env *perf_session__env(struct perf_session *session);
+uint16_t perf_session__e_machine(struct perf_session *session);
 
 #endif /* __PERF_SESSION_H */
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index dc882f17f52d..c25190cdceb4 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -187,7 +187,7 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *
 			void *arg)
 {
 	struct unwind_info *ui = arg;
-	const char *arch = perf_env__arch(ui->machine->env);
+	uint16_t e_machine = thread__e_machine(ui->thread, ui->machine);
 	struct stack_dump *stack = &ui->sample->user_stack;
 	u64 start, end;
 	int offset;
@@ -197,7 +197,7 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *
 		return false;
 
 	ret = perf_reg_value(&start, ui->sample->user_regs,
-			     perf_arch_reg_sp(arch));
+			     perf_arch_reg_sp(e_machine));
 	if (ret)
 		return false;
 
@@ -300,16 +300,18 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 			int max_stack,
 			bool best_effort)
 {
+	struct machine *machine = maps__machine(thread__maps(thread));
 	struct unwind_info *ui, ui_buf = {
 		.sample		= data,
 		.thread		= thread,
-		.machine	= maps__machine((thread__maps(thread))),
+		.machine	= machine,
 		.cb		= cb,
 		.arg		= arg,
 		.max_stack	= max_stack,
 		.best_effort    = best_effort
 	};
-	const char *arch = perf_env__arch(ui_buf.machine->env);
+	uint16_t e_machine = thread__e_machine(thread, machine);
+	const char *arch = perf_env__arch(machine->env);
 	Dwarf_Word ip;
 	int err = -EINVAL, i;
 	const Dwfl_Thread_Callbacks *callbacks;
@@ -327,7 +329,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 	if (!ui->dwfl)
 		goto out;
 
-	err = perf_reg_value(&ip, data->user_regs, perf_arch_reg_ip(arch));
+	err = perf_reg_value(&ip, data->user_regs, perf_arch_reg_ip(e_machine));
 	if (err)
 		goto out;
 
diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
index 0b037e7389a0..a24b45106acd 100644
--- a/tools/perf/util/unwind-libunwind-local.c
+++ b/tools/perf/util/unwind-libunwind-local.c
@@ -572,7 +572,6 @@ static int access_mem(unw_addr_space_t __maybe_unused as,
 		      int __write, void *arg)
 {
 	struct unwind_info *ui = arg;
-	const char *arch = perf_env__arch(ui->machine->env);
 	struct stack_dump *stack = &ui->sample->user_stack;
 	u64 start, end;
 	int offset;
@@ -585,7 +584,7 @@ static int access_mem(unw_addr_space_t __maybe_unused as,
 	}
 
 	ret = perf_reg_value(&start, perf_sample__user_regs(ui->sample),
-			     perf_arch_reg_sp(arch));
+			     perf_arch_reg_sp(thread__e_machine(ui->thread, ui->machine)));
 	if (ret)
 		return ret;
 
@@ -734,7 +733,7 @@ static void _unwind__finish_access(struct maps *maps)
 static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
 		       void *arg, int max_stack)
 {
-	const char *arch = perf_env__arch(ui->machine->env);
+	uint16_t e_machine = thread__e_machine(ui->thread, ui->machine);
 	u64 val;
 	unw_word_t ips[max_stack];
 	unw_addr_space_t addr_space;
@@ -742,7 +741,7 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
 	int ret, i = 0;
 
 	ret = perf_reg_value(&val, perf_sample__user_regs(ui->sample),
-			     perf_arch_reg_ip(arch));
+			     perf_arch_reg_ip(e_machine));
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 1672f3707a6ef4b386c30bb76df2f62e58a39430 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:35 -0800
Subject: perf dwarf-regs: Add util/dwarf-regs-arch for consistency with
 perf-regs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

perf_regs.h has cross architecture functions for operating with the
differing perf register constants. dwarf-regs.h is similar but for
cross architecture dwarf notions of registers.

For consistency move the arch parts of dwarf-regs out of util and into
its own directory.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/Build                              |  4 +-
 tools/perf/util/dwarf-regs-arch/Build              |  3 ++
 tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c  | 50 ++++++++++++++++++
 .../perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c | 61 ++++++++++++++++++++++
 tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c   | 50 ++++++++++++++++++
 tools/perf/util/dwarf-regs-csky.c                  | 50 ------------------
 tools/perf/util/dwarf-regs-powerpc.c               | 61 ----------------------
 tools/perf/util/dwarf-regs-x86.c                   | 50 ------------------
 8 files changed, 165 insertions(+), 164 deletions(-)
 create mode 100644 tools/perf/util/dwarf-regs-arch/Build
 create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c
 create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c
 create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c
 delete mode 100644 tools/perf/util/dwarf-regs-csky.c
 delete mode 100644 tools/perf/util/dwarf-regs-powerpc.c
 delete mode 100644 tools/perf/util/dwarf-regs-x86.c

(limited to 'tools')

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 5efec73be474..3cb1edd263cf 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -219,9 +219,7 @@ endif
 perf-util-$(CONFIG_LIBDW) += probe-finder.o
 perf-util-$(CONFIG_LIBDW) += dwarf-aux.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs.o
-perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o
-perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o
-perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-arch/
 perf-util-$(CONFIG_LIBDW) += debuginfo.o
 perf-util-$(CONFIG_LIBDW) += annotate-data.o
 perf-util-$(CONFIG_LIBDW) += libdw.o
diff --git a/tools/perf/util/dwarf-regs-arch/Build b/tools/perf/util/dwarf-regs-arch/Build
new file mode 100644
index 000000000000..98bec0032606
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/Build
@@ -0,0 +1,3 @@
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c
new file mode 100644
index 000000000000..d38ef1f07f3e
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
+// Mapping of DWARF debug register numbers into register names.
+
+#include <stddef.h>
+#include <dwarf-regs.h>
+
+#define CSKY_ABIV2_MAX_REGS 73
+const char *csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = {
+	/* r0 ~ r8 */
+	"%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1", "%regs2", "%regs3",
+	/* r9 ~ r15 */
+	"%regs4", "%regs5", "%regs6", "%regs7", "%regs8", "%regs9", "%sp",
+	"%lr",
+	/* r16 ~ r23 */
+	"%exregs0", "%exregs1", "%exregs2", "%exregs3", "%exregs4",
+	"%exregs5", "%exregs6", "%exregs7",
+	/* r24 ~ r31 */
+	"%exregs8", "%exregs9", "%exregs10", "%exregs11", "%exregs12",
+	"%exregs13", "%exregs14", "%tls",
+	"%pc", NULL, NULL, NULL, "%hi", "%lo", NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	"%epc",
+};
+
+#define CSKY_ABIV1_MAX_REGS 57
+const char *csky_dwarf_regs_table_abiv1[CSKY_ABIV1_MAX_REGS] = {
+	/* r0 ~ r8 */
+	"%sp", "%regs9", "%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1",
+	/* r9 ~ r15 */
+	"%regs2", "%regs3", "%regs4", "%regs5", "%regs6", "%regs7", "%regs8",
+	"%lr",
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	"%epc",
+};
+
+const char *get_csky_regstr(unsigned int n, unsigned int flags)
+{
+	if (flags & EF_CSKY_ABIV2)
+		return (n < CSKY_ABIV2_MAX_REGS) ? csky_dwarf_regs_table_abiv2[n] : NULL;
+
+	return (n < CSKY_ABIV1_MAX_REGS) ? csky_dwarf_regs_table_abiv1[n] : NULL;
+}
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c
new file mode 100644
index 000000000000..caf77a234c78
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Mapping of DWARF debug register numbers into register names.
+ *
+ * Copyright (C) 2010 Ian Munsie, IBM Corporation.
+ */
+
+#include <dwarf-regs.h>
+
+#define PPC_OP(op)	(((op) >> 26) & 0x3F)
+#define PPC_RA(a)	(((a) >> 16) & 0x1f)
+#define PPC_RT(t)	(((t) >> 21) & 0x1f)
+#define PPC_RB(b)	(((b) >> 11) & 0x1f)
+#define PPC_D(D)	((D) & 0xfffe)
+#define PPC_DS(DS)	((DS) & 0xfffc)
+#define OP_LD	58
+#define OP_STD	62
+
+static int get_source_reg(u32 raw_insn)
+{
+	return PPC_RA(raw_insn);
+}
+
+static int get_target_reg(u32 raw_insn)
+{
+	return PPC_RT(raw_insn);
+}
+
+static int get_offset_opcode(u32 raw_insn)
+{
+	int opcode = PPC_OP(raw_insn);
+
+	/* DS- form */
+	if ((opcode == OP_LD) || (opcode == OP_STD))
+		return PPC_DS(raw_insn);
+	else
+		return PPC_D(raw_insn);
+}
+
+/*
+ * Fills the required fields for op_loc depending on if it
+ * is a source or target.
+ * D form: ins RT,D(RA) -> src_reg1 = RA, offset = D, dst_reg1 = RT
+ * DS form: ins RT,DS(RA) -> src_reg1 = RA, offset = DS, dst_reg1 = RT
+ * X form: ins RT,RA,RB -> src_reg1 = RA, src_reg2 = RB, dst_reg1 = RT
+ */
+void get_powerpc_regs(u32 raw_insn, int is_source,
+		struct annotated_op_loc *op_loc)
+{
+	if (is_source)
+		op_loc->reg1 = get_source_reg(raw_insn);
+	else
+		op_loc->reg1 = get_target_reg(raw_insn);
+
+	if (op_loc->multi_regs)
+		op_loc->reg2 = PPC_RB(raw_insn);
+
+	/* TODO: Implement offset handling for X Form */
+	if ((op_loc->mem_ref) && (PPC_OP(raw_insn) != 31))
+		op_loc->offset = get_offset_opcode(raw_insn);
+}
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c
new file mode 100644
index 000000000000..7a55c65e8da6
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * dwarf-regs.c : Mapping of DWARF debug register numbers into register names.
+ * Extracted from probe-finder.c
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ */
+
+#include <errno.h> /* for EINVAL */
+#include <string.h> /* for strcmp */
+#include <linux/kernel.h> /* for ARRAY_SIZE */
+#include <dwarf-regs.h>
+
+struct dwarf_regs_idx {
+	const char *name;
+	int idx;
+};
+
+static const struct dwarf_regs_idx x86_regidx_table[] = {
+	{ "rax", 0 }, { "eax", 0 }, { "ax", 0 }, { "al", 0 },
+	{ "rdx", 1 }, { "edx", 1 }, { "dx", 1 }, { "dl", 1 },
+	{ "rcx", 2 }, { "ecx", 2 }, { "cx", 2 }, { "cl", 2 },
+	{ "rbx", 3 }, { "edx", 3 }, { "bx", 3 }, { "bl", 3 },
+	{ "rsi", 4 }, { "esi", 4 }, { "si", 4 }, { "sil", 4 },
+	{ "rdi", 5 }, { "edi", 5 }, { "di", 5 }, { "dil", 5 },
+	{ "rbp", 6 }, { "ebp", 6 }, { "bp", 6 }, { "bpl", 6 },
+	{ "rsp", 7 }, { "esp", 7 }, { "sp", 7 }, { "spl", 7 },
+	{ "r8", 8 }, { "r8d", 8 }, { "r8w", 8 }, { "r8b", 8 },
+	{ "r9", 9 }, { "r9d", 9 }, { "r9w", 9 }, { "r9b", 9 },
+	{ "r10", 10 }, { "r10d", 10 }, { "r10w", 10 }, { "r10b", 10 },
+	{ "r11", 11 }, { "r11d", 11 }, { "r11w", 11 }, { "r11b", 11 },
+	{ "r12", 12 }, { "r12d", 12 }, { "r12w", 12 }, { "r12b", 12 },
+	{ "r13", 13 }, { "r13d", 13 }, { "r13w", 13 }, { "r13b", 13 },
+	{ "r14", 14 }, { "r14d", 14 }, { "r14w", 14 }, { "r14b", 14 },
+	{ "r15", 15 }, { "r15d", 15 }, { "r15w", 15 }, { "r15b", 15 },
+	{ "rip", DWARF_REG_PC },
+};
+
+int get_x86_regnum(const char *name)
+{
+	unsigned int i;
+
+	if (*name != '%')
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(x86_regidx_table); i++)
+		if (!strcmp(x86_regidx_table[i].name, name + 1))
+			return x86_regidx_table[i].idx;
+	return -ENOENT;
+}
diff --git a/tools/perf/util/dwarf-regs-csky.c b/tools/perf/util/dwarf-regs-csky.c
deleted file mode 100644
index d38ef1f07f3e..000000000000
--- a/tools/perf/util/dwarf-regs-csky.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
-// Mapping of DWARF debug register numbers into register names.
-
-#include <stddef.h>
-#include <dwarf-regs.h>
-
-#define CSKY_ABIV2_MAX_REGS 73
-const char *csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = {
-	/* r0 ~ r8 */
-	"%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1", "%regs2", "%regs3",
-	/* r9 ~ r15 */
-	"%regs4", "%regs5", "%regs6", "%regs7", "%regs8", "%regs9", "%sp",
-	"%lr",
-	/* r16 ~ r23 */
-	"%exregs0", "%exregs1", "%exregs2", "%exregs3", "%exregs4",
-	"%exregs5", "%exregs6", "%exregs7",
-	/* r24 ~ r31 */
-	"%exregs8", "%exregs9", "%exregs10", "%exregs11", "%exregs12",
-	"%exregs13", "%exregs14", "%tls",
-	"%pc", NULL, NULL, NULL, "%hi", "%lo", NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	"%epc",
-};
-
-#define CSKY_ABIV1_MAX_REGS 57
-const char *csky_dwarf_regs_table_abiv1[CSKY_ABIV1_MAX_REGS] = {
-	/* r0 ~ r8 */
-	"%sp", "%regs9", "%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1",
-	/* r9 ~ r15 */
-	"%regs2", "%regs3", "%regs4", "%regs5", "%regs6", "%regs7", "%regs8",
-	"%lr",
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	"%epc",
-};
-
-const char *get_csky_regstr(unsigned int n, unsigned int flags)
-{
-	if (flags & EF_CSKY_ABIV2)
-		return (n < CSKY_ABIV2_MAX_REGS) ? csky_dwarf_regs_table_abiv2[n] : NULL;
-
-	return (n < CSKY_ABIV1_MAX_REGS) ? csky_dwarf_regs_table_abiv1[n] : NULL;
-}
diff --git a/tools/perf/util/dwarf-regs-powerpc.c b/tools/perf/util/dwarf-regs-powerpc.c
deleted file mode 100644
index caf77a234c78..000000000000
--- a/tools/perf/util/dwarf-regs-powerpc.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Mapping of DWARF debug register numbers into register names.
- *
- * Copyright (C) 2010 Ian Munsie, IBM Corporation.
- */
-
-#include <dwarf-regs.h>
-
-#define PPC_OP(op)	(((op) >> 26) & 0x3F)
-#define PPC_RA(a)	(((a) >> 16) & 0x1f)
-#define PPC_RT(t)	(((t) >> 21) & 0x1f)
-#define PPC_RB(b)	(((b) >> 11) & 0x1f)
-#define PPC_D(D)	((D) & 0xfffe)
-#define PPC_DS(DS)	((DS) & 0xfffc)
-#define OP_LD	58
-#define OP_STD	62
-
-static int get_source_reg(u32 raw_insn)
-{
-	return PPC_RA(raw_insn);
-}
-
-static int get_target_reg(u32 raw_insn)
-{
-	return PPC_RT(raw_insn);
-}
-
-static int get_offset_opcode(u32 raw_insn)
-{
-	int opcode = PPC_OP(raw_insn);
-
-	/* DS- form */
-	if ((opcode == OP_LD) || (opcode == OP_STD))
-		return PPC_DS(raw_insn);
-	else
-		return PPC_D(raw_insn);
-}
-
-/*
- * Fills the required fields for op_loc depending on if it
- * is a source or target.
- * D form: ins RT,D(RA) -> src_reg1 = RA, offset = D, dst_reg1 = RT
- * DS form: ins RT,DS(RA) -> src_reg1 = RA, offset = DS, dst_reg1 = RT
- * X form: ins RT,RA,RB -> src_reg1 = RA, src_reg2 = RB, dst_reg1 = RT
- */
-void get_powerpc_regs(u32 raw_insn, int is_source,
-		struct annotated_op_loc *op_loc)
-{
-	if (is_source)
-		op_loc->reg1 = get_source_reg(raw_insn);
-	else
-		op_loc->reg1 = get_target_reg(raw_insn);
-
-	if (op_loc->multi_regs)
-		op_loc->reg2 = PPC_RB(raw_insn);
-
-	/* TODO: Implement offset handling for X Form */
-	if ((op_loc->mem_ref) && (PPC_OP(raw_insn) != 31))
-		op_loc->offset = get_offset_opcode(raw_insn);
-}
diff --git a/tools/perf/util/dwarf-regs-x86.c b/tools/perf/util/dwarf-regs-x86.c
deleted file mode 100644
index 7a55c65e8da6..000000000000
--- a/tools/perf/util/dwarf-regs-x86.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * dwarf-regs.c : Mapping of DWARF debug register numbers into register names.
- * Extracted from probe-finder.c
- *
- * Written by Masami Hiramatsu <mhiramat@redhat.com>
- */
-
-#include <errno.h> /* for EINVAL */
-#include <string.h> /* for strcmp */
-#include <linux/kernel.h> /* for ARRAY_SIZE */
-#include <dwarf-regs.h>
-
-struct dwarf_regs_idx {
-	const char *name;
-	int idx;
-};
-
-static const struct dwarf_regs_idx x86_regidx_table[] = {
-	{ "rax", 0 }, { "eax", 0 }, { "ax", 0 }, { "al", 0 },
-	{ "rdx", 1 }, { "edx", 1 }, { "dx", 1 }, { "dl", 1 },
-	{ "rcx", 2 }, { "ecx", 2 }, { "cx", 2 }, { "cl", 2 },
-	{ "rbx", 3 }, { "edx", 3 }, { "bx", 3 }, { "bl", 3 },
-	{ "rsi", 4 }, { "esi", 4 }, { "si", 4 }, { "sil", 4 },
-	{ "rdi", 5 }, { "edi", 5 }, { "di", 5 }, { "dil", 5 },
-	{ "rbp", 6 }, { "ebp", 6 }, { "bp", 6 }, { "bpl", 6 },
-	{ "rsp", 7 }, { "esp", 7 }, { "sp", 7 }, { "spl", 7 },
-	{ "r8", 8 }, { "r8d", 8 }, { "r8w", 8 }, { "r8b", 8 },
-	{ "r9", 9 }, { "r9d", 9 }, { "r9w", 9 }, { "r9b", 9 },
-	{ "r10", 10 }, { "r10d", 10 }, { "r10w", 10 }, { "r10b", 10 },
-	{ "r11", 11 }, { "r11d", 11 }, { "r11w", 11 }, { "r11b", 11 },
-	{ "r12", 12 }, { "r12d", 12 }, { "r12w", 12 }, { "r12b", 12 },
-	{ "r13", 13 }, { "r13d", 13 }, { "r13w", 13 }, { "r13b", 13 },
-	{ "r14", 14 }, { "r14d", 14 }, { "r14w", 14 }, { "r14b", 14 },
-	{ "r15", 15 }, { "r15d", 15 }, { "r15w", 15 }, { "r15b", 15 },
-	{ "rip", DWARF_REG_PC },
-};
-
-int get_x86_regnum(const char *name)
-{
-	unsigned int i;
-
-	if (*name != '%')
-		return -EINVAL;
-
-	for (i = 0; i < ARRAY_SIZE(x86_regidx_table); i++)
-		if (!strcmp(x86_regidx_table[i].name, name + 1))
-			return x86_regidx_table[i].idx;
-	return -ENOENT;
-}
-- 
cgit v1.2.3


From 3a00f41646bbcb45aff17bb4ba27c52c6bab4f68 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:36 -0800
Subject: perf dwarf-regs: Remove get_arch_regnum()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Except in dwarf-regs the function is never called. The weak function
has no strong arch implementations.

Remove so that the fall-through case applies.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dwarf-regs.c         | 12 ------------
 tools/perf/util/include/dwarf-regs.h |  4 ----
 2 files changed, 16 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index 28a1cfdf26d4..b2f37299147e 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -71,13 +71,6 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int
 	return NULL;
 }
 
-#if EM_HOST != EM_X86_64 && EM_HOST != EM_386
-__weak int get_arch_regnum(const char *name __maybe_unused)
-{
-	return -ENOTSUP;
-}
-#endif
-
 /* Return DWARF register number from architecture register name */
 int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags __maybe_unused)
 {
@@ -98,11 +91,6 @@ int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags
 		machine = EM_HOST;
 	}
 	switch (machine) {
-#if EM_HOST != EM_X86_64 && EM_HOST != EM_386
-	case EM_HOST:
-		reg = get_arch_regnum(regname);
-		break;
-#endif
 	case EM_X86_64:
 		fallthrough;
 	case EM_386:
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index 6f1b9f6b2466..015d1ade645f 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -101,10 +101,6 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int
 
 int get_x86_regnum(const char *name);
 
-#if !defined(__x86_64__) && !defined(__i386__)
-int get_arch_regnum(const char *name);
-#endif
-
 /*
  * get_dwarf_regnum - Returns DWARF regnum from register name
  * name: architecture register name
-- 
cgit v1.2.3


From c31040085914f1188720073baa43d1483693c0a3 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:37 -0800
Subject: perf dwarf-regs: Clean up x86 dwarf_regnum code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The i386 and x86-64 register numbers differ on x86, but previously x86
was a single arch string and so this couldn't be handled. The
transition to using ELF EM_* values means we can translate x86
registers correctly for either the x86-64 dwarf register mappings
(from the System V ABI) or i386 register mappings. Correct the mappings.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c | 133 +++++++++++++++++++++--
 tools/perf/util/dwarf-regs.c                     |   5 +-
 tools/perf/util/include/dwarf-regs.h             |   3 +-
 3 files changed, 129 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c
index 7a55c65e8da6..f0c42e4d7423 100644
--- a/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c
@@ -13,10 +13,65 @@
 
 struct dwarf_regs_idx {
 	const char *name;
-	int idx;
+	int dwarf_regnum;
 };
 
-static const struct dwarf_regs_idx x86_regidx_table[] = {
+static const struct dwarf_regs_idx i386_regidx_table[] = {
+	{ "eax", 0 }, { "ax", 0 }, { "al", 0 },
+	{ "ecx", 1 }, { "cx", 1 }, { "cl", 1 },
+	{ "edx", 2 }, { "dx", 2 }, { "dl", 2 },
+	{ "ebx", 3 }, { "bx", 3 }, { "bl", 3 },
+	{ "esp", 4 }, { "sp", 4 }, { "$stack", 4},
+	{ "ebp", 5 }, { "bp", 5 },
+	{ "esi", 6 }, { "si", 6 },
+	{ "edi", 7 }, { "di", 7 },
+	// 8 - Return Address RA
+	{ "eflags", 9}, { "flags", 9},
+	// 10 - reserved
+	{ "st0", 11},
+	{ "st1", 12},
+	{ "st2", 13},
+	{ "st3", 14},
+	{ "st4", 15},
+	{ "st5", 16},
+	{ "st6", 17},
+	{ "st7", 18},
+	// 19-20 - reserved
+	{ "xmm0", 21},
+	{ "xmm1", 22},
+	{ "xmm2", 23},
+	{ "xmm3", 24},
+	{ "xmm4", 25},
+	{ "xmm5", 26},
+	{ "xmm6", 27},
+	{ "xmm7", 28},
+	{ "mm0", 29},
+	{ "mm1", 30},
+	{ "mm2", 31},
+	{ "mm3", 32},
+	{ "mm4", 33},
+	{ "mm5", 34},
+	{ "mm6", 35},
+	{ "mm7", 36},
+	// 37-38 - unknown
+	{ "mxcsr", 39}, // 128-bit Media Control and Status
+	{ "es", 40},
+	{ "cs", 41},
+	{ "ss", 42},
+	{ "ds", 43},
+	{ "fs", 44},
+	{ "gs", 45},
+	// 46-47 - reserved
+	{ "tr", 48}, // Task Register
+	{ "ldtr", 49}, // LDT Register
+	// 50-92 - reserved
+	{ "fs.base", 92},
+	{ "gs.base", 93},
+	// End of regular dwarf registers.
+	{ "eip", DWARF_REG_PC }, { "ip", DWARF_REG_PC },
+};
+
+static const struct dwarf_regs_idx x86_64_regidx_table[] = {
 	{ "rax", 0 }, { "eax", 0 }, { "ax", 0 }, { "al", 0 },
 	{ "rdx", 1 }, { "edx", 1 }, { "dx", 1 }, { "dl", 1 },
 	{ "rcx", 2 }, { "ecx", 2 }, { "cx", 2 }, { "cl", 2 },
@@ -33,18 +88,78 @@ static const struct dwarf_regs_idx x86_regidx_table[] = {
 	{ "r13", 13 }, { "r13d", 13 }, { "r13w", 13 }, { "r13b", 13 },
 	{ "r14", 14 }, { "r14d", 14 }, { "r14w", 14 }, { "r14b", 14 },
 	{ "r15", 15 }, { "r15d", 15 }, { "r15w", 15 }, { "r15b", 15 },
-	{ "rip", DWARF_REG_PC },
+	// 16 - Return Address RA
+	{ "xmm0", 17},
+	{ "xmm1", 18},
+	{ "xmm2", 19},
+	{ "xmm3", 20},
+	{ "xmm4", 21},
+	{ "xmm5", 22},
+	{ "xmm6", 23},
+	{ "xmm7", 24},
+	{ "xmm8", 25},
+	{ "xmm9", 26},
+	{ "xmm10", 27},
+	{ "xmm11", 28},
+	{ "xmm12", 29},
+	{ "xmm13", 30},
+	{ "xmm14", 31},
+	{ "xmm15", 32},
+	{ "st0", 33},
+	{ "st1", 34},
+	{ "st2", 35},
+	{ "st3", 36},
+	{ "st4", 37},
+	{ "st5", 38},
+	{ "st6", 39},
+	{ "st7", 40},
+	{ "mm0", 41},
+	{ "mm1", 42},
+	{ "mm2", 43},
+	{ "mm3", 44},
+	{ "mm4", 45},
+	{ "mm5", 46},
+	{ "mm6", 47},
+	{ "mm7", 48},
+	{ "rflags", 49}, { "eflags", 49}, { "flags", 49},
+	{ "es", 50},
+	{ "cs", 51},
+	{ "ss", 52},
+	{ "ds", 53},
+	{ "fs", 54},
+	{ "gs", 55},
+	// 56-47 - reserved
+	{ "fs.base", 58},
+	{ "gs.base", 59},
+	// 60-61 - reserved
+	{ "tr", 62}, // Task Register
+	{ "ldtr", 63}, // LDT Register
+	{ "mxcsr", 64}, // 128-bit Media Control and Status
+	{ "fcw", 65}, // x87 Control Word
+	{ "fsw", 66}, // x87 Status Word
+	// End of regular dwarf registers.
+	{ "rip", DWARF_REG_PC }, { "eip", DWARF_REG_PC }, { "ip", DWARF_REG_PC },
 };
 
-int get_x86_regnum(const char *name)
+static int get_regnum(const struct dwarf_regs_idx *entries, size_t num_entries, const char *name)
 {
-	unsigned int i;
-
 	if (*name != '%')
 		return -EINVAL;
 
-	for (i = 0; i < ARRAY_SIZE(x86_regidx_table); i++)
-		if (!strcmp(x86_regidx_table[i].name, name + 1))
-			return x86_regidx_table[i].idx;
+	name++;
+	for (size_t i = 0; i < num_entries; i++) {
+		if (!strcmp(entries[i].name, name))
+			return entries[i].dwarf_regnum;
+	}
 	return -ENOENT;
 }
+
+int __get_dwarf_regnum_i386(const char *name)
+{
+	return get_regnum(i386_regidx_table, ARRAY_SIZE(i386_regidx_table), name);
+}
+
+int __get_dwarf_regnum_x86_64(const char *name)
+{
+	return get_regnum(x86_64_regidx_table, ARRAY_SIZE(x86_64_regidx_table), name);
+}
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index b2f37299147e..ef249dd589e3 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -92,9 +92,10 @@ int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags
 	}
 	switch (machine) {
 	case EM_X86_64:
-		fallthrough;
+		reg = __get_dwarf_regnum_x86_64(name);
+		break;
 	case EM_386:
-		reg = get_x86_regnum(regname);
+		reg = __get_dwarf_regnum_i386(name);
 		break;
 	default:
 		pr_err("ELF MACHINE %x is not supported.\n", machine);
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index 015d1ade645f..bb5413b0fee4 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -99,7 +99,8 @@ const char *get_csky_regstr(unsigned int n, unsigned int flags);
  */
 const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int flags);
 
-int get_x86_regnum(const char *name);
+int __get_dwarf_regnum_i386(const char *name);
+int __get_dwarf_regnum_x86_64(const char *name);
 
 /*
  * get_dwarf_regnum - Returns DWARF regnum from register name
-- 
cgit v1.2.3


From d3ab52c31efab9e8a29b8fc1ae4c09ab41e0cf84 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:38 -0800
Subject: perf dwarf-regs: Add get_dwarf_regnum_for_perf_regnum() and use for
 x86 unwinding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a utility to map a perf register number to a DWARF register number
for a particular ELF machine type.

Create a generic unwind-libdw initial register initialization routine
that uses this function and thereby avoids arch specific
initialization. The unwind-libdw code does:

1) compute the maximum DWARF register from the set of sampled user registers,
2) allocates a set of DWARF registers,
3) copies the sample registers into the appropriate DWARF registers.

This generic solution is initially implemented for use with x86 as
only get_dwarf_regnum_for_perf_regnum() support for x86 is currently present.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c   | 95 ++++++++++++++++++++++
 tools/perf/util/dwarf-regs.c                       | 55 +++++++++++++
 tools/perf/util/include/dwarf-regs.h               |  8 ++
 tools/perf/util/unwind-libdw-arch/Build            |  1 -
 .../perf/util/unwind-libdw-arch/unwind-libdw-x86.c | 54 ------------
 tools/perf/util/unwind-libdw.c                     | 70 ++++++++++++++--
 tools/perf/util/unwind-libdw.h                     |  2 +-
 7 files changed, 222 insertions(+), 63 deletions(-)
 delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c
index f0c42e4d7423..cadef120aeb4 100644
--- a/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-x86.c
@@ -10,6 +10,7 @@
 #include <string.h> /* for strcmp */
 #include <linux/kernel.h> /* for ARRAY_SIZE */
 #include <dwarf-regs.h>
+#include "../../../arch/x86/include/uapi/asm/perf_regs.h"
 
 struct dwarf_regs_idx {
 	const char *name;
@@ -163,3 +164,97 @@ int __get_dwarf_regnum_x86_64(const char *name)
 {
 	return get_regnum(x86_64_regidx_table, ARRAY_SIZE(x86_64_regidx_table), name);
 }
+
+int __get_dwarf_regnum_for_perf_regnum_i386(int perf_regnum)
+{
+	static const int dwarf_i386_regnums[] = {
+		[PERF_REG_X86_AX] = 0,
+		[PERF_REG_X86_BX] = 3,
+		[PERF_REG_X86_CX] = 1,
+		[PERF_REG_X86_DX] = 2,
+		[PERF_REG_X86_SI] = 6,
+		[PERF_REG_X86_DI] = 7,
+		[PERF_REG_X86_BP] = 5,
+		[PERF_REG_X86_SP] = 4,
+		[PERF_REG_X86_IP] = 8,
+		[PERF_REG_X86_FLAGS] = 9,
+		[PERF_REG_X86_CS] = 41,
+		[PERF_REG_X86_SS] = 42,
+		[PERF_REG_X86_DS] = 43,
+		[PERF_REG_X86_ES] = 40,
+		[PERF_REG_X86_FS] = 44,
+		[PERF_REG_X86_GS] = 45,
+		[PERF_REG_X86_XMM0] = 21,
+		[PERF_REG_X86_XMM1] = 22,
+		[PERF_REG_X86_XMM2] = 23,
+		[PERF_REG_X86_XMM3] = 24,
+		[PERF_REG_X86_XMM4] = 25,
+		[PERF_REG_X86_XMM5] = 26,
+		[PERF_REG_X86_XMM6] = 27,
+		[PERF_REG_X86_XMM7] = 28,
+	};
+
+	if (perf_regnum == 0)
+		return 0;
+
+	if (perf_regnum <  0 || perf_regnum > (int)ARRAY_SIZE(dwarf_i386_regnums) ||
+	    dwarf_i386_regnums[perf_regnum] == 0)
+		return -ENOENT;
+
+	return dwarf_i386_regnums[perf_regnum];
+}
+
+int __get_dwarf_regnum_for_perf_regnum_x86_64(int perf_regnum)
+{
+	static const int dwarf_x86_64_regnums[] = {
+		[PERF_REG_X86_AX] = 0,
+		[PERF_REG_X86_BX] = 3,
+		[PERF_REG_X86_CX] = 2,
+		[PERF_REG_X86_DX] = 1,
+		[PERF_REG_X86_SI] = 4,
+		[PERF_REG_X86_DI] = 5,
+		[PERF_REG_X86_BP] = 6,
+		[PERF_REG_X86_SP] = 7,
+		[PERF_REG_X86_IP] = 16,
+		[PERF_REG_X86_FLAGS] = 49,
+		[PERF_REG_X86_CS] = 51,
+		[PERF_REG_X86_SS] = 52,
+		[PERF_REG_X86_DS] = 53,
+		[PERF_REG_X86_ES] = 50,
+		[PERF_REG_X86_FS] = 54,
+		[PERF_REG_X86_GS] = 55,
+		[PERF_REG_X86_R8] = 8,
+		[PERF_REG_X86_R9] = 9,
+		[PERF_REG_X86_R10] = 10,
+		[PERF_REG_X86_R11] = 11,
+		[PERF_REG_X86_R12] = 12,
+		[PERF_REG_X86_R13] = 13,
+		[PERF_REG_X86_R14] = 14,
+		[PERF_REG_X86_R15] = 15,
+		[PERF_REG_X86_XMM0] = 17,
+		[PERF_REG_X86_XMM1] = 18,
+		[PERF_REG_X86_XMM2] = 19,
+		[PERF_REG_X86_XMM3] = 20,
+		[PERF_REG_X86_XMM4] = 21,
+		[PERF_REG_X86_XMM5] = 22,
+		[PERF_REG_X86_XMM6] = 23,
+		[PERF_REG_X86_XMM7] = 24,
+		[PERF_REG_X86_XMM8] = 25,
+		[PERF_REG_X86_XMM9] = 26,
+		[PERF_REG_X86_XMM10] = 27,
+		[PERF_REG_X86_XMM11] = 28,
+		[PERF_REG_X86_XMM12] = 29,
+		[PERF_REG_X86_XMM13] = 30,
+		[PERF_REG_X86_XMM14] = 31,
+		[PERF_REG_X86_XMM15] = 32,
+	};
+
+	if (perf_regnum == 0)
+		return 0;
+
+	if (perf_regnum <  0 || perf_regnum > (int)ARRAY_SIZE(dwarf_x86_64_regnums) ||
+	    dwarf_x86_64_regnums[perf_regnum] == 0)
+		return -ENOENT;
+
+	return dwarf_x86_64_regnums[perf_regnum];
+}
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index ef249dd589e3..1f7d892612df 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -103,3 +103,58 @@ int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags
 	free(regname);
 	return reg;
 }
+
+static int get_libdw_frame_nregs(unsigned int machine, unsigned int flags __maybe_unused)
+{
+	switch (machine) {
+	case EM_X86_64:
+		return 17;
+	case EM_386:
+		return 9;
+	case EM_ARM:
+		return 16;
+	case EM_AARCH64:
+		return 97;
+	case EM_CSKY:
+		return 38;
+	case EM_S390:
+		return 32;
+	case EM_PPC:
+	case EM_PPC64:
+		return 145;
+	case EM_RISCV:
+		return 66;
+	case EM_SPARC:
+	case EM_SPARCV9:
+		return 103;
+	case EM_LOONGARCH:
+		return 74;
+	default:
+		return 0;
+	}
+}
+
+int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine,
+				     unsigned int flags, bool only_libdw_supported)
+{
+	int reg;
+
+	switch (machine) {
+	case EM_X86_64:
+		reg = __get_dwarf_regnum_for_perf_regnum_x86_64(perf_regnum);
+		break;
+	case EM_386:
+		reg = __get_dwarf_regnum_for_perf_regnum_i386(perf_regnum);
+		break;
+	default:
+		pr_err("ELF MACHINE %x is not supported.\n", machine);
+		return -ENOENT;
+	}
+	if (reg >= 0 && only_libdw_supported) {
+		int nregs = get_libdw_frame_nregs(machine, flags);
+
+		if (reg >= nregs)
+			reg = -ENOENT;
+	}
+	return reg;
+}
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index bb5413b0fee4..00881f1d45d6 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -101,6 +101,8 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int
 
 int __get_dwarf_regnum_i386(const char *name);
 int __get_dwarf_regnum_x86_64(const char *name);
+int __get_dwarf_regnum_for_perf_regnum_i386(int perf_regnum);
+int __get_dwarf_regnum_for_perf_regnum_x86_64(int perf_regnum);
 
 /*
  * get_dwarf_regnum - Returns DWARF regnum from register name
@@ -109,6 +111,12 @@ int __get_dwarf_regnum_x86_64(const char *name);
  */
 int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags);
 
+/*
+ * get_dwarf_regnum - Returns DWARF regnum from perf register number.
+ */
+int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine, unsigned int flags,
+				     bool only_libdw_supported);
+
 void get_powerpc_regs(u32 raw_insn, int is_source, struct annotated_op_loc *op_loc);
 
 #else /* HAVE_LIBDW_SUPPORT */
diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build
index ef17a83a7813..5b5682029953 100644
--- a/tools/perf/util/unwind-libdw-arch/Build
+++ b/tools/perf/util/unwind-libdw-arch/Build
@@ -1,4 +1,3 @@
-perf-util-y += unwind-libdw-x86.o
 perf-util-y += unwind-libdw-arm.o
 perf-util-y += unwind-libdw-arm64.o
 perf-util-y += unwind-libdw-csky.o
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c
deleted file mode 100644
index dd27545a4a68..000000000000
--- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-x86.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <elfutils/libdwfl.h>
-#include "../arch/x86/include/uapi/asm/perf_regs.h"
-#include "util/unwind-libdw.h"
-#include "util/perf_regs.h"
-#include "util/sample.h"
-
-bool libdw_set_initial_registers_x86(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[17];
-	unsigned nregs;
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_X86_##r);	\
-	val;							\
-})
-
-	if (user_regs->abi == PERF_SAMPLE_REGS_ABI_32) {
-		dwarf_regs[0] = REG(AX);
-		dwarf_regs[1] = REG(CX);
-		dwarf_regs[2] = REG(DX);
-		dwarf_regs[3] = REG(BX);
-		dwarf_regs[4] = REG(SP);
-		dwarf_regs[5] = REG(BP);
-		dwarf_regs[6] = REG(SI);
-		dwarf_regs[7] = REG(DI);
-		dwarf_regs[8] = REG(IP);
-		nregs = 9;
-	} else {
-		dwarf_regs[0]  = REG(AX);
-		dwarf_regs[1]  = REG(DX);
-		dwarf_regs[2]  = REG(CX);
-		dwarf_regs[3]  = REG(BX);
-		dwarf_regs[4]  = REG(SI);
-		dwarf_regs[5]  = REG(DI);
-		dwarf_regs[6]  = REG(BP);
-		dwarf_regs[7]  = REG(SP);
-		dwarf_regs[8]  = REG(R8);
-		dwarf_regs[9]  = REG(R9);
-		dwarf_regs[10] = REG(R10);
-		dwarf_regs[11] = REG(R11);
-		dwarf_regs[12] = REG(R12);
-		dwarf_regs[13] = REG(R13);
-		dwarf_regs[14] = REG(R14);
-		dwarf_regs[15] = REG(R15);
-		dwarf_regs[16] = REG(IP);
-		nregs = 17;
-	}
-
-	return dwfl_thread_state_registers(thread, 0, nregs, dwarf_regs);
-}
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index c25190cdceb4..055dab921442 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -6,6 +6,7 @@
 #include <errno.h>
 #include "debug.h"
 #include "dso.h"
+#include <dwarf-regs.h>
 #include "unwind.h"
 #include "unwind-libdw.h"
 #include "machine.h"
@@ -225,6 +226,59 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *
 	return true;
 }
 
+static bool libdw_set_initial_registers_generic(Dwfl_Thread *thread, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
+	Dwarf_Word *dwarf_regs;
+	int max_dwarf_reg = 0;
+	bool ret;
+	uint16_t e_machine = ui->e_machine;
+	int e_flags = 0;
+	uint64_t ip_perf_reg = perf_arch_reg_ip(e_machine);
+	Dwarf_Word val = 0;
+
+
+	/*
+	 * For every possible perf register in the bitmap determine the dwarf
+	 * register and use to compute the max.
+	 */
+	for (int perf_reg = 0; perf_reg < 64; perf_reg++) {
+		if (user_regs->mask & (1ULL << perf_reg)) {
+			int dwarf_reg =
+				get_dwarf_regnum_for_perf_regnum(perf_reg, e_machine,
+								 e_flags,
+								 /*only_libdw_supported=*/true);
+			if (dwarf_reg > max_dwarf_reg)
+				max_dwarf_reg = dwarf_reg;
+		}
+	}
+
+	dwarf_regs = calloc(max_dwarf_reg + 1, sizeof(*dwarf_regs));
+	if (!dwarf_regs)
+		return false;
+
+	for (int perf_reg = 0; perf_reg < 64; perf_reg++) {
+		if (user_regs->mask & (1ULL << perf_reg)) {
+			int dwarf_reg =
+				get_dwarf_regnum_for_perf_regnum(perf_reg, e_machine,
+								 e_flags,
+								 /*only_libdw_supported=*/true);
+			if (dwarf_reg >= 0) {
+				val = 0;
+				if (perf_reg_value(&val, user_regs, perf_reg) == 0)
+					dwarf_regs[dwarf_reg] = val;
+			}
+		}
+	}
+	if (perf_reg_value(&val, user_regs, ip_perf_reg) == 0)
+		dwfl_thread_state_register_pc(thread, val);
+
+	ret = dwfl_thread_state_registers(thread, 0, max_dwarf_reg + 1, dwarf_regs);
+	free(dwarf_regs);
+	return ret;
+}
+
 #define DEFINE_DWFL_THREAD_CALLBACKS(arch)                           \
 static const Dwfl_Thread_Callbacks callbacks_##arch = {              \
 	.next_thread           = next_thread,                        \
@@ -232,7 +286,12 @@ static const Dwfl_Thread_Callbacks callbacks_##arch = {              \
 	.set_initial_registers = libdw_set_initial_registers_##arch, \
 }
 
-DEFINE_DWFL_THREAD_CALLBACKS(x86);
+static const Dwfl_Thread_Callbacks callbacks_generic = {
+	.next_thread           = next_thread,
+	.memory_read           = memory_read,
+	.set_initial_registers = libdw_set_initial_registers_generic,
+};
+
 DEFINE_DWFL_THREAD_CALLBACKS(arm);
 DEFINE_DWFL_THREAD_CALLBACKS(arm64);
 DEFINE_DWFL_THREAD_CALLBACKS(csky);
@@ -257,12 +316,8 @@ static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch)
 		return &callbacks_riscv;
 	else if (!strcmp(arch, "s390"))
 		return &callbacks_s390;
-	else if (!strcmp(arch, "x86"))
-		return &callbacks_x86;
 
-	pr_err("Fail to get thread callbacks for arch %s, returns NULL\n",
-	       arch);
-	return NULL;
+	return &callbacks_generic;
 }
 
 static int
@@ -301,6 +356,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 			bool best_effort)
 {
 	struct machine *machine = maps__machine(thread__maps(thread));
+	uint16_t e_machine = thread__e_machine(thread, machine);
 	struct unwind_info *ui, ui_buf = {
 		.sample		= data,
 		.thread		= thread,
@@ -308,9 +364,9 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 		.cb		= cb,
 		.arg		= arg,
 		.max_stack	= max_stack,
+		.e_machine	= e_machine,
 		.best_effort    = best_effort
 	};
-	uint16_t e_machine = thread__e_machine(thread, machine);
 	const char *arch = perf_env__arch(machine->env);
 	Dwarf_Word ip;
 	int err = -EINVAL, i;
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 574b29848cce..496e5898e7ef 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -9,7 +9,6 @@ struct machine;
 struct perf_sample;
 struct thread;
 
-bool libdw_set_initial_registers_x86(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_arm(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_arm64(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_csky(Dwfl_Thread *thread, void *arg);
@@ -28,6 +27,7 @@ struct unwind_info {
 	void			*arg;
 	int			max_stack;
 	int			idx;
+	uint16_t		e_machine;
 	bool			best_effort;
 	struct unwind_entry	entries[];
 };
-- 
cgit v1.2.3


From cf7c7f12042b9e9dfb7e63c9c3180b6af1860b2b Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:39 -0800
Subject: perf dwarf-regs: Add basic get_dwarf_regnum() for most architectures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a basic get_dwarf_regnum() implementation for most architectures by
using the get_dwarf_regstr() tables and returning the index of the name
within the table.

Some minor name and constification clean up for csky.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c | 24 ++++++++--
 tools/perf/util/dwarf-regs.c                      | 58 +++++++++++++++++++++--
 tools/perf/util/include/dwarf-regs.h              |  5 +-
 3 files changed, 78 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c
index d38ef1f07f3e..86394ed46397 100644
--- a/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c
@@ -2,11 +2,12 @@
 // Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
 // Mapping of DWARF debug register numbers into register names.
 
+#include <errno.h>
 #include <stddef.h>
 #include <dwarf-regs.h>
 
 #define CSKY_ABIV2_MAX_REGS 73
-const char *csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = {
+static const char * const csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = {
 	/* r0 ~ r8 */
 	"%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1", "%regs2", "%regs3",
 	/* r9 ~ r15 */
@@ -27,7 +28,7 @@ const char *csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = {
 };
 
 #define CSKY_ABIV1_MAX_REGS 57
-const char *csky_dwarf_regs_table_abiv1[CSKY_ABIV1_MAX_REGS] = {
+static const char * const csky_dwarf_regs_table_abiv1[CSKY_ABIV1_MAX_REGS] = {
 	/* r0 ~ r8 */
 	"%sp", "%regs9", "%a0", "%a1", "%a2", "%a3", "%regs0", "%regs1",
 	/* r9 ~ r15 */
@@ -41,10 +42,27 @@ const char *csky_dwarf_regs_table_abiv1[CSKY_ABIV1_MAX_REGS] = {
 	"%epc",
 };
 
-const char *get_csky_regstr(unsigned int n, unsigned int flags)
+const char *__get_csky_regstr(unsigned int n, unsigned int flags)
 {
 	if (flags & EF_CSKY_ABIV2)
 		return (n < CSKY_ABIV2_MAX_REGS) ? csky_dwarf_regs_table_abiv2[n] : NULL;
 
 	return (n < CSKY_ABIV1_MAX_REGS) ? csky_dwarf_regs_table_abiv1[n] : NULL;
 }
+
+static int __get_dwarf_regnum(const char *const *regstr, size_t num_regstr, const char *name)
+{
+	for (size_t i = 0; i < num_regstr; i++) {
+		if (regstr[i] && !strcmp(regstr[i], name))
+			return i;
+	}
+	return -ENOENT;
+}
+
+int __get_csky_regnum(const char *name, unsigned int flags)
+{
+	if (flags & EF_CSKY_ABIV2)
+		return __get_dwarf_regnum(csky_dwarf_regs_table_abiv2, CSKY_ABIV2_MAX_REGS, name);
+
+	return __get_dwarf_regnum(csky_dwarf_regs_table_abiv1, CSKY_ABIV1_MAX_REGS, name);
+}
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index 1f7d892612df..dffa0c8bdd14 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -27,11 +27,11 @@
 #include "../arch/mips/include/dwarf-regs-table.h"
 #include "../arch/loongarch/include/dwarf-regs-table.h"
 
-#define __get_dwarf_regstr(tbl, n) (((n) < ARRAY_SIZE(tbl)) ? (tbl)[(n)] : NULL)
-
 /* Return architecture dependent register string (for kprobe-tracer) */
 const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int flags)
 {
+	#define __get_dwarf_regstr(tbl, n) (((n) < ARRAY_SIZE(tbl)) ? (tbl)[(n)] : NULL)
+
 	if (machine == EM_NONE) {
 		/* Generic arch - use host arch */
 		machine = EM_HOST;
@@ -46,7 +46,7 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int
 	case EM_AARCH64:
 		return __get_dwarf_regstr(aarch64_regstr_tbl, n);
 	case EM_CSKY:
-		return get_csky_regstr(n, flags);
+		return __get_csky_regstr(n, flags);
 	case EM_SH:
 		return __get_dwarf_regstr(sh_regstr_tbl, n);
 	case EM_S390:
@@ -69,15 +69,28 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int
 		pr_err("ELF MACHINE %x is not supported.\n", machine);
 	}
 	return NULL;
+
+	#undef __get_dwarf_regstr
+}
+
+static int __get_dwarf_regnum(const char *const *regstr, size_t num_regstr, const char *name)
+{
+	for (size_t i = 0; i < num_regstr; i++) {
+		if (regstr[i] && !strcmp(regstr[i], name))
+			return i;
+	}
+	return -ENOENT;
 }
 
 /* Return DWARF register number from architecture register name */
-int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags __maybe_unused)
+int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags)
 {
 	char *regname = strdup(name);
 	int reg = -1;
 	char *p;
 
+	#define _get_dwarf_regnum(tbl, name) __get_dwarf_regnum(tbl, ARRAY_SIZE(tbl), name)
+
 	if (regname == NULL)
 		return -EINVAL;
 
@@ -97,11 +110,48 @@ int get_dwarf_regnum(const char *name, unsigned int machine, unsigned int flags
 	case EM_386:
 		reg = __get_dwarf_regnum_i386(name);
 		break;
+	case EM_ARM:
+		reg = _get_dwarf_regnum(arm_regstr_tbl, name);
+		break;
+	case EM_AARCH64:
+		reg = _get_dwarf_regnum(aarch64_regstr_tbl, name);
+		break;
+	case EM_CSKY:
+		reg = __get_csky_regnum(name, flags);
+		break;
+	case EM_SH:
+		reg = _get_dwarf_regnum(sh_regstr_tbl, name);
+		break;
+	case EM_S390:
+		reg = _get_dwarf_regnum(s390_regstr_tbl, name);
+		break;
+	case EM_PPC:
+	case EM_PPC64:
+		reg = _get_dwarf_regnum(powerpc_regstr_tbl, name);
+		break;
+	case EM_RISCV:
+		reg = _get_dwarf_regnum(riscv_regstr_tbl, name);
+		break;
+	case EM_SPARC:
+	case EM_SPARCV9:
+		reg = _get_dwarf_regnum(sparc_regstr_tbl, name);
+		break;
+	case EM_XTENSA:
+		reg = _get_dwarf_regnum(xtensa_regstr_tbl, name);
+		break;
+	case EM_MIPS:
+		reg = _get_dwarf_regnum(mips_regstr_tbl, name);
+		break;
+	case EM_LOONGARCH:
+		reg = _get_dwarf_regnum(loongarch_regstr_tbl, name);
+		break;
 	default:
 		pr_err("ELF MACHINE %x is not supported.\n", machine);
 	}
 	free(regname);
 	return reg;
+
+	#undef _get_dwarf_regnum
 }
 
 static int get_libdw_frame_nregs(unsigned int machine, unsigned int flags __maybe_unused)
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index 00881f1d45d6..a120c97a5fac 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -89,8 +89,6 @@
 #define DWARF_REG_FB  0xd3affb /* random number */
 
 #ifdef HAVE_LIBDW_SUPPORT
-const char *get_csky_regstr(unsigned int n, unsigned int flags);
-
 /**
  * get_dwarf_regstr() - Returns ftrace register string from DWARF regnum.
  * @n: DWARF register number.
@@ -99,6 +97,9 @@ const char *get_csky_regstr(unsigned int n, unsigned int flags);
  */
 const char *get_dwarf_regstr(unsigned int n, unsigned int machine, unsigned int flags);
 
+const char *__get_csky_regstr(unsigned int n, unsigned int flags);
+int __get_csky_regnum(const char *name, unsigned int flags);
+
 int __get_dwarf_regnum_i386(const char *name);
 int __get_dwarf_regnum_x86_64(const char *name);
 int __get_dwarf_regnum_for_perf_regnum_i386(int perf_regnum);
-- 
cgit v1.2.3


From 8b863e70e2be6c256201d2297735a2a4bf1acf75 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:40 -0800
Subject: perf dwarf-regs: Add ARM perf to dwarf register number mapping
 functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These functions allow the generic initial register state code in
unwind-libdw to be used.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dwarf-regs-arch/Build              |  2 +
 tools/perf/util/dwarf-regs-arch/dwarf-regs-arm.c   | 12 +++++
 tools/perf/util/dwarf-regs-arch/dwarf-regs-arm64.c | 12 +++++
 tools/perf/util/dwarf-regs.c                       |  6 +++
 tools/perf/util/include/dwarf-regs.h               |  3 ++
 tools/perf/util/unwind-libdw-arch/Build            |  2 -
 .../perf/util/unwind-libdw-arch/unwind-libdw-arm.c | 39 --------------
 .../util/unwind-libdw-arch/unwind-libdw-arm64.c    | 61 ----------------------
 tools/perf/util/unwind-libdw.c                     |  8 +--
 tools/perf/util/unwind-libdw.h                     |  2 -
 10 files changed, 36 insertions(+), 111 deletions(-)
 create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-arm.c
 create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-arm64.c
 delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c
 delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-regs-arch/Build b/tools/perf/util/dwarf-regs-arch/Build
index 98bec0032606..3f19a9ec47c7 100644
--- a/tools/perf/util/dwarf-regs-arch/Build
+++ b/tools/perf/util/dwarf-regs-arch/Build
@@ -1,3 +1,5 @@
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm64.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm.c
new file mode 100644
index 000000000000..42c6c0635612
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <dwarf-regs.h>
+#include "../../../arch/arm/include/uapi/asm/perf_regs.h"
+
+int __get_dwarf_regnum_for_perf_regnum_arm(int perf_regnum)
+{
+	if (perf_regnum < 0 || perf_regnum >= PERF_REG_ARM_MAX)
+		return -ENOENT;
+
+	return perf_regnum;
+}
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm64.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm64.c
new file mode 100644
index 000000000000..593ca7d4fccc
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-arm64.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <dwarf-regs.h>
+#include "../../../arch/arm64/include/uapi/asm/perf_regs.h"
+
+int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum)
+{
+	if (perf_regnum < 0 || perf_regnum >= PERF_REG_ARM64_MAX)
+		return -ENOENT;
+
+	return perf_regnum;
+}
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index dffa0c8bdd14..c472ec5e4d1a 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -196,6 +196,12 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine,
 	case EM_386:
 		reg = __get_dwarf_regnum_for_perf_regnum_i386(perf_regnum);
 		break;
+	case EM_ARM:
+		reg = __get_dwarf_regnum_for_perf_regnum_arm(perf_regnum);
+		break;
+	case EM_AARCH64:
+		reg = __get_dwarf_regnum_for_perf_regnum_arm64(perf_regnum);
+		break;
 	default:
 		pr_err("ELF MACHINE %x is not supported.\n", machine);
 		return -ENOENT;
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index a120c97a5fac..a52df8d1b138 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -105,6 +105,9 @@ int __get_dwarf_regnum_x86_64(const char *name);
 int __get_dwarf_regnum_for_perf_regnum_i386(int perf_regnum);
 int __get_dwarf_regnum_for_perf_regnum_x86_64(int perf_regnum);
 
+int __get_dwarf_regnum_for_perf_regnum_arm(int perf_regnum);
+int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum);
+
 /*
  * get_dwarf_regnum - Returns DWARF regnum from register name
  * name: architecture register name
diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build
index 5b5682029953..79c3bbdc2dee 100644
--- a/tools/perf/util/unwind-libdw-arch/Build
+++ b/tools/perf/util/unwind-libdw-arch/Build
@@ -1,5 +1,3 @@
-perf-util-y += unwind-libdw-arm.o
-perf-util-y += unwind-libdw-arm64.o
 perf-util-y += unwind-libdw-csky.o
 perf-util-y += unwind-libdw-loongarch.o
 perf-util-y += unwind-libdw-powerpc.o
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c
deleted file mode 100644
index 56e9b5975bcc..000000000000
--- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <elfutils/libdwfl.h>
-#include "../arch/arm/include/uapi/asm/perf_regs.h"
-#include "util/unwind-libdw.h"
-#include "util/perf_regs.h"
-#include "util/sample.h"
-
-bool libdw_set_initial_registers_arm(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[PERF_REG_ARM_MAX];
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_ARM_##r);	\
-	val;							\
-})
-
-	dwarf_regs[0]  = REG(R0);
-	dwarf_regs[1]  = REG(R1);
-	dwarf_regs[2]  = REG(R2);
-	dwarf_regs[3]  = REG(R3);
-	dwarf_regs[4]  = REG(R4);
-	dwarf_regs[5]  = REG(R5);
-	dwarf_regs[6]  = REG(R6);
-	dwarf_regs[7]  = REG(R7);
-	dwarf_regs[8]  = REG(R8);
-	dwarf_regs[9]  = REG(R9);
-	dwarf_regs[10] = REG(R10);
-	dwarf_regs[11] = REG(FP);
-	dwarf_regs[12] = REG(IP);
-	dwarf_regs[13] = REG(SP);
-	dwarf_regs[14] = REG(LR);
-	dwarf_regs[15] = REG(PC);
-
-	return dwfl_thread_state_registers(thread, 0, PERF_REG_ARM_MAX,
-					   dwarf_regs);
-}
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c
deleted file mode 100644
index 29b6833e036c..000000000000
--- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-arm64.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <elfutils/libdwfl.h>
-#include "../arch/arm64/include/uapi/asm/perf_regs.h"
-#include "util/unwind-libdw.h"
-#include "util/perf_regs.h"
-#include "util/sample.h"
-
-bool libdw_set_initial_registers_arm64(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[PERF_REG_ARM64_MAX], dwarf_pc;
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_ARM64_##r);	\
-	val;							\
-})
-
-	dwarf_regs[0]  = REG(X0);
-	dwarf_regs[1]  = REG(X1);
-	dwarf_regs[2]  = REG(X2);
-	dwarf_regs[3]  = REG(X3);
-	dwarf_regs[4]  = REG(X4);
-	dwarf_regs[5]  = REG(X5);
-	dwarf_regs[6]  = REG(X6);
-	dwarf_regs[7]  = REG(X7);
-	dwarf_regs[8]  = REG(X8);
-	dwarf_regs[9]  = REG(X9);
-	dwarf_regs[10] = REG(X10);
-	dwarf_regs[11] = REG(X11);
-	dwarf_regs[12] = REG(X12);
-	dwarf_regs[13] = REG(X13);
-	dwarf_regs[14] = REG(X14);
-	dwarf_regs[15] = REG(X15);
-	dwarf_regs[16] = REG(X16);
-	dwarf_regs[17] = REG(X17);
-	dwarf_regs[18] = REG(X18);
-	dwarf_regs[19] = REG(X19);
-	dwarf_regs[20] = REG(X20);
-	dwarf_regs[21] = REG(X21);
-	dwarf_regs[22] = REG(X22);
-	dwarf_regs[23] = REG(X23);
-	dwarf_regs[24] = REG(X24);
-	dwarf_regs[25] = REG(X25);
-	dwarf_regs[26] = REG(X26);
-	dwarf_regs[27] = REG(X27);
-	dwarf_regs[28] = REG(X28);
-	dwarf_regs[29] = REG(X29);
-	dwarf_regs[30] = REG(LR);
-	dwarf_regs[31] = REG(SP);
-
-	if (!dwfl_thread_state_registers(thread, 0, PERF_REG_ARM64_MAX,
-					 dwarf_regs))
-		return false;
-
-	dwarf_pc = REG(PC);
-	dwfl_thread_state_register_pc(thread, dwarf_pc);
-
-	return true;
-}
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 055dab921442..8f291f9f9469 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -292,8 +292,6 @@ static const Dwfl_Thread_Callbacks callbacks_generic = {
 	.set_initial_registers = libdw_set_initial_registers_generic,
 };
 
-DEFINE_DWFL_THREAD_CALLBACKS(arm);
-DEFINE_DWFL_THREAD_CALLBACKS(arm64);
 DEFINE_DWFL_THREAD_CALLBACKS(csky);
 DEFINE_DWFL_THREAD_CALLBACKS(loongarch);
 DEFINE_DWFL_THREAD_CALLBACKS(powerpc);
@@ -302,11 +300,7 @@ DEFINE_DWFL_THREAD_CALLBACKS(s390);
 
 static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch)
 {
-	if (!strcmp(arch, "arm"))
-		return &callbacks_arm;
-	else if (!strcmp(arch, "arm64"))
-		return &callbacks_arm64;
-	else if (!strcmp(arch, "csky"))
+	if (!strcmp(arch, "csky"))
 		return &callbacks_csky;
 	else if (!strcmp(arch, "loongarch"))
 		return &callbacks_loongarch;
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 496e5898e7ef..fe3ae2a768ad 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -9,8 +9,6 @@ struct machine;
 struct perf_sample;
 struct thread;
 
-bool libdw_set_initial_registers_arm(Dwfl_Thread *thread, void *arg);
-bool libdw_set_initial_registers_arm64(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_csky(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_loongarch(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg);
-- 
cgit v1.2.3


From 8cac4013b0c23739ccbce19f74c1b572eba050d2 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:41 -0800
Subject: perf dwarf-regs: Add csky perf to dwarf register number mapping
 functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These functions allow the generic initial register state code in
unwind-libdw to be used.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c  | 58 ++++++++++++++++
 tools/perf/util/dwarf-regs.c                       |  3 +
 tools/perf/util/include/dwarf-regs.h               |  2 +
 tools/perf/util/unwind-libdw-arch/Build            |  1 -
 .../util/unwind-libdw-arch/unwind-libdw-csky.c     | 78 ----------------------
 tools/perf/util/unwind-libdw.c                     |  5 +-
 tools/perf/util/unwind-libdw.h                     |  1 -
 7 files changed, 64 insertions(+), 84 deletions(-)
 delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c
index 86394ed46397..cb44b774f8d9 100644
--- a/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-csky.c
@@ -5,6 +5,10 @@
 #include <errno.h>
 #include <stddef.h>
 #include <dwarf-regs.h>
+// Ensure the V2 perf reg definitions are included.
+#undef __CSKYABIV2__
+#define __CSKYABIV2__ 1
+#include "../../../arch/csky/include/uapi/asm/perf_regs.h"
 
 #define CSKY_ABIV2_MAX_REGS 73
 static const char * const csky_dwarf_regs_table_abiv2[CSKY_ABIV2_MAX_REGS] = {
@@ -66,3 +70,57 @@ int __get_csky_regnum(const char *name, unsigned int flags)
 
 	return __get_dwarf_regnum(csky_dwarf_regs_table_abiv1, CSKY_ABIV1_MAX_REGS, name);
 }
+
+int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags)
+{
+	static const int dwarf_csky_regnums[][2] = {
+		[PERF_REG_CSKY_TLS] = {-ENOENT, 31},
+		[PERF_REG_CSKY_LR] = {15, 15},
+		[PERF_REG_CSKY_PC] = {-ENOENT, 32},
+		/* TODO: PERF_REG_CSKY_SR */
+		[PERF_REG_CSKY_SP] = {0, 14},
+		/* TODO: PERF_REG_CSKY_ORIG_A0 */
+		[PERF_REG_CSKY_A0] = {2, 0},
+		[PERF_REG_CSKY_A1] = {3, 1},
+		[PERF_REG_CSKY_A2] = {4, 2},
+		[PERF_REG_CSKY_A3] = {5, 3},
+		[PERF_REG_CSKY_REGS0] = {6, 4},
+		[PERF_REG_CSKY_REGS1] = {7, 5},
+		[PERF_REG_CSKY_REGS2] = {8, 6},
+		[PERF_REG_CSKY_REGS3] = {9, 7},
+		[PERF_REG_CSKY_REGS4] = {10, 8},
+		[PERF_REG_CSKY_REGS5] = {11, 9},
+		[PERF_REG_CSKY_REGS6] = {12, 10},
+		[PERF_REG_CSKY_REGS7] = {13, 11},
+		[PERF_REG_CSKY_REGS8] = {14, 12},
+		[PERF_REG_CSKY_REGS9] = {1, 13},
+		[PERF_REG_CSKY_EXREGS0] = {-ENOENT, 16},
+		[PERF_REG_CSKY_EXREGS1] = {-ENOENT, 17},
+		[PERF_REG_CSKY_EXREGS2] = {-ENOENT, 18},
+		[PERF_REG_CSKY_EXREGS3] = {-ENOENT, 19},
+		[PERF_REG_CSKY_EXREGS4] = {-ENOENT, 20},
+		[PERF_REG_CSKY_EXREGS5] = {-ENOENT, 21},
+		[PERF_REG_CSKY_EXREGS6] = {-ENOENT, 22},
+		[PERF_REG_CSKY_EXREGS7] = {-ENOENT, 23},
+		[PERF_REG_CSKY_EXREGS8] = {-ENOENT, 24},
+		[PERF_REG_CSKY_EXREGS9] = {-ENOENT, 25},
+		[PERF_REG_CSKY_EXREGS10] = {-ENOENT, 26},
+		[PERF_REG_CSKY_EXREGS11] = {-ENOENT, 27},
+		[PERF_REG_CSKY_EXREGS12] = {-ENOENT, 28},
+		[PERF_REG_CSKY_EXREGS13] = {-ENOENT, 29},
+		[PERF_REG_CSKY_EXREGS14] = {-ENOENT, 30},
+		/* TODO: PERF_REG_CSKY_HI */
+		/* TODO: PERF_REG_CSKY_LO */
+		/* TODO: PERF_REG_CSKY_DCSR */
+	};
+	int idx = 0;
+
+	if (flags & EF_CSKY_ABIV2)
+		idx++;
+
+	if (perf_regnum <  0 || perf_regnum > (int)ARRAY_SIZE(dwarf_csky_regnums) ||
+	    dwarf_csky_regnums[perf_regnum][idx] == 0)
+		return -ENOENT;
+
+	return dwarf_csky_regnums[perf_regnum][idx];
+}
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index c472ec5e4d1a..7fa0930fd298 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -202,6 +202,9 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine,
 	case EM_AARCH64:
 		reg = __get_dwarf_regnum_for_perf_regnum_arm64(perf_regnum);
 		break;
+	case EM_CSKY:
+		reg = __get_dwarf_regnum_for_perf_regnum_csky(perf_regnum, flags);
+		break;
 	default:
 		pr_err("ELF MACHINE %x is not supported.\n", machine);
 		return -ENOENT;
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index a52df8d1b138..7780bc07e70e 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -108,6 +108,8 @@ int __get_dwarf_regnum_for_perf_regnum_x86_64(int perf_regnum);
 int __get_dwarf_regnum_for_perf_regnum_arm(int perf_regnum);
 int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum);
 
+int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags);
+
 /*
  * get_dwarf_regnum - Returns DWARF regnum from register name
  * name: architecture register name
diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build
index 79c3bbdc2dee..5fa1754fca8d 100644
--- a/tools/perf/util/unwind-libdw-arch/Build
+++ b/tools/perf/util/unwind-libdw-arch/Build
@@ -1,4 +1,3 @@
-perf-util-y += unwind-libdw-csky.o
 perf-util-y += unwind-libdw-loongarch.o
 perf-util-y += unwind-libdw-powerpc.o
 perf-util-y += unwind-libdw-riscv.o
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c
deleted file mode 100644
index 2556d034c32a..000000000000
--- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-csky.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
-
-#include <elfutils/libdwfl.h>
-#include "../arch/csky/include/uapi/asm/perf_regs.h"
-#include "util/unwind-libdw.h"
-#include "util/perf_regs.h"
-#include "util/sample.h"
-
-bool libdw_set_initial_registers_csky(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[PERF_REG_CSKY_MAX];
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_CSKY_##r);	\
-	val;							\
-})
-
-#if defined(__CSKYABIV2__)
-	dwarf_regs[0]  = REG(A0);
-	dwarf_regs[1]  = REG(A1);
-	dwarf_regs[2]  = REG(A2);
-	dwarf_regs[3]  = REG(A3);
-	dwarf_regs[4]  = REG(REGS0);
-	dwarf_regs[5]  = REG(REGS1);
-	dwarf_regs[6]  = REG(REGS2);
-	dwarf_regs[7]  = REG(REGS3);
-	dwarf_regs[8]  = REG(REGS4);
-	dwarf_regs[9]  = REG(REGS5);
-	dwarf_regs[10] = REG(REGS6);
-	dwarf_regs[11] = REG(REGS7);
-	dwarf_regs[12] = REG(REGS8);
-	dwarf_regs[13] = REG(REGS9);
-	dwarf_regs[14] = REG(SP);
-	dwarf_regs[15] = REG(LR);
-	dwarf_regs[16] = REG(EXREGS0);
-	dwarf_regs[17] = REG(EXREGS1);
-	dwarf_regs[18] = REG(EXREGS2);
-	dwarf_regs[19] = REG(EXREGS3);
-	dwarf_regs[20] = REG(EXREGS4);
-	dwarf_regs[21] = REG(EXREGS5);
-	dwarf_regs[22] = REG(EXREGS6);
-	dwarf_regs[23] = REG(EXREGS7);
-	dwarf_regs[24] = REG(EXREGS8);
-	dwarf_regs[25] = REG(EXREGS9);
-	dwarf_regs[26] = REG(EXREGS10);
-	dwarf_regs[27] = REG(EXREGS11);
-	dwarf_regs[28] = REG(EXREGS12);
-	dwarf_regs[29] = REG(EXREGS13);
-	dwarf_regs[30] = REG(EXREGS14);
-	dwarf_regs[31] = REG(TLS);
-	dwarf_regs[32] = REG(PC);
-#else
-	dwarf_regs[0]  = REG(SP);
-	dwarf_regs[1]  = REG(REGS9);
-	dwarf_regs[2]  = REG(A0);
-	dwarf_regs[3]  = REG(A1);
-	dwarf_regs[4]  = REG(A2);
-	dwarf_regs[5]  = REG(A3);
-	dwarf_regs[6]  = REG(REGS0);
-	dwarf_regs[7]  = REG(REGS1);
-	dwarf_regs[8]  = REG(REGS2);
-	dwarf_regs[9]  = REG(REGS3);
-	dwarf_regs[10] = REG(REGS4);
-	dwarf_regs[11] = REG(REGS5);
-	dwarf_regs[12] = REG(REGS6);
-	dwarf_regs[13] = REG(REGS7);
-	dwarf_regs[14] = REG(REGS8);
-	dwarf_regs[15] = REG(LR);
-#endif
-	dwfl_thread_state_register_pc(thread, REG(PC));
-
-	return dwfl_thread_state_registers(thread, 0, PERF_REG_CSKY_MAX,
-					   dwarf_regs);
-}
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 8f291f9f9469..a193163da707 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -292,7 +292,6 @@ static const Dwfl_Thread_Callbacks callbacks_generic = {
 	.set_initial_registers = libdw_set_initial_registers_generic,
 };
 
-DEFINE_DWFL_THREAD_CALLBACKS(csky);
 DEFINE_DWFL_THREAD_CALLBACKS(loongarch);
 DEFINE_DWFL_THREAD_CALLBACKS(powerpc);
 DEFINE_DWFL_THREAD_CALLBACKS(riscv);
@@ -300,9 +299,7 @@ DEFINE_DWFL_THREAD_CALLBACKS(s390);
 
 static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch)
 {
-	if (!strcmp(arch, "csky"))
-		return &callbacks_csky;
-	else if (!strcmp(arch, "loongarch"))
+	if (!strcmp(arch, "loongarch"))
 		return &callbacks_loongarch;
 	else if (!strcmp(arch, "powerpc"))
 		return &callbacks_powerpc;
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index fe3ae2a768ad..ee56f1e827e5 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -9,7 +9,6 @@ struct machine;
 struct perf_sample;
 struct thread;
 
-bool libdw_set_initial_registers_csky(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_loongarch(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_powerpc(Dwfl_Thread *thread, void *arg);
-- 
cgit v1.2.3


From 1f10d82e6adffd45cf1a59618d1ecc33625a8a37 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:42 -0800
Subject: perf dwarf-regs: Add loongarch perf to DWARF register number mapping
 functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These functions allow the generic initial register state code in
unwind-libdw to be used.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dwarf-regs-arch/Build              |  1 +
 .../util/dwarf-regs-arch/dwarf-regs-loongarch.c    | 12 +++++
 tools/perf/util/dwarf-regs.c                       |  3 ++
 tools/perf/util/include/dwarf-regs.h               |  1 +
 tools/perf/util/unwind-libdw-arch/Build            |  1 -
 .../unwind-libdw-arch/unwind-libdw-loongarch.c     | 57 ----------------------
 tools/perf/util/unwind-libdw.c                     |  5 +-
 tools/perf/util/unwind-libdw.h                     |  1 -
 8 files changed, 18 insertions(+), 63 deletions(-)
 create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-loongarch.c
 delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-regs-arch/Build b/tools/perf/util/dwarf-regs-arch/Build
index 3f19a9ec47c7..188359376ea5 100644
--- a/tools/perf/util/dwarf-regs-arch/Build
+++ b/tools/perf/util/dwarf-regs-arch/Build
@@ -1,5 +1,6 @@
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm64.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-loongarch.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-loongarch.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-loongarch.c
new file mode 100644
index 000000000000..203077b740a0
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-loongarch.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <dwarf-regs.h>
+#include "../../../arch/loongarch/include/uapi/asm/perf_regs.h"
+
+int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum)
+{
+	if (perf_regnum < 0 || perf_regnum >= PERF_REG_LOONGARCH_MAX)
+		return -ENOENT;
+
+	return perf_regnum;
+}
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index 7fa0930fd298..033218f14b36 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -205,6 +205,9 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine,
 	case EM_CSKY:
 		reg = __get_dwarf_regnum_for_perf_regnum_csky(perf_regnum, flags);
 		break;
+	case EM_LOONGARCH:
+		reg = __get_dwarf_regnum_for_perf_regnum_loongarch(perf_regnum);
+		break;
 	default:
 		pr_err("ELF MACHINE %x is not supported.\n", machine);
 		return -ENOENT;
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index 7780bc07e70e..bec15fb53e73 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -109,6 +109,7 @@ int __get_dwarf_regnum_for_perf_regnum_arm(int perf_regnum);
 int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum);
 
 int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags);
+int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum);
 
 /*
  * get_dwarf_regnum - Returns DWARF regnum from register name
diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build
index 5fa1754fca8d..62a4cbf2dca8 100644
--- a/tools/perf/util/unwind-libdw-arch/Build
+++ b/tools/perf/util/unwind-libdw-arch/Build
@@ -1,4 +1,3 @@
-perf-util-y += unwind-libdw-loongarch.o
 perf-util-y += unwind-libdw-powerpc.o
 perf-util-y += unwind-libdw-riscv.o
 perf-util-y += unwind-libdw-s390.o
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c
deleted file mode 100644
index 5fca673508be..000000000000
--- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-loongarch.c
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2020-2023 Loongson Technology Corporation Limited */
-
-#include <elfutils/libdwfl.h>
-#include "../arch/loongarch/include/uapi/asm/perf_regs.h"
-#include "util/unwind-libdw.h"
-#include "util/perf_regs.h"
-#include "util/sample.h"
-
-bool libdw_set_initial_registers_loongarch(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[PERF_REG_LOONGARCH_MAX];
-
-#define REG(r) ({							\
-	Dwarf_Word val = 0;						\
-	perf_reg_value(&val, user_regs, PERF_REG_LOONGARCH_##r);	\
-	val;								\
-})
-
-	dwarf_regs[0]  = 0;
-	dwarf_regs[1]  = REG(R1);
-	dwarf_regs[2]  = REG(R2);
-	dwarf_regs[3]  = REG(R3);
-	dwarf_regs[4]  = REG(R4);
-	dwarf_regs[5]  = REG(R5);
-	dwarf_regs[6]  = REG(R6);
-	dwarf_regs[7]  = REG(R7);
-	dwarf_regs[8]  = REG(R8);
-	dwarf_regs[9]  = REG(R9);
-	dwarf_regs[10] = REG(R10);
-	dwarf_regs[11] = REG(R11);
-	dwarf_regs[12] = REG(R12);
-	dwarf_regs[13] = REG(R13);
-	dwarf_regs[14] = REG(R14);
-	dwarf_regs[15] = REG(R15);
-	dwarf_regs[16] = REG(R16);
-	dwarf_regs[17] = REG(R17);
-	dwarf_regs[18] = REG(R18);
-	dwarf_regs[19] = REG(R19);
-	dwarf_regs[20] = REG(R20);
-	dwarf_regs[21] = REG(R21);
-	dwarf_regs[22] = REG(R22);
-	dwarf_regs[23] = REG(R23);
-	dwarf_regs[24] = REG(R24);
-	dwarf_regs[25] = REG(R25);
-	dwarf_regs[26] = REG(R26);
-	dwarf_regs[27] = REG(R27);
-	dwarf_regs[28] = REG(R28);
-	dwarf_regs[29] = REG(R29);
-	dwarf_regs[30] = REG(R30);
-	dwarf_regs[31] = REG(R31);
-	dwfl_thread_state_register_pc(thread, REG(PC));
-
-	return dwfl_thread_state_registers(thread, 0, PERF_REG_LOONGARCH_MAX, dwarf_regs);
-}
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index a193163da707..9c8dad643cd0 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -292,16 +292,13 @@ static const Dwfl_Thread_Callbacks callbacks_generic = {
 	.set_initial_registers = libdw_set_initial_registers_generic,
 };
 
-DEFINE_DWFL_THREAD_CALLBACKS(loongarch);
 DEFINE_DWFL_THREAD_CALLBACKS(powerpc);
 DEFINE_DWFL_THREAD_CALLBACKS(riscv);
 DEFINE_DWFL_THREAD_CALLBACKS(s390);
 
 static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch)
 {
-	if (!strcmp(arch, "loongarch"))
-		return &callbacks_loongarch;
-	else if (!strcmp(arch, "powerpc"))
+	if (!strcmp(arch, "powerpc"))
 		return &callbacks_powerpc;
 	else if (!strcmp(arch, "riscv"))
 		return &callbacks_riscv;
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index ee56f1e827e5..9d177d70f15c 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -9,7 +9,6 @@ struct machine;
 struct perf_sample;
 struct thread;
 
-bool libdw_set_initial_registers_loongarch(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_powerpc(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_riscv(Dwfl_Thread *thread, void *arg);
-- 
cgit v1.2.3


From f005302294601a8fb770c71179a3a13951d125ad Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:43 -0800
Subject: perf dwarf-regs: Add powerpc perf to DWARF register number mapping
 functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These functions allow the generic initial register state code in
unwind-libdw to be used.

Note, the link register was being coped to DWARF register 65 that the
SysV ABI spec claims is FPSCR. It is corrected here to 108, but this is
unlikely to matter as FPSCR has little to no impact on unwinding.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c | 77 +++++++++++++++++++++-
 tools/perf/util/dwarf-regs.c                       |  4 ++
 tools/perf/util/include/dwarf-regs.h               |  1 +
 tools/perf/util/unwind-libdw-arch/Build            |  1 -
 .../util/unwind-libdw-arch/unwind-libdw-powerpc.c  | 76 ---------------------
 tools/perf/util/unwind-libdw.c                     |  5 +-
 tools/perf/util/unwind-libdw.h                     |  1 -
 7 files changed, 82 insertions(+), 83 deletions(-)
 delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c
index caf77a234c78..51892a09725b 100644
--- a/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-powerpc.c
@@ -4,8 +4,9 @@
  *
  * Copyright (C) 2010 Ian Munsie, IBM Corporation.
  */
-
+#include <errno.h>
 #include <dwarf-regs.h>
+#include "../../../arch/powerpc/include/uapi/asm/perf_regs.h"
 
 #define PPC_OP(op)	(((op) >> 26) & 0x3F)
 #define PPC_RA(a)	(((a) >> 16) & 0x1f)
@@ -59,3 +60,77 @@ void get_powerpc_regs(u32 raw_insn, int is_source,
 	if ((op_loc->mem_ref) && (PPC_OP(raw_insn) != 31))
 		op_loc->offset = get_offset_opcode(raw_insn);
 }
+
+int __get_dwarf_regnum_for_perf_regnum_powerpc(int perf_regnum)
+{
+	static const int dwarf_powerpc_regnums[] = {
+		[PERF_REG_POWERPC_R0] = 0,
+		[PERF_REG_POWERPC_R1] = 1,
+		[PERF_REG_POWERPC_R2] = 2,
+		[PERF_REG_POWERPC_R3] = 3,
+		[PERF_REG_POWERPC_R4] = 4,
+		[PERF_REG_POWERPC_R5] = 5,
+		[PERF_REG_POWERPC_R6] = 6,
+		[PERF_REG_POWERPC_R7] = 7,
+		[PERF_REG_POWERPC_R8] = 8,
+		[PERF_REG_POWERPC_R9] = 9,
+		[PERF_REG_POWERPC_R10] = 10,
+		[PERF_REG_POWERPC_R11] = 11,
+		[PERF_REG_POWERPC_R12] = 12,
+		[PERF_REG_POWERPC_R13] = 13,
+		[PERF_REG_POWERPC_R14] = 14,
+		[PERF_REG_POWERPC_R15] = 15,
+		[PERF_REG_POWERPC_R16] = 16,
+		[PERF_REG_POWERPC_R17] = 17,
+		[PERF_REG_POWERPC_R18] = 18,
+		[PERF_REG_POWERPC_R19] = 19,
+		[PERF_REG_POWERPC_R20] = 20,
+		[PERF_REG_POWERPC_R21] = 21,
+		[PERF_REG_POWERPC_R22] = 22,
+		[PERF_REG_POWERPC_R23] = 23,
+		[PERF_REG_POWERPC_R24] = 24,
+		[PERF_REG_POWERPC_R25] = 25,
+		[PERF_REG_POWERPC_R26] = 26,
+		[PERF_REG_POWERPC_R27] = 27,
+		[PERF_REG_POWERPC_R28] = 28,
+		[PERF_REG_POWERPC_R29] = 29,
+		[PERF_REG_POWERPC_R30] = 30,
+		[PERF_REG_POWERPC_R31] = 31,
+		/* TODO: PERF_REG_POWERPC_NIP */
+		[PERF_REG_POWERPC_MSR] = 66,
+		/* TODO: PERF_REG_POWERPC_ORIG_R3 */
+		[PERF_REG_POWERPC_CTR] = 109,
+		[PERF_REG_POWERPC_LINK] = 108, /* Note, previously in perf encoded as 65? */
+		[PERF_REG_POWERPC_XER] = 101,
+		/* TODO: PERF_REG_POWERPC_CCR */
+		/* TODO: PERF_REG_POWERPC_SOFTE */
+		/* TODO: PERF_REG_POWERPC_TRAP */
+		/* TODO: PERF_REG_POWERPC_DAR */
+		/* TODO: PERF_REG_POWERPC_DSISR */
+		/* TODO: PERF_REG_POWERPC_SIER */
+		/* TODO: PERF_REG_POWERPC_MMCRA */
+		/* TODO: PERF_REG_POWERPC_MMCR0 */
+		/* TODO: PERF_REG_POWERPC_MMCR1 */
+		/* TODO: PERF_REG_POWERPC_MMCR2 */
+		/* TODO: PERF_REG_POWERPC_MMCR3 */
+		/* TODO: PERF_REG_POWERPC_SIER2 */
+		/* TODO: PERF_REG_POWERPC_SIER3 */
+		/* TODO: PERF_REG_POWERPC_PMC1 */
+		/* TODO: PERF_REG_POWERPC_PMC2 */
+		/* TODO: PERF_REG_POWERPC_PMC3 */
+		/* TODO: PERF_REG_POWERPC_PMC4 */
+		/* TODO: PERF_REG_POWERPC_PMC5 */
+		/* TODO: PERF_REG_POWERPC_PMC6 */
+		/* TODO: PERF_REG_POWERPC_SDAR */
+		/* TODO: PERF_REG_POWERPC_SIAR */
+	};
+
+	if (perf_regnum == 0)
+		return 0;
+
+	if (perf_regnum <  0 || perf_regnum > (int)ARRAY_SIZE(dwarf_powerpc_regnums) ||
+	    dwarf_powerpc_regnums[perf_regnum] == 0)
+		return -ENOENT;
+
+	return dwarf_powerpc_regnums[perf_regnum];
+}
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index 033218f14b36..3b1c2a436806 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -205,6 +205,10 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine,
 	case EM_CSKY:
 		reg = __get_dwarf_regnum_for_perf_regnum_csky(perf_regnum, flags);
 		break;
+	case EM_PPC:
+	case EM_PPC64:
+		reg = __get_dwarf_regnum_for_perf_regnum_powerpc(perf_regnum);
+		break;
 	case EM_LOONGARCH:
 		reg = __get_dwarf_regnum_for_perf_regnum_loongarch(perf_regnum);
 		break;
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index bec15fb53e73..9ebb3ba33fba 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -110,6 +110,7 @@ int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum);
 
 int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags);
 int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum);
+int __get_dwarf_regnum_for_perf_regnum_powerpc(int perf_regnum);
 
 /*
  * get_dwarf_regnum - Returns DWARF regnum from register name
diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build
index 62a4cbf2dca8..e6c97e842cd6 100644
--- a/tools/perf/util/unwind-libdw-arch/Build
+++ b/tools/perf/util/unwind-libdw-arch/Build
@@ -1,3 +1,2 @@
-perf-util-y += unwind-libdw-powerpc.o
 perf-util-y += unwind-libdw-riscv.o
 perf-util-y += unwind-libdw-s390.o
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c
deleted file mode 100644
index 1560db45e7b4..000000000000
--- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-powerpc.c
+++ /dev/null
@@ -1,76 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <elfutils/libdwfl.h>
-#include <linux/kernel.h>
-#include "../arch/powerpc/include/uapi/asm/perf_regs.h"
-#include "util/unwind-libdw.h"
-#include "util/perf_regs.h"
-#include "util/sample.h"
-
-/* See backends/ppc_initreg.c and backends/ppc_regs.c in elfutils.  */
-static const int special_regs[3][2] = {
-	{ 65, PERF_REG_POWERPC_LINK },
-	{ 101, PERF_REG_POWERPC_XER },
-	{ 109, PERF_REG_POWERPC_CTR },
-};
-
-bool libdw_set_initial_registers_powerpc(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[32], dwarf_nip;
-	size_t i;
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_POWERPC_##r);	\
-	val;							\
-})
-
-	dwarf_regs[0]  = REG(R0);
-	dwarf_regs[1]  = REG(R1);
-	dwarf_regs[2]  = REG(R2);
-	dwarf_regs[3]  = REG(R3);
-	dwarf_regs[4]  = REG(R4);
-	dwarf_regs[5]  = REG(R5);
-	dwarf_regs[6]  = REG(R6);
-	dwarf_regs[7]  = REG(R7);
-	dwarf_regs[8]  = REG(R8);
-	dwarf_regs[9]  = REG(R9);
-	dwarf_regs[10] = REG(R10);
-	dwarf_regs[11] = REG(R11);
-	dwarf_regs[12] = REG(R12);
-	dwarf_regs[13] = REG(R13);
-	dwarf_regs[14] = REG(R14);
-	dwarf_regs[15] = REG(R15);
-	dwarf_regs[16] = REG(R16);
-	dwarf_regs[17] = REG(R17);
-	dwarf_regs[18] = REG(R18);
-	dwarf_regs[19] = REG(R19);
-	dwarf_regs[20] = REG(R20);
-	dwarf_regs[21] = REG(R21);
-	dwarf_regs[22] = REG(R22);
-	dwarf_regs[23] = REG(R23);
-	dwarf_regs[24] = REG(R24);
-	dwarf_regs[25] = REG(R25);
-	dwarf_regs[26] = REG(R26);
-	dwarf_regs[27] = REG(R27);
-	dwarf_regs[28] = REG(R28);
-	dwarf_regs[29] = REG(R29);
-	dwarf_regs[30] = REG(R30);
-	dwarf_regs[31] = REG(R31);
-	if (!dwfl_thread_state_registers(thread, 0, 32, dwarf_regs))
-		return false;
-
-	dwarf_nip = REG(NIP);
-	dwfl_thread_state_register_pc(thread, dwarf_nip);
-	for (i = 0; i < ARRAY_SIZE(special_regs); i++) {
-		Dwarf_Word val = 0;
-		perf_reg_value(&val, user_regs, special_regs[i][1]);
-		if (!dwfl_thread_state_registers(thread,
-						 special_regs[i][0], 1,
-						 &val))
-			return false;
-	}
-
-	return true;
-}
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 9c8dad643cd0..e9ba050e7ab1 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -292,15 +292,12 @@ static const Dwfl_Thread_Callbacks callbacks_generic = {
 	.set_initial_registers = libdw_set_initial_registers_generic,
 };
 
-DEFINE_DWFL_THREAD_CALLBACKS(powerpc);
 DEFINE_DWFL_THREAD_CALLBACKS(riscv);
 DEFINE_DWFL_THREAD_CALLBACKS(s390);
 
 static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch)
 {
-	if (!strcmp(arch, "powerpc"))
-		return &callbacks_powerpc;
-	else if (!strcmp(arch, "riscv"))
+	if (!strcmp(arch, "riscv"))
 		return &callbacks_riscv;
 	else if (!strcmp(arch, "s390"))
 		return &callbacks_s390;
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 9d177d70f15c..0ec1abdabbe7 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -10,7 +10,6 @@ struct perf_sample;
 struct thread;
 
 bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg);
-bool libdw_set_initial_registers_powerpc(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_riscv(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_s390(Dwfl_Thread *thread, void *arg);
 
-- 
cgit v1.2.3


From 36b372dfff51a0f069d4f8f11991b7241743fd52 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:44 -0800
Subject: perf dwarf-regs: Add RISC-V perf to DWARF register number mapping
 functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These functions allow the generic initial register state code in
unwind-libdw to be used.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dwarf-regs-arch/Build              |  1 +
 tools/perf/util/dwarf-regs-arch/dwarf-regs-riscv.c | 12 +++++
 tools/perf/util/dwarf-regs.c                       |  3 ++
 tools/perf/util/include/dwarf-regs.h               |  1 +
 tools/perf/util/unwind-libdw-arch/Build            |  1 -
 .../util/unwind-libdw-arch/unwind-libdw-riscv.c    | 58 ----------------------
 tools/perf/util/unwind-libdw.c                     |  5 +-
 tools/perf/util/unwind-libdw.h                     |  1 -
 8 files changed, 18 insertions(+), 64 deletions(-)
 create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-riscv.c
 delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-regs-arch/Build b/tools/perf/util/dwarf-regs-arch/Build
index 188359376ea5..94e4dfceb4d1 100644
--- a/tools/perf/util/dwarf-regs-arch/Build
+++ b/tools/perf/util/dwarf-regs-arch/Build
@@ -3,4 +3,5 @@ perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-loongarch.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-riscv.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-riscv.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-riscv.c
new file mode 100644
index 000000000000..090db51aba41
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-riscv.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <dwarf-regs.h>
+#include "../../../arch/riscv/include/uapi/asm/perf_regs.h"
+
+int __get_dwarf_regnum_for_perf_regnum_riscv(int perf_regnum)
+{
+	if (perf_regnum < 0 || perf_regnum >= PERF_REG_RISCV_MAX)
+		return -ENOENT;
+
+	return perf_regnum;
+}
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index 3b1c2a436806..137568e15018 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -209,6 +209,9 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine,
 	case EM_PPC64:
 		reg = __get_dwarf_regnum_for_perf_regnum_powerpc(perf_regnum);
 		break;
+	case EM_RISCV:
+		reg = __get_dwarf_regnum_for_perf_regnum_riscv(perf_regnum);
+		break;
 	case EM_LOONGARCH:
 		reg = __get_dwarf_regnum_for_perf_regnum_loongarch(perf_regnum);
 		break;
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index 9ebb3ba33fba..ae76608da110 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -111,6 +111,7 @@ int __get_dwarf_regnum_for_perf_regnum_arm64(int perf_regnum);
 int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags);
 int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum);
 int __get_dwarf_regnum_for_perf_regnum_powerpc(int perf_regnum);
+int __get_dwarf_regnum_for_perf_regnum_riscv(int perf_regnum);
 
 /*
  * get_dwarf_regnum - Returns DWARF regnum from register name
diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build
index e6c97e842cd6..6d6e319e1201 100644
--- a/tools/perf/util/unwind-libdw-arch/Build
+++ b/tools/perf/util/unwind-libdw-arch/Build
@@ -1,2 +1 @@
-perf-util-y += unwind-libdw-riscv.o
 perf-util-y += unwind-libdw-s390.o
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c
deleted file mode 100644
index c2e2c4b6b2e0..000000000000
--- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-riscv.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. */
-
-#include <elfutils/libdwfl.h>
-#include "../arch/riscv/include/uapi/asm/perf_regs.h"
-#include "util/unwind-libdw.h"
-#include "util/perf_regs.h"
-#include "util/sample.h"
-
-bool libdw_set_initial_registers_riscv(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[32];
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_RISCV_##r);	\
-	val;							\
-})
-
-	dwarf_regs[0]  = 0;
-	dwarf_regs[1]  = REG(RA);
-	dwarf_regs[2]  = REG(SP);
-	dwarf_regs[3]  = REG(GP);
-	dwarf_regs[4]  = REG(TP);
-	dwarf_regs[5]  = REG(T0);
-	dwarf_regs[6]  = REG(T1);
-	dwarf_regs[7]  = REG(T2);
-	dwarf_regs[8]  = REG(S0);
-	dwarf_regs[9]  = REG(S1);
-	dwarf_regs[10] = REG(A0);
-	dwarf_regs[11] = REG(A1);
-	dwarf_regs[12] = REG(A2);
-	dwarf_regs[13] = REG(A3);
-	dwarf_regs[14] = REG(A4);
-	dwarf_regs[15] = REG(A5);
-	dwarf_regs[16] = REG(A6);
-	dwarf_regs[17] = REG(A7);
-	dwarf_regs[18] = REG(S2);
-	dwarf_regs[19] = REG(S3);
-	dwarf_regs[20] = REG(S4);
-	dwarf_regs[21] = REG(S5);
-	dwarf_regs[22] = REG(S6);
-	dwarf_regs[23] = REG(S7);
-	dwarf_regs[24] = REG(S8);
-	dwarf_regs[25] = REG(S9);
-	dwarf_regs[26] = REG(S10);
-	dwarf_regs[27] = REG(S11);
-	dwarf_regs[28] = REG(T3);
-	dwarf_regs[29] = REG(T4);
-	dwarf_regs[30] = REG(T5);
-	dwarf_regs[31] = REG(T6);
-	dwfl_thread_state_register_pc(thread, REG(PC));
-
-	return dwfl_thread_state_registers(thread, 0, PERF_REG_RISCV_MAX,
-					   dwarf_regs);
-}
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index e9ba050e7ab1..b3c4380d40b6 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -292,14 +292,11 @@ static const Dwfl_Thread_Callbacks callbacks_generic = {
 	.set_initial_registers = libdw_set_initial_registers_generic,
 };
 
-DEFINE_DWFL_THREAD_CALLBACKS(riscv);
 DEFINE_DWFL_THREAD_CALLBACKS(s390);
 
 static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch)
 {
-	if (!strcmp(arch, "riscv"))
-		return &callbacks_riscv;
-	else if (!strcmp(arch, "s390"))
+	if (!strcmp(arch, "s390"))
 		return &callbacks_s390;
 
 	return &callbacks_generic;
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 0ec1abdabbe7..5c23080cb6c1 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -10,7 +10,6 @@ struct perf_sample;
 struct thread;
 
 bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg);
-bool libdw_set_initial_registers_riscv(Dwfl_Thread *thread, void *arg);
 bool libdw_set_initial_registers_s390(Dwfl_Thread *thread, void *arg);
 
 struct unwind_info {
-- 
cgit v1.2.3


From 1e452dd850f3d509cdc9da05f2c70161f7f73d37 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:45 -0800
Subject: perf dwarf-regs: Add S390 perf to DWARF register number mapping
 functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These functions allow the generic initial register state code in
unwind-libdw to be used.

Now the non-generic code in unwind-libdw has no uses remove it.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dwarf-regs-arch/Build              |  1 +
 tools/perf/util/dwarf-regs-arch/dwarf-regs-s390.c  | 53 ++++++++++++++++++
 tools/perf/util/dwarf-regs.c                       |  3 +
 tools/perf/util/include/dwarf-regs.h               |  1 +
 tools/perf/util/unwind-libdw-arch/Build            |  1 -
 .../util/unwind-libdw-arch/unwind-libdw-s390.c     | 65 ----------------------
 tools/perf/util/unwind-libdw.c                     | 31 ++---------
 tools/perf/util/unwind-libdw.h                     |  1 -
 8 files changed, 62 insertions(+), 94 deletions(-)
 create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-s390.c
 delete mode 100644 tools/perf/util/unwind-libdw-arch/Build
 delete mode 100644 tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-regs-arch/Build b/tools/perf/util/dwarf-regs-arch/Build
index 94e4dfceb4d1..10c2af3d933a 100644
--- a/tools/perf/util/dwarf-regs-arch/Build
+++ b/tools/perf/util/dwarf-regs-arch/Build
@@ -4,4 +4,5 @@ perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-loongarch.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-riscv.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-s390.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-x86.o
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-s390.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-s390.c
new file mode 100644
index 000000000000..310a37451bdc
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-s390.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <dwarf-regs.h>
+#include "../../../arch/s390/include/uapi/asm/perf_regs.h"
+
+int __get_dwarf_regnum_for_perf_regnum_s390(int perf_regnum)
+{
+	static const int dwarf_s390_regnums[] = {
+		[PERF_REG_S390_R0] = 0,
+		[PERF_REG_S390_R1] = 1,
+		[PERF_REG_S390_R2] = 2,
+		[PERF_REG_S390_R3] = 3,
+		[PERF_REG_S390_R4] = 4,
+		[PERF_REG_S390_R5] = 5,
+		[PERF_REG_S390_R6] = 6,
+		[PERF_REG_S390_R7] = 7,
+		[PERF_REG_S390_R8] = 8,
+		[PERF_REG_S390_R9] = 9,
+		[PERF_REG_S390_R10] = 10,
+		[PERF_REG_S390_R11] = 11,
+		[PERF_REG_S390_R12] = 12,
+		[PERF_REG_S390_R13] = 13,
+		[PERF_REG_S390_R14] = 14,
+		[PERF_REG_S390_R15] = 15,
+		[PERF_REG_S390_FP0] = 16,
+		[PERF_REG_S390_FP1] = 20,
+		[PERF_REG_S390_FP2] = 17,
+		[PERF_REG_S390_FP3] = 21,
+		[PERF_REG_S390_FP4] = 18,
+		[PERF_REG_S390_FP5] = 22,
+		[PERF_REG_S390_FP6] = 19,
+		[PERF_REG_S390_FP7] = 23,
+		[PERF_REG_S390_FP8] = 24,
+		[PERF_REG_S390_FP9] = 28,
+		[PERF_REG_S390_FP10] = 25,
+		[PERF_REG_S390_FP11] = 29,
+		[PERF_REG_S390_FP12] = 26,
+		[PERF_REG_S390_FP13] = 30,
+		[PERF_REG_S390_FP14] = 27,
+		[PERF_REG_S390_FP15] = 31,
+		[PERF_REG_S390_MASK] = 64,
+		[PERF_REG_S390_PC] = 65,
+	};
+
+	if (perf_regnum == 0)
+		return 0;
+
+	if (perf_regnum <  0 || perf_regnum > (int)ARRAY_SIZE(dwarf_s390_regnums) ||
+	    dwarf_s390_regnums[perf_regnum] == 0)
+		return -ENOENT;
+
+	return dwarf_s390_regnums[perf_regnum];
+}
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index 137568e15018..f86f76547592 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -212,6 +212,9 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine,
 	case EM_RISCV:
 		reg = __get_dwarf_regnum_for_perf_regnum_riscv(perf_regnum);
 		break;
+	case EM_S390:
+		reg = __get_dwarf_regnum_for_perf_regnum_s390(perf_regnum);
+		break;
 	case EM_LOONGARCH:
 		reg = __get_dwarf_regnum_for_perf_regnum_loongarch(perf_regnum);
 		break;
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index ae76608da110..b95cf2d7b5b3 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -112,6 +112,7 @@ int __get_dwarf_regnum_for_perf_regnum_csky(int perf_regnum, unsigned int flags)
 int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum);
 int __get_dwarf_regnum_for_perf_regnum_powerpc(int perf_regnum);
 int __get_dwarf_regnum_for_perf_regnum_riscv(int perf_regnum);
+int __get_dwarf_regnum_for_perf_regnum_s390(int perf_regnum);
 
 /*
  * get_dwarf_regnum - Returns DWARF regnum from register name
diff --git a/tools/perf/util/unwind-libdw-arch/Build b/tools/perf/util/unwind-libdw-arch/Build
deleted file mode 100644
index 6d6e319e1201..000000000000
--- a/tools/perf/util/unwind-libdw-arch/Build
+++ /dev/null
@@ -1 +0,0 @@
-perf-util-y += unwind-libdw-s390.o
diff --git a/tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c b/tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c
deleted file mode 100644
index 1e05e9d9d95f..000000000000
--- a/tools/perf/util/unwind-libdw-arch/unwind-libdw-s390.c
+++ /dev/null
@@ -1,65 +0,0 @@
-#include <linux/kernel.h>
-#include <elfutils/libdwfl.h>
-#include "util/unwind-libdw.h"
-#include "util/perf_regs.h"
-#include "util/event.h"
-#include "util/sample.h"
-#include "../arch/s390/include/dwarf-regs-table.h"
-#include "../arch/s390/include/uapi/asm/perf_regs.h"
-
-
-bool libdw_set_initial_registers_s390(Dwfl_Thread *thread, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
-	Dwarf_Word dwarf_regs[ARRAY_SIZE(s390_dwarf_regs)];
-
-#define REG(r) ({						\
-	Dwarf_Word val = 0;					\
-	perf_reg_value(&val, user_regs, PERF_REG_S390_##r);	\
-	val;							\
-})
-	/*
-	 * For DWARF register mapping details,
-	 * see also perf/arch/s390/include/dwarf-regs-table.h
-	 */
-	dwarf_regs[0]  = REG(R0);
-	dwarf_regs[1]  = REG(R1);
-	dwarf_regs[2]  = REG(R2);
-	dwarf_regs[3]  = REG(R3);
-	dwarf_regs[4]  = REG(R4);
-	dwarf_regs[5]  = REG(R5);
-	dwarf_regs[6]  = REG(R6);
-	dwarf_regs[7]  = REG(R7);
-	dwarf_regs[8]  = REG(R8);
-	dwarf_regs[9]  = REG(R9);
-	dwarf_regs[10] = REG(R10);
-	dwarf_regs[11] = REG(R11);
-	dwarf_regs[12] = REG(R12);
-	dwarf_regs[13] = REG(R13);
-	dwarf_regs[14] = REG(R14);
-	dwarf_regs[15] = REG(R15);
-
-	dwarf_regs[16] = REG(FP0);
-	dwarf_regs[17] = REG(FP2);
-	dwarf_regs[18] = REG(FP4);
-	dwarf_regs[19] = REG(FP6);
-	dwarf_regs[20] = REG(FP1);
-	dwarf_regs[21] = REG(FP3);
-	dwarf_regs[22] = REG(FP5);
-	dwarf_regs[23] = REG(FP7);
-	dwarf_regs[24] = REG(FP8);
-	dwarf_regs[25] = REG(FP10);
-	dwarf_regs[26] = REG(FP12);
-	dwarf_regs[27] = REG(FP14);
-	dwarf_regs[28] = REG(FP9);
-	dwarf_regs[29] = REG(FP11);
-	dwarf_regs[30] = REG(FP13);
-	dwarf_regs[31] = REG(FP15);
-
-	dwarf_regs[64] = REG(MASK);
-	dwarf_regs[65] = REG(PC);
-
-	dwfl_thread_state_register_pc(thread, dwarf_regs[65]);
-	return dwfl_thread_state_registers(thread, 0, 32, dwarf_regs);
-}
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index b3c4380d40b6..e0321043af88 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -226,7 +226,7 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *
 	return true;
 }
 
-static bool libdw_set_initial_registers_generic(Dwfl_Thread *thread, void *arg)
+static bool libdw_set_initial_registers(Dwfl_Thread *thread, void *arg)
 {
 	struct unwind_info *ui = arg;
 	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
@@ -279,29 +279,12 @@ static bool libdw_set_initial_registers_generic(Dwfl_Thread *thread, void *arg)
 	return ret;
 }
 
-#define DEFINE_DWFL_THREAD_CALLBACKS(arch)                           \
-static const Dwfl_Thread_Callbacks callbacks_##arch = {              \
-	.next_thread           = next_thread,                        \
-	.memory_read           = memory_read,                        \
-	.set_initial_registers = libdw_set_initial_registers_##arch, \
-}
-
-static const Dwfl_Thread_Callbacks callbacks_generic = {
+static const Dwfl_Thread_Callbacks callbacks = {
 	.next_thread           = next_thread,
 	.memory_read           = memory_read,
-	.set_initial_registers = libdw_set_initial_registers_generic,
+	.set_initial_registers = libdw_set_initial_registers,
 };
 
-DEFINE_DWFL_THREAD_CALLBACKS(s390);
-
-static const Dwfl_Thread_Callbacks *get_thread_callbacks(const char *arch)
-{
-	if (!strcmp(arch, "s390"))
-		return &callbacks_s390;
-
-	return &callbacks_generic;
-}
-
 static int
 frame_callback(Dwfl_Frame *state, void *arg)
 {
@@ -349,10 +332,8 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 		.e_machine	= e_machine,
 		.best_effort    = best_effort
 	};
-	const char *arch = perf_env__arch(machine->env);
 	Dwarf_Word ip;
 	int err = -EINVAL, i;
-	const Dwfl_Thread_Callbacks *callbacks;
 
 	if (!data->user_regs || !data->user_regs->regs)
 		return -EINVAL;
@@ -375,11 +356,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 	if (err)
 		goto out;
 
-	callbacks = get_thread_callbacks(arch);
-	if (!callbacks)
-		goto out;
-
-	err = !dwfl_attach_state(ui->dwfl, /*elf=*/NULL, thread__tid(thread), callbacks, ui);
+	err = !dwfl_attach_state(ui->dwfl, /*elf=*/NULL, thread__tid(thread), &callbacks, ui);
 	if (err)
 		goto out;
 
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 5c23080cb6c1..20d63d881dff 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -10,7 +10,6 @@ struct perf_sample;
 struct thread;
 
 bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg);
-bool libdw_set_initial_registers_s390(Dwfl_Thread *thread, void *arg);
 
 struct unwind_info {
 	Dwfl			*dwfl;
-- 
cgit v1.2.3


From 406b51a9a5e8d3c3d862a6eebe3def7e11229693 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:46 -0800
Subject: perf dwarf-regs: Add MIPS perf to DWARF register number mapping
 functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Despite an unused function declaration, there was no unwind-libdw for
MIPS but there is a perf_regs.h and a libdw implementation.

Fill in the pieces so hopefully MIPS unwinding with libdw works.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dwarf-regs-arch/Build             |  1 +
 tools/perf/util/dwarf-regs-arch/dwarf-regs-mips.c | 14 ++++++++++++++
 tools/perf/util/dwarf-regs.c                      |  5 +++++
 tools/perf/util/include/dwarf-regs.h              |  1 +
 tools/perf/util/unwind-libdw.h                    |  2 --
 5 files changed, 21 insertions(+), 2 deletions(-)
 create mode 100644 tools/perf/util/dwarf-regs-arch/dwarf-regs-mips.c

(limited to 'tools')

diff --git a/tools/perf/util/dwarf-regs-arch/Build b/tools/perf/util/dwarf-regs-arch/Build
index 10c2af3d933a..ceb68ae86fd8 100644
--- a/tools/perf/util/dwarf-regs-arch/Build
+++ b/tools/perf/util/dwarf-regs-arch/Build
@@ -2,6 +2,7 @@ perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm64.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-arm.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-csky.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-loongarch.o
+perf-util-$(CONFIG_LIBDW) += dwarf-regs-mips.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-powerpc.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-riscv.o
 perf-util-$(CONFIG_LIBDW) += dwarf-regs-s390.o
diff --git a/tools/perf/util/dwarf-regs-arch/dwarf-regs-mips.c b/tools/perf/util/dwarf-regs-arch/dwarf-regs-mips.c
new file mode 100644
index 000000000000..3bb916b45c66
--- /dev/null
+++ b/tools/perf/util/dwarf-regs-arch/dwarf-regs-mips.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <dwarf-regs.h>
+#include "../../../arch/mips/include/uapi/asm/perf_regs.h"
+
+int __get_dwarf_regnum_for_perf_regnum_mips(int perf_regnum)
+{
+	if (perf_regnum == PERF_REG_MIPS_PC)
+		return 37;
+	if (perf_regnum < 0 || perf_regnum >= PERF_REG_MIPS_MAX)
+		return -ENOENT;
+
+	return perf_regnum;
+}
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index f86f76547592..797f455eba0d 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -179,6 +179,8 @@ static int get_libdw_frame_nregs(unsigned int machine, unsigned int flags __mayb
 		return 103;
 	case EM_LOONGARCH:
 		return 74;
+	case EM_MIPS:
+		return 71;
 	default:
 		return 0;
 	}
@@ -218,6 +220,9 @@ int get_dwarf_regnum_for_perf_regnum(int perf_regnum, unsigned int machine,
 	case EM_LOONGARCH:
 		reg = __get_dwarf_regnum_for_perf_regnum_loongarch(perf_regnum);
 		break;
+	case EM_MIPS:
+		reg = __get_dwarf_regnum_for_perf_regnum_mips(perf_regnum);
+		break;
 	default:
 		pr_err("ELF MACHINE %x is not supported.\n", machine);
 		return -ENOENT;
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
index b95cf2d7b5b3..46a764cf322f 100644
--- a/tools/perf/util/include/dwarf-regs.h
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -113,6 +113,7 @@ int __get_dwarf_regnum_for_perf_regnum_loongarch(int perf_regnum);
 int __get_dwarf_regnum_for_perf_regnum_powerpc(int perf_regnum);
 int __get_dwarf_regnum_for_perf_regnum_riscv(int perf_regnum);
 int __get_dwarf_regnum_for_perf_regnum_s390(int perf_regnum);
+int __get_dwarf_regnum_for_perf_regnum_mips(int perf_regnum);
 
 /*
  * get_dwarf_regnum - Returns DWARF regnum from register name
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 20d63d881dff..9c5b5fcaaae8 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -9,8 +9,6 @@ struct machine;
 struct perf_sample;
 struct thread;
 
-bool libdw_set_initial_registers_mips(Dwfl_Thread *thread, void *arg);
-
 struct unwind_info {
 	Dwfl			*dwfl;
 	struct perf_sample      *sample;
-- 
cgit v1.2.3


From 2e9191573a69ff962b018d85a2c58269a1637b27 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:47 -0800
Subject: perf build: Remove NO_LIBDW_DWARF_UNWIND option
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Libdw unwinding support is present for every architecture that has a
perf_regs.h - perf registers are needed for the initial frame to
unwind.

Elfutils also supports SPARC, ARC and m68k but there is no support in
the Linux kernel for perf registers on these architectures.

As the perf supported DWARF unwinding architectures are a subset of the
elfutils ones, remove NO_LIBDW_DWARF_UNWIND as there isn't a case of
elfutils lacking the support need for perf.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config | 19 +------------------
 tools/perf/tests/make      |  3 +--
 tools/perf/util/Build      |  3 +--
 3 files changed, 3 insertions(+), 22 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 6f2c7bd36e74..5e4ae775987f 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -118,14 +118,6 @@ ifeq ($(ARCH),mips)
   endif
 endif
 
-# So far there's only x86 and arm libdw unwind support merged in perf.
-# Disable it on all other architectures in case libdw unwind
-# support is detected in system. Add supported architectures
-# to the check.
-ifneq ($(SRCARCH),$(filter $(SRCARCH),x86 arm arm64 powerpc s390 csky riscv loongarch))
-  NO_LIBDW_DWARF_UNWIND := 1
-endif
-
 ifneq ($(LIBUNWIND),1)
   NO_LIBUNWIND := 1
 endif
@@ -456,7 +448,6 @@ endif
 ifdef NO_LIBELF
   NO_LIBDW := 1
   NO_LIBUNWIND := 1
-  NO_LIBDW_DWARF_UNWIND := 1
   NO_LIBBPF := 1
   NO_JVMTI := 1
 else
@@ -504,10 +495,6 @@ ifeq ($(feature-libaio), 1)
   endif
 endif
 
-ifdef NO_LIBDW
-  NO_LIBDW_DWARF_UNWIND := 1
-endif
-
 ifeq ($(feature-scandirat), 1)
   # Ignore having scandirat with memory sanitizer that lacks an interceptor.
   ifeq ($(filter s% -fsanitize=memory%,$(EXTRA_CFLAGS),),)
@@ -757,7 +744,7 @@ dwarf-post-unwind-text := BUG
 
 # setup DWARF post unwinder
 ifdef NO_LIBUNWIND
-  ifdef NO_LIBDW_DWARF_UNWIND
+  ifdef NO_LIBDW
     $(warning Disabling post unwind, no support found.)
     dwarf-post-unwind := 0
   else
@@ -767,10 +754,6 @@ ifdef NO_LIBUNWIND
 else
   dwarf-post-unwind-text := libunwind
   $(call detected,CONFIG_LIBUNWIND)
-  # Enable libunwind support by default.
-  ifndef NO_LIBDW_DWARF_UNWIND
-    NO_LIBDW_DWARF_UNWIND := 1
-  endif
 endif
 
 ifeq ($(dwarf-post-unwind),1)
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 36411b4b6d2b..767ad9e147a8 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -83,7 +83,6 @@ make_no_demangle    := NO_DEMANGLE=1
 make_no_libelf      := NO_LIBELF=1
 make_no_libdw       := NO_LIBDW=1
 make_libunwind      := LIBUNWIND=1
-make_no_libdw_dwarf_unwind := NO_LIBDW_DWARF_UNWIND=1
 make_no_backtrace   := NO_BACKTRACE=1
 make_no_libcapstone := NO_CAPSTONE=1
 make_no_libnuma     := NO_LIBNUMA=1
@@ -120,7 +119,7 @@ make_static         := LDFLAGS=-static NO_PERF_READ_VDSO32=1 NO_PERF_READ_VDSOX3
 make_minimal        := NO_LIBPYTHON=1 NO_GTK2=1
 make_minimal        += NO_DEMANGLE=1 NO_LIBELF=1 NO_BACKTRACE=1
 make_minimal        += NO_LIBNUMA=1 NO_LIBBIONIC=1 NO_LIBDW=1
-make_minimal        += NO_LIBDW_DWARF_UNWIND=1 NO_LIBBPF=1
+make_minimal        += NO_LIBBPF=1
 make_minimal        += NO_SDT=1 NO_JVMTI=1 NO_LIBZSTD=1
 make_minimal        += NO_LIBCAP=1 NO_CAPSTONE=1
 
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 3cb1edd263cf..c30ff257f8b4 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -223,9 +223,8 @@ perf-util-$(CONFIG_LIBDW) += dwarf-regs-arch/
 perf-util-$(CONFIG_LIBDW) += debuginfo.o
 perf-util-$(CONFIG_LIBDW) += annotate-data.o
 perf-util-$(CONFIG_LIBDW) += libdw.o
+perf-util-$(CONFIG_LIBDW) += unwind-libdw.o
 
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw-arch/
 perf-util-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind-local.o
 perf-util-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
 perf-util-$(CONFIG_LIBUNWIND_X86)      += libunwind/x86_32.o
-- 
cgit v1.2.3


From 6b2658b3f36a7e524d7a8957e729e307484f2a8c Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:48 -0800
Subject: perf unwind-libdw: Don't discard loaded ELF/DWARF after every unwind
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The unwind-libdw dwfl has ELF binaries associated with mmap
addresses. Experimenting with using the per dso dwfl it is required to
alter the address to be 0 based variant. Unfortunately libdwfl doesn't
allow a single unwind and then an update to the return address to be 0
based as there are assertions that registers aren't updated once an
unwind has started, etc.

As removing the dwfl didn't prove possible, an alternative is to just
not discard the dwfl when the unwind ends. The dwfl is valid for a
process unless a dso is loaded at the same address as a previous
one. So keep the dwfl with the maps, invalidate it if a map is removed
(in case a new map replaces it) and recycle the dwfl in the unwinding
code. A wrinkly in the implementation of this is that the attached
thread argument is remembered by the dwfl and so it needs to be a
pointer to memory that also persists with the dwfl (struct
dwfl_ui_thread_info in the code).

Recording 10 seconds of system wide data with --call-graph=dwarf and
then processing with perf report shows a total runtime improvement
from 41.583s to 2.279s (an 18x speedup).

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/maps.c         | 36 ++++++++++++++++-
 tools/perf/util/maps.h         |  4 ++
 tools/perf/util/unwind-libdw.c | 90 +++++++++++++++++++++++++++++++-----------
 tools/perf/util/unwind-libdw.h |  9 ++++-
 4 files changed, 112 insertions(+), 27 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c
index c321d4f4d846..8ccc46d515b6 100644
--- a/tools/perf/util/maps.c
+++ b/tools/perf/util/maps.c
@@ -10,6 +10,7 @@
 #include "thread.h"
 #include "ui/ui.h"
 #include "unwind.h"
+#include "unwind-libdw.h"
 #include <internal/rc_check.h>
 
 /*
@@ -39,6 +40,9 @@ DECLARE_RC_STRUCT(maps) {
 #ifdef HAVE_LIBUNWIND_SUPPORT
 	void		*addr_space;
 	const struct unwind_libunwind_ops *unwind_libunwind_ops;
+#endif
+#ifdef HAVE_LIBDW_SUPPORT
+	void		*libdw_addr_space_dwfl;
 #endif
 	refcount_t	 refcnt;
 	/**
@@ -203,6 +207,17 @@ void maps__set_unwind_libunwind_ops(struct maps *maps, const struct unwind_libun
 	RC_CHK_ACCESS(maps)->unwind_libunwind_ops = ops;
 }
 #endif
+#ifdef HAVE_LIBDW_SUPPORT
+void *maps__libdw_addr_space_dwfl(const struct maps *maps)
+{
+	return RC_CHK_ACCESS(maps)->libdw_addr_space_dwfl;
+}
+
+void maps__set_libdw_addr_space_dwfl(struct maps *maps, void *dwfl)
+{
+	RC_CHK_ACCESS(maps)->libdw_addr_space_dwfl = dwfl;
+}
+#endif
 
 static struct rw_semaphore *maps__lock(struct maps *maps)
 {
@@ -218,6 +233,9 @@ static void maps__init(struct maps *maps, struct machine *machine)
 #ifdef HAVE_LIBUNWIND_SUPPORT
 	RC_CHK_ACCESS(maps)->addr_space = NULL;
 	RC_CHK_ACCESS(maps)->unwind_libunwind_ops = NULL;
+#endif
+#ifdef HAVE_LIBDW_SUPPORT
+	RC_CHK_ACCESS(maps)->libdw_addr_space_dwfl = NULL;
 #endif
 	refcount_set(maps__refcnt(maps), 1);
 	RC_CHK_ACCESS(maps)->nr_maps = 0;
@@ -240,6 +258,9 @@ static void maps__exit(struct maps *maps)
 	zfree(&maps_by_address);
 	zfree(&maps_by_name);
 	unwind__finish_access(maps);
+#ifdef HAVE_LIBDW_SUPPORT
+	libdw__invalidate_dwfl(maps, maps__libdw_addr_space_dwfl(maps));
+#endif
 }
 
 struct maps *maps__new(struct machine *machine)
@@ -549,6 +570,9 @@ void maps__remove(struct maps *maps, struct map *map)
 	__maps__remove(maps, map);
 	check_invariants(maps);
 	up_write(maps__lock(maps));
+#ifdef HAVE_LIBDW_SUPPORT
+	libdw__invalidate_dwfl(maps, maps__libdw_addr_space_dwfl(maps));
+#endif
 }
 
 bool maps__empty(struct maps *maps)
@@ -604,18 +628,26 @@ int maps__for_each_map(struct maps *maps, int (*cb)(struct map *map, void *data)
 void maps__remove_maps(struct maps *maps, bool (*cb)(struct map *map, void *data), void *data)
 {
 	struct map **maps_by_address;
+	bool removed = false;
 
 	down_write(maps__lock(maps));
 
 	maps_by_address = maps__maps_by_address(maps);
 	for (unsigned int i = 0; i < maps__nr_maps(maps);) {
-		if (cb(maps_by_address[i], data))
+		if (cb(maps_by_address[i], data)) {
 			__maps__remove(maps, maps_by_address[i]);
-		else
+			removed = true;
+		} else {
 			i++;
+		}
 	}
 	check_invariants(maps);
 	up_write(maps__lock(maps));
+	if (removed) {
+#ifdef HAVE_LIBDW_SUPPORT
+		libdw__invalidate_dwfl(maps, maps__libdw_addr_space_dwfl(maps));
+#endif
+	}
 }
 
 struct symbol *maps__find_symbol(struct maps *maps, u64 addr, struct map **mapp)
diff --git a/tools/perf/util/maps.h b/tools/perf/util/maps.h
index d9aa62ed968a..20c52084ba9e 100644
--- a/tools/perf/util/maps.h
+++ b/tools/perf/util/maps.h
@@ -52,6 +52,10 @@ void maps__set_addr_space(struct maps *maps, void *addr_space);
 const struct unwind_libunwind_ops *maps__unwind_libunwind_ops(const struct maps *maps);
 void maps__set_unwind_libunwind_ops(struct maps *maps, const struct unwind_libunwind_ops *ops);
 #endif
+#ifdef HAVE_LIBDW_SUPPORT
+void *maps__libdw_addr_space_dwfl(const struct maps *maps);
+void maps__set_libdw_addr_space_dwfl(struct maps *maps, void *dwfl);
+#endif
 
 size_t maps__fprintf(struct maps *maps, FILE *fp);
 
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index e0321043af88..c1646ef5f971 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -20,6 +20,17 @@
 #include "callchain.h"
 #include "util/env.h"
 
+/*
+ * The dwfl thread argument passed to functions like memory_read. Memory has to
+ * be allocated to persist of multiple uses of the dwfl.
+ */
+struct dwfl_ui_thread_info {
+	/* Back link to the dwfl. */
+	Dwfl *dwfl;
+	/* The current unwind info, only 1 is supported. */
+	struct unwind_info *ui;
+};
+
 static char *debuginfo_path;
 
 static int __find_debuginfo(Dwfl_Module *mod __maybe_unused, void **userdata,
@@ -35,6 +46,19 @@ static int __find_debuginfo(Dwfl_Module *mod __maybe_unused, void **userdata,
 	return -1;
 }
 
+void libdw__invalidate_dwfl(struct maps *maps, void *arg)
+{
+	struct dwfl_ui_thread_info *dwfl_ui_ti = arg;
+
+	if (!dwfl_ui_ti)
+		return;
+
+	assert(dwfl_ui_ti->ui == NULL);
+	maps__set_libdw_addr_space_dwfl(maps, NULL);
+	dwfl_end(dwfl_ui_ti->dwfl);
+	free(dwfl_ui_ti);
+}
+
 static const Dwfl_Callbacks offline_callbacks = {
 	.find_debuginfo		= __find_debuginfo,
 	.debuginfo_path		= &debuginfo_path,
@@ -187,7 +211,8 @@ out_fail:
 static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *result,
 			void *arg)
 {
-	struct unwind_info *ui = arg;
+	struct dwfl_ui_thread_info *dwfl_ui_ti = arg;
+	struct unwind_info *ui = dwfl_ui_ti->ui;
 	uint16_t e_machine = thread__e_machine(ui->thread, ui->machine);
 	struct stack_dump *stack = &ui->sample->user_stack;
 	u64 start, end;
@@ -228,7 +253,8 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *
 
 static bool libdw_set_initial_registers(Dwfl_Thread *thread, void *arg)
 {
-	struct unwind_info *ui = arg;
+	struct dwfl_ui_thread_info *dwfl_ui_ti = arg;
+	struct unwind_info *ui = dwfl_ui_ti->ui;
 	struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
 	Dwarf_Word *dwarf_regs;
 	int max_dwarf_reg = 0;
@@ -320,33 +346,50 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 			int max_stack,
 			bool best_effort)
 {
-	struct machine *machine = maps__machine(thread__maps(thread));
+	struct maps *maps = thread__maps(thread);
+	struct machine *machine = maps__machine(maps);
 	uint16_t e_machine = thread__e_machine(thread, machine);
-	struct unwind_info *ui, ui_buf = {
-		.sample		= data,
-		.thread		= thread,
-		.machine	= machine,
-		.cb		= cb,
-		.arg		= arg,
-		.max_stack	= max_stack,
-		.e_machine	= e_machine,
-		.best_effort    = best_effort
-	};
+	struct dwfl_ui_thread_info *dwfl_ui_ti;
+	static struct unwind_info *ui;
+	Dwfl *dwfl;
 	Dwarf_Word ip;
 	int err = -EINVAL, i;
 
 	if (!data->user_regs || !data->user_regs->regs)
 		return -EINVAL;
 
-	ui = zalloc(sizeof(ui_buf) + sizeof(ui_buf.entries[0]) * max_stack);
+	ui = zalloc(sizeof(*ui) + sizeof(ui->entries[0]) * max_stack);
 	if (!ui)
 		return -ENOMEM;
 
-	*ui = ui_buf;
+	*ui = (struct unwind_info){
+		.sample		= data,
+		.thread		= thread,
+		.machine	= machine,
+		.cb		= cb,
+		.arg		= arg,
+		.max_stack	= max_stack,
+		.e_machine	= e_machine,
+		.best_effort    = best_effort
+	};
 
-	ui->dwfl = dwfl_begin(&offline_callbacks);
-	if (!ui->dwfl)
-		goto out;
+	dwfl_ui_ti = maps__libdw_addr_space_dwfl(maps);
+	if (dwfl_ui_ti) {
+		dwfl = dwfl_ui_ti->dwfl;
+	} else {
+		dwfl_ui_ti = zalloc(sizeof(*dwfl_ui_ti));
+		dwfl = dwfl_begin(&offline_callbacks);
+		if (!dwfl)
+			goto out;
+
+		dwfl_ui_ti->dwfl = dwfl;
+		maps__set_libdw_addr_space_dwfl(maps, dwfl_ui_ti);
+	}
+	assert(dwfl_ui_ti->ui == NULL);
+	assert(dwfl_ui_ti->dwfl == dwfl);
+	assert(dwfl_ui_ti == maps__libdw_addr_space_dwfl(maps));
+	dwfl_ui_ti->ui = ui;
+	ui->dwfl = dwfl;
 
 	err = perf_reg_value(&ip, data->user_regs, perf_arch_reg_ip(e_machine));
 	if (err)
@@ -356,11 +399,12 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 	if (err)
 		goto out;
 
-	err = !dwfl_attach_state(ui->dwfl, /*elf=*/NULL, thread__tid(thread), &callbacks, ui);
-	if (err)
-		goto out;
+	dwfl_attach_state(dwfl, /*elf=*/NULL, thread__tid(thread), &callbacks,
+			  /* Dwfl thread function argument*/dwfl_ui_ti);
+	// Ignore thread already attached error.
 
-	err = dwfl_getthread_frames(ui->dwfl, thread__tid(thread), frame_callback, ui);
+	err = dwfl_getthread_frames(dwfl, thread__tid(thread), frame_callback,
+				    /* Dwfl frame function argument*/ui);
 
 	if (err && ui->max_stack != max_stack)
 		err = 0;
@@ -384,7 +428,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 	for (i = 0; i < ui->idx; i++)
 		map_symbol__exit(&ui->entries[i].ms);
 
-	dwfl_end(ui->dwfl);
+	dwfl_ui_ti->ui = NULL;
 	free(ui);
 	return 0;
 }
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 9c5b5fcaaae8..3dec0ab8bd50 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -2,15 +2,17 @@
 #ifndef __PERF_UNWIND_LIBDW_H
 #define __PERF_UNWIND_LIBDW_H
 
-#include <elfutils/libdwfl.h>
+#include <stdint.h>
 #include "unwind.h"
 
 struct machine;
 struct perf_sample;
 struct thread;
 
+#ifdef HAVE_LIBDW_SUPPORT
+
 struct unwind_info {
-	Dwfl			*dwfl;
+	void			*dwfl;
 	struct perf_sample      *sample;
 	struct machine          *machine;
 	struct thread           *thread;
@@ -23,4 +25,7 @@ struct unwind_info {
 	struct unwind_entry	entries[];
 };
 
+void libdw__invalidate_dwfl(struct maps *maps, void *dwfl);
+#endif
+
 #endif /* __PERF_UNWIND_LIBDW_H */
-- 
cgit v1.2.3


From 28cb835f7645892f4559b92fcfeb25a81646f4cf Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 16 Jan 2026 21:28:49 -0800
Subject: perf machine: Add inline information to frame pointer and LBR
 callchains
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use append_inlines() in frame pointer and LBR cases.

Update the addr2line test to also test frame pointers.

LBR is also updated but inaccuracy in the branched to IP means the
inline information is missing in the leaf.

Leave LBR callchains untested for now.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Wielaard <mark@klomp.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/addr2line_inlines.sh |  31 +++++++--
 tools/perf/util/machine.c                   | 104 ++++++++++++++++------------
 2 files changed, 86 insertions(+), 49 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/addr2line_inlines.sh b/tools/perf/tests/shell/addr2line_inlines.sh
index 4a5b6f5be23d..ce30d9c7e0bf 100755
--- a/tools/perf/tests/shell/addr2line_inlines.sh
+++ b/tools/perf/tests/shell/addr2line_inlines.sh
@@ -21,8 +21,28 @@ trap_cleanup() {
 }
 trap trap_cleanup EXIT TERM INT
 
-test_inlinedloop() {
-    echo "Inline unwinding verification test"
+test_fp() {
+    echo "Inline unwinding fp verification test"
+    # Record data. Currently only dwarf callchains support inlined functions.
+    perf record --call-graph fp -e task-clock:u -o "${perf_data}" -- perf test -w inlineloop 1
+
+    # Check output with inline (default) and srcline
+    perf script -i "${perf_data}" --fields +srcline > "${perf_script_txt}"
+
+    # Expect the leaf and middle functions to occur on lines in the 20s, with
+    # the non-inlined parent function on a line in the 30s.
+    if grep -q "inlineloop.c:2. (inlined)" "${perf_script_txt}" &&
+       grep -q "inlineloop.c:3.$" "${perf_script_txt}"
+    then
+        echo "Inline unwinding fp verification test [Success]"
+    else
+        echo "Inline unwinding fp verification test [Failed missing inlined functions]"
+        err=1
+    fi
+}
+
+test_dwarf() {
+    echo "Inline unwinding dwarf verification test"
     # Record data. Currently only dwarf callchains support inlined functions.
     perf record --call-graph dwarf -e task-clock:u -o "${perf_data}" -- perf test -w inlineloop 1
 
@@ -34,14 +54,15 @@ test_inlinedloop() {
     if grep -q "inlineloop.c:2. (inlined)" "${perf_script_txt}" &&
        grep -q "inlineloop.c:3.$" "${perf_script_txt}"
     then
-        echo "Inline unwinding verification test [Success]"
+        echo "Inline unwinding dwarf verification test [Success]"
     else
-        echo "Inline unwinding verification test [Failed missing inlined functions]"
+        echo "Inline unwinding dwarf verification test [Failed missing inlined functions]"
         err=1
     fi
 }
 
-test_inlinedloop
+test_fp
+test_dwarf
 
 cleanup
 exit $err
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 841b711d970e..30d606fbf040 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2090,6 +2090,59 @@ struct iterations {
 	u64 cycles;
 };
 
+static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms, u64 ip,
+			bool branch, struct branch_flags *flags, int nr_loop_iter,
+			u64 iter_cycles, u64 branch_from)
+{
+	struct symbol *sym = ms->sym;
+	struct map *map = ms->map;
+	struct inline_node *inline_node;
+	struct inline_list *ilist;
+	struct dso *dso;
+	u64 addr;
+	int ret = 1;
+	struct map_symbol ilist_ms;
+	bool first = true;
+
+	if (!symbol_conf.inline_name || !map || !sym)
+		return ret;
+
+	addr = map__dso_map_ip(map, ip);
+	addr = map__rip_2objdump(map, addr);
+	dso = map__dso(map);
+
+	inline_node = inlines__tree_find(dso__inlined_nodes(dso), addr);
+	if (!inline_node) {
+		inline_node = dso__parse_addr_inlines(dso, addr, sym);
+		if (!inline_node)
+			return ret;
+		inlines__tree_insert(dso__inlined_nodes(dso), inline_node);
+	}
+
+	ilist_ms = (struct map_symbol) {
+		.maps = maps__get(ms->maps),
+		.map = map__get(map),
+	};
+	list_for_each_entry(ilist, &inline_node->val, list) {
+		ilist_ms.sym = ilist->symbol;
+		if (first) {
+			ret = callchain_cursor_append(cursor, ip, &ilist_ms,
+						      branch, flags, nr_loop_iter,
+						      iter_cycles, branch_from, ilist->srcline);
+		} else {
+			ret = callchain_cursor_append(cursor, ip, &ilist_ms, false,
+						      NULL, 0, 0, 0, ilist->srcline);
+		}
+		first = false;
+
+		if (ret != 0)
+			return ret;
+	}
+	map_symbol__exit(&ilist_ms);
+
+	return ret;
+}
+
 static int add_callchain_ip(struct thread *thread,
 			    struct callchain_cursor *cursor,
 			    struct symbol **parent,
@@ -2170,6 +2223,11 @@ static int add_callchain_ip(struct thread *thread,
 	ms.maps = maps__get(al.maps);
 	ms.map = map__get(al.map);
 	ms.sym = al.sym;
+
+	if (append_inlines(cursor, &ms, ip, branch, flags, nr_loop_iter,
+			   iter_cycles, branch_from) == 0)
+		goto out;
+
 	srcline = callchain_srcline(&ms, al.addr);
 	err = callchain_cursor_append(cursor, ip, &ms,
 				      branch, flags, nr_loop_iter,
@@ -2888,49 +2946,6 @@ check_calls:
 	return 0;
 }
 
-static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms, u64 ip)
-{
-	struct symbol *sym = ms->sym;
-	struct map *map = ms->map;
-	struct inline_node *inline_node;
-	struct inline_list *ilist;
-	struct dso *dso;
-	u64 addr;
-	int ret = 1;
-	struct map_symbol ilist_ms;
-
-	if (!symbol_conf.inline_name || !map || !sym)
-		return ret;
-
-	addr = map__dso_map_ip(map, ip);
-	addr = map__rip_2objdump(map, addr);
-	dso = map__dso(map);
-
-	inline_node = inlines__tree_find(dso__inlined_nodes(dso), addr);
-	if (!inline_node) {
-		inline_node = dso__parse_addr_inlines(dso, addr, sym);
-		if (!inline_node)
-			return ret;
-		inlines__tree_insert(dso__inlined_nodes(dso), inline_node);
-	}
-
-	ilist_ms = (struct map_symbol) {
-		.maps = maps__get(ms->maps),
-		.map = map__get(map),
-	};
-	list_for_each_entry(ilist, &inline_node->val, list) {
-		ilist_ms.sym = ilist->symbol;
-		ret = callchain_cursor_append(cursor, ip, &ilist_ms, false,
-					      NULL, 0, 0, 0, ilist->srcline);
-
-		if (ret != 0)
-			return ret;
-	}
-	map_symbol__exit(&ilist_ms);
-
-	return ret;
-}
-
 static int unwind_entry(struct unwind_entry *entry, void *arg)
 {
 	struct callchain_cursor *cursor = arg;
@@ -2940,7 +2955,8 @@ static int unwind_entry(struct unwind_entry *entry, void *arg)
 	if (symbol_conf.hide_unresolved && entry->ms.sym == NULL)
 		return 0;
 
-	if (append_inlines(cursor, &entry->ms, entry->ip) == 0)
+	if (append_inlines(cursor, &entry->ms, entry->ip, /*branch=*/false, /*branch_flags=*/NULL,
+			   /*nr_loop_iter=*/0, /*iter_cycles=*/0, /*branch_from=*/0) == 0)
 		return 0;
 
 	/*
-- 
cgit v1.2.3


From 2e6690d4f7fc41c4fae7d0a4c0bf11f1973e5650 Mon Sep 17 00:00:00 2001
From: Gyutae Bae <gyutae.bae@navercorp.com>
Date: Tue, 20 Jan 2026 18:07:16 +0900
Subject: selftests/bpf: Add perfbuf multi-producer benchmark

Add a multi-producer benchmark for perfbuf to complement the existing
ringbuf multi-producer test. Unlike ringbuf which uses a shared buffer
and experiences contention, perfbuf uses per-CPU buffers so the test
measures scaling behavior rather than contention.

This allows developers to compare perfbuf vs ringbuf performance under
multi-producer workloads when choosing between the two for their systems.

Signed-off-by: Gyutae Bae <gyutae.bae@navercorp.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260120090716.82927-1-gyutae.opensource@navercorp.com
---
 tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh
index 83e05e837871..123b7feb6935 100755
--- a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh
+++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh
@@ -49,6 +49,11 @@ for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do
 	summarize "rb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)"
 done
 
+header "Perfbuf, multi-producer"
+for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do
+	summarize "pb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 --rb-sample-rate 50 pb-libbpf)"
+done
+
 header "Ringbuf, multi-producer contention in overwrite mode, no consumer"
 for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do
 	summarize "rb-prod nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 --rb-overwrite --rb-bench-producer rb-libbpf)"
-- 
cgit v1.2.3


From 965ef09a26f3efe7dcdc4691860c1e85da353b77 Mon Sep 17 00:00:00 2001
From: Romain Gantois <romain.gantois@bootlin.com>
Date: Fri, 26 Dec 2025 08:54:32 +0100
Subject: tools: lib: thermal: Correct CFLAGS and LDFLAGS in pkg-config
 template

There are two issues with the current pkg-config template. Firstly, the
-lthermal linker flag is missing. Secondly, the libnl3 include directory
compiler flag references "include" instead of "includedir", which leads to
an unexpanded variable when pkg-config is called.

Add the missing -lthermal flag and correct the libnl3 include directory.

Signed-off-by: Romain Gantois <romain.gantois@bootlin.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://patch.msgid.link/20251226-libthermal-pkgconfig-v1-1-3406de5ca8ea@bootlin.com
---
 tools/lib/thermal/libthermal.pc.template | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/thermal/libthermal.pc.template b/tools/lib/thermal/libthermal.pc.template
index ac24d0ab17f5..3b8a24d0a8b8 100644
--- a/tools/lib/thermal/libthermal.pc.template
+++ b/tools/lib/thermal/libthermal.pc.template
@@ -8,5 +8,5 @@ Name: libthermal
 Description: thermal library
 Requires: libnl-3.0 libnl-genl-3.0
 Version: @VERSION@
-Libs: -L${libdir} -lnl-genl-3 -lnl-3
-Cflags: -I${includedir} -I${include}/libnl3
+Libs: -L${libdir} -lnl-genl-3 -lnl-3 -lthermal
+Cflags: -I${includedir} -I${includedir}/libnl3
-- 
cgit v1.2.3


From 92ea788d2af4e65ad7a144ccfff50667e9a0d227 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 13 Jan 2026 15:29:02 -0800
Subject: perf inject: Add --convert-callchain option

There are applications not built with frame pointers, so DWARF is needed
to get the stack traces.

`perf record --call-graph dwarf` saves the stack and register data for
each sample to get the stacktrace offline.  But sometimes this data may
have sensitive information and we don't want to keep them in the file.

This new 'perf inject --convert-callchain' option creates the callchains
and discards the stack and register after that.

This saves storage space and processing time for the new data file.

Of course, users should remove the original data file to not keep
sensitive data around.  :)

The down side is that it cannot handle inlined callchain entries as they
all have the same IPs.

Maybe we can add an option to 'perf report' to look up inlined functions
using DWARF - IIUC it doesn't require stack and register data.

This is an example.

  $ perf record --call-graph dwarf -- perf test -w noploop

  $ perf report --stdio --no-children --percent-limit=0 > output-prev

  $ perf inject -i perf.data --convert-callchain -o perf.data.out

  $ perf report --stdio --no-children --percent-limit=0 -i perf.data.out > output-next

  $ diff -u output-prev output-next
  ...
        0.23%  perf          ld-linux-x86-64.so.2  [.] _dl_relocate_object_no_relro
               |
  -            ---elf_dynamic_do_Rela (inlined)
  -               _dl_relocate_object_no_relro
  +            ---_dl_relocate_object_no_relro
                  _dl_relocate_object
                  dl_main
                  _dl_sysdep_start
  -               _dl_start_final (inlined)
                  _dl_start
                  _start

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-inject.txt |   5 +
 tools/perf/builtin-inject.c              | 152 +++++++++++++++++++++++++++++++
 2 files changed, 157 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index c972032f4ca0..95dfdf39666e 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -109,6 +109,11 @@ include::itrace.txt[]
 	should be used, and also --buildid-all and --switch-events may be
 	useful.
 
+--convert-callchain::
+	Parse DWARF callchains and convert them to usual callchains.  This also
+	discards stack and register data from the samples.  This will lose
+	inlined callchain entries.
+
 :GMEXAMPLECMD: inject
 :GMEXAMPLESUBCMD:
 include::guestmount.txt[]
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 6080afec537d..e2a653280e1b 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -122,6 +122,7 @@ struct perf_inject {
 	bool			in_place_update;
 	bool			in_place_update_dry_run;
 	bool			copy_kcore_dir;
+	bool			convert_callchain;
 	const char		*input_name;
 	struct perf_data	output;
 	u64			bytes_written;
@@ -133,6 +134,7 @@ struct perf_inject {
 	struct guest_session	guest_session;
 	struct strlist		*known_build_ids;
 	const struct evsel	*mmap_evsel;
+	struct ip_callchain	*raw_callchain;
 };
 
 struct event_entry {
@@ -383,6 +385,90 @@ static int perf_event__repipe_sample(const struct perf_tool *tool,
 	return perf_event__repipe_synth(tool, event);
 }
 
+static int perf_event__convert_sample_callchain(const struct perf_tool *tool,
+						union perf_event *event,
+						struct perf_sample *sample,
+						struct evsel *evsel,
+						struct machine *machine)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	struct callchain_cursor *cursor = get_tls_callchain_cursor();
+	union perf_event *event_copy = (void *)inject->event_copy;
+	struct callchain_cursor_node *node;
+	struct thread *thread;
+	u64 sample_type = evsel->core.attr.sample_type;
+	u32 sample_size = event->header.size;
+	u64 i, k;
+	int ret;
+
+	if (event_copy == NULL) {
+		inject->event_copy = malloc(PERF_SAMPLE_MAX_SIZE);
+		if (!inject->event_copy)
+			return -ENOMEM;
+
+		event_copy = (void *)inject->event_copy;
+	}
+
+	if (cursor == NULL)
+		return -ENOMEM;
+
+	callchain_cursor_reset(cursor);
+
+	thread = machine__find_thread(machine, sample->tid, sample->pid);
+	if (thread == NULL)
+		goto out;
+
+	/* this will parse DWARF using stack and register data */
+	ret = thread__resolve_callchain(thread, cursor, evsel, sample,
+					/*parent=*/NULL, /*root_al=*/NULL,
+					PERF_MAX_STACK_DEPTH);
+	thread__put(thread);
+	if (ret != 0)
+		goto out;
+
+	/* copy kernel callchain and context entries */
+	for (i = 0; i < sample->callchain->nr; i++) {
+		inject->raw_callchain->ips[i] = sample->callchain->ips[i];
+		if (sample->callchain->ips[i] == PERF_CONTEXT_USER) {
+			i++;
+			break;
+		}
+	}
+	if (i == 0 || inject->raw_callchain->ips[i - 1] != PERF_CONTEXT_USER)
+		inject->raw_callchain->ips[i++] = PERF_CONTEXT_USER;
+
+	node = cursor->first;
+	for (k = 0; k < cursor->nr && i < PERF_MAX_STACK_DEPTH; k++) {
+		if (machine__kernel_ip(machine, node->ip))
+			/* kernel IPs were added already */;
+		else if (node->ms.sym && node->ms.sym->inlined)
+			/* we can't handle inlined callchains */;
+		else
+			inject->raw_callchain->ips[i++] = node->ip;
+
+		node = node->next;
+	}
+
+	inject->raw_callchain->nr = i;
+	sample->callchain = inject->raw_callchain;
+
+out:
+	memcpy(event_copy, event, sizeof(event->header));
+
+	/* adjust sample size for stack and regs */
+	sample_size -= sample->user_stack.size;
+	sample_size -= (hweight64(evsel->core.attr.sample_regs_user) + 1) * sizeof(u64);
+	sample_size += (sample->callchain->nr + 1) * sizeof(u64);
+	event_copy->header.size = sample_size;
+
+	/* remove sample_type {STACK,REGS}_USER for synthesize */
+	sample_type &= ~(PERF_SAMPLE_STACK_USER | PERF_SAMPLE_REGS_USER);
+
+	perf_event__synthesize_sample(event_copy, sample_type,
+				      evsel->core.attr.read_format, sample);
+	return perf_event__repipe_synth(tool, event_copy);
+}
+
 static struct dso *findnew_dso(int pid, int tid, const char *filename,
 			       const struct dso_id *id, struct machine *machine)
 {
@@ -2270,6 +2356,15 @@ static int __cmd_inject(struct perf_inject *inject)
 		/* Allow space in the header for guest attributes */
 		output_data_offset += gs->session->header.data_offset;
 		output_data_offset = roundup(output_data_offset, 4096);
+	} else if (inject->convert_callchain) {
+		inject->tool.sample	= perf_event__convert_sample_callchain;
+		inject->tool.fork	= perf_event__repipe_fork;
+		inject->tool.comm	= perf_event__repipe_comm;
+		inject->tool.exit	= perf_event__repipe_exit;
+		inject->tool.mmap	= perf_event__repipe_mmap;
+		inject->tool.mmap2	= perf_event__repipe_mmap2;
+		inject->tool.ordered_events = true;
+		inject->tool.ordering_requires_timestamps = true;
 	}
 
 	if (!inject->itrace_synth_opts.set)
@@ -2322,6 +2417,23 @@ static int __cmd_inject(struct perf_inject *inject)
 				perf_header__set_feat(&session->header,
 						      HEADER_BRANCH_STACK);
 		}
+
+		/*
+		 * The converted data file won't have stack and registers.
+		 * Update the perf_event_attr to remove them before writing.
+		 */
+		if (inject->convert_callchain) {
+			struct evsel *evsel;
+
+			evlist__for_each_entry(session->evlist, evsel) {
+				evsel__reset_sample_bit(evsel, REGS_USER);
+				evsel__reset_sample_bit(evsel, STACK_USER);
+				evsel->core.attr.sample_regs_user = 0;
+				evsel->core.attr.sample_stack_user = 0;
+				evsel->core.attr.exclude_callchain_user = 0;
+			}
+		}
+
 		session->header.data_offset = output_data_offset;
 		session->header.data_size = inject->bytes_written;
 		perf_session__inject_header(session, session->evlist, fd, &inj_fc.fc,
@@ -2346,6 +2458,18 @@ static int __cmd_inject(struct perf_inject *inject)
 	return ret;
 }
 
+static bool evsel__has_dwarf_callchain(struct evsel *evsel)
+{
+	struct perf_event_attr *attr = &evsel->core.attr;
+	const u64 dwarf_callchain_flags =
+		PERF_SAMPLE_STACK_USER | PERF_SAMPLE_REGS_USER | PERF_SAMPLE_CALLCHAIN;
+
+	if (!attr->exclude_callchain_user)
+		return false;
+
+	return (attr->sample_type & dwarf_callchain_flags) == dwarf_callchain_flags;
+}
+
 int cmd_inject(int argc, const char **argv)
 {
 	struct perf_inject inject = {
@@ -2414,6 +2538,8 @@ int cmd_inject(int argc, const char **argv)
 		OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory",
 			   "guest mount directory under which every guest os"
 			   " instance has a subdir"),
+		OPT_BOOLEAN(0, "convert-callchain", &inject.convert_callchain,
+			    "Generate callchains using DWARF and drop register/stack data"),
 		OPT_END()
 	};
 	const char * const inject_usage[] = {
@@ -2429,6 +2555,9 @@ int cmd_inject(int argc, const char **argv)
 
 #ifndef HAVE_JITDUMP
 	set_option_nobuild(options, 'j', "jit", "NO_LIBELF=1", true);
+#endif
+#ifndef HAVE_LIBDW_SUPPORT
+	set_option_nobuild(options, 0, "convert-callchain", "NO_LIBDW=1", true);
 #endif
 	argc = parse_options(argc, argv, options, inject_usage, 0);
 
@@ -2588,6 +2717,28 @@ int cmd_inject(int argc, const char **argv)
 		}
 	}
 
+	if (inject.convert_callchain) {
+		struct evsel *evsel;
+
+		if (inject.output.is_pipe || inject.session->data->is_pipe) {
+			pr_err("--convert-callchain cannot work with pipe\n");
+			goto out_delete;
+		}
+
+		evlist__for_each_entry(inject.session->evlist, evsel) {
+			if (!evsel__has_dwarf_callchain(evsel)) {
+				pr_err("--convert-callchain requires DWARF call graph.\n");
+				goto out_delete;
+			}
+		}
+
+		inject.raw_callchain = calloc(PERF_MAX_STACK_DEPTH, sizeof(u64));
+		if (inject.raw_callchain == NULL) {
+			pr_err("callchain allocation failed\n");
+			goto out_delete;
+		}
+	}
+
 #ifdef HAVE_JITDUMP
 	if (inject.jit_mode) {
 		inject.tool.mmap2	   = perf_event__repipe_mmap2;
@@ -2618,5 +2769,6 @@ out_close_output:
 	free(inject.itrace_synth_opts.vm_tm_corr_args);
 	free(inject.event_copy);
 	free(inject.guest_session.ev.event_buf);
+	free(inject.raw_callchain);
 	return ret;
 }
-- 
cgit v1.2.3


From b42c4dfe02af407d04375f070f56a818ed4c92ce Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 13 Jan 2026 15:29:03 -0800
Subject: perf test: Add DWARF callchain conversion test

  $ perf test -vv "DWARF callchain"
   87: perf inject to convert DWARF callchains to regular ones:
  --- start ---
  test child forked, pid 1560328
  recording data with DWARF callchain
  [ perf record: Woken up 4 times to write data ]
  [ perf record: Captured and wrote 0.908 MB /tmp/perf-test.nM3WoW (105 samples) ]
  convert DWARF callchain using perf inject
  compare the both result excluding inlined functions
  ---- end(0) ----
   87: perf inject to convert DWARF callchains to regular ones         : Ok
  $

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/inject-callchain.sh | 45 ++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100755 tools/perf/tests/shell/inject-callchain.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/inject-callchain.sh b/tools/perf/tests/shell/inject-callchain.sh
new file mode 100755
index 000000000000..a1cba8010f95
--- /dev/null
+++ b/tools/perf/tests/shell/inject-callchain.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# perf inject to convert DWARF callchains to regular ones
+# SPDX-License-Identifier: GPL-2.0
+
+if ! perf check feature -q dwarf; then
+    echo "SKIP: DWARF support is not available"
+    exit 2
+fi
+
+TESTDATA=$(mktemp /tmp/perf-test.XXXXXX)
+
+err=0
+
+cleanup()
+{
+    trap - EXIT TERM INT
+    rm -f ${TESTDATA}*
+}
+
+trap_cleanup()
+{
+	cleanup
+	exit 1
+}
+
+trap trap_cleanup EXIT TERM INT
+
+echo "recording data with DWARF callchain"
+perf record -F 999 --call-graph dwarf -o "${TESTDATA}" -- perf test -w noploop
+
+echo "convert DWARF callchain using perf inject"
+perf inject -i "${TESTDATA}" --convert-callchain -o "${TESTDATA}.new"
+
+perf report -i "${TESTDATA}" --no-children -q --percent-limit=1 > ${TESTDATA}.out
+perf report -i "${TESTDATA}.new" --no-children -q --percent-limit=1 > ${TESTDATA}.new.out
+
+echo "compare the both result excluding inlined functions"
+if diff -u "${TESTDATA}.out" "${TESTDATA}.new.out" | grep "^- " | grep -qv "(inlined)"; then
+    echo "Found some differences"
+    diff -u "${TESTDATA}.out" "${TESTDATA}.new.out"
+    err=1
+fi
+
+cleanup
+exit $err
-- 
cgit v1.2.3


From 069e603d8248dac98b1ef2909e2f1c4169b9da11 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 13 Jan 2026 15:37:57 -0800
Subject: perf tools: Get debug info of DSO properly

The dso__debuginfo() just used the path name to open the file but it may
be outdated.  It should check build-ID and use the file in the build-ID
cache if available rather than just using the path name.

Let's factor out dso__get_filename() to avoid code duplicate.

Fixes: 53a61a6ca279165d ("perf annotate: Add dso__debuginfo() helper")
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dso.c | 63 +++++++++++++++++++++++++++++++++++++++------------
 tools/perf/util/dso.h | 11 ++-------
 2 files changed, 50 insertions(+), 24 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index dce207c7f862..3b272a6fae24 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -112,7 +112,7 @@ bool dso__is_object_file(const struct dso *dso)
 
 int dso__read_binary_type_filename(const struct dso *dso,
 				   enum dso_binary_type type,
-				   char *root_dir, char *filename, size_t size)
+				   const char *root_dir, char *filename, size_t size)
 {
 	char build_id_hex[SBUILD_ID_SIZE];
 	int ret = 0;
@@ -561,20 +561,15 @@ char *dso__filename_with_chroot(const struct dso *dso, const char *filename)
 	return filename_with_chroot(nsinfo__pid(dso__nsinfo_const(dso)), filename);
 }
 
-static int __open_dso(struct dso *dso, struct machine *machine)
-	EXCLUSIVE_LOCKS_REQUIRED(_dso__data_open_lock)
+static char *dso__get_filename(struct dso *dso, const char *root_dir,
+			       bool *decomp)
 {
-	int fd = -EINVAL;
-	char *root_dir = (char *)"";
 	char *name = malloc(PATH_MAX);
-	bool decomp = false;
 
-	if (!name)
-		return -ENOMEM;
+	*decomp = false;
 
-	mutex_lock(dso__lock(dso));
-	if (machine)
-		root_dir = machine->root_dir;
+	if (name == NULL)
+		return NULL;
 
 	if (dso__read_binary_type_filename(dso, dso__binary_type(dso),
 					    root_dir, name, PATH_MAX))
@@ -599,20 +594,38 @@ static int __open_dso(struct dso *dso, struct machine *machine)
 		size_t len = sizeof(newpath);
 
 		if (dso__decompress_kmodule_path(dso, name, newpath, len) < 0) {
-			fd = -(*dso__load_errno(dso));
+			errno = *dso__load_errno(dso);
 			goto out;
 		}
 
-		decomp = true;
+		*decomp = true;
 		strcpy(name, newpath);
 	}
+	return name;
+
+out:
+	free(name);
+	return NULL;
+}
 
-	fd = do_open(name);
+static int __open_dso(struct dso *dso, struct machine *machine)
+	EXCLUSIVE_LOCKS_REQUIRED(_dso__data_open_lock)
+{
+	int fd = -EINVAL;
+	char *name;
+	bool decomp = false;
+
+	mutex_lock(dso__lock(dso));
+
+	name = dso__get_filename(dso, machine ? machine->root_dir : "", &decomp);
+	if (name)
+		fd = do_open(name);
+	else
+		fd = -errno;
 
 	if (decomp)
 		unlink(name);
 
-out:
 	mutex_unlock(dso__lock(dso));
 	free(name);
 	return fd;
@@ -1916,3 +1929,23 @@ const u8 *dso__read_symbol(struct dso *dso, const char *symfs_filename,
 	return __dso__read_symbol(dso, symfs_filename, start, len,
 				  out_buf, out_buf_len, is_64bit);
 }
+
+struct debuginfo *dso__debuginfo(struct dso *dso)
+{
+	char *name;
+	bool decomp = false;
+	struct debuginfo *dinfo = NULL;
+
+	mutex_lock(dso__lock(dso));
+
+	name = dso__get_filename(dso, "", &decomp);
+	if (name)
+		dinfo = debuginfo__new(name);
+
+	if (decomp)
+		unlink(name);
+
+	mutex_unlock(dso__lock(dso));
+	free(name);
+	return dinfo;
+}
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index 295388085031..ac725bc8ea74 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -784,7 +784,7 @@ int dso__kernel_module_get_build_id(struct dso *dso, const char *root_dir);
 
 char dso__symtab_origin(const struct dso *dso);
 int dso__read_binary_type_filename(const struct dso *dso, enum dso_binary_type type,
-				   char *root_dir, char *filename, size_t size);
+				   const char *root_dir, char *filename, size_t size);
 bool is_kernel_module(const char *pathname, int cpumode);
 bool dso__needs_decompress(struct dso *dso);
 int dso__decompress_kmodule_fd(struct dso *dso, const char *name);
@@ -933,14 +933,7 @@ u64 dso__findnew_global_type(struct dso *dso, u64 addr, u64 offset);
 bool perf_pid_map_tid(const char *dso_name, int *tid);
 bool is_perf_pid_map_name(const char *dso_name);
 
-/*
- * In the future, we may get debuginfo using build-ID (w/o path).
- * Add this helper is for the smooth conversion.
- */
-static inline struct debuginfo *dso__debuginfo(struct dso *dso)
-{
-	return debuginfo__new(dso__long_name(dso));
-}
+struct debuginfo *dso__debuginfo(struct dso *dso);
 
 const u8 *dso__read_symbol(struct dso *dso, const char *symfs_filename,
 			   const struct map *map, const struct symbol *sym,
-- 
cgit v1.2.3


From 4906eccbfae3eec58f7bf27f2921dc365bedafec Mon Sep 17 00:00:00 2001
From: Haiyue Wang <haiyuewa@163.com>
Date: Mon, 8 Dec 2025 10:15:14 +0800
Subject: perf tools: Add the legacy-cache.json to .gitignore

The commit 0012e0fa221bf9cc ("perf jevents: Add legacy-hardware and
legacy-cache json") will auto-generate: "pmu-events/arch/common/common/legacy-cache.json".

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Haiyue Wang <haiyuewa@163.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index b64302a76144..5c59f954f52a 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -36,6 +36,7 @@ config.mak.autogen
 util/intel-pt-decoder/inat-tables.c
 arch/*/include/generated/
 trace/beauty/generated/
+pmu-events/arch/common/common/legacy-cache.json
 pmu-events/pmu-events.c
 pmu-events/jevents
 pmu-events/metric_test.log
-- 
cgit v1.2.3


From a58807adbed5f532efb231e5490767f284f237c0 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 10 Dec 2025 11:01:41 -0800
Subject: perf tests kallsyms: Fix missed map__put()

Issue was caught by leak sanitizer and the test robot.

Fixes: 34e271ae55382fbd ("perf test: Add kallsyms split test")
Reported-by: kernel test robot <oliver.sang@intel.com>
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Closes: https://lore.kernel.org/oe-lkp/202512101502.f3819cd3-lkp@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/kallsyms-split.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/tests/kallsyms-split.c b/tools/perf/tests/kallsyms-split.c
index bbbc66957e5d..117ed3b70f63 100644
--- a/tools/perf/tests/kallsyms-split.c
+++ b/tools/perf/tests/kallsyms-split.c
@@ -148,6 +148,7 @@ static int test__kallsyms_split(struct test_suite *test __maybe_unused,
 	ret = TEST_OK;
 
 out:
+	map__put(map);
 	remove_proc_dir(0);
 	machine__exit(&m);
 	return ret;
-- 
cgit v1.2.3


From e524dda49340cb973c95fdfd6aa700eeb67aa128 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Fri, 12 Dec 2025 17:24:59 +0000
Subject: perf mem: Simplify Arm SPE event config

Since configuration fields default to zero, the zero assignments are
redundant, remove them.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Grant <al.grant@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm64/util/mem-events.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm64/util/mem-events.c b/tools/perf/arch/arm64/util/mem-events.c
index 9f8da7937255..eaf00e0609c6 100644
--- a/tools/perf/arch/arm64/util/mem-events.c
+++ b/tools/perf/arch/arm64/util/mem-events.c
@@ -6,7 +6,7 @@
 #define E(t, n, s, l, a) { .tag = t, .name = n, .event_name = s, .ldlat = l, .aux_event = a }
 
 struct perf_mem_event perf_mem_events_arm[PERF_MEM_EVENTS__MAX] = {
-	E("spe-load",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=0,min_latency=%u/",	NULL,	true,	0),
-	E("spe-store",	"%s/ts_enable=1,pa_enable=1,load_filter=0,store_filter=1/",			NULL,	false,	0),
+	E("spe-load",	"%s/ts_enable=1,pa_enable=1,load_filter=1,min_latency=%u/",	NULL,	true,	0),
+	E("spe-store",	"%s/ts_enable=1,pa_enable=1,store_filter=1/",			NULL,	false,	0),
 	E("spe-ldst",	"%s/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=%u/",	NULL,	true,	0),
 };
-- 
cgit v1.2.3


From dc7fb075f7de33ee78a2133598215af6a87d7ab3 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Fri, 12 Dec 2025 17:25:00 +0000
Subject: perf c2c: Update documentation for adding memory event table

Users may occasionally need to see which options are applied to memory
events.

This helps to understand the behavior of "perf c2c" and "perf mem", and
provides guidance for configuring memory event options directly.

Add a table to track memory events and their corresponding options, and
include the Arm SPE events in it.

Suggested-by: Al Grant <al.grant@arm.com>
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-c2c.txt | 51 +++++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt
index 40b0f71a2c44..e57a122b8719 100644
--- a/tools/perf/Documentation/perf-c2c.txt
+++ b/tools/perf/Documentation/perf-c2c.txt
@@ -160,20 +160,43 @@ Following perf record options are configured by default:
 
   -W,-d,--phys-data,--sample-cpu
 
-Unless specified otherwise with '-e' option, following events are monitored by
-default on Intel:
-
-  cpu/mem-loads,ldlat=30/P
-  cpu/mem-stores/P
-
-following on AMD:
-
-  ibs_op//
-
-and following on PowerPC:
-
-  cpu/mem-loads/
-  cpu/mem-stores/
+The following table lists the events monitored on different architectures.
+Unless specified otherwise with the -e option, the tool will select the
+default events.
+
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
+  | Arch   | Configuration | Options         | Events                                                                         |
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
+  | Intel  | Default       | -e ldlat-loads  | cpu/mem-loads,ldlat=30/P                                                       |
+  |        |               | -e ldlat-stores | cpu/mem-stores/P                                                               |
+  |        |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Load only     | -e ldlat-loads  | cpu/mem-loads,ldlat=30/P                                                       |
+  |        |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Store only    | -e ldlat-stores | cpu/mem-stores/P                                                               |
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
+  | Intel  | Default       | -e ldlat-loads  | {cpu/mem-loads-aux/,cpu/mem-loads,ldlat=30/}:P                                 |
+  | with   |               | -e ldlat-stores | cpu/mem-stores/P                                                               |
+  | AUX    |--------------+------------------+--------------------------------------------------------------------------------+
+  |        | Load only     | -e ldlat-loads  | {cpu/mem-loads-aux/,cpu/mem-loads,ldlat=30/}:P                                 |
+  |        |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Store only    | -e ldlat-stores | cpu/mem-stores/P                                                               |
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
+  | AMD    | Default       | -e mem-ldst     | ibs_op// (without latency support)                                             |
+  |        |               |                 | ibs_op/ldlat=30/ (with latency support)                                        |
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
+  | PowerPC| Default       | -e ldlat-loads  | cpu/mem-loads/                                                                 |
+  |        |               | -e ldlat-stores | cpu/mem-stores/                                                                |
+  |        |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Load only     | -e ldlat-loads  | cpu/mem-loads/                                                                 |
+  |        |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Store only    | -e ldlat-stores | cpu/mem-stores/                                                                |
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
+  | Arm    | Default       | -e spe-ldst     | arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,store_filter=1,min_latency=30/ |
+  | SPE    |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Load only     | -e spe-load     | arm_spe_0/ts_enable=1,pa_enable=1,load_filter=1,min_latency=30/                |
+  |        |---------------+-----------------+--------------------------------------------------------------------------------+
+  |        | Store only    | -e spe-store    | arm_spe_0/ts_enable=1,pa_enable=1,store_filter=1/                              |
+  +--------+---------------+-----------------+--------------------------------------------------------------------------------+
 
 User can pass any 'perf record' option behind '--' mark, like (to enable
 callchains and system wide monitoring):
-- 
cgit v1.2.3


From 9bb93278c35d658057f513ba25ffb2b5204c2b9e Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Thu, 4 Dec 2025 09:10:53 +0000
Subject: perf tools: Always uniquify event names

evlist__uniquify_evsel_names() only gets called in __parse_events() if
verbose is > 0. This means that the auto added "slots" events stay as
"slots" rather than being expanded to "cpu_core/slots/" unless Perf is
run in verbose mode. This is invisible to users when running Perf stat
because evlist__print_counters() always calls it regardless of verbose
mode before displaying.

The only thing this seems to affect is the test "Parsing of all PMU
events from sysfs" which fails when not run in verbose mode.
test__checkevent_pmu_events() always expects event names to be prefixed
with the pmu name, but this only happens for "slots" events after
evlist__uniquify_evsel_names() is called.

One fix could be to relax the test to accept the non prefixed name in
normal mode. But seeing as Perf stat uniquifies unconditionally, make
parse_events() do the same.

This fixes the following test failure:

  $ perf test "Parsing of all PMU events from sysfs"
  5.2: Parsing of all PMU events from sysfs                    : FAILED!
  $

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index c8f2962a06c7..6e1185d7be1b 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -2247,12 +2247,12 @@ int __parse_events(struct evlist *evlist, const char *str, const char *pmu_filte
 	evlist__splice_list_tail(evlist, &parse_state.list);
 
 	if (ret2 && warn_if_reordered && !parse_state.wild_card_pmus) {
+		evlist__uniquify_evsel_names(evlist, &stat_config);
 		pr_warning("WARNING: events were regrouped to match PMUs\n");
 
 		if (verbose > 0) {
 			struct strbuf sb = STRBUF_INIT;
 
-			evlist__uniquify_evsel_names(evlist, &stat_config);
 			evlist__format_evsels(evlist, &sb, 2048);
 			pr_debug("evlist after sorting/fixing: '%s'\n", sb.buf);
 			strbuf_release(&sb);
-- 
cgit v1.2.3


From 838def24130540dcb7b846bdb2bad63ea4c3dd55 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Thu, 4 Dec 2025 09:10:54 +0000
Subject: perf test: Add missing newlines in debug messages

These debug messages bleed into the next log line. Fix it by adding the
missing newlines.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/parse-events.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 2bd622972114..1d3cc224fbc2 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -2627,7 +2627,7 @@ static int test_events(const struct evlist_test *events, int cnt)
 		pr_debug("running test %d '%s'\n", i, e.name);
 		test_ret = test_event(&e);
 		if (test_ret != TEST_OK) {
-			pr_debug("Event test failure: test %d '%s'", i, e.name);
+			pr_debug("Event test failure: test %d '%s'\n", i, e.name);
 			ret = combine_test_results(ret, test_ret);
 		}
 	}
@@ -2764,7 +2764,7 @@ static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest
 
 			test_ret = test_event(&e);
 			if (test_ret != TEST_OK) {
-				pr_debug("Test PMU event failed for '%s'", name);
+				pr_debug("Test PMU event failed for '%s'\n", name);
 				ret = combine_test_results(ret, test_ret);
 			}
 
@@ -2790,7 +2790,7 @@ static int test__pmu_events(struct test_suite *test __maybe_unused, int subtest
 			e.check = test__checkevent_pmu_events_mix;
 			test_ret = test_event(&e);
 			if (test_ret != TEST_OK) {
-				pr_debug("Test PMU event failed for '%s'", name);
+				pr_debug("Test PMU event failed for '%s'\n", name);
 				ret = combine_test_results(ret, test_ret);
 			}
 		}
-- 
cgit v1.2.3


From a70493e2bb0878885aa7a8178162550270693eb1 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Mon, 19 Jan 2026 10:18:35 +0000
Subject: perf cs-etm: Fix decoding for sparse CPU maps

The ETM decoder incorrectly assumed that auxtrace queue indices were
equivalent to CPU number. This assumption is used for inserting records
into the queue, and for fetching queues when given a CPU number. This
assumption held when Perf always opened a dummy event on every CPU, even
if the user provided a subset of CPUs on the commandline, resulting in
the indices aligning.

For example:

  # event : name = cs_etm//u, , id = { 2451, 2452 }, type = 11 (cs_etm), size = 136, config = 0x4010, { sample_period, samp>
  # event : name = dummy:u, , id = { 2453, 2454, 2455, 2456 }, type = 1 (PERF_TYPE_SOFTWARE), size = 136, config = 0x9 (PER>

  0 0 0x200 [0xd0]: PERF_RECORD_ID_INDEX nr: 6
  ... id: 2451  idx: 2  cpu: 2  tid: -1
  ... id: 2452  idx: 3  cpu: 3  tid: -1
  ... id: 2453  idx: 0  cpu: 0  tid: -1
  ... id: 2454  idx: 1  cpu: 1  tid: -1
  ... id: 2455  idx: 2  cpu: 2  tid: -1
  ... id: 2456  idx: 3  cpu: 3  tid: -1

Since commit 811082e4b668 ("perf parse-events: Support user CPUs mixed
with threads/processes") the dummy event no longer behaves in this way,
making the ETM event indices start from 0 on the first CPU recorded
regardless of its ID:

  # event : name = cs_etm//u, , id = { 771, 772 }, type = 11 (cs_etm), size = 144, config = 0x4010, { sample_period, sample>
  # event : name = dummy:u, , id = { 773, 774 }, type = 1 (PERF_TYPE_SOFTWARE), size = 144, config = 0x9 (PERF_COUNT_SW_DUM>

  0 0 0x200 [0x90]: PERF_RECORD_ID_INDEX nr: 4
  ... id: 771  idx: 0  cpu: 2  tid: -1
  ... id: 772  idx: 1  cpu: 3  tid: -1
  ... id: 773  idx: 0  cpu: 2  tid: -1
  ... id: 774  idx: 1  cpu: 3  tid: -1

This causes the following segfault when decoding:

  $ perf record -e cs_etm//u -C 2,3 -- true
  $ perf report

  perf: Segmentation fault
  -------- backtrace --------
  #0 0xaaaabf9fd020 in ui__signal_backtrace setup.c:110
  #1 0xffffab5c7930 in __kernel_rt_sigreturn [vdso][930]
  #2 0xaaaabfb68d30 in cs_etm_decoder__reset cs-etm-decoder.c:85
  #3 0xaaaabfb65930 in cs_etm__get_data_block cs-etm.c:2032
  #4 0xaaaabfb666fc in cs_etm__run_per_cpu_timeless_decoder cs-etm.c:2551
  #5 0xaaaabfb6692c in (cs_etm__process_timeless_queues cs-etm.c:2612
  #6 0xaaaabfb63390 in cs_etm__flush_events cs-etm.c:921
  #7 0xaaaabfb324c0 in auxtrace__flush_events auxtrace.c:2915
  #8 0xaaaabfaac378 in __perf_session__process_events session.c:2285
  #9 0xaaaabfaacc9c in perf_session__process_events session.c:2442
  #10 0xaaaabf8d3d90 in __cmd_report builtin-report.c:1085
  #11 0xaaaabf8d6944 in cmd_report builtin-report.c:1866
  #12 0xaaaabf95ebfc in run_builtin perf.c:351
  #13 0xaaaabf95eeb0 in handle_internal_command perf.c:404
  #14 0xaaaabf95f068 in run_argv perf.c:451
  #15 0xaaaabf95f390 in main perf.c:558
  #16 0xffffaab97400 in __libc_start_call_main libc_start_call_main.h:74
  #17 0xffffaab974d8 in __libc_start_main@@GLIBC_2.34 libc-start.c:128
  #18 0xaaaabf8aa8f0 in _start perf[7a8f0]

Fix it by inserting into the queues based on CPU number, rather than
using the index.

Fixes: 811082e4b668db96 ("perf parse-events: Support user CPUs mixed with threads/processes")
Signed-off-by: James Clark <james.clark@linaro.org>
Tested-by: Leo Yan <leo.yan@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: coresight@lists.linaro.org
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/cs-etm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 25d56e0f1c07..12b55c2bc2ca 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -3086,7 +3086,7 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o
 
 	if (aux_offset >= auxtrace_event->offset &&
 	    aux_offset + aux_size <= auxtrace_event->offset + auxtrace_event->size) {
-		struct cs_etm_queue *etmq = etm->queues.queue_array[auxtrace_event->idx].priv;
+		struct cs_etm_queue *etmq = cs_etm__get_queue(etm, auxtrace_event->cpu);
 
 		/*
 		 * If this AUX event was inside this buffer somewhere, create a new auxtrace event
@@ -3095,6 +3095,7 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o
 		auxtrace_fragment.auxtrace = *auxtrace_event;
 		auxtrace_fragment.auxtrace.size = aux_size;
 		auxtrace_fragment.auxtrace.offset = aux_offset;
+		auxtrace_fragment.auxtrace.idx = etmq->queue_nr;
 		file_offset += aux_offset - auxtrace_event->offset + auxtrace_event->header.size;
 
 		pr_debug3("CS ETM: Queue buffer size: %#"PRI_lx64" offset: %#"PRI_lx64
-- 
cgit v1.2.3


From 3d020f2e3baea49f68c71f73ebb947da8e6fedc5 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Mon, 19 Jan 2026 10:18:36 +0000
Subject: perf cs-etm: Test sparse CPU maps

We only currently test with default (all CPUs) or --per-thread mode.

Different permutations of the "-C" option can affect decoding so add
tests for some of them.

Signed-off-by: James Clark <james.clark@linaro.org>
Tested-by: Leo Yan <leo.yan@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suzuki Poulouse <suzuki.poulose@arm.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/test_arm_coresight.sh | 54 ++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/test_arm_coresight.sh b/tools/perf/tests/shell/test_arm_coresight.sh
index 1c750b67d141..bbf89e944e7b 100755
--- a/tools/perf/tests/shell/test_arm_coresight.sh
+++ b/tools/perf/tests/shell/test_arm_coresight.sh
@@ -198,6 +198,58 @@ arm_cs_etm_basic_test() {
 	arm_cs_report "CoreSight basic testing with '$*'" $err
 }
 
+arm_cs_etm_test_cpu_list() {
+	echo "Testing sparse CPU list: $1"
+	perf record -o ${perfdata} -e cs_etm//u -C $1 \
+		-- taskset --cpu-list $1 true > /dev/null 2>&1
+	perf_script_branch_samples true
+	err=$?
+	arm_cs_report "CoreSight sparse CPUs with '$*'" $err
+}
+
+arm_cs_etm_sparse_cpus_test() {
+	# Iterate for every ETM device
+	cpus=()
+	for dev in /sys/bus/event_source/devices/cs_etm/cpu*; do
+		# Canonicalize the path
+		dev=`readlink -f $dev`
+
+		# Find the ETM device belonging to which CPU
+		cpus+=("$(cat $dev/cpu)")
+	done
+
+	mapfile -t cpus < <(printf '%s\n' "${cpus[@]}" | sort -n)
+	total=${#cpus[@]}
+
+	# Need more than 1 to test
+	if [ $total -le 1 ]; then
+		return 0
+	fi
+
+	half=$((total / 2))
+
+	# First half
+	first_half=$(IFS=,; echo "${cpus[*]:0:$half}")
+	arm_cs_etm_test_cpu_list $first_half
+
+	# Second half
+	second_half=$(IFS=,; echo "${cpus[*]:$half}")
+	arm_cs_etm_test_cpu_list $second_half
+
+	# Odd list is the same as halves unless >= 4 CPUs
+	if [ $total -lt 4 ]; then
+		return 0
+	fi
+
+	# Odd indices
+	odd_cpus=()
+	for ((i=1; i<total; i+=2)); do
+		odd_cpus+=("${cpus[$i]}")
+	done
+	odd_list=$(IFS=,; echo "${odd_cpus[*]}")
+	arm_cs_etm_test_cpu_list $odd_list
+}
+
 arm_cs_etm_traverse_path_test
 arm_cs_etm_system_wide_test
 arm_cs_etm_snapshot_test
@@ -211,4 +263,6 @@ arm_cs_etm_basic_test -e cs_etm/timestamp=1/ -a
 arm_cs_etm_basic_test -e cs_etm/timestamp=0/
 arm_cs_etm_basic_test -e cs_etm/timestamp=1/
 
+arm_cs_etm_sparse_cpus_test
+
 exit $glb_err
-- 
cgit v1.2.3


From 292eca3163218f2185a8eabe59f4a576bb9e05f8 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 13:04:56 +0100
Subject: docs: kdoc: fix logic to handle unissued warnings

Changeset 469c1c9eb6c9 ("kernel-doc: Issue warnings that were silently discarded")
didn't properly addressed the missing messages behavior, as
it was calling directly python logger low-level function,
instead of using the expected method to emit warnings.

Basically, there are two methods to log messages:

- self.config.log.warning() - This is the raw level to emit a
  warning. It just writes the a message at stderr, via python
  logging, as it is initialized as:

    self.config.log = logging.getLogger("kernel-doc")

- self.config.warning() - This is where we actually consider a
  message as a warning, properly incrementing error count.

Due to that, several parsing error messages are internally considered
as success, causing -Werror to not work on such messages.

While here, ensure that the last ignored entry will also be handled
by adding an extra check at the end of the parse handler.

Fixes: 469c1c9eb6c9 ("kernel-doc: Issue warnings that were silently discarded")
Closes: https://lore.kernel.org/linux-doc/20260112091053.00cee29a@foz.lan/
Cc: stable@vger.kernel.org
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <95109a6585171da4d6900049deaa2634b41ee743.1768823489.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/kdoc_parser.py | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py
index a9a37519145d..c03505889dc2 100644
--- a/tools/lib/python/kdoc/kdoc_parser.py
+++ b/tools/lib/python/kdoc/kdoc_parser.py
@@ -295,7 +295,7 @@ class KernelEntry:
 
     # TODO: rename to emit_message after removal of kernel-doc.pl
     def emit_msg(self, ln, msg, *, warning=True):
-        """Emit a message"""
+        """Emit a message."""
 
         log_msg = f"{self.fname}:{ln} {msg}"
 
@@ -448,18 +448,37 @@ class KernelDoc:
 
         self.config.log.debug("Output: %s:%s = %s", dtype, name, pformat(args))
 
+    def emit_unused_warnings(self):
+        """
+        When the parser fails to produce a valid entry, it places some
+        warnings under `entry.warnings` that will be discarded when resetting
+        the state.
+
+        Ensure that those warnings are not lost.
+
+        .. note::
+
+              Because we are calling `config.warning()` here, those
+              warnings are not filtered by the `-W` parameters: they will all
+              be produced even when `-Wreturn`, `-Wshort-desc`, and/or
+              `-Wcontents-before-sections` are used.
+
+              Allowing those warnings to be filtered is complex, because it
+              would require storing them in a buffer and then filtering them
+              during the output step of the code, depending on the
+              selected symbols.
+        """
+        if self.entry and self.entry not in self.entries:
+            for log_msg in self.entry.warnings:
+                self.config.warning(log_msg)
+
     def reset_state(self, ln):
         """
         Ancillary routine to create a new entry. It initializes all
         variables used by the state machine.
         """
 
-        #
-        # Flush the warnings out before we proceed further
-        #
-        if self.entry and self.entry not in self.entries:
-            for log_msg in self.entry.warnings:
-                self.config.log.warning(log_msg)
+        self.emit_unused_warnings()
 
         self.entry = KernelEntry(self.config, self.fname, ln)
 
@@ -1741,6 +1760,8 @@ class KernelDoc:
                         # Hand this line to the appropriate state handler
                         self.state_actions[self.state](self, ln, line)
 
+            self.emit_unused_warnings()
+
         except OSError:
             self.config.log.error(f"Error: Cannot open file {self.fname}")
 
-- 
cgit v1.2.3


From eba6ffd126cd52358181ed5a179644a161f9c65f Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Mon, 19 Jan 2026 13:05:01 +0100
Subject: docs: kdoc: move kernel-doc to tools/docs

kernel-doc is the last documentation-related tool still living outside of
the tools/docs directory; the time has come to move it over.

[mchehab: fixed kdoc lib location]

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <311d17e403524349940a8b12de6b5e91e554b1f4.1768823489.git.mchehab+huawei@kernel.org>
---
 Documentation/conf.py                              |   2 +-
 Documentation/doc-guide/kernel-doc.rst             |   8 +-
 Documentation/kbuild/kbuild.rst                    |   2 +-
 Documentation/process/coding-style.rst             |   2 +-
 .../translations/it_IT/doc-guide/kernel-doc.rst    |   8 +-
 .../translations/sp_SP/process/coding-style.rst    |   2 +-
 .../translations/zh_CN/doc-guide/kernel-doc.rst    |  10 +-
 Documentation/translations/zh_CN/kbuild/kbuild.rst |   2 +-
 .../translations/zh_CN/process/coding-style.rst    |   2 +-
 .../translations/zh_TW/process/coding-style.rst    |   2 +-
 MAINTAINERS                                        |   2 -
 Makefile                                           |   2 +-
 drivers/gpu/drm/i915/Makefile                      |   2 +-
 scripts/kernel-doc                                 |   1 -
 scripts/kernel-doc.py                              | 360 ---------------------
 tools/docs/find-unused-docs.sh                     |   2 +-
 tools/docs/kernel-doc                              | 360 +++++++++++++++++++++
 tools/docs/sphinx-build-wrapper                    |   2 +-
 18 files changed, 384 insertions(+), 387 deletions(-)
 delete mode 120000 scripts/kernel-doc
 delete mode 100755 scripts/kernel-doc.py
 create mode 100755 tools/docs/kernel-doc

(limited to 'tools')

diff --git a/Documentation/conf.py b/Documentation/conf.py
index 16d025af1f30..652be9221246 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -585,7 +585,7 @@ pdf_documents = [
 # kernel-doc extension configuration for running Sphinx directly (e.g. by Read
 # the Docs). In a normal build, these are supplied from the Makefile via command
 # line arguments.
-kerneldoc_bin = "../scripts/kernel-doc.py"
+kerneldoc_bin = "../tools/docs/kernel-doc"  # Not used now
 kerneldoc_srctree = ".."
 
 def setup(app):
diff --git a/Documentation/doc-guide/kernel-doc.rst b/Documentation/doc-guide/kernel-doc.rst
index b56128d7f5c3..8d2c09fb36e4 100644
--- a/Documentation/doc-guide/kernel-doc.rst
+++ b/Documentation/doc-guide/kernel-doc.rst
@@ -54,7 +54,7 @@ Running the ``kernel-doc`` tool with increased verbosity and without actual
 output generation may be used to verify proper formatting of the
 documentation comments. For example::
 
-	scripts/kernel-doc -v -none drivers/foo/bar.c
+	tools/docs/kernel-doc -v -none drivers/foo/bar.c
 
 The documentation format of ``.c`` files is also verified by the kernel build
 when it is requested to perform extra gcc checks::
@@ -365,7 +365,7 @@ differentiated by whether the macro name is immediately followed by a
 left parenthesis ('(') for function-like macros or not followed by one
 for object-like macros.
 
-Function-like macros are handled like functions by ``scripts/kernel-doc``.
+Function-like macros are handled like functions by ``tools/docs/kernel-doc``.
 They may have a parameter list. Object-like macros have do not have a
 parameter list.
 
@@ -596,8 +596,8 @@ from the source file.
 
 The kernel-doc extension is included in the kernel source tree, at
 ``Documentation/sphinx/kerneldoc.py``. Internally, it uses the
-``scripts/kernel-doc`` script to extract the documentation comments from the
-source.
+``tools/docs/kernel-doc`` script to extract the documentation comments from
+the source.
 
 .. _kernel_doc:
 
diff --git a/Documentation/kbuild/kbuild.rst b/Documentation/kbuild/kbuild.rst
index 82826b0332df..5a9013bacfb7 100644
--- a/Documentation/kbuild/kbuild.rst
+++ b/Documentation/kbuild/kbuild.rst
@@ -180,7 +180,7 @@ architecture.
 KDOCFLAGS
 ---------
 Specify extra (warning/error) flags for kernel-doc checks during the build,
-see scripts/kernel-doc for which flags are supported. Note that this doesn't
+see tools/docs/kernel-doc for which flags are supported. Note that this doesn't
 (currently) apply to documentation builds.
 
 ARCH
diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst
index 258158637f65..35b381230f6e 100644
--- a/Documentation/process/coding-style.rst
+++ b/Documentation/process/coding-style.rst
@@ -614,7 +614,7 @@ it.
 
 When commenting the kernel API functions, please use the kernel-doc format.
 See the files at :ref:`Documentation/doc-guide/ <doc_guide>` and
-``scripts/kernel-doc`` for details. Note that the danger of over-commenting
+``tools/docs/kernel-doc`` for details. Note that the danger of over-commenting
 applies to kernel-doc comments all the same. Do not add boilerplate
 kernel-doc which simply reiterates what's obvious from the signature
 of the function.
diff --git a/Documentation/translations/it_IT/doc-guide/kernel-doc.rst b/Documentation/translations/it_IT/doc-guide/kernel-doc.rst
index aa0e31d353d6..bac959b8b7b9 100644
--- a/Documentation/translations/it_IT/doc-guide/kernel-doc.rst
+++ b/Documentation/translations/it_IT/doc-guide/kernel-doc.rst
@@ -80,7 +80,7 @@ Al fine di verificare che i commenti siano formattati correttamente, potete
 eseguire il programma ``kernel-doc`` con un livello di verbosità alto e senza
 che questo produca alcuna documentazione. Per esempio::
 
-	scripts/kernel-doc -v -none drivers/foo/bar.c
+	tools/docs/kernel-doc -v -none drivers/foo/bar.c
 
 Il formato della documentazione è verificato della procedura di generazione
 del kernel quando viene richiesto di effettuare dei controlli extra con GCC::
@@ -378,7 +378,7 @@ distinguono in base al fatto che il nome della macro simile a funzione sia
 immediatamente seguito da una parentesi sinistra ('(') mentre in quelle simili a
 oggetti no.
 
-Le macro simili a funzioni sono gestite come funzioni da ``scripts/kernel-doc``.
+Le macro simili a funzioni sono gestite come funzioni da ``tools/docs/kernel-doc``.
 Possono avere un elenco di parametri. Le macro simili a oggetti non hanno un
 elenco di parametri.
 
@@ -595,7 +595,7 @@ documentazione presenti nel file sorgente (*source*).
 
 L'estensione kernel-doc fa parte dei sorgenti del kernel, la si può trovare
 in ``Documentation/sphinx/kerneldoc.py``. Internamente, viene utilizzato
-lo script ``scripts/kernel-doc`` per estrarre i commenti di documentazione
+lo script ``tools/docs/kernel-doc`` per estrarre i commenti di documentazione
 dai file sorgenti.
 
 Come utilizzare kernel-doc per generare pagine man
@@ -604,4 +604,4 @@ Come utilizzare kernel-doc per generare pagine man
 Se volete utilizzare kernel-doc solo per generare delle pagine man, potete
 farlo direttamente dai sorgenti del kernel::
 
-  $ scripts/kernel-doc -man $(git grep -l '/\*\*' -- :^Documentation :^tools) | scripts/split-man.pl /tmp/man
+  $ tools/docs/kernel-doc -man $(git grep -l '/\*\*' -- :^Documentation :^tools) | scripts/split-man.pl /tmp/man
diff --git a/Documentation/translations/sp_SP/process/coding-style.rst b/Documentation/translations/sp_SP/process/coding-style.rst
index 025223be9706..7d63aa8426e6 100644
--- a/Documentation/translations/sp_SP/process/coding-style.rst
+++ b/Documentation/translations/sp_SP/process/coding-style.rst
@@ -633,7 +633,7 @@ posiblemente POR QUÉ hace esto.
 
 Al comentar las funciones de la API del kernel, utilice el formato
 kernel-doc. Consulte los archivos en :ref:`Documentation/doc-guide/ <doc_guide>`
-y ``scripts/kernel-doc`` para más detalles.
+y ``tools/docs/kernel-doc`` para más detalles.
 
 El estilo preferido para comentarios largos (de varias líneas) es:
 
diff --git a/Documentation/translations/zh_CN/doc-guide/kernel-doc.rst b/Documentation/translations/zh_CN/doc-guide/kernel-doc.rst
index ccfb9b8329c2..fb2bbaaa85c1 100644
--- a/Documentation/translations/zh_CN/doc-guide/kernel-doc.rst
+++ b/Documentation/translations/zh_CN/doc-guide/kernel-doc.rst
@@ -43,7 +43,7 @@ kernel-doc注释用 ``/**`` 作为开始标记。 ``kernel-doc`` 工具将提取
 用详细模式和不生成实际输出来运行 ``kernel-doc`` 工具，可以验证文档注释的格式
 是否正确。例如::
 
-	scripts/kernel-doc -v -none drivers/foo/bar.c
+	tools/docs/kernel-doc -v -none drivers/foo/bar.c
 
 当请求执行额外的gcc检查时，内核构建将验证文档格式::
 
@@ -473,7 +473,7 @@ doc: *title*
 如果没有选项，kernel-doc指令将包含源文件中的所有文档注释。
 
 kernel-doc扩展包含在内核源代码树中，位于 ``Documentation/sphinx/kerneldoc.py`` 。
-在内部，它使用 ``scripts/kernel-doc`` 脚本从源代码中提取文档注释。
+在内部，它使用 ``tools/docs/kernel-doc`` 脚本从源代码中提取文档注释。
 
 .. _kernel_doc_zh:
 
@@ -482,18 +482,18 @@ kernel-doc扩展包含在内核源代码树中，位于 ``Documentation/sphinx/k
 
 如果您只想使用kernel-doc生成手册页，可以从内核git树这样做::
 
-  $ scripts/kernel-doc -man \
+  $ tools/docs/kernel-doc -man \
     $(git grep -l '/\*\*' -- :^Documentation :^tools) \
     | scripts/split-man.pl /tmp/man
 
 一些旧版本的git不支持路径排除语法的某些变体。
 以下命令之一可能适用于这些版本::
 
-  $ scripts/kernel-doc -man \
+  $ tools/docs/kernel-doc -man \
     $(git grep -l '/\*\*' -- . ':!Documentation' ':!tools') \
     | scripts/split-man.pl /tmp/man
 
-  $ scripts/kernel-doc -man \
+  $ tools/docs/kernel-doc -man \
     $(git grep -l '/\*\*' -- . ":(exclude)Documentation" ":(exclude)tools") \
     | scripts/split-man.pl /tmp/man
 
diff --git a/Documentation/translations/zh_CN/kbuild/kbuild.rst b/Documentation/translations/zh_CN/kbuild/kbuild.rst
index 57f5cf5b2cdd..a477b4b08958 100644
--- a/Documentation/translations/zh_CN/kbuild/kbuild.rst
+++ b/Documentation/translations/zh_CN/kbuild/kbuild.rst
@@ -174,7 +174,7 @@ UTS_MACHINE 变量（在某些架构中还包括内核配置）来猜测正确
 KDOCFLAGS
 ---------
 指定在构建过程中用于 kernel-doc 检查的额外（警告/错误）标志，查看
-scripts/kernel-doc 了解支持的标志。请注意，这目前不适用于文档构建。
+tools/docs/kernel-doc 了解支持的标志。请注意，这目前不适用于文档构建。
 
 ARCH
 ----
diff --git a/Documentation/translations/zh_CN/process/coding-style.rst b/Documentation/translations/zh_CN/process/coding-style.rst
index 0484d0c65c25..5a342a024c01 100644
--- a/Documentation/translations/zh_CN/process/coding-style.rst
+++ b/Documentation/translations/zh_CN/process/coding-style.rst
@@ -545,7 +545,7 @@ Linux 里这是提倡的做法，因为这样可以很简单的给读者提供
 也可以加上它做这些事情的原因。
 
 当注释内核 API 函数时，请使用 kernel-doc 格式。详见
-Documentation/translations/zh_CN/doc-guide/index.rst 和 scripts/kernel-doc 。
+Documentation/translations/zh_CN/doc-guide/index.rst 和 tools/docs/kernel-doc 。
 
 长 (多行) 注释的首选风格是：
 
diff --git a/Documentation/translations/zh_TW/process/coding-style.rst b/Documentation/translations/zh_TW/process/coding-style.rst
index 311c6f6bad0b..e2ba97b3d8bb 100644
--- a/Documentation/translations/zh_TW/process/coding-style.rst
+++ b/Documentation/translations/zh_TW/process/coding-style.rst
@@ -548,7 +548,7 @@ Linux 裏這是提倡的做法，因爲這樣可以很簡單的給讀者提供
 也可以加上它做這些事情的原因。
 
 當註釋內核 API 函數時，請使用 kernel-doc 格式。詳見
-Documentation/translations/zh_CN/doc-guide/index.rst 和 scripts/kernel-doc 。
+Documentation/translations/zh_CN/doc-guide/index.rst 和 tools/docs/kernel-doc 。
 
 長 (多行) 註釋的首選風格是：
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 02ec226dd571..d009e2da2215 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7523,7 +7523,6 @@ S:	Maintained
 P:	Documentation/doc-guide/maintainer-profile.rst
 T:	git git://git.lwn.net/linux.git docs-next
 F:	Documentation/
-F:	scripts/kernel-doc*
 F:	tools/lib/python/*
 F:	tools/docs/
 F:	tools/net/ynl/pyynl/lib/doc_generator.py
@@ -7561,7 +7560,6 @@ M:	Mauro Carvalho Chehab <mchehab@kernel.org>
 L:	linux-doc@vger.kernel.org
 S:	Maintained
 F:	Documentation/sphinx/
-F:	scripts/kernel-doc*
 F:	tools/lib/python/*
 F:	tools/docs/
 
diff --git a/Makefile b/Makefile
index 3cd00b62cde9..81a4ab11256c 100644
--- a/Makefile
+++ b/Makefile
@@ -460,7 +460,7 @@ HOSTPKG_CONFIG	= pkg-config
 
 # the KERNELDOC macro needs to be exported, as scripts/Makefile.build
 # has a logic to call it
-KERNELDOC       = $(srctree)/scripts/kernel-doc.py
+KERNELDOC       = $(srctree)/tools/docs/kernel-doc
 export KERNELDOC
 
 KBUILD_USERHOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 4db24050edb0..c979c579de66 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -443,7 +443,7 @@ always-$(CONFIG_DRM_I915_WERROR) += \
 
 quiet_cmd_hdrtest = HDRTEST $(patsubst %.hdrtest,%.h,$@)
       cmd_hdrtest = $(CC) $(filter-out $(CFLAGS_GCOV), $(c_flags)) -S -o /dev/null -x c /dev/null -include $<; \
-		$(srctree)/scripts/kernel-doc -none -Werror $<; touch $@
+		$(KERNELDOC) -none -Werror $<; touch $@
 
 $(obj)/%.hdrtest: $(src)/%.h FORCE
 	$(call if_changed_dep,hdrtest)
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
deleted file mode 120000
index 3b6ef807791a..000000000000
--- a/scripts/kernel-doc
+++ /dev/null
@@ -1 +0,0 @@
-kernel-doc.py
\ No newline at end of file
diff --git a/scripts/kernel-doc.py b/scripts/kernel-doc.py
deleted file mode 100755
index 4e3b9cfe3fd7..000000000000
--- a/scripts/kernel-doc.py
+++ /dev/null
@@ -1,360 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0
-# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
-#
-# pylint: disable=C0103,R0912,R0914,R0915
-#
-# NOTE: While kernel-doc requires at least version 3.6 to run, the
-#       command line should work with Python 3.2+ (tested with 3.4).
-#       The rationale is that it shall fail gracefully during Kernel
-#       compilation with older Kernel versions. Due to that:
-#       - encoding line is needed here;
-#       - f-strings cannot be used in this file.
-#       - libraries that require newer versions can only be included
-#         after the Python version has been checked.
-#
-# Converted from the kernel-doc script originally written in Perl
-# under GPLv2, copyrighted since 1998 by the following authors:
-#
-#    Aditya Srivastava <yashsri421@gmail.com>
-#    Akira Yokosawa <akiyks@gmail.com>
-#    Alexander A. Klimov <grandmaster@al2klimov.de>
-#    Alexander Lobakin <aleksander.lobakin@intel.com>
-#    André Almeida <andrealmeid@igalia.com>
-#    Andy Shevchenko <andriy.shevchenko@linux.intel.com>
-#    Anna-Maria Behnsen <anna-maria@linutronix.de>
-#    Armin Kuster <akuster@mvista.com>
-#    Bart Van Assche <bart.vanassche@sandisk.com>
-#    Ben Hutchings <ben@decadent.org.uk>
-#    Borislav Petkov <bbpetkov@yahoo.de>
-#    Chen-Yu Tsai <wenst@chromium.org>
-#    Coco Li <lixiaoyan@google.com>
-#    Conchúr Navid <conchur@web.de>
-#    Daniel Santos <daniel.santos@pobox.com>
-#    Danilo Cesar Lemes de Paula <danilo.cesar@collabora.co.uk>
-#    Dan Luedtke <mail@danrl.de>
-#    Donald Hunter <donald.hunter@gmail.com>
-#    Gabriel Krisman Bertazi <krisman@collabora.co.uk>
-#    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-#    Harvey Harrison <harvey.harrison@gmail.com>
-#    Horia Geanta <horia.geanta@freescale.com>
-#    Ilya Dryomov <idryomov@gmail.com>
-#    Jakub Kicinski <kuba@kernel.org>
-#    Jani Nikula <jani.nikula@intel.com>
-#    Jason Baron <jbaron@redhat.com>
-#    Jason Gunthorpe <jgg@nvidia.com>
-#    Jérémy Bobbio <lunar@debian.org>
-#    Johannes Berg <johannes.berg@intel.com>
-#    Johannes Weiner <hannes@cmpxchg.org>
-#    Jonathan Cameron <Jonathan.Cameron@huawei.com>
-#    Jonathan Corbet <corbet@lwn.net>
-#    Jonathan Neuschäfer <j.neuschaefer@gmx.net>
-#    Kamil Rytarowski <n54@gmx.com>
-#    Kees Cook <kees@kernel.org>
-#    Laurent Pinchart <laurent.pinchart@ideasonboard.com>
-#    Levin, Alexander (Sasha Levin) <alexander.levin@verizon.com>
-#    Linus Torvalds <torvalds@linux-foundation.org>
-#    Lucas De Marchi <lucas.demarchi@profusion.mobi>
-#    Mark Rutland <mark.rutland@arm.com>
-#    Markus Heiser <markus.heiser@darmarit.de>
-#    Martin Waitz <tali@admingilde.org>
-#    Masahiro Yamada <masahiroy@kernel.org>
-#    Matthew Wilcox <willy@infradead.org>
-#    Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
-#    Michal Wajdeczko <michal.wajdeczko@intel.com>
-#    Michael Zucchi
-#    Mike Rapoport <rppt@linux.ibm.com>
-#    Niklas Söderlund <niklas.soderlund@corigine.com>
-#    Nishanth Menon <nm@ti.com>
-#    Paolo Bonzini <pbonzini@redhat.com>
-#    Pavan Kumar Linga <pavan.kumar.linga@intel.com>
-#    Pavel Pisa <pisa@cmp.felk.cvut.cz>
-#    Peter Maydell <peter.maydell@linaro.org>
-#    Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
-#    Randy Dunlap <rdunlap@infradead.org>
-#    Richard Kennedy <richard@rsk.demon.co.uk>
-#    Rich Walker <rw@shadow.org.uk>
-#    Rolf Eike Beer <eike-kernel@sf-tec.de>
-#    Sakari Ailus <sakari.ailus@linux.intel.com>
-#    Silvio Fricke <silvio.fricke@gmail.com>
-#    Simon Huggins
-#    Tim Waugh <twaugh@redhat.com>
-#    Tomasz Warniełło <tomasz.warniello@gmail.com>
-#    Utkarsh Tripathi <utripathi2002@gmail.com>
-#    valdis.kletnieks@vt.edu <valdis.kletnieks@vt.edu>
-#    Vegard Nossum <vegard.nossum@oracle.com>
-#    Will Deacon <will.deacon@arm.com>
-#    Yacine Belkadi <yacine.belkadi.1@gmail.com>
-#    Yujie Liu <yujie.liu@intel.com>
-
-"""
-Print formatted kernel documentation to stdout.
-
-Read C language source or header FILEs, extract embedded
-documentation comments, and print formatted documentation
-to standard output.
-
-The documentation comments are identified by the ``/**``
-opening comment mark.
-
-See Documentation/doc-guide/kernel-doc.rst for the
-documentation comment syntax.
-"""
-
-import argparse
-import logging
-import os
-import sys
-
-# Import Python modules
-
-LIB_DIR = "../tools/lib/python"
-SRC_DIR = os.path.dirname(os.path.realpath(__file__))
-
-sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR))
-
-WERROR_RETURN_CODE = 3
-
-DESC = """
-Read C language source or header FILEs, extract embedded documentation comments,
-and print formatted documentation to standard output.
-
-The documentation comments are identified by the "/**" opening comment mark.
-
-See Documentation/doc-guide/kernel-doc.rst for the documentation comment syntax.
-"""
-
-EXPORT_FILE_DESC = """
-Specify an additional FILE in which to look for EXPORT_SYMBOL information.
-
-May be used multiple times.
-"""
-
-EXPORT_DESC = """
-Only output documentation for symbols that have been
-exported using EXPORT_SYMBOL() and related macros in any input
-FILE or -export-file FILE.
-"""
-
-INTERNAL_DESC = """
-Only output documentation for symbols that have NOT been
-exported using EXPORT_SYMBOL() and related macros in any input
-FILE or -export-file FILE.
-"""
-
-FUNCTION_DESC = """
-Only output documentation for the given function or DOC: section
-title. All other functions and DOC: sections are ignored.
-
-May be used multiple times.
-"""
-
-NOSYMBOL_DESC = """
-Exclude the specified symbol from the output documentation.
-
-May be used multiple times.
-"""
-
-FILES_DESC = """
-Header and C source files to be parsed.
-"""
-
-WARN_CONTENTS_BEFORE_SECTIONS_DESC = """
-Warn if there are contents before sections (deprecated).
-
-This option is kept just for backward-compatibility, but it does nothing,
-neither here nor at the original Perl script.
-"""
-
-
-class MsgFormatter(logging.Formatter):
-    """Helper class to format warnings in a similar way to kernel-doc.pl."""
-
-    def format(self, record):
-        record.levelname = record.levelname.capitalize()
-        return logging.Formatter.format(self, record)
-
-def main():
-    """
-    Main program.
-
-    By default, the return value is:
-
-    - 0: success or Python version is not compatible with
-      kernel-doc.  If -Werror is not used, it will also
-      return 0 if there are issues at kernel-doc markups;
-
-    - 1: an abnormal condition happened;
-
-    - 2: argparse issued an error;
-
-    - 3: -Werror is used, and one or more unfiltered parse warnings happened.
-    """
-
-    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
-                                     description=DESC)
-
-    #
-    # Normal arguments
-    #
-    parser.add_argument("-v", "-verbose", "--verbose", action="store_true",
-                        help="Verbose output, more warnings and other information.")
-
-    parser.add_argument("-d", "-debug", "--debug", action="store_true",
-                        help="Enable debug messages")
-
-    parser.add_argument("-M", "-modulename", "--modulename",
-                        default="Kernel API",
-                        help="Allow setting a module name at the output.")
-
-    parser.add_argument("-l", "-enable-lineno", "--enable_lineno",
-                        action="store_true",
-                        help="Enable line number output (only in ReST mode)")
-
-    #
-    # Arguments to control the warning behavior
-    #
-    parser.add_argument("-Wreturn", "--wreturn", action="store_true",
-                        help="Warns about the lack of a return markup on functions.")
-
-    parser.add_argument("-Wshort-desc", "-Wshort-description", "--wshort-desc",
-                        action="store_true",
-                        help="Warns if initial short description is missing")
-
-    parser.add_argument("-Wcontents-before-sections",
-                        "--wcontents-before-sections", action="store_true",
-                        help=WARN_CONTENTS_BEFORE_SECTIONS_DESC)
-
-    parser.add_argument("-Wall", "--wall", action="store_true",
-                        help="Enable all types of warnings")
-
-    parser.add_argument("-Werror", "--werror", action="store_true",
-                        help="Treat warnings as errors.")
-
-    parser.add_argument("-export-file", "--export-file", action='append',
-                        help=EXPORT_FILE_DESC)
-
-    #
-    # Output format mutually-exclusive group
-    #
-    out_group = parser.add_argument_group("Output format selection (mutually exclusive)")
-
-    out_fmt = out_group.add_mutually_exclusive_group()
-
-    out_fmt.add_argument("-m", "-man", "--man", action="store_true",
-                         help="Output troff manual page format.")
-    out_fmt.add_argument("-r", "-rst", "--rst", action="store_true",
-                         help="Output reStructuredText format (default).")
-    out_fmt.add_argument("-N", "-none", "--none", action="store_true",
-                         help="Do not output documentation, only warnings.")
-
-    #
-    # Output selection mutually-exclusive group
-    #
-    sel_group = parser.add_argument_group("Output selection (mutually exclusive)")
-    sel_mut = sel_group.add_mutually_exclusive_group()
-
-    sel_mut.add_argument("-e", "-export", "--export", action='store_true',
-                         help=EXPORT_DESC)
-
-    sel_mut.add_argument("-i", "-internal", "--internal", action='store_true',
-                         help=INTERNAL_DESC)
-
-    sel_mut.add_argument("-s", "-function", "--symbol", action='append',
-                         help=FUNCTION_DESC)
-
-    #
-    # Those are valid for all 3 types of filter
-    #
-    parser.add_argument("-n", "-nosymbol", "--nosymbol", action='append',
-                        help=NOSYMBOL_DESC)
-
-    parser.add_argument("-D", "-no-doc-sections", "--no-doc-sections",
-                        action='store_true', help="Don't output DOC sections")
-
-    parser.add_argument("files", metavar="FILE",
-                        nargs="+", help=FILES_DESC)
-
-    args = parser.parse_args()
-
-    if args.wall:
-        args.wreturn = True
-        args.wshort_desc = True
-        args.wcontents_before_sections = True
-
-    logger = logging.getLogger()
-
-    if not args.debug:
-        logger.setLevel(logging.INFO)
-    else:
-        logger.setLevel(logging.DEBUG)
-
-    formatter = MsgFormatter('%(levelname)s: %(message)s')
-
-    handler = logging.StreamHandler()
-    handler.setFormatter(formatter)
-
-    logger.addHandler(handler)
-
-    python_ver = sys.version_info[:2]
-    if python_ver < (3,6):
-        #
-        # Depending on the Kernel configuration, kernel-doc --none is called at
-        # build time. As we don't want to break compilation due to the
-        # usage of an old Python version, return 0 here.
-        #
-        if args.none:
-            logger.error("Python 3.6 or later is required by kernel-doc. Skipping checks")
-            sys.exit(0)
-
-        sys.exit("Python 3.6 or later is required by kernel-doc. Aborting.")
-
-    if python_ver < (3,7):
-        logger.warning("Python 3.7 or later is required for correct results")
-
-    #
-    # Import kernel-doc libraries only after checking the Python version
-    #
-    from kdoc.kdoc_files import KernelFiles             # pylint: disable=C0415
-    from kdoc.kdoc_output import RestFormat, ManFormat  # pylint: disable=C0415
-
-    if args.man:
-        out_style = ManFormat(modulename=args.modulename)
-    elif args.none:
-        out_style = None
-    else:
-        out_style = RestFormat()
-
-    kfiles = KernelFiles(verbose=args.verbose,
-                         out_style=out_style, werror=args.werror,
-                         wreturn=args.wreturn, wshort_desc=args.wshort_desc,
-                         wcontents_before_sections=args.wcontents_before_sections)
-
-    kfiles.parse(args.files, export_file=args.export_file)
-
-    for t in kfiles.msg(enable_lineno=args.enable_lineno, export=args.export,
-                        internal=args.internal, symbol=args.symbol,
-                        nosymbol=args.nosymbol, export_file=args.export_file,
-                        no_doc_sections=args.no_doc_sections):
-        msg = t[1]
-        if msg:
-            print(msg)
-
-    error_count = kfiles.errors
-    if not error_count:
-        sys.exit(0)
-
-    if args.werror:
-        print("%s warnings as errors" % error_count)    # pylint: disable=C0209
-        sys.exit(WERROR_RETURN_CODE)
-
-    if args.verbose:
-        print("%s errors" % error_count)                # pylint: disable=C0209
-
-    sys.exit(0)
-
-#
-# Call main method
-#
-if __name__ == "__main__":
-    main()
diff --git a/tools/docs/find-unused-docs.sh b/tools/docs/find-unused-docs.sh
index ca4e607ec3f7..53514c759dc1 100755
--- a/tools/docs/find-unused-docs.sh
+++ b/tools/docs/find-unused-docs.sh
@@ -54,7 +54,7 @@ for file in `find $1 -name '*.c'`; do
 	if [[ ${FILES_INCLUDED[$file]+_} ]]; then
 	continue;
 	fi
-	str=$(PYTHONDONTWRITEBYTECODE=1 scripts/kernel-doc -export "$file" 2>/dev/null)
+	str=$(PYTHONDONTWRITEBYTECODE=1 tools/docs/kernel-doc -export "$file" 2>/dev/null)
 	if [[ -n "$str" ]]; then
 	echo "$file"
 	fi
diff --git a/tools/docs/kernel-doc b/tools/docs/kernel-doc
new file mode 100755
index 000000000000..a19a92566780
--- /dev/null
+++ b/tools/docs/kernel-doc
@@ -0,0 +1,360 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
+#
+# pylint: disable=C0103,R0912,R0914,R0915
+#
+# NOTE: While kernel-doc requires at least version 3.6 to run, the
+#       command line should work with Python 3.2+ (tested with 3.4).
+#       The rationale is that it shall fail gracefully during Kernel
+#       compilation with older Kernel versions. Due to that:
+#       - encoding line is needed here;
+#       - f-strings cannot be used in this file.
+#       - libraries that require newer versions can only be included
+#         after the Python version has been checked.
+#
+# Converted from the kernel-doc script originally written in Perl
+# under GPLv2, copyrighted since 1998 by the following authors:
+#
+#    Aditya Srivastava <yashsri421@gmail.com>
+#    Akira Yokosawa <akiyks@gmail.com>
+#    Alexander A. Klimov <grandmaster@al2klimov.de>
+#    Alexander Lobakin <aleksander.lobakin@intel.com>
+#    André Almeida <andrealmeid@igalia.com>
+#    Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+#    Anna-Maria Behnsen <anna-maria@linutronix.de>
+#    Armin Kuster <akuster@mvista.com>
+#    Bart Van Assche <bart.vanassche@sandisk.com>
+#    Ben Hutchings <ben@decadent.org.uk>
+#    Borislav Petkov <bbpetkov@yahoo.de>
+#    Chen-Yu Tsai <wenst@chromium.org>
+#    Coco Li <lixiaoyan@google.com>
+#    Conchúr Navid <conchur@web.de>
+#    Daniel Santos <daniel.santos@pobox.com>
+#    Danilo Cesar Lemes de Paula <danilo.cesar@collabora.co.uk>
+#    Dan Luedtke <mail@danrl.de>
+#    Donald Hunter <donald.hunter@gmail.com>
+#    Gabriel Krisman Bertazi <krisman@collabora.co.uk>
+#    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+#    Harvey Harrison <harvey.harrison@gmail.com>
+#    Horia Geanta <horia.geanta@freescale.com>
+#    Ilya Dryomov <idryomov@gmail.com>
+#    Jakub Kicinski <kuba@kernel.org>
+#    Jani Nikula <jani.nikula@intel.com>
+#    Jason Baron <jbaron@redhat.com>
+#    Jason Gunthorpe <jgg@nvidia.com>
+#    Jérémy Bobbio <lunar@debian.org>
+#    Johannes Berg <johannes.berg@intel.com>
+#    Johannes Weiner <hannes@cmpxchg.org>
+#    Jonathan Cameron <Jonathan.Cameron@huawei.com>
+#    Jonathan Corbet <corbet@lwn.net>
+#    Jonathan Neuschäfer <j.neuschaefer@gmx.net>
+#    Kamil Rytarowski <n54@gmx.com>
+#    Kees Cook <kees@kernel.org>
+#    Laurent Pinchart <laurent.pinchart@ideasonboard.com>
+#    Levin, Alexander (Sasha Levin) <alexander.levin@verizon.com>
+#    Linus Torvalds <torvalds@linux-foundation.org>
+#    Lucas De Marchi <lucas.demarchi@profusion.mobi>
+#    Mark Rutland <mark.rutland@arm.com>
+#    Markus Heiser <markus.heiser@darmarit.de>
+#    Martin Waitz <tali@admingilde.org>
+#    Masahiro Yamada <masahiroy@kernel.org>
+#    Matthew Wilcox <willy@infradead.org>
+#    Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+#    Michal Wajdeczko <michal.wajdeczko@intel.com>
+#    Michael Zucchi
+#    Mike Rapoport <rppt@linux.ibm.com>
+#    Niklas Söderlund <niklas.soderlund@corigine.com>
+#    Nishanth Menon <nm@ti.com>
+#    Paolo Bonzini <pbonzini@redhat.com>
+#    Pavan Kumar Linga <pavan.kumar.linga@intel.com>
+#    Pavel Pisa <pisa@cmp.felk.cvut.cz>
+#    Peter Maydell <peter.maydell@linaro.org>
+#    Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
+#    Randy Dunlap <rdunlap@infradead.org>
+#    Richard Kennedy <richard@rsk.demon.co.uk>
+#    Rich Walker <rw@shadow.org.uk>
+#    Rolf Eike Beer <eike-kernel@sf-tec.de>
+#    Sakari Ailus <sakari.ailus@linux.intel.com>
+#    Silvio Fricke <silvio.fricke@gmail.com>
+#    Simon Huggins
+#    Tim Waugh <twaugh@redhat.com>
+#    Tomasz Warniełło <tomasz.warniello@gmail.com>
+#    Utkarsh Tripathi <utripathi2002@gmail.com>
+#    valdis.kletnieks@vt.edu <valdis.kletnieks@vt.edu>
+#    Vegard Nossum <vegard.nossum@oracle.com>
+#    Will Deacon <will.deacon@arm.com>
+#    Yacine Belkadi <yacine.belkadi.1@gmail.com>
+#    Yujie Liu <yujie.liu@intel.com>
+
+"""
+Print formatted kernel documentation to stdout.
+
+Read C language source or header FILEs, extract embedded
+documentation comments, and print formatted documentation
+to standard output.
+
+The documentation comments are identified by the ``/**``
+opening comment mark.
+
+See Documentation/doc-guide/kernel-doc.rst for the
+documentation comment syntax.
+"""
+
+import argparse
+import logging
+import os
+import sys
+
+# Import Python modules
+
+LIB_DIR = "../lib/python"
+SRC_DIR = os.path.dirname(os.path.realpath(__file__))
+
+sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR))
+
+WERROR_RETURN_CODE = 3
+
+DESC = """
+Read C language source or header FILEs, extract embedded documentation comments,
+and print formatted documentation to standard output.
+
+The documentation comments are identified by the "/**" opening comment mark.
+
+See Documentation/doc-guide/kernel-doc.rst for the documentation comment syntax.
+"""
+
+EXPORT_FILE_DESC = """
+Specify an additional FILE in which to look for EXPORT_SYMBOL information.
+
+May be used multiple times.
+"""
+
+EXPORT_DESC = """
+Only output documentation for symbols that have been
+exported using EXPORT_SYMBOL() and related macros in any input
+FILE or -export-file FILE.
+"""
+
+INTERNAL_DESC = """
+Only output documentation for symbols that have NOT been
+exported using EXPORT_SYMBOL() and related macros in any input
+FILE or -export-file FILE.
+"""
+
+FUNCTION_DESC = """
+Only output documentation for the given function or DOC: section
+title. All other functions and DOC: sections are ignored.
+
+May be used multiple times.
+"""
+
+NOSYMBOL_DESC = """
+Exclude the specified symbol from the output documentation.
+
+May be used multiple times.
+"""
+
+FILES_DESC = """
+Header and C source files to be parsed.
+"""
+
+WARN_CONTENTS_BEFORE_SECTIONS_DESC = """
+Warn if there are contents before sections (deprecated).
+
+This option is kept just for backward-compatibility, but it does nothing,
+neither here nor at the original Perl script.
+"""
+
+
+class MsgFormatter(logging.Formatter):
+    """Helper class to format warnings in a similar way to kernel-doc.pl."""
+
+    def format(self, record):
+        record.levelname = record.levelname.capitalize()
+        return logging.Formatter.format(self, record)
+
+def main():
+    """
+    Main program.
+
+    By default, the return value is:
+
+    - 0: success or Python version is not compatible with
+      kernel-doc.  If -Werror is not used, it will also
+      return 0 if there are issues at kernel-doc markups;
+
+    - 1: an abnormal condition happened;
+
+    - 2: argparse issued an error;
+
+    - 3: -Werror is used, and one or more unfiltered parse warnings happened.
+    """
+
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
+                                     description=DESC)
+
+    #
+    # Normal arguments
+    #
+    parser.add_argument("-v", "-verbose", "--verbose", action="store_true",
+                        help="Verbose output, more warnings and other information.")
+
+    parser.add_argument("-d", "-debug", "--debug", action="store_true",
+                        help="Enable debug messages")
+
+    parser.add_argument("-M", "-modulename", "--modulename",
+                        default="Kernel API",
+                        help="Allow setting a module name at the output.")
+
+    parser.add_argument("-l", "-enable-lineno", "--enable_lineno",
+                        action="store_true",
+                        help="Enable line number output (only in ReST mode)")
+
+    #
+    # Arguments to control the warning behavior
+    #
+    parser.add_argument("-Wreturn", "--wreturn", action="store_true",
+                        help="Warns about the lack of a return markup on functions.")
+
+    parser.add_argument("-Wshort-desc", "-Wshort-description", "--wshort-desc",
+                        action="store_true",
+                        help="Warns if initial short description is missing")
+
+    parser.add_argument("-Wcontents-before-sections",
+                        "--wcontents-before-sections", action="store_true",
+                        help=WARN_CONTENTS_BEFORE_SECTIONS_DESC)
+
+    parser.add_argument("-Wall", "--wall", action="store_true",
+                        help="Enable all types of warnings")
+
+    parser.add_argument("-Werror", "--werror", action="store_true",
+                        help="Treat warnings as errors.")
+
+    parser.add_argument("-export-file", "--export-file", action='append',
+                        help=EXPORT_FILE_DESC)
+
+    #
+    # Output format mutually-exclusive group
+    #
+    out_group = parser.add_argument_group("Output format selection (mutually exclusive)")
+
+    out_fmt = out_group.add_mutually_exclusive_group()
+
+    out_fmt.add_argument("-m", "-man", "--man", action="store_true",
+                         help="Output troff manual page format.")
+    out_fmt.add_argument("-r", "-rst", "--rst", action="store_true",
+                         help="Output reStructuredText format (default).")
+    out_fmt.add_argument("-N", "-none", "--none", action="store_true",
+                         help="Do not output documentation, only warnings.")
+
+    #
+    # Output selection mutually-exclusive group
+    #
+    sel_group = parser.add_argument_group("Output selection (mutually exclusive)")
+    sel_mut = sel_group.add_mutually_exclusive_group()
+
+    sel_mut.add_argument("-e", "-export", "--export", action='store_true',
+                         help=EXPORT_DESC)
+
+    sel_mut.add_argument("-i", "-internal", "--internal", action='store_true',
+                         help=INTERNAL_DESC)
+
+    sel_mut.add_argument("-s", "-function", "--symbol", action='append',
+                         help=FUNCTION_DESC)
+
+    #
+    # Those are valid for all 3 types of filter
+    #
+    parser.add_argument("-n", "-nosymbol", "--nosymbol", action='append',
+                        help=NOSYMBOL_DESC)
+
+    parser.add_argument("-D", "-no-doc-sections", "--no-doc-sections",
+                        action='store_true', help="Don't output DOC sections")
+
+    parser.add_argument("files", metavar="FILE",
+                        nargs="+", help=FILES_DESC)
+
+    args = parser.parse_args()
+
+    if args.wall:
+        args.wreturn = True
+        args.wshort_desc = True
+        args.wcontents_before_sections = True
+
+    logger = logging.getLogger()
+
+    if not args.debug:
+        logger.setLevel(logging.INFO)
+    else:
+        logger.setLevel(logging.DEBUG)
+
+    formatter = MsgFormatter('%(levelname)s: %(message)s')
+
+    handler = logging.StreamHandler()
+    handler.setFormatter(formatter)
+
+    logger.addHandler(handler)
+
+    python_ver = sys.version_info[:2]
+    if python_ver < (3,6):
+        #
+        # Depending on the Kernel configuration, kernel-doc --none is called at
+        # build time. As we don't want to break compilation due to the
+        # usage of an old Python version, return 0 here.
+        #
+        if args.none:
+            logger.error("Python 3.6 or later is required by kernel-doc. Skipping checks")
+            sys.exit(0)
+
+        sys.exit("Python 3.6 or later is required by kernel-doc. Aborting.")
+
+    if python_ver < (3,7):
+        logger.warning("Python 3.7 or later is required for correct results")
+
+    #
+    # Import kernel-doc libraries only after checking the Python version
+    #
+    from kdoc.kdoc_files import KernelFiles             # pylint: disable=C0415
+    from kdoc.kdoc_output import RestFormat, ManFormat  # pylint: disable=C0415
+
+    if args.man:
+        out_style = ManFormat(modulename=args.modulename)
+    elif args.none:
+        out_style = None
+    else:
+        out_style = RestFormat()
+
+    kfiles = KernelFiles(verbose=args.verbose,
+                         out_style=out_style, werror=args.werror,
+                         wreturn=args.wreturn, wshort_desc=args.wshort_desc,
+                         wcontents_before_sections=args.wcontents_before_sections)
+
+    kfiles.parse(args.files, export_file=args.export_file)
+
+    for t in kfiles.msg(enable_lineno=args.enable_lineno, export=args.export,
+                        internal=args.internal, symbol=args.symbol,
+                        nosymbol=args.nosymbol, export_file=args.export_file,
+                        no_doc_sections=args.no_doc_sections):
+        msg = t[1]
+        if msg:
+            print(msg)
+
+    error_count = kfiles.errors
+    if not error_count:
+        sys.exit(0)
+
+    if args.werror:
+        print("%s warnings as errors" % error_count)    # pylint: disable=C0209
+        sys.exit(WERROR_RETURN_CODE)
+
+    if args.verbose:
+        print("%s errors" % error_count)                # pylint: disable=C0209
+
+    sys.exit(0)
+
+#
+# Call main method
+#
+if __name__ == "__main__":
+    main()
diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper
index 7a5fcef25429..cb2a5005e633 100755
--- a/tools/docs/sphinx-build-wrapper
+++ b/tools/docs/sphinx-build-wrapper
@@ -246,7 +246,7 @@ class SphinxBuilder:
         #
         self.sphinxbuild = os.environ.get("SPHINXBUILD", "sphinx-build")
         self.kerneldoc = self.get_path(os.environ.get("KERNELDOC",
-                                                      "scripts/kernel-doc.py"))
+                                                      "tools/docs/kernel-doc"))
         self.builddir = self.get_path(builddir, use_cwd=True, abs_path=True)
 
         #
-- 
cgit v1.2.3


From 32e9a42440a230b14c438099bc5fccb5012a638a Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 13:05:02 +0100
Subject: docs: kdoc: move the return values to the helper message

It makes sense to describe what kernel-doc is expected to return
on its help message. Move such messages to argparse epilog.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <3bcfa48016770929fcd073376515e3ff0b777ea8.1768823489.git.mchehab+huawei@kernel.org>
---
 tools/docs/kernel-doc | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/docs/kernel-doc b/tools/docs/kernel-doc
index a19a92566780..902397804e80 100755
--- a/tools/docs/kernel-doc
+++ b/tools/docs/kernel-doc
@@ -166,6 +166,20 @@ This option is kept just for backward-compatibility, but it does nothing,
 neither here nor at the original Perl script.
 """
 
+EPILOG = """
+The return value is:
+
+- 0: success or Python version is not compatible with
+kernel-doc.  If -Werror is not used, it will also
+return 0 if there are issues at kernel-doc markups;
+
+- 1: an abnormal condition happened;
+
+- 2: argparse issued an error;
+
+- 3: When -Werror is used, it means that one or more unfiltered parse
+     warnings happened.
+"""
 
 class MsgFormatter(logging.Formatter):
     """Helper class to format warnings in a similar way to kernel-doc.pl."""
@@ -178,21 +192,10 @@ def main():
     """
     Main program.
 
-    By default, the return value is:
-
-    - 0: success or Python version is not compatible with
-      kernel-doc.  If -Werror is not used, it will also
-      return 0 if there are issues at kernel-doc markups;
-
-    - 1: an abnormal condition happened;
-
-    - 2: argparse issued an error;
-
-    - 3: -Werror is used, and one or more unfiltered parse warnings happened.
     """
 
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
-                                     description=DESC)
+                                     description=DESC, epilog=EPILOG)
 
     #
     # Normal arguments
-- 
cgit v1.2.3


From 35c0f975ef4a96cb488bcb5fca6e852fc347bc49 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 13:05:03 +0100
Subject: docs: kdoc: improve description of MsgFormatter

The description there is quite vague. Make it clearer.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <94269990e2d665bec08a1b6f4d28d84939cb9d83.1768823489.git.mchehab+huawei@kernel.org>
---
 tools/docs/kernel-doc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/docs/kernel-doc b/tools/docs/kernel-doc
index 902397804e80..aed09f9a54dd 100755
--- a/tools/docs/kernel-doc
+++ b/tools/docs/kernel-doc
@@ -182,7 +182,10 @@ return 0 if there are issues at kernel-doc markups;
 """
 
 class MsgFormatter(logging.Formatter):
-    """Helper class to format warnings in a similar way to kernel-doc.pl."""
+    """
+    Helper class to capitalize errors and warnings, the same way
+    the venerable (now retired) kernel-doc.pl used to do.
+    """
 
     def format(self, record):
         record.levelname = record.levelname.capitalize()
-- 
cgit v1.2.3


From 4a3efd128f7da996b677151790d043ec44a00561 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 20 Jan 2026 15:50:38 -0700
Subject: docs: sphinx-build-wrapper: stop setting kerneldoc_bin for Sphinx

Now that the Sphinx build does not use the kerneldoc_bin configuration
variable, we shouldn't try to set it in the build wrapper or we get a nifty
warning:

  WARNING: unknown config value 'kerneldoc_bin' in override, ignoring

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 tools/docs/sphinx-build-wrapper | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper
index cb2a5005e633..9f1ae1485f84 100755
--- a/tools/docs/sphinx-build-wrapper
+++ b/tools/docs/sphinx-build-wrapper
@@ -750,7 +750,6 @@ class SphinxBuilder:
 
             build_args = args + [
                 "-d", doctree_dir,
-                "-D", f"kerneldoc_bin={kerneldoc}",
                 "-D", f"version={self.kernelversion}",
                 "-D", f"release={self.kernelrelease}",
                 "-D", f"kerneldoc_srctree={self.srctree}",
-- 
cgit v1.2.3


From b2664a90c171846fcd572d93f6f21459721a1d2e Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Mon, 12 Jan 2026 09:19:49 -0700
Subject: jobserver: Split up the big try: block

The parsing of jobserver options is done in a massive try: block that hides
problems and (perhaps) bugs.  Split up that block and make the logic
explicit by moving the initial parsing of MAKEFLAGS out of that block.  Add
warnings in the places things can go wrong.

Reviewed-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 tools/lib/python/jobserver.py | 143 ++++++++++++++++++++++++++----------------
 1 file changed, 90 insertions(+), 53 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/jobserver.py b/tools/lib/python/jobserver.py
index 616411087725..a7c70ff4c375 100755
--- a/tools/lib/python/jobserver.py
+++ b/tools/lib/python/jobserver.py
@@ -35,6 +35,9 @@ import os
 import subprocess
 import sys
 
+def warn(text, *args):
+    print(f'WARNING: {text}', *args, file = sys.stderr)
+
 class JobserverExec:
     """
     Claim all slots from make using POSIX Jobserver.
@@ -58,64 +61,98 @@ class JobserverExec:
 
         if self.is_open:
             return
-
-        try:
-            # Fetch the make environment options.
-            flags = os.environ["MAKEFLAGS"]
-            # Look for "--jobserver=R,W"
-            # Note that GNU Make has used --jobserver-fds and --jobserver-auth
-            # so this handles all of them.
-            opts = [x for x in flags.split(" ") if x.startswith("--jobserver")]
-
-            # Parse out R,W file descriptor numbers and set them nonblocking.
-            # If the MAKEFLAGS variable contains multiple instances of the
-            # --jobserver-auth= option, the last one is relevant.
-            fds = opts[-1].split("=", 1)[1]
-
-            # Starting with GNU Make 4.4, named pipes are used for reader
-            # and writer.
-            # Example argument: --jobserver-auth=fifo:/tmp/GMfifo8134
-            _, _, path = fds.partition("fifo:")
-
-            if path:
+        self.is_open = True  # We only try once
+        self.claim = None
+        #
+        # Check the make flags for "--jobserver=R,W"
+        # Note that GNU Make has used --jobserver-fds and --jobserver-auth
+        # so this handles all of them.
+        #
+        flags = os.environ.get('MAKEFLAGS', '')
+        opts = [x for x in flags.split(" ") if x.startswith("--jobserver")]
+        if not opts:
+            return
+        #
+        # Separate out the provided file descriptors
+        #
+        split_opt = opts[-1].split('=', 1)
+        if len(split_opt) != 2:
+            warn('unparseable option:', opts[-1])
+            return
+        fds = split_opt[1]
+        #
+        # As of GNU Make 4.4, we'll be looking for a named pipe
+        # identified as fifo:path
+        #
+        if fds.startswith('fifo:'):
+            path = fds[len('fifo:'):]
+            try:
                 self.reader = os.open(path, os.O_RDONLY | os.O_NONBLOCK)
                 self.writer = os.open(path, os.O_WRONLY)
-            else:
-                self.reader, self.writer = [int(x) for x in fds.split(",", 1)]
+            except (OSError, IOError):
+                warn('unable to open jobserver pipe', path)
+                return
+        #
+        # Otherwise look for integer file-descriptor numbers.
+        #
+        else:
+            split_fds = fds.split(',')
+            if len(split_fds) != 2:
+                warn('malformed jobserver file descriptors:', fds)
+                return
+            try:
+                self.reader = int(split_fds[0])
+                self.writer = int(split_fds[1])
+            except ValueError:
+                warn('non-integer jobserver file-descriptors:', fds)
+                return
+            try:
+                #
                 # Open a private copy of reader to avoid setting nonblocking
                 # on an unexpecting process with the same reader fd.
-                self.reader = os.open("/proc/self/fd/%d" % (self.reader),
+                #
+                self.reader = os.open(f"/proc/self/fd/{self.reader}",
                                       os.O_RDONLY | os.O_NONBLOCK)
-
-            # Read out as many jobserver slots as possible
-            while True:
-                try:
-                    slot = os.read(self.reader, 8)
-                    if not slot:
-                        # Clear self.jobs to prevent us from probably writing incorrect file.
-                        self.jobs = b""
-                        raise ValueError("unexpected empty token from jobserver fd, invalid '--jobserver-auth=' setting?")
-                    self.jobs += slot
-                except (OSError, IOError) as e:
-                    if e.errno == errno.EWOULDBLOCK:
-                        # Stop at the end of the jobserver queue.
-                        break
-                    # If something went wrong, give back the jobs.
-                    if self.jobs:
-                        os.write(self.writer, self.jobs)
-                    raise e
-
-            # Add a bump for our caller's reserveration, since we're just going
-            # to sit here blocked on our child.
-            self.claim = len(self.jobs) + 1
-
-        except (KeyError, IndexError, ValueError, OSError, IOError) as e:
-            print(f"jobserver: warning: {repr(e)}", file=sys.stderr)
-            # Any missing environment strings or bad fds should result in just
-            # not being parallel.
-            self.claim = None
-
-        self.is_open = True
+            except (IOError, OSError) as e:
+                warn('Unable to reopen jobserver read-side pipe:', repr(e))
+                return
+        #
+        # OK, we have the channel to the job server; read out as many jobserver
+        # slots as possible.
+        #
+        while True:
+            try:
+                slot = os.read(self.reader, 8)
+                if not slot:
+                    #
+                    # Something went wrong.  Clear self.jobs to avoid writing
+                    # weirdness back to the jobserver and give up.
+                    self.jobs = b""
+                    warn("unexpected empty token from jobserver;"
+                         " possible invalid '--jobserver-auth=' setting")
+                    self.claim = None
+                    return
+            except (OSError, IOError) as e:
+                #
+                # If there is nothing more to read then we are done.
+                #
+                if e.errno == errno.EWOULDBLOCK:
+                    break
+                #
+                # Anything else says that something went weird; give back
+                # the jobs and give up.
+                #
+                if self.jobs:
+                    os.write(self.writer, self.jobs)
+                    self.claim = None
+                    warn('error reading from jobserver pipe', repr(e))
+                    return
+            self.jobs += slot
+        #
+        # Add a bump for our caller's reserveration, since we're just going
+        # to sit here blocked on our child.
+        #
+        self.claim = len(self.jobs) + 1
 
     def close(self):
         """Return all reserved slots to Jobserver"""
-- 
cgit v1.2.3


From 01ea38942bdcd28a7962d49d6f3a602979b81009 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 20 Jan 2026 17:47:09 -0300
Subject: perf tests sw-clock: Mark the volatile tmp variable as __maybe_unused

As it is just used to waste some cycles, not being used as all, to
silence some compilers.

Noticed with gcc version 16.0.1 20260115 on fedora 44:

    tests/sw-clock.c: In function '__test__sw_clock_freq':
    tests/sw-clock.c:31:22: error: variable 'tmp' set but not used [-Werror=unused-but-set-variable=]
       31 |         volatile int tmp = 0;
          |                      ^~~

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/sw-clock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c
index 4a2ad7176fa0..b6e46975379c 100644
--- a/tools/perf/tests/sw-clock.c
+++ b/tools/perf/tests/sw-clock.c
@@ -5,6 +5,7 @@
 #include <stdlib.h>
 #include <signal.h>
 #include <sys/mman.h>
+#include <linux/compiler.h>
 #include <linux/string.h>
 
 #include "tests.h"
@@ -28,7 +29,7 @@
 static int __test__sw_clock_freq(enum perf_sw_ids clock_id)
 {
 	int i, err = -1;
-	volatile int tmp = 0;
+	volatile int tmp __maybe_unused = 0;
 	u64 total_periods = 0;
 	int nr_samples = 0;
 	char sbuf[STRERR_BUFSIZE];
-- 
cgit v1.2.3


From 2c850606a46b319d5128bda59f67b1fc642d94ef Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 20 Jan 2026 17:57:43 -0300
Subject: perf trace: Deal with compiler const checks

The strchr() function these days return const/non-const based on the arg
it receives, and sometimes we need to use casts when we're dealing with
variables that are used in code that needs to safely change the returned
value and sometimes not (as it points to really const areas).

Tweak one such case.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 58a32adafddf..8df5ca44e4f9 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -5190,7 +5190,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str,
 	}
 
 	while (1) {
-		if ((sep = strchr(s, ',')) != NULL)
+		if ((sep = strchr((char *)s, ',')) != NULL)
 			*sep = '\0';
 
 		list = 0;
-- 
cgit v1.2.3


From 2583e81fd8852e6cf6730c4463b6580deb7a8aad Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Tue, 20 Jan 2026 14:26:29 -0800
Subject: resolve_btfids: Introduce finalize_btf() step

Since recently [1][2] resolve_btfids executes final adjustments to the
kernel/module BTF before it's embedded into the target binary.

To keep the implementation simple, a clear and stable "pipeline" of
how BTF data flows through resolve_btfids would be helpful. Some BTF
modifications may change the ids of the types, so it is important to
maintain correct order of operations with respect to .BTF_ids
resolution too.

This patch refactors the BTF handling to establish the following
sequence:
  - load target ELF sections
  - load .BTF_ids symbols
    - this will be a dependency of btf2btf transformations in
      subsequent patches
  - load BTF and its base as is
  - (*) btf2btf transformations will happen here
  - finalize_btf(), introduced in this patch
    - does distill base and sort BTF
  - resolve and patch .BTF_ids

This approach helps to avoid fixups in .BTF_ids data in case the ids
change at any point of BTF processing, because symbol resolution
happens on the finalized, ready to dump, BTF data.

This also gives flexibility in BTF transformations, because they will
happen on BTF that is not distilled and/or sorted yet, allowing to
freely add, remove and modify BTF types.

[1] https://lore.kernel.org/bpf/20251219181321.1283664-1-ihor.solodrai@linux.dev/
[2] https://lore.kernel.org/bpf/20260109130003.3313716-1-dolinux.peng@gmail.com/

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20260120222638.3976562-5-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/bpf/resolve_btfids/main.c | 69 ++++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index 343d08050116..1fcf37af6764 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -563,19 +563,6 @@ static int load_btf(struct object *obj)
 	obj->base_btf = base_btf;
 	obj->btf = btf;
 
-	if (obj->base_btf && obj->distill_base) {
-		err = btf__distill_base(obj->btf, &base_btf, &btf);
-		if (err) {
-			pr_err("FAILED to distill base BTF: %s\n", strerror(errno));
-			goto out_err;
-		}
-
-		btf__free(obj->base_btf);
-		btf__free(obj->btf);
-		obj->base_btf = base_btf;
-		obj->btf = btf;
-	}
-
 	return 0;
 
 out_err:
@@ -911,6 +898,41 @@ out:
 	return err;
 }
 
+static int finalize_btf(struct object *obj)
+{
+	struct btf *base_btf = obj->base_btf, *btf = obj->btf;
+	int err;
+
+	if (obj->base_btf && obj->distill_base) {
+		err = btf__distill_base(obj->btf, &base_btf, &btf);
+		if (err) {
+			pr_err("FAILED to distill base BTF: %s\n", strerror(errno));
+			goto out_err;
+		}
+
+		btf__free(obj->base_btf);
+		btf__free(obj->btf);
+		obj->base_btf = base_btf;
+		obj->btf = btf;
+	}
+
+	err = sort_btf_by_name(obj->btf);
+	if (err) {
+		pr_err("FAILED to sort BTF: %s\n", strerror(errno));
+		goto out_err;
+	}
+
+	return 0;
+
+out_err:
+	btf__free(base_btf);
+	btf__free(btf);
+	obj->base_btf = NULL;
+	obj->btf = NULL;
+
+	return err;
+}
+
 static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, const char *suffix)
 {
 	int len = snprintf(buf, buf_sz, "%s%s", in_path, suffix);
@@ -1054,6 +1076,7 @@ int main(int argc, const char **argv)
 	};
 	const char *btfids_path = NULL;
 	bool fatal_warnings = false;
+	bool resolve_btfids = true;
 	char out_path[PATH_MAX];
 
 	struct option btfid_options[] = {
@@ -1083,12 +1106,6 @@ int main(int argc, const char **argv)
 	if (btfids_path)
 		return patch_btfids(btfids_path, obj.path);
 
-	if (load_btf(&obj))
-		goto out;
-
-	if (sort_btf_by_name(obj.btf))
-		goto out;
-
 	if (elf_collect(&obj))
 		goto out;
 
@@ -1099,12 +1116,22 @@ int main(int argc, const char **argv)
 	if (obj.efile.idlist_shndx == -1 ||
 	    obj.efile.symbols_shndx == -1) {
 		pr_debug("Cannot find .BTF_ids or symbols sections, skip symbols resolution\n");
-		goto dump_btf;
+		resolve_btfids = false;
 	}
 
-	if (symbols_collect(&obj))
+	if (resolve_btfids)
+		if (symbols_collect(&obj))
+			goto out;
+
+	if (load_btf(&obj))
 		goto out;
 
+	if (finalize_btf(&obj))
+		goto out;
+
+	if (!resolve_btfids)
+		goto dump_btf;
+
 	if (symbols_resolve(&obj))
 		goto out;
 
-- 
cgit v1.2.3


From 9d199965990c0b21160c565076b93725d6638c28 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Tue, 20 Jan 2026 14:26:30 -0800
Subject: resolve_btfids: Support for KF_IMPLICIT_ARGS

Implement BTF modifications in resolve_btfids to support BPF kernel
functions with implicit arguments.

For a kfunc marked with KF_IMPLICIT_ARGS flag, a new function
prototype is added to BTF that does not have implicit arguments. The
kfunc's prototype is then updated to a new one in BTF. This prototype
is the intended interface for the BPF programs.

A <func_name>_impl function is added to BTF to make the original kfunc
prototype searchable for the BPF verifier. If a <func_name>_impl
function already exists in BTF, its interpreted as a legacy case, and
this step is skipped.

Whether an argument is implicit is determined by its type:
currently only `struct bpf_prog_aux *` is supported.

As a result, the BTF associated with kfunc is changed from

    __bpf_kfunc bpf_foo(int arg1, struct bpf_prog_aux *aux);

into

    bpf_foo_impl(int arg1, struct bpf_prog_aux *aux);
    __bpf_kfunc bpf_foo(int arg1);

For more context see previous discussions and patches [1][2].

[1] https://lore.kernel.org/dwarves/ba1650aa-fafd-49a8-bea4-bdddee7c38c9@linux.dev/
[2] https://lore.kernel.org/bpf/20251029190113.3323406-1-ihor.solodrai@linux.dev/

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20260120222638.3976562-6-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/bpf/resolve_btfids/main.c | 382 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 382 insertions(+)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index 1fcf37af6764..db8d1554bdcc 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -152,6 +152,25 @@ struct object {
 	int nr_typedefs;
 };
 
+#define KF_IMPLICIT_ARGS (1 << 16)
+#define KF_IMPL_SUFFIX "_impl"
+
+struct kfunc {
+	const char *name;
+	u32 btf_id;
+	u32 flags;
+};
+
+struct btf2btf_context {
+	struct btf *btf;
+	u32 *decl_tags;
+	u32 nr_decl_tags;
+	u32 max_decl_tags;
+	struct kfunc *kfuncs;
+	u32 nr_kfuncs;
+	u32 max_kfuncs;
+};
+
 static int verbose;
 static int warnings;
 
@@ -837,6 +856,366 @@ static int dump_raw_btf(struct btf *btf, const char *out_path)
 	return 0;
 }
 
+static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf, s32 type_id)
+{
+	const struct btf_type *t = btf__type_by_id(btf, type_id);
+
+	while (btf_is_mod(t))
+		t = btf__type_by_id(btf, t->type);
+
+	return t;
+}
+
+static int push_decl_tag_id(struct btf2btf_context *ctx, u32 decl_tag_id)
+{
+	u32 *arr = ctx->decl_tags;
+	u32 cap = ctx->max_decl_tags;
+
+	if (ctx->nr_decl_tags + 1 > cap) {
+		cap = max(cap + 256, cap * 2);
+		arr = realloc(arr, sizeof(u32) * cap);
+		if (!arr)
+			return -ENOMEM;
+		ctx->max_decl_tags = cap;
+		ctx->decl_tags = arr;
+	}
+
+	ctx->decl_tags[ctx->nr_decl_tags++] = decl_tag_id;
+
+	return 0;
+}
+
+static int push_kfunc(struct btf2btf_context *ctx, struct kfunc *kfunc)
+{
+	struct kfunc *arr = ctx->kfuncs;
+	u32 cap = ctx->max_kfuncs;
+
+	if (ctx->nr_kfuncs + 1 > cap) {
+		cap = max(cap + 256, cap * 2);
+		arr = realloc(arr, sizeof(struct kfunc) * cap);
+		if (!arr)
+			return -ENOMEM;
+		ctx->max_kfuncs = cap;
+		ctx->kfuncs = arr;
+	}
+
+	ctx->kfuncs[ctx->nr_kfuncs++] = *kfunc;
+
+	return 0;
+}
+
+static int collect_decl_tags(struct btf2btf_context *ctx)
+{
+	const u32 type_cnt = btf__type_cnt(ctx->btf);
+	struct btf *btf = ctx->btf;
+	const struct btf_type *t;
+	int err;
+
+	for (u32 id = 1; id < type_cnt; id++) {
+		t = btf__type_by_id(btf, id);
+		if (!btf_is_decl_tag(t))
+			continue;
+		err = push_decl_tag_id(ctx, id);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/*
+ * To find the kfunc flags having its struct btf_id (with ELF addresses)
+ * we need to find the address that is in range of a set8.
+ * If a set8 is found, then the flags are located at addr + 4 bytes.
+ * Return 0 (no flags!) if not found.
+ */
+static u32 find_kfunc_flags(struct object *obj, struct btf_id *kfunc_id)
+{
+	const u32 *elf_data_ptr = obj->efile.idlist->d_buf;
+	u64 set_lower_addr, set_upper_addr, addr;
+	struct btf_id *set_id;
+	struct rb_node *next;
+	u32 flags;
+	u64 idx;
+
+	for (next = rb_first(&obj->sets); next; next = rb_next(next)) {
+		set_id = rb_entry(next, struct btf_id, rb_node);
+		if (set_id->kind != BTF_ID_KIND_SET8 || set_id->addr_cnt != 1)
+			continue;
+
+		set_lower_addr = set_id->addr[0];
+		set_upper_addr = set_lower_addr + set_id->cnt * sizeof(u64);
+
+		for (u32 i = 0; i < kfunc_id->addr_cnt; i++) {
+			addr = kfunc_id->addr[i];
+			/*
+			 * Lower bound is exclusive to skip the 8-byte header of the set.
+			 * Upper bound is inclusive to capture the last entry at offset 8*cnt.
+			 */
+			if (set_lower_addr < addr && addr <= set_upper_addr) {
+				pr_debug("found kfunc %s in BTF_ID_FLAGS %s\n",
+					 kfunc_id->name, set_id->name);
+				idx = addr - obj->efile.idlist_addr;
+				idx = idx / sizeof(u32) + 1;
+				flags = elf_data_ptr[idx];
+
+				return flags;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int collect_kfuncs(struct object *obj, struct btf2btf_context *ctx)
+{
+	const char *tag_name, *func_name;
+	struct btf *btf = ctx->btf;
+	const struct btf_type *t;
+	u32 flags, func_id;
+	struct kfunc kfunc;
+	struct btf_id *id;
+	int err;
+
+	if (ctx->nr_decl_tags == 0)
+		return 0;
+
+	for (u32 i = 0; i < ctx->nr_decl_tags; i++) {
+		t = btf__type_by_id(btf, ctx->decl_tags[i]);
+		if (btf_kflag(t) || btf_decl_tag(t)->component_idx != -1)
+			continue;
+
+		tag_name = btf__name_by_offset(btf, t->name_off);
+		if (strcmp(tag_name, "bpf_kfunc") != 0)
+			continue;
+
+		func_id = t->type;
+		t = btf__type_by_id(btf, func_id);
+		if (!btf_is_func(t))
+			continue;
+
+		func_name = btf__name_by_offset(btf, t->name_off);
+		if (!func_name)
+			continue;
+
+		id = btf_id__find(&obj->funcs, func_name);
+		if (!id || id->kind != BTF_ID_KIND_SYM)
+			continue;
+
+		flags = find_kfunc_flags(obj, id);
+
+		kfunc.name = id->name;
+		kfunc.btf_id = func_id;
+		kfunc.flags = flags;
+
+		err = push_kfunc(ctx, &kfunc);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int build_btf2btf_context(struct object *obj, struct btf2btf_context *ctx)
+{
+	int err;
+
+	ctx->btf = obj->btf;
+
+	err = collect_decl_tags(ctx);
+	if (err) {
+		pr_err("ERROR: resolve_btfids: failed to collect decl tags from BTF\n");
+		return err;
+	}
+
+	err = collect_kfuncs(obj, ctx);
+	if (err) {
+		pr_err("ERROR: resolve_btfids: failed to collect kfuncs from BTF\n");
+		return err;
+	}
+
+	return 0;
+}
+
+
+/* Implicit BPF kfunc arguments can only be of particular types */
+static bool is_kf_implicit_arg(const struct btf *btf, const struct btf_param *p)
+{
+	static const char *const kf_implicit_arg_types[] = {
+		"bpf_prog_aux",
+	};
+	const struct btf_type *t;
+	const char *name;
+
+	t = btf_type_skip_qualifiers(btf, p->type);
+	if (!btf_is_ptr(t))
+		return false;
+
+	t = btf_type_skip_qualifiers(btf, t->type);
+	if (!btf_is_struct(t))
+		return false;
+
+	name = btf__name_by_offset(btf, t->name_off);
+	if (!name)
+		return false;
+
+	for (int i = 0; i < ARRAY_SIZE(kf_implicit_arg_types); i++)
+		if (strcmp(name, kf_implicit_arg_types[i]) == 0)
+			return true;
+
+	return false;
+}
+
+/*
+ * For a kfunc with KF_IMPLICIT_ARGS we do the following:
+ *   1. Add a new function with _impl suffix in the name, with the prototype
+ *      of the original kfunc.
+ *   2. Add all decl tags except "bpf_kfunc" for the _impl func.
+ *   3. Add a new function prototype with modified list of arguments:
+ *      omitting implicit args.
+ *   4. Change the prototype of the original kfunc to the new one.
+ *
+ * This way we transform the BTF associated with the kfunc from
+ *	__bpf_kfunc bpf_foo(int arg1, void *implicit_arg);
+ * into
+ *	bpf_foo_impl(int arg1, void *implicit_arg);
+ *	__bpf_kfunc bpf_foo(int arg1);
+ *
+ * If a kfunc with KF_IMPLICIT_ARGS already has an _impl counterpart
+ * in BTF, then it's a legacy case: an _impl function is declared in the
+ * source code. In this case, we can skip adding an _impl function, but we
+ * still have to add a func prototype that omits implicit args.
+ */
+static int process_kfunc_with_implicit_args(struct btf2btf_context *ctx, struct kfunc *kfunc)
+{
+	s32 idx, new_proto_id, new_func_id, proto_id;
+	const char *param_name, *tag_name;
+	const struct btf_param *params;
+	enum btf_func_linkage linkage;
+	char tmp_name[KSYM_NAME_LEN];
+	struct btf *btf = ctx->btf;
+	int err, len, nr_params;
+	struct btf_type *t;
+
+	t = (struct btf_type *)btf__type_by_id(btf, kfunc->btf_id);
+	if (!t || !btf_is_func(t)) {
+		pr_err("ERROR: resolve_btfids: btf id %d is not a function\n", kfunc->btf_id);
+		return -EINVAL;
+	}
+
+	linkage = btf_vlen(t);
+
+	proto_id = t->type;
+	t = (struct btf_type *)btf__type_by_id(btf, proto_id);
+	if (!t || !btf_is_func_proto(t)) {
+		pr_err("ERROR: resolve_btfids: btf id %d is not a function prototype\n", proto_id);
+		return -EINVAL;
+	}
+
+	len = snprintf(tmp_name, sizeof(tmp_name), "%s%s", kfunc->name, KF_IMPL_SUFFIX);
+	if (len < 0 || len >= sizeof(tmp_name)) {
+		pr_err("ERROR: function name is too long: %s%s\n", kfunc->name, KF_IMPL_SUFFIX);
+		return -E2BIG;
+	}
+
+	if (btf__find_by_name_kind(btf, tmp_name, BTF_KIND_FUNC) > 0) {
+		pr_debug("resolve_btfids: function %s already exists in BTF\n", tmp_name);
+		goto add_new_proto;
+	}
+
+	/* Add a new function with _impl suffix and original prototype */
+	new_func_id = btf__add_func(btf, tmp_name, linkage, proto_id);
+	if (new_func_id < 0) {
+		pr_err("ERROR: resolve_btfids: failed to add func %s to BTF\n", tmp_name);
+		return new_func_id;
+	}
+
+	/* Copy all decl tags except "bpf_kfunc" from the original kfunc to the new one */
+	for (int i = 0; i < ctx->nr_decl_tags; i++) {
+		t = (struct btf_type *)btf__type_by_id(btf, ctx->decl_tags[i]);
+		if (t->type != kfunc->btf_id)
+			continue;
+
+		tag_name = btf__name_by_offset(btf, t->name_off);
+		if (strcmp(tag_name, "bpf_kfunc") == 0)
+			continue;
+
+		idx = btf_decl_tag(t)->component_idx;
+
+		if (btf_kflag(t))
+			err = btf__add_decl_attr(btf, tag_name, new_func_id, idx);
+		else
+			err = btf__add_decl_tag(btf, tag_name, new_func_id, idx);
+
+		if (err < 0) {
+			pr_err("ERROR: resolve_btfids: failed to add decl tag %s for %s\n",
+			       tag_name, tmp_name);
+			return -EINVAL;
+		}
+	}
+
+add_new_proto:
+	t = (struct btf_type *)btf__type_by_id(btf, proto_id);
+	new_proto_id = btf__add_func_proto(btf, t->type);
+	if (new_proto_id < 0) {
+		pr_err("ERROR: resolve_btfids: failed to add func proto for %s\n", kfunc->name);
+		return new_proto_id;
+	}
+
+	/* Add non-implicit args to the new prototype */
+	t = (struct btf_type *)btf__type_by_id(btf, proto_id);
+	nr_params = btf_vlen(t);
+	for (int i = 0; i < nr_params; i++) {
+		params = btf_params(t);
+		if (is_kf_implicit_arg(btf, &params[i]))
+			break;
+		param_name = btf__name_by_offset(btf, params[i].name_off);
+		err = btf__add_func_param(btf, param_name, params[i].type);
+		if (err < 0) {
+			pr_err("ERROR: resolve_btfids: failed to add param %s for %s\n",
+			       param_name, kfunc->name);
+			return err;
+		}
+		t = (struct btf_type *)btf__type_by_id(btf, proto_id);
+	}
+
+	/* Finally change the prototype of the original kfunc to the new one */
+	t = (struct btf_type *)btf__type_by_id(btf, kfunc->btf_id);
+	t->type = new_proto_id;
+
+	pr_debug("resolve_btfids: updated BTF for kfunc with implicit args %s\n", kfunc->name);
+
+	return 0;
+}
+
+static int btf2btf(struct object *obj)
+{
+	struct btf2btf_context ctx = {};
+	int err;
+
+	err = build_btf2btf_context(obj, &ctx);
+	if (err)
+		goto out;
+
+	for (u32 i = 0; i < ctx.nr_kfuncs; i++) {
+		struct kfunc *kfunc = &ctx.kfuncs[i];
+
+		if (!(kfunc->flags & KF_IMPLICIT_ARGS))
+			continue;
+
+		err = process_kfunc_with_implicit_args(&ctx, kfunc);
+		if (err)
+			goto out;
+	}
+
+	err = 0;
+out:
+	free(ctx.decl_tags);
+	free(ctx.kfuncs);
+
+	return err;
+}
+
 /*
  * Sort types by name in ascending order resulting in all
  * anonymous types being placed before named types.
@@ -1126,6 +1505,9 @@ int main(int argc, const char **argv)
 	if (load_btf(&obj))
 		goto out;
 
+	if (btf2btf(&obj))
+		goto out;
+
 	if (finalize_btf(&obj))
 		goto out;
 
-- 
cgit v1.2.3


From e939f3d16d77a88e5f363394ef73db4c898c4107 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Tue, 20 Jan 2026 14:26:31 -0800
Subject: selftests/bpf: Add tests for KF_IMPLICIT_ARGS

Add trivial end-to-end tests to validate that KF_IMPLICIT_ARGS flag is
properly handled by both resolve_btfids and the verifier.

Declare kfuncs in bpf_testmod. Check that bpf_prog_aux pointer is set
in the kfunc implementation. Verify that calls with implicit args and
a legacy case all work.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20260120222638.3976562-7-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/kfunc_implicit_args.c | 10 ++++++
 .../selftests/bpf/progs/kfunc_implicit_args.c      | 41 ++++++++++++++++++++++
 .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 26 ++++++++++++++
 3 files changed, 77 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_implicit_args.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c b/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c
new file mode 100644
index 000000000000..5e4793c9c29a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include "kfunc_implicit_args.skel.h"
+
+void test_kfunc_implicit_args(void)
+{
+	RUN_TESTS(kfunc_implicit_args);
+}
diff --git a/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c b/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c
new file mode 100644
index 000000000000..89b6a47e22dd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+extern int bpf_kfunc_implicit_arg(int a) __weak __ksym;
+extern int bpf_kfunc_implicit_arg_impl(int a, struct bpf_prog_aux *aux) __weak __ksym; /* illegal */
+extern int bpf_kfunc_implicit_arg_legacy(int a, int b) __weak __ksym;
+extern int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux) __weak __ksym;
+
+char _license[] SEC("license") = "GPL";
+
+SEC("syscall")
+__retval(5)
+int test_kfunc_implicit_arg(void *ctx)
+{
+	return bpf_kfunc_implicit_arg(5);
+}
+
+SEC("syscall")
+__failure __msg("cannot find address for kernel function bpf_kfunc_implicit_arg_impl")
+int test_kfunc_implicit_arg_impl_illegal(void *ctx)
+{
+	return bpf_kfunc_implicit_arg_impl(5, NULL);
+}
+
+SEC("syscall")
+__retval(7)
+int test_kfunc_implicit_arg_legacy(void *ctx)
+{
+	return bpf_kfunc_implicit_arg_legacy(3, 4);
+}
+
+SEC("syscall")
+__retval(11)
+int test_kfunc_implicit_arg_legacy_impl(void *ctx)
+{
+	return bpf_kfunc_implicit_arg_legacy_impl(5, 6, NULL);
+}
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index bc07ce9d5477..a996b816ecc4 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -1142,6 +1142,10 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args)
 __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id);
 __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux_prog);
 
+__bpf_kfunc int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux);
+__bpf_kfunc int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux);
+__bpf_kfunc int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux);
+
 BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
@@ -1184,6 +1188,9 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10)
 BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1)
 BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl)
+BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy_impl)
 BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids)
 
 static int bpf_testmod_ops_init(struct btf *btf)
@@ -1675,6 +1682,25 @@ int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog
 	return ret;
 }
 
+int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux)
+{
+	if (aux && a > 0)
+		return a;
+	return -EINVAL;
+}
+
+int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux)
+{
+	if (aux)
+		return a + b;
+	return -EINVAL;
+}
+
+int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux)
+{
+	return bpf_kfunc_implicit_arg_legacy(a, b, aux);
+}
+
 static int multi_st_ops_reg(void *kdata, struct bpf_link *link)
 {
 	struct bpf_testmod_multi_st_ops *st_ops =
-- 
cgit v1.2.3


From b97931a25a4bc74076ffb5c3d1a534c71ade4d55 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Tue, 20 Jan 2026 14:26:32 -0800
Subject: bpf: Migrate bpf_wq_set_callback_impl() to KF_IMPLICIT_ARGS

Implement bpf_wq_set_callback() with an implicit bpf_prog_aux
argument, and remove bpf_wq_set_callback_impl().

Update special kfunc checks in the verifier accordingly.

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20260120222638.3976562-8-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c                                     | 11 +++++------
 kernel/bpf/verifier.c                                    | 16 ++++++++--------
 tools/testing/selftests/bpf/bpf_experimental.h           |  5 -----
 .../selftests/bpf/progs/verifier_async_cb_context.c      |  4 ++--
 tools/testing/selftests/bpf/progs/wq_failures.c          |  4 ++--
 5 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 9eaa4185e0a7..c76a9003b221 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -3120,12 +3120,11 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
 	return 0;
 }
 
-__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
-					 int (callback_fn)(void *map, int *key, void *value),
-					 unsigned int flags,
-					 void *aux__prog)
+__bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq,
+				    int (callback_fn)(void *map, int *key, void *value),
+				    unsigned int flags,
+				    struct bpf_prog_aux *aux)
 {
-	struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog;
 	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
 
 	if (flags)
@@ -4488,7 +4487,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_memset)
 BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
 #endif
 BTF_ID_FLAGS(func, bpf_wq_init)
-BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
+BTF_ID_FLAGS(func, bpf_wq_set_callback, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_wq_start)
 BTF_ID_FLAGS(func, bpf_preempt_disable)
 BTF_ID_FLAGS(func, bpf_preempt_enable)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index adc24a2ce5b6..51e8c9f70868 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -520,7 +520,7 @@ static bool is_async_callback_calling_kfunc(u32 btf_id);
 static bool is_callback_calling_kfunc(u32 btf_id);
 static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
 
-static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);
+static bool is_bpf_wq_set_callback_kfunc(u32 btf_id);
 static bool is_task_work_add_kfunc(u32 func_id);
 
 static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
@@ -562,7 +562,7 @@ static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn
 
 	/* bpf_wq and bpf_task_work callbacks are always sleepable. */
 	if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
-	    (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm)))
+	    (is_bpf_wq_set_callback_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm)))
 		return true;
 
 	verifier_bug(env, "unhandled async callback in is_async_cb_sleepable");
@@ -12437,7 +12437,7 @@ enum special_kfunc_type {
 	KF_bpf_percpu_obj_new_impl,
 	KF_bpf_percpu_obj_drop_impl,
 	KF_bpf_throw,
-	KF_bpf_wq_set_callback_impl,
+	KF_bpf_wq_set_callback,
 	KF_bpf_preempt_disable,
 	KF_bpf_preempt_enable,
 	KF_bpf_iter_css_task_new,
@@ -12501,7 +12501,7 @@ BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
 BTF_ID(func, bpf_throw)
-BTF_ID(func, bpf_wq_set_callback_impl)
+BTF_ID(func, bpf_wq_set_callback)
 BTF_ID(func, bpf_preempt_disable)
 BTF_ID(func, bpf_preempt_enable)
 #ifdef CONFIG_CGROUPS
@@ -12994,7 +12994,7 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id)
 
 static bool is_async_callback_calling_kfunc(u32 btf_id)
 {
-	return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] ||
+	return is_bpf_wq_set_callback_kfunc(btf_id) ||
 	       is_task_work_add_kfunc(btf_id);
 }
 
@@ -13004,9 +13004,9 @@ static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
 	       insn->imm == special_kfunc_list[KF_bpf_throw];
 }
 
-static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id)
+static bool is_bpf_wq_set_callback_kfunc(u32 btf_id)
 {
-	return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+	return btf_id == special_kfunc_list[KF_bpf_wq_set_callback];
 }
 
 static bool is_callback_calling_kfunc(u32 btf_id)
@@ -14085,7 +14085,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		meta.r0_rdonly = false;
 	}
 
-	if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) {
+	if (is_bpf_wq_set_callback_kfunc(meta.func_id)) {
 		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
 					 set_timer_callback_state);
 		if (err) {
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 2cd9165c7348..68a49b1f77ae 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -580,11 +580,6 @@ extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym;
 
 extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym;
 extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym;
-extern int bpf_wq_set_callback_impl(struct bpf_wq *wq,
-		int (callback_fn)(void *map, int *key, void *value),
-		unsigned int flags__k, void *aux__ign) __ksym;
-#define bpf_wq_set_callback(timer, cb, flags) \
-	bpf_wq_set_callback_impl(timer, cb, flags, NULL)
 
 struct bpf_iter_kmem_cache;
 extern int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it) __weak __ksym;
diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c
index 7efa9521105e..5d5e1cd4d51d 100644
--- a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c
+++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c
@@ -96,7 +96,7 @@ int wq_non_sleepable_prog(void *ctx)
 
 	if (bpf_wq_init(&val->w, &wq_map, 0) != 0)
 		return 0;
-	if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0)
+	if (bpf_wq_set_callback(&val->w, wq_cb, 0) != 0)
 		return 0;
 	return 0;
 }
@@ -114,7 +114,7 @@ int wq_sleepable_prog(void *ctx)
 
 	if (bpf_wq_init(&val->w, &wq_map, 0) != 0)
 		return 0;
-	if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0)
+	if (bpf_wq_set_callback(&val->w, wq_cb, 0) != 0)
 		return 0;
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c
index d06f6d40594a..3767f5595bbc 100644
--- a/tools/testing/selftests/bpf/progs/wq_failures.c
+++ b/tools/testing/selftests/bpf/progs/wq_failures.c
@@ -97,7 +97,7 @@ __failure
 /* check that the first argument of bpf_wq_set_callback()
  * is a correct bpf_wq pointer.
  */
-__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */
+__msg(": (85) call bpf_wq_set_callback#") /* anchor message */
 __msg("arg#0 doesn't point to a map value")
 long test_wrong_wq_pointer(void *ctx)
 {
@@ -123,7 +123,7 @@ __failure
 /* check that the first argument of bpf_wq_set_callback()
  * is a correct bpf_wq pointer.
  */
-__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */
+__msg(": (85) call bpf_wq_set_callback#") /* anchor message */
 __msg("off 1 doesn't point to 'struct bpf_wq' that is at 0")
 long test_wrong_wq_pointer_offset(void *ctx)
 {
-- 
cgit v1.2.3


From 8157cc739ad301b7fb6dfc4cfc5497cedd33df4e Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Tue, 20 Jan 2026 14:26:33 -0800
Subject: HID: Use bpf_wq_set_callback kernel function

Remove extern declaration of bpf_wq_set_callback_impl() from
hid_bpf_helpers.h and replace bpf_wq_set_callback macro with a
corresponding new declaration.

Tested with:
  # append tools/testing/selftests/hid/config and build the kernel
  $ make -C tools/testing/selftests/hid
  # in built kernel
  $ ./tools/testing/selftests/hid/hid_bpf -t test_multiply_events_wq

  TAP version 13
  1..1
  # Starting 1 tests from 1 test cases.
  #  RUN           hid_bpf.test_multiply_events_wq ...
  [    2.575520] hid-generic 0003:0001:0A36.0001: hidraw0: USB HID v0.00 Device [test-uhid-device-138] on 138
  #            OK  hid_bpf.test_multiply_events_wq
  ok 1 hid_bpf.test_multiply_events_wq
  # PASSED: 1 / 1 tests passed.
  # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0
  PASS

Acked-by: Benjamin Tissoires <bentiss@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20260120222638.3976562-9-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/hid/bpf/progs/hid_bpf_helpers.h             | 8 +++-----
 tools/testing/selftests/hid/progs/hid_bpf_helpers.h | 8 +++-----
 2 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/drivers/hid/bpf/progs/hid_bpf_helpers.h b/drivers/hid/bpf/progs/hid_bpf_helpers.h
index bf19785a6b06..228f8d787567 100644
--- a/drivers/hid/bpf/progs/hid_bpf_helpers.h
+++ b/drivers/hid/bpf/progs/hid_bpf_helpers.h
@@ -33,11 +33,9 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx,
 /* bpf_wq implementation */
 extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym;
 extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym;
-extern int bpf_wq_set_callback_impl(struct bpf_wq *wq,
-		int (callback_fn)(void *map, int *key, void *value),
-		unsigned int flags__k, void *aux__ign) __ksym;
-#define bpf_wq_set_callback(wq, cb, flags) \
-	bpf_wq_set_callback_impl(wq, cb, flags, NULL)
+extern int bpf_wq_set_callback(struct bpf_wq *wq,
+		int (*callback_fn)(void *, int *, void *),
+		unsigned int flags) __weak __ksym;
 
 #define HID_MAX_DESCRIPTOR_SIZE	4096
 #define HID_IGNORE_EVENT	-1
diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
index 531228b849da..80ab60905865 100644
--- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
+++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h
@@ -116,10 +116,8 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx,
 /* bpf_wq implementation */
 extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym;
 extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym;
-extern int bpf_wq_set_callback_impl(struct bpf_wq *wq,
-		int (callback_fn)(void *map, int *key, void *wq),
-		unsigned int flags__k, void *aux__ign) __weak __ksym;
-#define bpf_wq_set_callback(timer, cb, flags) \
-	bpf_wq_set_callback_impl(timer, cb, flags, NULL)
+extern int bpf_wq_set_callback(struct bpf_wq *wq,
+		int (*callback_fn)(void *, int *, void *),
+		unsigned int flags) __weak __ksym;
 
 #endif /* __HID_BPF_HELPERS_H */
-- 
cgit v1.2.3


From 6e663ffdf7600168338fdfa2fd1eed83395d58a3 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Tue, 20 Jan 2026 14:26:34 -0800
Subject: bpf: Migrate bpf_task_work_schedule_* kfuncs to KF_IMPLICIT_ARGS

Implement bpf_task_work_schedule_* with an implicit bpf_prog_aux
argument, and remove corresponding _impl funcs from the kernel.

Update special kfunc checks in the verifier accordingly.

Update the selftests to use the new API with implicit argument.

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20260120222638.3976562-10-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c                               | 30 ++++++++++------------
 kernel/bpf/verifier.c                              | 12 ++++-----
 tools/testing/selftests/bpf/progs/file_reader.c    |  2 +-
 tools/testing/selftests/bpf/progs/task_work.c      |  7 +++--
 tools/testing/selftests/bpf/progs/task_work_fail.c |  8 +++---
 .../testing/selftests/bpf/progs/task_work_stress.c |  4 +--
 .../bpf/progs/verifier_async_cb_context.c          |  4 +--
 7 files changed, 32 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c76a9003b221..f2f974b5fb3b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -4274,41 +4274,39 @@ release_prog:
 }
 
 /**
- * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL
+ * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL
  * mode
  * @task: Task struct for which callback should be scheduled
  * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
  * @map__map: bpf_map that embeds struct bpf_task_work in the values
  * @callback: pointer to BPF subprogram to call
- * @aux__prog: user should pass NULL
+ * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
  *
  * Return: 0 if task work has been scheduled successfully, negative error code otherwise
  */
-__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task,
-						   struct bpf_task_work *tw, void *map__map,
-						   bpf_task_work_callback_t callback,
-						   void *aux__prog)
+__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
+					      void *map__map, bpf_task_work_callback_t callback,
+					      struct bpf_prog_aux *aux)
 {
-	return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
+	return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_SIGNAL);
 }
 
 /**
- * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME
+ * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME
  * mode
  * @task: Task struct for which callback should be scheduled
  * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
  * @map__map: bpf_map that embeds struct bpf_task_work in the values
  * @callback: pointer to BPF subprogram to call
- * @aux__prog: user should pass NULL
+ * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier
  *
  * Return: 0 if task work has been scheduled successfully, negative error code otherwise
  */
-__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task,
-						   struct bpf_task_work *tw, void *map__map,
-						   bpf_task_work_callback_t callback,
-						   void *aux__prog)
+__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
+					      void *map__map, bpf_task_work_callback_t callback,
+					      struct bpf_prog_aux *aux)
 {
-	return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
+	return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_RESUME);
 }
 
 static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep,
@@ -4536,8 +4534,8 @@ BTF_ID_FLAGS(func, bpf_strncasestr);
 BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
 #endif
 BTF_ID_FLAGS(func, bpf_stream_vprintk_impl)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_dynptr_from_file)
 BTF_ID_FLAGS(func, bpf_dynptr_file_discard)
 BTF_KFUNCS_END(common_btf_ids)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 51e8c9f70868..8e8570e9d167 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12457,8 +12457,8 @@ enum special_kfunc_type {
 	KF_bpf_dynptr_from_file,
 	KF_bpf_dynptr_file_discard,
 	KF___bpf_trap,
-	KF_bpf_task_work_schedule_signal_impl,
-	KF_bpf_task_work_schedule_resume_impl,
+	KF_bpf_task_work_schedule_signal,
+	KF_bpf_task_work_schedule_resume,
 	KF_bpf_arena_alloc_pages,
 	KF_bpf_arena_free_pages,
 	KF_bpf_arena_reserve_pages,
@@ -12534,16 +12534,16 @@ BTF_ID(func, bpf_res_spin_unlock_irqrestore)
 BTF_ID(func, bpf_dynptr_from_file)
 BTF_ID(func, bpf_dynptr_file_discard)
 BTF_ID(func, __bpf_trap)
-BTF_ID(func, bpf_task_work_schedule_signal_impl)
-BTF_ID(func, bpf_task_work_schedule_resume_impl)
+BTF_ID(func, bpf_task_work_schedule_signal)
+BTF_ID(func, bpf_task_work_schedule_resume)
 BTF_ID(func, bpf_arena_alloc_pages)
 BTF_ID(func, bpf_arena_free_pages)
 BTF_ID(func, bpf_arena_reserve_pages)
 
 static bool is_task_work_add_kfunc(u32 func_id)
 {
-	return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] ||
-	       func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl];
+	return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] ||
+	       func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume];
 }
 
 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
diff --git a/tools/testing/selftests/bpf/progs/file_reader.c b/tools/testing/selftests/bpf/progs/file_reader.c
index 4d756b623557..462712ff3b8a 100644
--- a/tools/testing/selftests/bpf/progs/file_reader.c
+++ b/tools/testing/selftests/bpf/progs/file_reader.c
@@ -77,7 +77,7 @@ int on_open_validate_file_read(void *c)
 		err = 1;
 		return 0;
 	}
-	bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, task_work_callback, NULL);
+	bpf_task_work_schedule_signal(task, &work->tw, &arrmap, task_work_callback);
 	return 0;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c
index 663a80990f8f..a6009d105158 100644
--- a/tools/testing/selftests/bpf/progs/task_work.c
+++ b/tools/testing/selftests/bpf/progs/task_work.c
@@ -65,8 +65,7 @@ int oncpu_hash_map(struct pt_regs *args)
 	work = bpf_map_lookup_elem(&hmap, &key);
 	if (!work)
 		return 0;
-
-	bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL);
+	bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work);
 	return 0;
 }
 
@@ -80,7 +79,7 @@ int oncpu_array_map(struct pt_regs *args)
 	work = bpf_map_lookup_elem(&arrmap, &key);
 	if (!work)
 		return 0;
-	bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, process_work, NULL);
+	bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work);
 	return 0;
 }
 
@@ -102,6 +101,6 @@ int oncpu_lru_map(struct pt_regs *args)
 	work = bpf_map_lookup_elem(&lrumap, &key);
 	if (!work || work->data[0])
 		return 0;
-	bpf_task_work_schedule_resume_impl(task, &work->tw, &lrumap, process_work, NULL);
+	bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work);
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c
index 1270953fd092..82e4b8913333 100644
--- a/tools/testing/selftests/bpf/progs/task_work_fail.c
+++ b/tools/testing/selftests/bpf/progs/task_work_fail.c
@@ -53,7 +53,7 @@ int mismatch_map(struct pt_regs *args)
 	work = bpf_map_lookup_elem(&arrmap, &key);
 	if (!work)
 		return 0;
-	bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL);
+	bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work);
 	return 0;
 }
 
@@ -65,7 +65,7 @@ int no_map_task_work(struct pt_regs *args)
 	struct bpf_task_work tw;
 
 	task = bpf_get_current_task_btf();
-	bpf_task_work_schedule_resume_impl(task, &tw, &hmap, process_work, NULL);
+	bpf_task_work_schedule_resume(task, &tw, &hmap, process_work);
 	return 0;
 }
 
@@ -76,7 +76,7 @@ int task_work_null(struct pt_regs *args)
 	struct task_struct *task;
 
 	task = bpf_get_current_task_btf();
-	bpf_task_work_schedule_resume_impl(task, NULL, &hmap, process_work, NULL);
+	bpf_task_work_schedule_resume(task, NULL, &hmap, process_work);
 	return 0;
 }
 
@@ -91,6 +91,6 @@ int map_null(struct pt_regs *args)
 	work = bpf_map_lookup_elem(&arrmap, &key);
 	if (!work)
 		return 0;
-	bpf_task_work_schedule_resume_impl(task, &work->tw, NULL, process_work, NULL);
+	bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work);
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c
index 55e555f7f41b..1d4378f351ef 100644
--- a/tools/testing/selftests/bpf/progs/task_work_stress.c
+++ b/tools/testing/selftests/bpf/progs/task_work_stress.c
@@ -51,8 +51,8 @@ int schedule_task_work(void *ctx)
 		if (!work)
 			return 0;
 	}
-	err = bpf_task_work_schedule_signal_impl(bpf_get_current_task_btf(), &work->tw, &hmap,
-						 process_work, NULL);
+	err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap,
+					    process_work);
 	if (err)
 		__sync_fetch_and_add(&schedule_error, 1);
 	else
diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c
index 5d5e1cd4d51d..39aff82549c9 100644
--- a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c
+++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c
@@ -156,7 +156,7 @@ int task_work_non_sleepable_prog(void *ctx)
 	if (!task)
 		return 0;
 
-	bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL);
+	bpf_task_work_schedule_resume(task, &val->tw, &task_work_map, task_work_cb);
 	return 0;
 }
 
@@ -176,6 +176,6 @@ int task_work_sleepable_prog(void *ctx)
 	if (!task)
 		return 0;
 
-	bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL);
+	bpf_task_work_schedule_resume(task, &val->tw, &task_work_map, task_work_cb);
 	return 0;
 }
-- 
cgit v1.2.3


From d806f3101276a1ed18d963944580e1ee1c7a3d26 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Tue, 20 Jan 2026 14:26:35 -0800
Subject: bpf: Migrate bpf_stream_vprintk() to KF_IMPLICIT_ARGS

Implement bpf_stream_vprintk with an implicit bpf_prog_aux argument,
and remote bpf_stream_vprintk_impl from the kernel.

Update the selftests to use the new API with implicit argument.

bpf_stream_vprintk macro is changed to use the new bpf_stream_vprintk
kfunc, and the extern definition of bpf_stream_vprintk_impl is
replaced accordingly.

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20260120222638.3976562-11-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c                            | 2 +-
 kernel/bpf/stream.c                             | 5 ++---
 tools/lib/bpf/bpf_helpers.h                     | 6 +++---
 tools/testing/selftests/bpf/progs/stream_fail.c | 6 +++---
 4 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index f2f974b5fb3b..f8aa1320e2f7 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -4533,7 +4533,7 @@ BTF_ID_FLAGS(func, bpf_strncasestr);
 #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
 BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
 #endif
-BTF_ID_FLAGS(func, bpf_stream_vprintk_impl)
+BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_dynptr_from_file)
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
index 0b6bc3f30335..24730df55e69 100644
--- a/kernel/bpf/stream.c
+++ b/kernel/bpf/stream.c
@@ -212,14 +212,13 @@ __bpf_kfunc_start_defs();
  * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
  * enum in headers.
  */
-__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args,
-					u32 len__sz, void *aux__prog)
+__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args,
+				   u32 len__sz, struct bpf_prog_aux *aux)
 {
 	struct bpf_bprintf_data data = {
 		.get_bin_args	= true,
 		.get_buf	= true,
 	};
-	struct bpf_prog_aux *aux = aux__prog;
 	u32 fmt_size = strlen(fmt__str) + 1;
 	struct bpf_stream *stream;
 	u32 data_len = len__sz;
diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index d4e4e388e625..c145da05a67c 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -315,8 +315,8 @@ enum libbpf_tristate {
 			  ___param, sizeof(___param));		\
 })
 
-extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args,
-				   __u32 len__sz, void *aux__prog) __weak __ksym;
+extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args,
+			      __u32 len__sz) __weak __ksym;
 
 #define bpf_stream_printk(stream_id, fmt, args...)					\
 ({											\
@@ -328,7 +328,7 @@ extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const vo
 	___bpf_fill(___param, args);							\
 	_Pragma("GCC diagnostic pop")							\
 											\
-	bpf_stream_vprintk_impl(stream_id, ___fmt, ___param, sizeof(___param), NULL);	\
+	bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param));		\
 })
 
 /* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args
diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c
index 3662515f0107..8e8249f3521c 100644
--- a/tools/testing/selftests/bpf/progs/stream_fail.c
+++ b/tools/testing/selftests/bpf/progs/stream_fail.c
@@ -10,7 +10,7 @@ SEC("syscall")
 __failure __msg("Possibly NULL pointer passed")
 int stream_vprintk_null_arg(void *ctx)
 {
-	bpf_stream_vprintk_impl(BPF_STDOUT, "", NULL, 0, NULL);
+	bpf_stream_vprintk(BPF_STDOUT, "", NULL, 0);
 	return 0;
 }
 
@@ -18,7 +18,7 @@ SEC("syscall")
 __failure __msg("R3 type=scalar expected=")
 int stream_vprintk_scalar_arg(void *ctx)
 {
-	bpf_stream_vprintk_impl(BPF_STDOUT, "", (void *)46, 0, NULL);
+	bpf_stream_vprintk(BPF_STDOUT, "", (void *)46, 0);
 	return 0;
 }
 
@@ -26,7 +26,7 @@ SEC("syscall")
 __failure __msg("arg#1 doesn't point to a const string")
 int stream_vprintk_string_arg(void *ctx)
 {
-	bpf_stream_vprintk_impl(BPF_STDOUT, ctx, NULL, 0, NULL);
+	bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0);
 	return 0;
 }
 
-- 
cgit v1.2.3


From bd06b977e02d80fe0dec303cc219d007121a4526 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Tue, 20 Jan 2026 14:26:36 -0800
Subject: selftests/bpf: Migrate struct_ops_assoc test to KF_IMPLICIT_ARGS

A test kfunc named bpf_kfunc_multi_st_ops_test_1_impl() is a user of
__prog suffix. Subsequent patch removes __prog support in favor of
KF_IMPLICIT_ARGS, so migrate this kfunc to use implicit argument.

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20260120222638.3976562-12-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/struct_ops_assoc.c          | 8 ++++----
 tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c | 4 ++--
 tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c    | 6 +++---
 tools/testing/selftests/bpf/test_kmods/bpf_testmod.c          | 9 ++++-----
 tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h    | 6 ++++--
 5 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c
index 8f1097903e22..68842e3f936b 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_assoc.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c
@@ -32,7 +32,7 @@ int BPF_PROG(sys_enter_prog_a, struct pt_regs *regs, long id)
 	if (!test_pid || task->pid != test_pid)
 		return 0;
 
-	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args);
 	if (ret != MAP_A_MAGIC)
 		test_err_a++;
 
@@ -45,7 +45,7 @@ int syscall_prog_a(void *ctx)
 	struct st_ops_args args = {};
 	int ret;
 
-	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args);
 	if (ret != MAP_A_MAGIC)
 		test_err_a++;
 
@@ -79,7 +79,7 @@ int BPF_PROG(sys_enter_prog_b, struct pt_regs *regs, long id)
 	if (!test_pid || task->pid != test_pid)
 		return 0;
 
-	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args);
 	if (ret != MAP_B_MAGIC)
 		test_err_b++;
 
@@ -92,7 +92,7 @@ int syscall_prog_b(void *ctx)
 	struct st_ops_args args = {};
 	int ret;
 
-	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args);
 	if (ret != MAP_B_MAGIC)
 		test_err_b++;
 
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c
index d5a2ea934284..0bed49e9f217 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c
@@ -31,7 +31,7 @@ __noinline static int timer_cb(void *map, int *key, struct bpf_timer *timer)
 	struct st_ops_args args = {};
 
 	recur++;
-	timer_test_1_ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	timer_test_1_ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args);
 	recur--;
 
 	timer_cb_run++;
@@ -64,7 +64,7 @@ int syscall_prog(void *ctx)
 	struct st_ops_args args = {};
 	int ret;
 
-	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args);
 	if (ret != MAP_MAGIC)
 		test_err++;
 
diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c
index 5bb6ebf5eed4..396b3e58c729 100644
--- a/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c
+++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c
@@ -23,7 +23,7 @@ int BPF_PROG(test_1_a, struct st_ops_args *args)
 
 	if (!recur) {
 		recur++;
-		ret = bpf_kfunc_multi_st_ops_test_1_impl(args, NULL);
+		ret = bpf_kfunc_multi_st_ops_test_1_assoc(args);
 		if (ret != -1)
 			test_err_a++;
 		recur--;
@@ -40,7 +40,7 @@ int syscall_prog_a(void *ctx)
 	struct st_ops_args args = {};
 	int ret;
 
-	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args);
 	if (ret != MAP_A_MAGIC)
 		test_err_a++;
 
@@ -62,7 +62,7 @@ int syscall_prog_b(void *ctx)
 	struct st_ops_args args = {};
 	int ret;
 
-	ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL);
+	ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args);
 	if (ret != MAP_A_MAGIC)
 		test_err_b++;
 
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index a996b816ecc4..0d542ba64365 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -1140,7 +1140,7 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args)
 }
 
 __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id);
-__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux_prog);
+__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args, struct bpf_prog_aux *aux);
 
 __bpf_kfunc int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux);
 __bpf_kfunc int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux);
@@ -1187,7 +1187,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10)
 BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1)
-BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl)
+BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_assoc, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy_impl)
@@ -1669,13 +1669,12 @@ int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id)
 }
 
 /* Call test_1() of the associated struct_ops map */
-int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog)
+int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args, struct bpf_prog_aux *aux)
 {
-	struct bpf_prog_aux *prog_aux = (struct bpf_prog_aux *)aux__prog;
 	struct bpf_testmod_multi_st_ops *st_ops;
 	int ret = -1;
 
-	st_ops = (struct bpf_testmod_multi_st_ops *)bpf_prog_get_assoc_struct_ops(prog_aux);
+	st_ops = (struct bpf_testmod_multi_st_ops *)bpf_prog_get_assoc_struct_ops(aux);
 	if (st_ops)
 		ret = st_ops->test_1(args);
 
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
index 2357a0340ffe..225ea30c4e3d 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
@@ -161,7 +161,9 @@ void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym;
 struct task_struct *bpf_kfunc_ret_rcu_test(void) __ksym;
 int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) __ksym;
 
-int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __ksym;
-int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) __ksym;
+#ifndef __KERNEL__
+extern int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __weak __ksym;
+extern int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args) __weak __ksym;
+#endif
 
 #endif /* _BPF_TESTMOD_KFUNC_H */
-- 
cgit v1.2.3


From 44fdd581d27366092e162b42f025d75d5a16c851 Mon Sep 17 00:00:00 2001
From: Yazhou Tang <tangyazhou518@outlook.com>
Date: Mon, 19 Jan 2026 16:54:57 +0800
Subject: bpf: Add range tracking for BPF_DIV and BPF_MOD

This patch implements range tracking (interval analysis) for BPF_DIV and
BPF_MOD operations when the divisor is a constant, covering both signed
and unsigned variants.

While LLVM typically optimizes integer division and modulo by constants
into multiplication and shift sequences, this optimization is less
effective for the BPF target when dealing with 64-bit arithmetic.

Currently, the verifier does not track bounds for scalar division or
modulo, treating the result as "unbounded". This leads to false positive
rejections for safe code patterns.

For example, the following code (compiled with -O2):

```c
int test(struct pt_regs *ctx) {
    char buffer[6] = {1};
    __u64 x = bpf_ktime_get_ns();
    __u64 res = x % sizeof(buffer);
    char value = buffer[res];
    bpf_printk("res = %llu, val = %d", res, value);
    return 0;
}
```

Generates a raw `BPF_MOD64` instruction:

```asm
;     __u64 res = x % sizeof(buffer);
       1:	97 00 00 00 06 00 00 00	r0 %= 0x6
;     char value = buffer[res];
       2:	18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00	r1 = 0x0 ll
       4:	0f 01 00 00 00 00 00 00	r1 += r0
       5:	91 14 00 00 00 00 00 00	r4 = *(s8 *)(r1 + 0x0)
```

Without this patch, the verifier fails with "math between map_value
pointer and register with unbounded min value is not allowed" because
it cannot deduce that `r0` is within [0, 5].

According to the BPF instruction set[1], the instruction's offset field
(`insn->off`) is used to distinguish between signed (`off == 1`) and
unsigned division (`off == 0`). Moreover, we also follow the BPF division
and modulo runtime behavior (semantics) to handle special cases, such as
division by zero and signed division overflow.

- UDIV: dst = (src != 0) ? (dst / src) : 0
- SDIV: dst = (src == 0) ? 0 : ((src == -1 && dst == LLONG_MIN) ? LLONG_MIN : (dst / src))
- UMOD: dst = (src != 0) ? (dst % src) : dst
- SMOD: dst = (src == 0) ? dst : ((src == -1 && dst == LLONG_MIN) ? 0: (dst s% src))

Here is the overview of the changes made in this patch (See the code comments
for more details and examples):

1. For BPF_DIV: Firstly check whether the divisor is zero. If so, set the
   destination register to zero (matching runtime behavior).

   For non-zero constant divisors: goto `scalar(32)?_min_max_(u|s)div` functions.
   - General cases: compute the new range by dividing max_dividend and
     min_dividend by the constant divisor.
   - Overflow case (SIGNED_MIN / -1) in signed division: mark the result
     as unbounded if the dividend is not a single number.

2. For BPF_MOD: Firstly check whether the divisor is zero. If so, leave the
   destination register unchanged (matching runtime behavior).

   For non-zero constant divisors: goto `scalar(32)?_min_max_(u|s)mod` functions.
   - General case: For signed modulo, the result's sign matches the
     dividend's sign. And the result's absolute value is strictly bounded
     by `min(abs(dividend), abs(divisor) - 1)`.
     - Special care is taken when the divisor is SIGNED_MIN. By casting
       to unsigned before negation and subtracting 1, we avoid signed
       overflow and correctly calculate the maximum possible magnitude
       (`res_max_abs` in the code).
   - "Small dividend" case: If the dividend is already within the possible
     result range (e.g., [-2, 5] % 10), the operation is an identity
     function, and the destination register remains unchanged.

3. In `scalar(32)?_min_max_(u|s)(div|mod)` functions: After updating current
   range, reset other ranges and tnum to unbounded/unknown.

   e.g., in `scalar_min_max_sdiv`, signed 64-bit range is updated. Then reset
   unsigned 64-bit range and 32-bit range to unbounded, and tnum to unknown.

   Exception: in BPF_MOD's "small dividend" case, since the result remains
   unchanged, we do not reset other ranges/tnum.

4. Also updated existing selftests based on the expected BPF_DIV and
   BPF_MOD behavior.

[1] https://www.kernel.org/doc/Documentation/bpf/standardization/instruction-set.rst

Co-developed-by: Shenghao Yuan <shenghaoyuan0928@163.com>
Signed-off-by: Shenghao Yuan <shenghaoyuan0928@163.com>
Co-developed-by: Tianci Cao <ziye@zju.edu.cn>
Signed-off-by: Tianci Cao <ziye@zju.edu.cn>
Signed-off-by: Yazhou Tang <tangyazhou518@outlook.com>
Tested-by: syzbot@syzkaller.appspotmail.com
Link: https://lore.kernel.org/r/20260119085458.182221-2-tangyazhou@zju.edu.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                              | 299 +++++++++++++++++++++
 .../bpf/progs/verifier_value_illegal_alu.c         |   7 +-
 tools/testing/selftests/bpf/verifier/precise.c     |   4 +-
 3 files changed, 305 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 919556614505..f11dc5366e5b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2349,6 +2349,18 @@ static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
 	reg->u32_max_value = U32_MAX;
 }
 
+static void reset_reg64_and_tnum(struct bpf_reg_state *reg)
+{
+	__mark_reg64_unbounded(reg);
+	reg->var_off = tnum_unknown;
+}
+
+static void reset_reg32_and_tnum(struct bpf_reg_state *reg)
+{
+	__mark_reg32_unbounded(reg);
+	reg->var_off = tnum_unknown;
+}
+
 static void __update_reg32_bounds(struct bpf_reg_state *reg)
 {
 	struct tnum var32_off = tnum_subreg(reg->var_off);
@@ -15159,6 +15171,252 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
 	}
 }
 
+static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg,
+				  struct bpf_reg_state *src_reg)
+{
+	u32 *dst_umin = &dst_reg->u32_min_value;
+	u32 *dst_umax = &dst_reg->u32_max_value;
+	u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */
+
+	*dst_umin = *dst_umin / src_val;
+	*dst_umax = *dst_umax / src_val;
+
+	/* Reset other ranges/tnum to unbounded/unknown. */
+	dst_reg->s32_min_value = S32_MIN;
+	dst_reg->s32_max_value = S32_MAX;
+	reset_reg64_and_tnum(dst_reg);
+}
+
+static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg,
+				struct bpf_reg_state *src_reg)
+{
+	u64 *dst_umin = &dst_reg->umin_value;
+	u64 *dst_umax = &dst_reg->umax_value;
+	u64 src_val = src_reg->umin_value; /* non-zero, const divisor */
+
+	*dst_umin = div64_u64(*dst_umin, src_val);
+	*dst_umax = div64_u64(*dst_umax, src_val);
+
+	/* Reset other ranges/tnum to unbounded/unknown. */
+	dst_reg->smin_value = S64_MIN;
+	dst_reg->smax_value = S64_MAX;
+	reset_reg32_and_tnum(dst_reg);
+}
+
+static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg,
+				  struct bpf_reg_state *src_reg)
+{
+	s32 *dst_smin = &dst_reg->s32_min_value;
+	s32 *dst_smax = &dst_reg->s32_max_value;
+	s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */
+	s32 res1, res2;
+
+	/* BPF div specification: S32_MIN / -1 = S32_MIN */
+	if (*dst_smin == S32_MIN && src_val == -1) {
+		/*
+		 * If the dividend range contains more than just S32_MIN,
+		 * we cannot precisely track the result, so it becomes unbounded.
+		 * e.g., [S32_MIN, S32_MIN+10]/(-1),
+		 *     = {S32_MIN} U [-(S32_MIN+10), -(S32_MIN+1)]
+		 *     = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX]
+		 * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN.
+		 */
+		if (*dst_smax != S32_MIN) {
+			*dst_smin = S32_MIN;
+			*dst_smax = S32_MAX;
+		}
+		goto reset;
+	}
+
+	res1 = *dst_smin / src_val;
+	res2 = *dst_smax / src_val;
+	*dst_smin = min(res1, res2);
+	*dst_smax = max(res1, res2);
+
+reset:
+	/* Reset other ranges/tnum to unbounded/unknown. */
+	dst_reg->u32_min_value = 0;
+	dst_reg->u32_max_value = U32_MAX;
+	reset_reg64_and_tnum(dst_reg);
+}
+
+static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg,
+				struct bpf_reg_state *src_reg)
+{
+	s64 *dst_smin = &dst_reg->smin_value;
+	s64 *dst_smax = &dst_reg->smax_value;
+	s64 src_val = src_reg->smin_value; /* non-zero, const divisor */
+	s64 res1, res2;
+
+	/* BPF div specification: S64_MIN / -1 = S64_MIN */
+	if (*dst_smin == S64_MIN && src_val == -1) {
+		/*
+		 * If the dividend range contains more than just S64_MIN,
+		 * we cannot precisely track the result, so it becomes unbounded.
+		 * e.g., [S64_MIN, S64_MIN+10]/(-1),
+		 *     = {S64_MIN} U [-(S64_MIN+10), -(S64_MIN+1)]
+		 *     = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX]
+		 * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN.
+		 */
+		if (*dst_smax != S64_MIN) {
+			*dst_smin = S64_MIN;
+			*dst_smax = S64_MAX;
+		}
+		goto reset;
+	}
+
+	res1 = div64_s64(*dst_smin, src_val);
+	res2 = div64_s64(*dst_smax, src_val);
+	*dst_smin = min(res1, res2);
+	*dst_smax = max(res1, res2);
+
+reset:
+	/* Reset other ranges/tnum to unbounded/unknown. */
+	dst_reg->umin_value = 0;
+	dst_reg->umax_value = U64_MAX;
+	reset_reg32_and_tnum(dst_reg);
+}
+
+static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg,
+				  struct bpf_reg_state *src_reg)
+{
+	u32 *dst_umin = &dst_reg->u32_min_value;
+	u32 *dst_umax = &dst_reg->u32_max_value;
+	u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */
+	u32 res_max = src_val - 1;
+
+	/*
+	 * If dst_umax <= res_max, the result remains unchanged.
+	 * e.g., [2, 5] % 10 = [2, 5].
+	 */
+	if (*dst_umax <= res_max)
+		return;
+
+	*dst_umin = 0;
+	*dst_umax = min(*dst_umax, res_max);
+
+	/* Reset other ranges/tnum to unbounded/unknown. */
+	dst_reg->s32_min_value = S32_MIN;
+	dst_reg->s32_max_value = S32_MAX;
+	reset_reg64_and_tnum(dst_reg);
+}
+
+static void scalar_min_max_umod(struct bpf_reg_state *dst_reg,
+				struct bpf_reg_state *src_reg)
+{
+	u64 *dst_umin = &dst_reg->umin_value;
+	u64 *dst_umax = &dst_reg->umax_value;
+	u64 src_val = src_reg->umin_value; /* non-zero, const divisor */
+	u64 res_max = src_val - 1;
+
+	/*
+	 * If dst_umax <= res_max, the result remains unchanged.
+	 * e.g., [2, 5] % 10 = [2, 5].
+	 */
+	if (*dst_umax <= res_max)
+		return;
+
+	*dst_umin = 0;
+	*dst_umax = min(*dst_umax, res_max);
+
+	/* Reset other ranges/tnum to unbounded/unknown. */
+	dst_reg->smin_value = S64_MIN;
+	dst_reg->smax_value = S64_MAX;
+	reset_reg32_and_tnum(dst_reg);
+}
+
+static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg,
+				  struct bpf_reg_state *src_reg)
+{
+	s32 *dst_smin = &dst_reg->s32_min_value;
+	s32 *dst_smax = &dst_reg->s32_max_value;
+	s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */
+
+	/*
+	 * Safe absolute value calculation:
+	 * If src_val == S32_MIN (-2147483648), src_abs becomes 2147483648.
+	 * Here use unsigned integer to avoid overflow.
+	 */
+	u32 src_abs = (src_val > 0) ? (u32)src_val : -(u32)src_val;
+
+	/*
+	 * Calculate the maximum possible absolute value of the result.
+	 * Even if src_abs is 2147483648 (S32_MIN), subtracting 1 gives
+	 * 2147483647 (S32_MAX), which fits perfectly in s32.
+	 */
+	s32 res_max_abs = src_abs - 1;
+
+	/*
+	 * If the dividend is already within the result range,
+	 * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5].
+	 */
+	if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs)
+		return;
+
+	/* General case: result has the same sign as the dividend. */
+	if (*dst_smin >= 0) {
+		*dst_smin = 0;
+		*dst_smax = min(*dst_smax, res_max_abs);
+	} else if (*dst_smax <= 0) {
+		*dst_smax = 0;
+		*dst_smin = max(*dst_smin, -res_max_abs);
+	} else {
+		*dst_smin = -res_max_abs;
+		*dst_smax = res_max_abs;
+	}
+
+	/* Reset other ranges/tnum to unbounded/unknown. */
+	dst_reg->u32_min_value = 0;
+	dst_reg->u32_max_value = U32_MAX;
+	reset_reg64_and_tnum(dst_reg);
+}
+
+static void scalar_min_max_smod(struct bpf_reg_state *dst_reg,
+				struct bpf_reg_state *src_reg)
+{
+	s64 *dst_smin = &dst_reg->smin_value;
+	s64 *dst_smax = &dst_reg->smax_value;
+	s64 src_val = src_reg->smin_value; /* non-zero, const divisor */
+
+	/*
+	 * Safe absolute value calculation:
+	 * If src_val == S64_MIN (-2^63), src_abs becomes 2^63.
+	 * Here use unsigned integer to avoid overflow.
+	 */
+	u64 src_abs = (src_val > 0) ? (u64)src_val : -(u64)src_val;
+
+	/*
+	 * Calculate the maximum possible absolute value of the result.
+	 * Even if src_abs is 2^63 (S64_MIN), subtracting 1 gives
+	 * 2^63 - 1 (S64_MAX), which fits perfectly in s64.
+	 */
+	s64 res_max_abs = src_abs - 1;
+
+	/*
+	 * If the dividend is already within the result range,
+	 * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5].
+	 */
+	if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs)
+		return;
+
+	/* General case: result has the same sign as the dividend. */
+	if (*dst_smin >= 0) {
+		*dst_smin = 0;
+		*dst_smax = min(*dst_smax, res_max_abs);
+	} else if (*dst_smax <= 0) {
+		*dst_smax = 0;
+		*dst_smin = max(*dst_smin, -res_max_abs);
+	} else {
+		*dst_smin = -res_max_abs;
+		*dst_smax = res_max_abs;
+	}
+
+	/* Reset other ranges/tnum to unbounded/unknown. */
+	dst_reg->umin_value = 0;
+	dst_reg->umax_value = U64_MAX;
+	reset_reg32_and_tnum(dst_reg);
+}
+
 static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
 				 struct bpf_reg_state *src_reg)
 {
@@ -15564,6 +15822,14 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
 	case BPF_MUL:
 		return true;
 
+	/*
+	 * Division and modulo operators range is only safe to compute when the
+	 * divisor is a constant.
+	 */
+	case BPF_DIV:
+	case BPF_MOD:
+		return src_is_const;
+
 	/* Shift operators range is only computable if shift dimension operand
 	 * is a constant. Shifts greater than 31 or 63 are undefined. This
 	 * includes shifts by a negative number.
@@ -15616,6 +15882,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 				      struct bpf_reg_state src_reg)
 {
 	u8 opcode = BPF_OP(insn->code);
+	s16 off = insn->off;
 	bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
 	int ret;
 
@@ -15667,6 +15934,38 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		scalar32_min_max_mul(dst_reg, &src_reg);
 		scalar_min_max_mul(dst_reg, &src_reg);
 		break;
+	case BPF_DIV:
+		/* BPF div specification: x / 0 = 0 */
+		if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) {
+			___mark_reg_known(dst_reg, 0);
+			break;
+		}
+		if (alu32)
+			if (off == 1)
+				scalar32_min_max_sdiv(dst_reg, &src_reg);
+			else
+				scalar32_min_max_udiv(dst_reg, &src_reg);
+		else
+			if (off == 1)
+				scalar_min_max_sdiv(dst_reg, &src_reg);
+			else
+				scalar_min_max_udiv(dst_reg, &src_reg);
+		break;
+	case BPF_MOD:
+		/* BPF mod specification: x % 0 = x */
+		if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0))
+			break;
+		if (alu32)
+			if (off == 1)
+				scalar32_min_max_smod(dst_reg, &src_reg);
+			else
+				scalar32_min_max_umod(dst_reg, &src_reg);
+		else
+			if (off == 1)
+				scalar_min_max_smod(dst_reg, &src_reg);
+			else
+				scalar_min_max_umod(dst_reg, &src_reg);
+		break;
 	case BPF_AND:
 		if (tnum_is_const(src_reg.var_off)) {
 			ret = maybe_fork_scalars(env, insn, dst_reg);
diff --git a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c
index 2129e4353fd9..4d8273c258d5 100644
--- a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c
+++ b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c
@@ -173,14 +173,15 @@ __naked void flow_keys_illegal_variable_offset_alu(void)
 	asm volatile("					\
 	r6 = r1;					\
 	r7 = *(u64*)(r6 + %[flow_keys_off]);		\
-	r8 = 8;						\
-	r8 /= 1;					\
+	call %[bpf_get_prandom_u32];			\
+	r8 = r0;					\
 	r8 &= 8;					\
 	r7 += r8;					\
 	r0 = *(u64*)(r7 + 0);				\
 	exit;						\
 "	:
-	: __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys))
+	: __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys)),
+	  __imm(bpf_get_prandom_u32)
 	: __clobber_all);
 }
 
diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c
index 59a020c35647..061d98f6e9bb 100644
--- a/tools/testing/selftests/bpf/verifier/precise.c
+++ b/tools/testing/selftests/bpf/verifier/precise.c
@@ -229,11 +229,11 @@
 {
 	"precise: program doesn't prematurely prune branches",
 	.insns = {
-		BPF_ALU64_IMM(BPF_MOV, BPF_REG_6, 0x400),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+		BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_0),
 		BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0),
 		BPF_ALU64_IMM(BPF_MOV, BPF_REG_8, 0),
 		BPF_ALU64_IMM(BPF_MOV, BPF_REG_9, 0x80000000),
-		BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 0x401),
 		BPF_JMP_IMM(BPF_JA, 0, 0, 0),
 		BPF_JMP_REG(BPF_JLE, BPF_REG_6, BPF_REG_9, 2),
 		BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 1),
-- 
cgit v1.2.3


From c9e440bf25a712d906c40ba3ef831f3f0ccc6a1b Mon Sep 17 00:00:00 2001
From: Yazhou Tang <tangyazhou518@outlook.com>
Date: Mon, 19 Jan 2026 16:54:58 +0800
Subject: selftests/bpf: Add tests for BPF_DIV and BPF_MOD range tracking

Now BPF_DIV has range tracking support via interval analysis. This patch
adds selftests to cover various cases of BPF_DIV and BPF_MOD operations
when the divisor is a constant, also covering both signed and unsigned variants.

This patch includes several types of tests in 32-bit and 64-bit variants:

1. For UDIV
   - positive divisor
   - zero divisor

2. For SDIV
   - positive divisor, positive dividend
   - positive divisor, negative dividend
   - positive divisor, mixed sign dividend
   - negative divisor, positive dividend
   - negative divisor, negative dividend
   - negative divisor, mixed sign dividend
   - zero divisor
   - overflow (SIGNED_MIN/-1), normal dividend
   - overflow (SIGNED_MIN/-1), constant dividend

3. For UMOD
   - positive divisor
   - positive divisor, small dividend
   - zero divisor

4. For SMOD
   - positive divisor, positive dividend
   - positive divisor, negative dividend
   - positive divisor, mixed sign dividend
   - positive divisor, mixed sign dividend, small dividend
   - negative divisor, positive dividend
   - negative divisor, negative dividend
   - negative divisor, mixed sign dividend
   - negative divisor, mixed sign dividend, small dividend
   - zero divisor
   - overflow (SIGNED_MIN/-1), normal dividend
   - overflow (SIGNED_MIN/-1), constant dividend

Specifically, these selftests are based on dead code elimination:
If the BPF verifier can precisely analyze the result of BPF_DIV/BPF_MOD
instruction, it can prune the path that leads to an error (here we use
invalid memory access as the error case), allowing the program to pass
verification.

Co-developed-by: Shenghao Yuan <shenghaoyuan0928@163.com>
Signed-off-by: Shenghao Yuan <shenghaoyuan0928@163.com>
Co-developed-by: Tianci Cao <ziye@zju.edu.cn>
Signed-off-by: Tianci Cao <ziye@zju.edu.cn>
Signed-off-by: Yazhou Tang <tangyazhou518@outlook.com>
Link: https://lore.kernel.org/r/20260119085458.182221-3-tangyazhou@zju.edu.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c  |    2 +
 .../selftests/bpf/progs/verifier_div_mod_bounds.c  | 1149 ++++++++++++++++++++
 2 files changed, 1151 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 38c5ba70100c..fa9e506cc36f 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -33,6 +33,7 @@
 #include "verifier_direct_packet_access.skel.h"
 #include "verifier_direct_stack_access_wraparound.skel.h"
 #include "verifier_div0.skel.h"
+#include "verifier_div_mod_bounds.skel.h"
 #include "verifier_div_overflow.skel.h"
 #include "verifier_global_subprogs.skel.h"
 #include "verifier_global_ptr_args.skel.h"
@@ -175,6 +176,7 @@ void test_verifier_d_path(void)               { RUN(verifier_d_path); }
 void test_verifier_direct_packet_access(void) { RUN(verifier_direct_packet_access); }
 void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_stack_access_wraparound); }
 void test_verifier_div0(void)                 { RUN(verifier_div0); }
+void test_verifier_div_mod_bounds(void)       { RUN(verifier_div_mod_bounds); }
 void test_verifier_div_overflow(void)         { RUN(verifier_div_overflow); }
 void test_verifier_global_subprogs(void)      { RUN(verifier_global_subprogs); }
 void test_verifier_global_ptr_args(void)      { RUN(verifier_global_ptr_args); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c b/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c
new file mode 100644
index 000000000000..4672af0b3268
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c
@@ -0,0 +1,1149 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <limits.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+/* This file contains unit tests for signed/unsigned division and modulo
+ * operations (with divisor as a constant), focusing on verifying whether
+ * BPF verifier's range tracking module soundly and precisely computes
+ * the results.
+ */
+
+SEC("socket")
+__description("UDIV32, positive divisor")
+__success __retval(0) __log_level(2)
+__msg("w1 /= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=3,var_off=(0x0; 0x3))")
+__naked void udiv32_pos_divisor(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	w1 &= 8;					\
+	w1 |= 1;					\
+	w1 /= 3;					\
+	if w1 > 3 goto l0_%=;				\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("UDIV32, zero divisor")
+__success __retval(0) __log_level(2)
+__msg("w1 /= w2 {{.*}}; R1=0 R2=0")
+__naked void udiv32_zero_divisor(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	w1 &= 8;					\
+	w1 |= 1;					\
+	w2 = 0;						\
+	w1 /= w2;					\
+	if w1 != 0 goto l0_%=;				\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("UDIV64, positive divisor")
+__success __retval(0) __log_level(2)
+__msg("r1 /= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=3,var_off=(0x0; 0x3))")
+__naked void udiv64_pos_divisor(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	r1 &= 8;					\
+	r1 |= 1;					\
+	r1 /= 3;					\
+	if r1 > 3 goto l0_%=;				\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("UDIV64, zero divisor")
+__success __retval(0) __log_level(2)
+__msg("r1 /= r2 {{.*}}; R1=0 R2=0")
+__naked void udiv64_zero_divisor(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	r1 &= 8;					\
+	r1 |= 1;					\
+	r2 = 0;						\
+	r1 /= r2;					\
+	if r1 != 0 goto l0_%=;				\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV32, positive divisor, positive dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))")
+__naked void sdiv32_pos_divisor_1(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s< 8 goto l0_%=;				\
+	if w1 s> 10 goto l0_%=;				\
+	w1 s/= 3;					\
+	if w1 s< 2 goto l1_%=;				\
+	if w1 s> 3 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV32, positive divisor, negative dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=umin=umin32=0xfffffffd,smax=umax=umax32=0xfffffffe,smin32=-3,smax32=-2,var_off=(0xfffffffc; 0x3))")
+__naked void sdiv32_pos_divisor_2(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s> -8 goto l0_%=;				\
+	if w1 s< -10 goto l0_%=;			\
+	w1 s/= 3;					\
+	if w1 s< -3 goto l1_%=;				\
+	if w1 s> -2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV32, positive divisor, mixed sign dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=3,var_off=(0x0; 0xffffffff))")
+__naked void sdiv32_pos_divisor_3(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s< -8 goto l0_%=;				\
+	if w1 s> 10 goto l0_%=;				\
+	w1 s/= 3;					\
+	if w1 s< -2 goto l1_%=;				\
+	if w1 s> 3 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV32, negative divisor, positive dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=umin=umin32=0xfffffffd,smax=umax=umax32=0xfffffffe,smin32=-3,smax32=-2,var_off=(0xfffffffc; 0x3))")
+__naked void sdiv32_neg_divisor_1(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s< 8 goto l0_%=;				\
+	if w1 s> 10 goto l0_%=;				\
+	w1 s/= -3;					\
+	if w1 s< -3 goto l1_%=;				\
+	if w1 s> -2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV32, negative divisor, positive dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))")
+__naked void sdiv32_neg_divisor_2(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s> -8 goto l0_%=;				\
+	if w1 s< -10 goto l0_%=;			\
+	w1 s/= -3;					\
+	if w1 s< 2 goto l1_%=;				\
+	if w1 s> 3 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV32, negative divisor, mixed sign dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-3,smax32=2,var_off=(0x0; 0xffffffff))")
+__naked void sdiv32_neg_divisor_3(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s< -8 goto l0_%=;				\
+	if w1 s> 10 goto l0_%=;				\
+	w1 s/= -3;					\
+	if w1 s< -3 goto l1_%=;				\
+	if w1 s> 2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV32, zero divisor")
+__success __retval(0) __log_level(2)
+__msg("w1 s/= w2 {{.*}}; R1=0 R2=0")
+__naked void sdiv32_zero_divisor(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	w1 &= 8;					\
+	w1 |= 1;					\
+	w2 = 0;						\
+	w1 s/= w2;					\
+	if w1 != 0 goto l0_%=;				\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV32, overflow (S32_MIN/-1)")
+__success __retval(0) __log_level(2)
+__msg("w1 s/= -1 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))")
+__naked void sdiv32_overflow_1(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	w2 = %[int_min];				\
+	w2 += 10;					\
+	if w1 s> w2 goto l0_%=;				\
+	w1 s/= -1;					\
+l0_%=:	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm_const(int_min, INT_MIN),
+	  __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV32, overflow (S32_MIN/-1), constant dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s/= -1 {{.*}}; R1=0x80000000")
+__naked void sdiv32_overflow_2(void)
+{
+	asm volatile ("					\
+	w1 = %[int_min];				\
+	w1 s/= -1;					\
+	if w1 != %[int_min] goto l0_%=;			\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm_const(int_min, INT_MIN)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV64, positive divisor, positive dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))")
+__naked void sdiv64_pos_divisor_1(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s< 8 goto l0_%=;				\
+	if r1 s> 10 goto l0_%=;				\
+	r1 s/= 3;					\
+	if r1 s< 2 goto l1_%=;				\
+	if r1 s> 3 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV64, positive divisor, negative dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=-2,umin=0xfffffffffffffffd,umax=0xfffffffffffffffe,umin32=0xfffffffd,umax32=0xfffffffe,var_off=(0xfffffffffffffffc; 0x3))")
+__naked void sdiv64_pos_divisor_2(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s> -8 goto l0_%=;				\
+	if r1 s< -10 goto l0_%=;			\
+	r1 s/= 3;					\
+	if r1 s< -3 goto l1_%=;				\
+	if r1 s> -2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV64, positive divisor, mixed sign dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=3)")
+__naked void sdiv64_pos_divisor_3(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s< -8 goto l0_%=;				\
+	if r1 s> 10 goto l0_%=;				\
+	r1 s/= 3;					\
+	if r1 s< -2 goto l1_%=;				\
+	if r1 s> 3 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV64, negative divisor, positive dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=-2,umin=0xfffffffffffffffd,umax=0xfffffffffffffffe,umin32=0xfffffffd,umax32=0xfffffffe,var_off=(0xfffffffffffffffc; 0x3))")
+__naked void sdiv64_neg_divisor_1(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s< 8 goto l0_%=;				\
+	if r1 s> 10 goto l0_%=;				\
+	r1 s/= -3;					\
+	if r1 s< -3 goto l1_%=;				\
+	if r1 s> -2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV64, negative divisor, positive dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))")
+__naked void sdiv64_neg_divisor_2(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s> -8 goto l0_%=;				\
+	if r1 s< -10 goto l0_%=;			\
+	r1 s/= -3;					\
+	if r1 s< 2 goto l1_%=;				\
+	if r1 s> 3 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV64, negative divisor, mixed sign dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=2)")
+__naked void sdiv64_neg_divisor_3(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s< -8 goto l0_%=;				\
+	if r1 s> 10 goto l0_%=;				\
+	r1 s/= -3;					\
+	if r1 s< -3 goto l1_%=;				\
+	if r1 s> 2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV64, zero divisor")
+__success __retval(0) __log_level(2)
+__msg("r1 s/= r2 {{.*}}; R1=0 R2=0")
+__naked void sdiv64_zero_divisor(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	r1 &= 8;					\
+	r1 |= 1;					\
+	r2 = 0;						\
+	r1 s/= r2;					\
+	if r1 != 0 goto l0_%=;				\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV64, overflow (S64_MIN/-1)")
+__success __retval(0) __log_level(2)
+__msg("r1 s/= -1 {{.*}}; R1=scalar()")
+__naked void sdiv64_overflow_1(void)
+{
+	asm volatile ("					\
+	call %[bpf_ktime_get_ns];			\
+	r1 = r0;					\
+	r2 = %[llong_min] ll;				\
+	r2 += 10;					\
+	if r1 s> r2 goto l0_%=;				\
+	r1 s/= -1;					\
+l0_%=:	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm_const(llong_min, LLONG_MIN),
+	  __imm(bpf_ktime_get_ns)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SDIV64, overflow (S64_MIN/-1), constant dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s/= -1 {{.*}}; R1=0x8000000000000000")
+__naked void sdiv64_overflow_2(void)
+{
+	asm volatile ("					\
+	r1 = %[llong_min] ll;				\
+	r1 s/= -1;					\
+	r2 = %[llong_min] ll;				\
+	if r1 != r2 goto l0_%=;				\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm_const(llong_min, LLONG_MIN)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("UMOD32, positive divisor")
+__success __retval(0) __log_level(2)
+__msg("w1 %= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))")
+__naked void umod32_pos_divisor(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	w1 &= 8;					\
+	w1 |= 1;					\
+	w1 %%= 3;					\
+	if w1 > 3 goto l0_%=;				\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("UMOD32, positive divisor, small dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 %= 10 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8))")
+__naked void umod32_pos_divisor_unchanged(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	w1 &= 8;					\
+	w1 |= 1;					\
+	w1 %%= 10;					\
+	if w1 < 1 goto l0_%=;				\
+	if w1 > 9 goto l0_%=;				\
+	if w1 & 1 != 1 goto l0_%=;			\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("UMOD32, zero divisor")
+__success __retval(0) __log_level(2)
+__msg("w1 %= w2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8)) R2=0")
+__naked void umod32_zero_divisor(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	w1 &= 8;					\
+	w1 |= 1;					\
+	w2 = 0;						\
+	w1 %%= w2;					\
+	if w1 < 1 goto l0_%=;				\
+	if w1 > 9 goto l0_%=;				\
+	if w1 & 1 != 1 goto l0_%=;			\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("UMOD64, positive divisor")
+__success __retval(0) __log_level(2)
+__msg("r1 %= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))")
+__naked void umod64_pos_divisor(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	r1 &= 8;					\
+	r1 |= 1;					\
+	r1 %%= 3;					\
+	if r1 > 3 goto l0_%=;				\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("UMOD64, positive divisor, small dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 %= 10 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8))")
+__naked void umod64_pos_divisor_unchanged(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	r1 &= 8;					\
+	r1 |= 1;					\
+	r1 %%= 10;					\
+	if r1 < 1 goto l0_%=;				\
+	if r1 > 9 goto l0_%=;				\
+	if r1 & 1 != 1 goto l0_%=;			\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("UMOD64, zero divisor")
+__success __retval(0) __log_level(2)
+__msg("r1 %= r2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8)) R2=0")
+__naked void umod64_zero_divisor(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	r1 &= 8;					\
+	r1 |= 1;					\
+	r2 = 0;						\
+	r1 %%= r2;					\
+	if r1 < 1 goto l0_%=;				\
+	if r1 > 9 goto l0_%=;				\
+	if r1 & 1 != 1 goto l0_%=;			\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, positive divisor, positive dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))")
+__naked void smod32_pos_divisor_1(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s< 8 goto l0_%=;				\
+	if w1 s> 10 goto l0_%=;				\
+	w1 s%%= 3;					\
+	if w1 s< 0 goto l1_%=;				\
+	if w1 s> 2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, positive divisor, negative dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=0,var_off=(0x0; 0xffffffff))")
+__naked void smod32_pos_divisor_2(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s> -8 goto l0_%=;				\
+	if w1 s< -10 goto l0_%=;			\
+	w1 s%%= 3;					\
+	if w1 s< -2 goto l1_%=;				\
+	if w1 s> 0 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, positive divisor, mixed sign dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=2,var_off=(0x0; 0xffffffff))")
+__naked void smod32_pos_divisor_3(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s< -8 goto l0_%=;				\
+	if w1 s> 10 goto l0_%=;				\
+	w1 s%%= 3;					\
+	if w1 s< -2 goto l1_%=;				\
+	if w1 s> 2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, positive divisor, small dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s%= 11 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff))")
+__naked void smod32_pos_divisor_unchanged(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s< -8 goto l0_%=;				\
+	if w1 s> 10 goto l0_%=;				\
+	w1 s%%= 11;					\
+	if w1 s< -8 goto l1_%=;				\
+	if w1 s> 10 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, negative divisor, positive dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))")
+__naked void smod32_neg_divisor_1(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s< 8 goto l0_%=;				\
+	if w1 s> 10 goto l0_%=;				\
+	w1 s%%= -3;					\
+	if w1 s< 0 goto l1_%=;				\
+	if w1 s> 2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, negative divisor, negative dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=0,var_off=(0x0; 0xffffffff))")
+__naked void smod32_neg_divisor_2(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s> -8 goto l0_%=;				\
+	if w1 s< -10 goto l0_%=;			\
+	w1 s%%= -3;					\
+	if w1 s< -2 goto l1_%=;				\
+	if w1 s> 0 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, negative divisor, mixed sign dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=2,var_off=(0x0; 0xffffffff))")
+__naked void smod32_neg_divisor_3(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s< -8 goto l0_%=;				\
+	if w1 s> 10 goto l0_%=;				\
+	w1 s%%= -3;					\
+	if w1 s< -2 goto l1_%=;				\
+	if w1 s> 2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, negative divisor, small dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s%= -11 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff))")
+__naked void smod32_neg_divisor_unchanged(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s< -8 goto l0_%=;				\
+	if w1 s> 10 goto l0_%=;				\
+	w1 s%%= -11;					\
+	if w1 s< -8 goto l1_%=;				\
+	if w1 s> 10 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, zero divisor")
+__success __retval(0) __log_level(2)
+__msg("w1 s%= w2 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff)) R2=0")
+__naked void smod32_zero_divisor(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	if w1 s< -8 goto l0_%=;				\
+	if w1 s> 10 goto l0_%=;				\
+	w2 = 0;						\
+	w1 s%%= w2;					\
+	if w1 s< -8 goto l1_%=;				\
+	if w1 s> 10 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, overflow (S32_MIN%-1)")
+__success __retval(0) __log_level(2)
+__msg("w1 s%= -1 {{.*}}; R1=0")
+__naked void smod32_overflow_1(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w1 = w0;					\
+	w2 = %[int_min];				\
+	w2 += 10;					\
+	if w1 s> w2 goto l0_%=;				\
+	w1 s%%= -1;					\
+	if w1 != 0 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm_const(int_min, INT_MIN),
+	  __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD32, overflow (S32_MIN%-1), constant dividend")
+__success __retval(0) __log_level(2)
+__msg("w1 s%= -1 {{.*}}; R1=0")
+__naked void smod32_overflow_2(void)
+{
+	asm volatile ("					\
+	w1 = %[int_min];				\
+	w1 s%%= -1;					\
+	if w1 != 0 goto l0_%=;				\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm_const(int_min, INT_MIN)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, positive divisor, positive dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))")
+__naked void smod64_pos_divisor_1(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s< 8 goto l0_%=;				\
+	if r1 s> 10 goto l0_%=;				\
+	r1 s%%= 3;					\
+	if r1 s< 0 goto l1_%=;				\
+	if r1 s> 2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, positive divisor, negative dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=0)")
+__naked void smod64_pos_divisor_2(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s> -8 goto l0_%=;				\
+	if r1 s< -10 goto l0_%=;			\
+	r1 s%%= 3;					\
+	if r1 s< -2 goto l1_%=;				\
+	if r1 s> 0 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, positive divisor, mixed sign dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=2)")
+__naked void smod64_pos_divisor_3(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s< -8 goto l0_%=;				\
+	if r1 s> 10 goto l0_%=;				\
+	r1 s%%= 3;					\
+	if r1 s< -2 goto l1_%=;				\
+	if r1 s> 2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, positive divisor, small dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s%= 11 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10)")
+__naked void smod64_pos_divisor_unchanged(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s< -8 goto l0_%=;				\
+	if r1 s> 10 goto l0_%=;				\
+	r1 s%%= 11;					\
+	if r1 s< -8 goto l1_%=;				\
+	if r1 s> 10 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, negative divisor, positive dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))")
+__naked void smod64_neg_divisor_1(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s< 8 goto l0_%=;				\
+	if r1 s> 10 goto l0_%=;				\
+	r1 s%%= -3;					\
+	if r1 s< 0 goto l1_%=;				\
+	if r1 s> 2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, negative divisor, negative dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=0)")
+__naked void smod64_neg_divisor_2(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s> -8 goto l0_%=;				\
+	if r1 s< -10 goto l0_%=;			\
+	r1 s%%= -3;					\
+	if r1 s< -2 goto l1_%=;				\
+	if r1 s> 0 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, negative divisor, mixed sign dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=2)")
+__naked void smod64_neg_divisor_3(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s< -8 goto l0_%=;				\
+	if r1 s> 10 goto l0_%=;				\
+	r1 s%%= -3;					\
+	if r1 s< -2 goto l1_%=;				\
+	if r1 s> 2 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, negative divisor, small dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s%= -11 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10)")
+__naked void smod64_neg_divisor_unchanged(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s< -8 goto l0_%=;				\
+	if r1 s> 10 goto l0_%=;				\
+	r1 s%%= -11;					\
+	if r1 s< -8 goto l1_%=;				\
+	if r1 s> 10 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, zero divisor")
+__success __retval(0) __log_level(2)
+__msg("r1 s%= r2 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10) R2=0")
+__naked void smod64_zero_divisor(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	if r1 s< -8 goto l0_%=;				\
+	if r1 s> 10 goto l0_%=;				\
+	r2 = 0;						\
+	r1 s%%= r2;					\
+	if r1 s< -8 goto l1_%=;				\
+	if r1 s> 10 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, overflow (S64_MIN%-1)")
+__success __retval(0) __log_level(2)
+__msg("r1 s%= -1 {{.*}}; R1=0")
+__naked void smod64_overflow_1(void)
+{
+	asm volatile ("					\
+	call %[bpf_ktime_get_ns];			\
+	r1 = r0;					\
+	r2 = %[llong_min] ll;				\
+	r2 += 10;					\
+	if r1 s> r2 goto l0_%=;				\
+	r1 s%%= -1;					\
+	if r1 != 0 goto l1_%=;				\
+l0_%=:	r0 = 0;						\
+	exit;						\
+l1_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm_const(llong_min, LLONG_MIN),
+	  __imm(bpf_ktime_get_ns)
+	: __clobber_all);
+}
+
+SEC("socket")
+__description("SMOD64, overflow (S64_MIN%-1), constant dividend")
+__success __retval(0) __log_level(2)
+__msg("r1 s%= -1 {{.*}}; R1=0")
+__naked void smod64_overflow_2(void)
+{
+	asm volatile ("					\
+	r1 = %[llong_min] ll;				\
+	r1 s%%= -1;					\
+	if r1 != 0 goto l0_%=;				\
+	r0 = 0;						\
+	exit;						\
+l0_%=:	r0 = *(u64 *)(r1 + 0);				\
+	exit;						\
+"	:
+	: __imm_const(llong_min, LLONG_MIN)
+	: __clobber_all);
+}
-- 
cgit v1.2.3


From a917cd0a23fae160a85b0e8a0dd1d548c5d5242e Mon Sep 17 00:00:00 2001
From: Michel Lind <michel@michel-slm.name>
Date: Fri, 16 Jan 2026 21:21:58 +0000
Subject: tools/net/ynl: Makefile's install target now installs ynltool

This tool is built by default, but was not being installed by default
when running `make install`. Fix this by calling ynltool's install
target.

Signed-off-by: Michel Lind <michel@michel-slm.name>
Link: https://patch.msgid.link/aWqr9gUT4hWZwwcI@mbp-m3-fedora.vm
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/ynl/Makefile b/tools/net/ynl/Makefile
index c2f3e8b3f2ac..9b692f368be7 100644
--- a/tools/net/ynl/Makefile
+++ b/tools/net/ynl/Makefile
@@ -41,7 +41,7 @@ clean distclean:
 	rm -rf pyynl.egg-info
 	rm -rf build
 
-install: libynl.a lib/*.h
+install: libynl.a lib/*.h ynltool
 	@echo -e "\tINSTALL libynl.a"
 	@$(INSTALL) -d $(DESTDIR)$(libdir)
 	@$(INSTALL) -m 0644 libynl.a $(DESTDIR)$(libdir)/libynl.a
@@ -51,6 +51,7 @@ install: libynl.a lib/*.h
 	@echo -e "\tINSTALL pyynl"
 	@pip install --prefix=$(DESTDIR)$(prefix) .
 	@make -C generated install
+	@make -C ynltool install
 
 run_tests:
 	@$(MAKE) -C tests run_tests
-- 
cgit v1.2.3


From dd341eacdba360d035c9d4de66d3c80a89d77c84 Mon Sep 17 00:00:00 2001
From: Matt Bobrowski <mattbobrowski@google.com>
Date: Tue, 20 Jan 2026 09:16:30 +0000
Subject: selftests/bpf: update verifier test for default trusted pointer
 semantics

Replace the verifier test for default trusted pointer semantics, which
previously relied on BPF kfunc bpf_get_root_mem_cgroup(), with a new
test utilizing dedicated BPF kfuncs defined within the bpf_testmod.

bpf_get_root_mem_cgroup() was modified such that it again relies on
KF_ACQUIRE semantics, therefore no longer making it a suitable
candidate to test BPF verifier default trusted pointer semantics
against.

Link: https://lore.kernel.org/bpf/20260113083949.2502978-2-mattbobrowski@google.com
Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Link: https://lore.kernel.org/r/20260120091630.3420452-1-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c  |  4 +--
 .../bpf/progs/verifier_default_trusted_ptr.c       | 29 ++++++++++++++++++++
 .../selftests/bpf/progs/verifier_memcontrol.c      | 32 ----------------------
 .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 18 ++++++++++++
 .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h   |  3 ++
 5 files changed, 52 insertions(+), 34 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c
 delete mode 100644 tools/testing/selftests/bpf/progs/verifier_memcontrol.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index fa9e506cc36f..b6a1e79709be 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -30,6 +30,7 @@
 #include "verifier_ctx.skel.h"
 #include "verifier_ctx_sk_msg.skel.h"
 #include "verifier_d_path.skel.h"
+#include "verifier_default_trusted_ptr.skel.h"
 #include "verifier_direct_packet_access.skel.h"
 #include "verifier_direct_stack_access_wraparound.skel.h"
 #include "verifier_div0.skel.h"
@@ -62,7 +63,6 @@
 #include "verifier_masking.skel.h"
 #include "verifier_may_goto_1.skel.h"
 #include "verifier_may_goto_2.skel.h"
-#include "verifier_memcontrol.skel.h"
 #include "verifier_meta_access.skel.h"
 #include "verifier_movsx.skel.h"
 #include "verifier_mtu.skel.h"
@@ -173,6 +173,7 @@ void test_verifier_const_or(void)             { RUN(verifier_const_or); }
 void test_verifier_ctx(void)                  { RUN(verifier_ctx); }
 void test_verifier_ctx_sk_msg(void)           { RUN(verifier_ctx_sk_msg); }
 void test_verifier_d_path(void)               { RUN(verifier_d_path); }
+void test_verifier_default_trusted_ptr(void)  { RUN_TESTS(verifier_default_trusted_ptr); }
 void test_verifier_direct_packet_access(void) { RUN(verifier_direct_packet_access); }
 void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_stack_access_wraparound); }
 void test_verifier_div0(void)                 { RUN(verifier_div0); }
@@ -205,7 +206,6 @@ void test_verifier_map_ret_val(void)          { RUN(verifier_map_ret_val); }
 void test_verifier_masking(void)              { RUN(verifier_masking); }
 void test_verifier_may_goto_1(void)           { RUN(verifier_may_goto_1); }
 void test_verifier_may_goto_2(void)           { RUN(verifier_may_goto_2); }
-void test_verifier_memcontrol(void)	      { RUN(verifier_memcontrol); }
 void test_verifier_meta_access(void)          { RUN(verifier_meta_access); }
 void test_verifier_movsx(void)                 { RUN(verifier_movsx); }
 void test_verifier_mul(void)                  { RUN(verifier_mul); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c b/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c
new file mode 100644
index 000000000000..fa3b656ad4fb
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2026 Google LLC.
+ */
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include "bpf_misc.h"
+#include "../test_kmods/bpf_testmod_kfunc.h"
+
+SEC("syscall")
+__success __retval(0)
+int test_default_trusted_ptr(void *ctx)
+{
+	struct prog_test_member *trusted_ptr;
+
+	trusted_ptr = bpf_kfunc_get_default_trusted_ptr_test();
+	/*
+	 * Test BPF kfunc bpf_get_default_trusted_ptr_test() returns a
+	 * PTR_TO_BTF_ID | PTR_TRUSTED, therefore it should be accepted when
+	 * passed to a BPF kfunc only accepting KF_TRUSTED_ARGS.
+	 */
+	bpf_kfunc_put_default_trusted_ptr_test(trusted_ptr);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_memcontrol.c b/tools/testing/selftests/bpf/progs/verifier_memcontrol.c
deleted file mode 100644
index 13564956f621..000000000000
--- a/tools/testing/selftests/bpf/progs/verifier_memcontrol.c
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2026 Google LLC.
- */
-
-#include <vmlinux.h>
-#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
-#include "bpf_misc.h"
-
-SEC("syscall")
-__success __retval(0)
-int root_mem_cgroup_default_trusted(void *ctx)
-{
-	unsigned long usage;
-	struct mem_cgroup *root_mem_cgroup;
-
-	root_mem_cgroup = bpf_get_root_mem_cgroup();
-	if (!root_mem_cgroup)
-		return 1;
-
-	/*
-	 * BPF kfunc bpf_get_root_mem_cgroup() returns a PTR_TO_BTF_ID |
-	 * PTR_TRUSTED | PTR_MAYBE_NULL, therefore it should be accepted when
-	 * passed to a BPF kfunc only accepting KF_TRUSTED_ARGS.
-	 */
-	usage = bpf_mem_cgroup_usage(root_mem_cgroup);
-	__sink(usage);
-	return 0;
-}
-
-char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index 0d542ba64365..d425034b72d3 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -254,6 +254,22 @@ __bpf_kfunc int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size)
 	return NULL;
 }
 
+static struct prog_test_member trusted_ptr;
+
+__bpf_kfunc struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void)
+{
+	return &trusted_ptr;
+}
+
+__bpf_kfunc void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr)
+{
+	/*
+	 * This BPF kfunc doesn't actually have any put/KF_ACQUIRE
+	 * semantics. We're simply wanting to simulate a BPF kfunc that takes a
+	 * struct prog_test_member pointer as an argument.
+	 */
+}
+
 __bpf_kfunc struct bpf_testmod_ctx *
 bpf_testmod_ctx_create(int *err)
 {
@@ -709,6 +725,8 @@ BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_1)
 BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_2)
+BTF_ID_FLAGS(func, bpf_kfunc_get_default_trusted_ptr_test);
+BTF_ID_FLAGS(func, bpf_kfunc_put_default_trusted_ptr_test);
 BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids)
 
 BTF_ID_LIST(bpf_testmod_dtor_ids)
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
index 225ea30c4e3d..10f89f06245f 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
@@ -166,4 +166,7 @@ extern int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __wea
 extern int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args) __weak __ksym;
 #endif
 
+struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void) __ksym;
+void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr) __ksym;
+
 #endif /* _BPF_TESTMOD_KFUNC_H */
-- 
cgit v1.2.3


From 8766d61a1d33cb5f15bfdd6ce9832bbe1fc649c2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 20 Jan 2026 18:04:55 -0800
Subject: Revert "Merge branch
 'netkit-support-for-io_uring-zero-copy-and-af_xdp'"

This reverts commit 77b9c4a438fc66e2ab004c411056b3fb71a54f2c, reversing
changes made to 4515ec4ad58a37e70a9e1256c0b993958c9b7497:

 931420a2fc36 ("selftests/net: Add netkit container tests")
 ab771c938d9a ("selftests/net: Make NetDrvContEnv support queue leasing")
 6be87fbb2776 ("selftests/net: Add env for container based tests")
 61d99ce3dfc2 ("selftests/net: Add bpf skb forwarding program")
 920da3634194 ("netkit: Add xsk support for af_xdp applications")
 eef51113f8af ("netkit: Add netkit notifier to check for unregistering devices")
 b5ef109d22d4 ("netkit: Implement rtnl_link_ops->alloc and ndo_queue_create")
 b5c3fa4a0b16 ("netkit: Add single device mode for netkit")
 0073d2fd679d ("xsk: Proxy pool management for leased queues")
 1ecea95dd3b5 ("xsk: Extend xsk_rcv_check validation")
 804bf334d08a ("net: Proxy netdev_queue_get_dma_dev for leased queues")
 0caa9a8ddec3 ("net: Proxy net_mp_{open,close}_rxq for leased queues")
 ff8889ff9107 ("net, ethtool: Disallow leased real rxqs to be resized")
 9e2103f36110 ("net: Add lease info to queue-get response")
 31127deddef4 ("net: Implement netdev_nl_queue_create_doit")
 a5546e18f77c ("net: Add queue-create operation")

The series will conflict with io_uring work, and the code needs more
polish.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml            |  44 ---
 drivers/net/netkit.c                               | 360 ++++-----------------
 include/linux/netdevice.h                          |   6 -
 include/net/netdev_queues.h                        |  19 +-
 include/net/netdev_rx_queue.h                      |  21 +-
 include/net/page_pool/memory_provider.h            |   4 +-
 include/net/xdp_sock_drv.h                         |   2 +-
 include/uapi/linux/if_link.h                       |   6 -
 include/uapi/linux/netdev.h                        |  11 -
 net/core/dev.c                                     |   7 -
 net/core/dev.h                                     |   2 -
 net/core/netdev-genl-gen.c                         |  20 --
 net/core/netdev-genl-gen.h                         |   2 -
 net/core/netdev-genl.c                             | 185 -----------
 net/core/netdev_queues.c                           |  74 +----
 net/core/netdev_rx_queue.c                         | 169 ++--------
 net/ethtool/channels.c                             |  12 +-
 net/ethtool/ioctl.c                                |   9 +-
 net/xdp/xsk.c                                      |  79 +----
 tools/include/uapi/linux/netdev.h                  |  11 -
 tools/testing/selftests/drivers/net/README.rst     |   7 -
 tools/testing/selftests/drivers/net/hw/Makefile    |   2 -
 .../selftests/drivers/net/hw/lib/py/__init__.py    |   7 +-
 .../selftests/drivers/net/hw/nk_forward.bpf.c      |  49 ---
 tools/testing/selftests/drivers/net/hw/nk_netns.py |  23 --
 .../testing/selftests/drivers/net/hw/nk_qlease.py  |  55 ----
 .../selftests/drivers/net/lib/py/__init__.py       |   7 +-
 tools/testing/selftests/drivers/net/lib/py/env.py  | 157 ---------
 28 files changed, 117 insertions(+), 1233 deletions(-)
 delete mode 100644 tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c
 delete mode 100755 tools/testing/selftests/drivers/net/hw/nk_netns.py
 delete mode 100755 tools/testing/selftests/drivers/net/hw/nk_qlease.py

(limited to 'tools')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index b86db8656eac..596c306ce52b 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -339,15 +339,6 @@ attribute-sets:
         doc: XSK information for this queue, if any.
         type: nest
         nested-attributes: xsk-info
-      -
-        name: lease
-        doc: |
-          A queue from a virtual device can have a lease which refers to
-          another queue from a physical device. This is useful for memory
-          providers and AF_XDP operations which take an ifindex and queue id
-          to allow applications to bind against virtual devices in containers.
-        type: nest
-        nested-attributes: lease
   -
     name: qstats
     doc: |
@@ -546,24 +537,6 @@ attribute-sets:
         name: id
       -
         name: type
-  -
-    name: lease
-    attributes:
-      -
-        name: ifindex
-        doc: The netdev ifindex to lease the queue from.
-        type: u32
-        checks:
-          min: 1
-      -
-        name: queue
-        doc: The netdev queue to lease from.
-        type: nest
-        nested-attributes: queue-id
-      -
-        name: netns-id
-        doc: The network namespace id of the netdev.
-        type: s32
   -
     name: dmabuf
     attributes:
@@ -713,7 +686,6 @@ operations:
             - dmabuf
             - io-uring
             - xsk
-            - lease
       dump:
         request:
           attributes:
@@ -825,22 +797,6 @@ operations:
         reply:
           attributes:
             - id
-    -
-      name: queue-create
-      doc: |
-        Create a new queue for the given netdevice. Whether this operation
-        is supported depends on the device and the driver.
-      attribute-set: queue
-      flags: [admin-perm]
-      do:
-        request:
-          attributes:
-            - ifindex
-            - type
-            - lease
-        reply: &queue-create-op
-          attributes:
-            - id
 
 kernel-family:
   headers: ["net/netdev_netlink.h"]
diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
index 0519f855d062..0a2fef7caccb 100644
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -9,21 +9,11 @@
 #include <linux/bpf_mprog.h>
 #include <linux/indirect_call_wrapper.h>
 
-#include <net/netdev_lock.h>
-#include <net/netdev_queues.h>
-#include <net/netdev_rx_queue.h>
-#include <net/xdp_sock_drv.h>
 #include <net/netkit.h>
 #include <net/dst.h>
 #include <net/tcx.h>
 
-#define NETKIT_DRV_NAME	"netkit"
-
-#define NETKIT_NUM_RX_QUEUES_MAX  1024
-#define NETKIT_NUM_TX_QUEUES_MAX  1
-
-#define NETKIT_NUM_RX_QUEUES_REAL 1
-#define NETKIT_NUM_TX_QUEUES_REAL 1
+#define DRV_NAME "netkit"
 
 struct netkit {
 	__cacheline_group_begin(netkit_fastpath);
@@ -36,7 +26,6 @@ struct netkit {
 
 	__cacheline_group_begin(netkit_slowpath);
 	enum netkit_mode mode;
-	enum netkit_pairing pair;
 	bool primary;
 	u32 headroom;
 	__cacheline_group_end(netkit_slowpath);
@@ -47,8 +36,6 @@ struct netkit_link {
 	struct net_device *dev;
 };
 
-static struct rtnl_link_ops netkit_link_ops;
-
 static __always_inline int
 netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
 	   enum netkit_action ret)
@@ -148,10 +135,6 @@ static int netkit_open(struct net_device *dev)
 	struct netkit *nk = netkit_priv(dev);
 	struct net_device *peer = rtnl_dereference(nk->peer);
 
-	if (nk->pair == NETKIT_DEVICE_SINGLE) {
-		netif_carrier_on(dev);
-		return 0;
-	}
 	if (!peer)
 		return -ENOTCONN;
 	if (peer->flags & IFF_UP) {
@@ -236,86 +219,9 @@ static void netkit_get_stats(struct net_device *dev,
 	stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
 }
 
-static bool netkit_xsk_supported_at_phys(const struct net_device *dev)
-{
-	if (!dev->netdev_ops->ndo_bpf ||
-	    !dev->netdev_ops->ndo_xdp_xmit ||
-	    !dev->netdev_ops->ndo_xsk_wakeup)
-		return false;
-	if ((dev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK)
-		return false;
-	return true;
-}
-
-static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp)
-{
-	struct netkit *nk = netkit_priv(dev);
-	struct netdev_bpf xdp_lower;
-	struct netdev_rx_queue *rxq;
-	struct net_device *phys;
-	int ret = -EBUSY;
-
-	switch (xdp->command) {
-	case XDP_SETUP_XSK_POOL:
-		if (nk->pair == NETKIT_DEVICE_PAIR)
-			return -EOPNOTSUPP;
-		if (xdp->xsk.queue_id >= dev->real_num_rx_queues)
-			return -EINVAL;
-
-		rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id);
-		if (!rxq->lease)
-			return -EOPNOTSUPP;
-
-		phys = rxq->lease->dev;
-		if (!netkit_xsk_supported_at_phys(phys))
-			return -EOPNOTSUPP;
-
-		memcpy(&xdp_lower, xdp, sizeof(xdp_lower));
-		xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->lease);
-		break;
-	case XDP_SETUP_PROG:
-		return -EPERM;
-	default:
-		return -EINVAL;
-	}
-
-	netdev_lock(phys);
-	if (!dev_get_min_mp_channel_count(phys))
-		ret = phys->netdev_ops->ndo_bpf(phys, &xdp_lower);
-	netdev_unlock(phys);
-	return ret;
-}
-
-static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
-{
-	struct netdev_rx_queue *rxq;
-	struct net_device *phys;
-
-	if (queue_id >= dev->real_num_rx_queues)
-		return -EINVAL;
-
-	rxq = __netif_get_rx_queue(dev, queue_id);
-	if (!rxq->lease)
-		return -EOPNOTSUPP;
-
-	phys = rxq->lease->dev;
-	if (!netkit_xsk_supported_at_phys(phys))
-		return -EOPNOTSUPP;
-
-	return phys->netdev_ops->ndo_xsk_wakeup(phys,
-			get_netdev_rx_queue_index(rxq->lease), flags);
-}
-
-static int netkit_init(struct net_device *dev)
-{
-	netdev_lockdep_set_classes(dev);
-	return 0;
-}
-
 static void netkit_uninit(struct net_device *dev);
 
 static const struct net_device_ops netkit_netdev_ops = {
-	.ndo_init		= netkit_init,
 	.ndo_open		= netkit_open,
 	.ndo_stop		= netkit_close,
 	.ndo_start_xmit		= netkit_xmit,
@@ -326,95 +232,19 @@ static const struct net_device_ops netkit_netdev_ops = {
 	.ndo_get_peer_dev	= netkit_peer_dev,
 	.ndo_get_stats64	= netkit_get_stats,
 	.ndo_uninit		= netkit_uninit,
-	.ndo_bpf		= netkit_xsk,
-	.ndo_xsk_wakeup		= netkit_xsk_wakeup,
 	.ndo_features_check	= passthru_features_check,
 };
 
 static void netkit_get_drvinfo(struct net_device *dev,
 			       struct ethtool_drvinfo *info)
 {
-	strscpy(info->driver, NETKIT_DRV_NAME, sizeof(info->driver));
+	strscpy(info->driver, DRV_NAME, sizeof(info->driver));
 }
 
 static const struct ethtool_ops netkit_ethtool_ops = {
 	.get_drvinfo		= netkit_get_drvinfo,
 };
 
-static int netkit_queue_create(struct net_device *dev)
-{
-	struct netkit *nk = netkit_priv(dev);
-	u32 rxq_count_old, rxq_count_new;
-	int err;
-
-	rxq_count_old = dev->real_num_rx_queues;
-	rxq_count_new = rxq_count_old + 1;
-
-	/* Only allow to lease a queue in single device mode or to
-	 * lease against the peer device which then ends up in the
-	 * target netns.
-	 */
-	if (nk->pair == NETKIT_DEVICE_PAIR && nk->primary)
-		return -EOPNOTSUPP;
-
-	if (netif_running(dev))
-		netif_carrier_off(dev);
-	err = netif_set_real_num_rx_queues(dev, rxq_count_new);
-	if (netif_running(dev))
-		netif_carrier_on(dev);
-
-	return err ? : rxq_count_old;
-}
-
-static const struct netdev_queue_mgmt_ops netkit_queue_mgmt_ops = {
-	.ndo_queue_create	= netkit_queue_create,
-};
-
-static struct net_device *netkit_alloc(struct nlattr *tb[],
-				       const char *ifname,
-				       unsigned char name_assign_type,
-				       unsigned int num_tx_queues,
-				       unsigned int num_rx_queues)
-{
-	const struct rtnl_link_ops *ops = &netkit_link_ops;
-	struct net_device *dev;
-
-	if (num_tx_queues > NETKIT_NUM_TX_QUEUES_MAX ||
-	    num_rx_queues > NETKIT_NUM_RX_QUEUES_MAX)
-		return ERR_PTR(-EOPNOTSUPP);
-
-	dev = alloc_netdev_mqs(ops->priv_size, ifname,
-			       name_assign_type, ops->setup,
-			       num_tx_queues, num_rx_queues);
-	if (dev) {
-		dev->real_num_tx_queues = NETKIT_NUM_TX_QUEUES_REAL;
-		dev->real_num_rx_queues = NETKIT_NUM_RX_QUEUES_REAL;
-	}
-	return dev;
-}
-
-static void netkit_queue_unlease(struct net_device *dev)
-{
-	struct netdev_rx_queue *rxq, *rxq_lease;
-	struct net_device *dev_lease;
-	int i;
-
-	if (dev->real_num_rx_queues == 1)
-		return;
-
-	netdev_lock(dev);
-	for (i = 1; i < dev->real_num_rx_queues; i++) {
-		rxq = __netif_get_rx_queue(dev, i);
-		rxq_lease = rxq->lease;
-		dev_lease = rxq_lease->dev;
-
-		netdev_lock(dev_lease);
-		netdev_rx_queue_unlease(rxq, rxq_lease);
-		netdev_unlock(dev_lease);
-	}
-	netdev_unlock(dev);
-}
-
 static void netkit_setup(struct net_device *dev)
 {
 	static const netdev_features_t netkit_features_hw_vlan =
@@ -445,20 +275,18 @@ static void netkit_setup(struct net_device *dev)
 	dev->priv_flags |= IFF_DISABLE_NETPOLL;
 	dev->lltx = true;
 
-	dev->netdev_ops     = &netkit_netdev_ops;
-	dev->ethtool_ops    = &netkit_ethtool_ops;
-	dev->queue_mgmt_ops = &netkit_queue_mgmt_ops;
+	dev->ethtool_ops = &netkit_ethtool_ops;
+	dev->netdev_ops  = &netkit_netdev_ops;
 
 	dev->features |= netkit_features;
 	dev->hw_features = netkit_features;
 	dev->hw_enc_features = netkit_features;
 	dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
 	dev->vlan_features = dev->features & ~netkit_features_hw_vlan;
+
 	dev->needs_free_netdev = true;
 
 	netif_set_tso_max_size(dev, GSO_MAX_SIZE);
-
-	xdp_set_features_flag(dev, NETDEV_XDP_ACT_XSK);
 }
 
 static struct net *netkit_get_link_net(const struct net_device *dev)
@@ -497,6 +325,8 @@ static int netkit_validate(struct nlattr *tb[], struct nlattr *data[],
 	return 0;
 }
 
+static struct rtnl_link_ops netkit_link_ops;
+
 static int netkit_new_link(struct net_device *dev,
 			   struct rtnl_newlink_params *params,
 			   struct netlink_ext_ack *extack)
@@ -505,7 +335,6 @@ static int netkit_new_link(struct net_device *dev,
 	enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT;
 	enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT;
 	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr;
-	enum netkit_pairing pair = NETKIT_DEVICE_PAIR;
 	enum netkit_action policy_prim = NETKIT_PASS;
 	enum netkit_action policy_peer = NETKIT_PASS;
 	struct nlattr **data = params->data;
@@ -514,8 +343,7 @@ static int netkit_new_link(struct net_device *dev,
 	struct nlattr **tb = params->tb;
 	u16 headroom = 0, tailroom = 0;
 	struct ifinfomsg *ifmp = NULL;
-	struct net_device *peer = NULL;
-	bool seen_peer = false;
+	struct net_device *peer;
 	char ifname[IFNAMSIZ];
 	struct netkit *nk;
 	int err;
@@ -552,12 +380,6 @@ static int netkit_new_link(struct net_device *dev,
 			headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]);
 		if (data[IFLA_NETKIT_TAILROOM])
 			tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]);
-		if (data[IFLA_NETKIT_PAIRING])
-			pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]);
-
-		seen_peer = data[IFLA_NETKIT_PEER_INFO] ||
-			    data[IFLA_NETKIT_PEER_SCRUB] ||
-			    data[IFLA_NETKIT_PEER_POLICY];
 	}
 
 	if (ifmp && tbp[IFLA_IFNAME]) {
@@ -570,46 +392,45 @@ static int netkit_new_link(struct net_device *dev,
 	if (mode != NETKIT_L2 &&
 	    (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS]))
 		return -EOPNOTSUPP;
-	if (pair == NETKIT_DEVICE_SINGLE &&
-	    (tb != tbp || seen_peer || policy_prim != NETKIT_PASS))
-		return -EOPNOTSUPP;
 
-	if (pair == NETKIT_DEVICE_PAIR) {
-		peer = rtnl_create_link(peer_net, ifname, ifname_assign_type,
-					&netkit_link_ops, tbp, extack);
-		if (IS_ERR(peer))
-			return PTR_ERR(peer);
-
-		netif_inherit_tso_max(peer, dev);
-		if (headroom)
-			peer->needed_headroom = headroom;
-		if (tailroom)
-			peer->needed_tailroom = tailroom;
-		if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
-			eth_hw_addr_random(peer);
-		if (ifmp && dev->ifindex)
-			peer->ifindex = ifmp->ifi_index;
+	peer = rtnl_create_link(peer_net, ifname, ifname_assign_type,
+				&netkit_link_ops, tbp, extack);
+	if (IS_ERR(peer))
+		return PTR_ERR(peer);
 
-		nk = netkit_priv(peer);
-		nk->primary = false;
-		nk->policy = policy_peer;
-		nk->scrub = scrub_peer;
-		nk->mode = mode;
-		nk->pair = pair;
-		nk->headroom = headroom;
-		bpf_mprog_bundle_init(&nk->bundle);
-
-		err = register_netdevice(peer);
-		if (err < 0)
-			goto err_register_peer;
-		netif_carrier_off(peer);
-		if (mode == NETKIT_L2)
-			dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
-
-		err = rtnl_configure_link(peer, NULL, 0, NULL);
-		if (err < 0)
-			goto err_configure_peer;
+	netif_inherit_tso_max(peer, dev);
+	if (headroom) {
+		peer->needed_headroom = headroom;
+		dev->needed_headroom = headroom;
 	}
+	if (tailroom) {
+		peer->needed_tailroom = tailroom;
+		dev->needed_tailroom = tailroom;
+	}
+
+	if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
+		eth_hw_addr_random(peer);
+	if (ifmp && dev->ifindex)
+		peer->ifindex = ifmp->ifi_index;
+
+	nk = netkit_priv(peer);
+	nk->primary = false;
+	nk->policy = policy_peer;
+	nk->scrub = scrub_peer;
+	nk->mode = mode;
+	nk->headroom = headroom;
+	bpf_mprog_bundle_init(&nk->bundle);
+
+	err = register_netdevice(peer);
+	if (err < 0)
+		goto err_register_peer;
+	netif_carrier_off(peer);
+	if (mode == NETKIT_L2)
+		dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
+
+	err = rtnl_configure_link(peer, NULL, 0, NULL);
+	if (err < 0)
+		goto err_configure_peer;
 
 	if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS])
 		eth_hw_addr_random(dev);
@@ -617,17 +438,12 @@ static int netkit_new_link(struct net_device *dev,
 		nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
 		strscpy(dev->name, "nk%d", IFNAMSIZ);
-	if (headroom)
-		dev->needed_headroom = headroom;
-	if (tailroom)
-		dev->needed_tailroom = tailroom;
 
 	nk = netkit_priv(dev);
 	nk->primary = true;
 	nk->policy = policy_prim;
 	nk->scrub = scrub_prim;
 	nk->mode = mode;
-	nk->pair = pair;
 	nk->headroom = headroom;
 	bpf_mprog_bundle_init(&nk->bundle);
 
@@ -639,12 +455,10 @@ static int netkit_new_link(struct net_device *dev,
 		dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL);
 
 	rcu_assign_pointer(netkit_priv(dev)->peer, peer);
-	if (peer)
-		rcu_assign_pointer(netkit_priv(peer)->peer, dev);
+	rcu_assign_pointer(netkit_priv(peer)->peer, dev);
 	return 0;
 err_configure_peer:
-	if (peer)
-		unregister_netdevice(peer);
+	unregister_netdevice(peer);
 	return err;
 err_register_peer:
 	free_netdev(peer);
@@ -704,8 +518,6 @@ static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 whi
 	nk = netkit_priv(dev);
 	if (!nk->primary)
 		return ERR_PTR(-EACCES);
-	if (nk->pair == NETKIT_DEVICE_SINGLE)
-		return ERR_PTR(-EOPNOTSUPP);
 	if (which == BPF_NETKIT_PEER) {
 		dev = rcu_dereference_rtnl(nk->peer);
 		if (!dev)
@@ -1032,7 +844,6 @@ static void netkit_release_all(struct net_device *dev)
 static void netkit_uninit(struct net_device *dev)
 {
 	netkit_release_all(dev);
-	netkit_queue_unlease(dev);
 }
 
 static void netkit_del_link(struct net_device *dev, struct list_head *head)
@@ -1068,7 +879,6 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
 		{ IFLA_NETKIT_PEER_INFO,  "peer info" },
 		{ IFLA_NETKIT_HEADROOM,   "headroom" },
 		{ IFLA_NETKIT_TAILROOM,   "tailroom" },
-		{ IFLA_NETKIT_PAIRING,    "pairing" },
 	};
 
 	if (!nk->primary) {
@@ -1088,11 +898,9 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
 	}
 
 	if (data[IFLA_NETKIT_POLICY]) {
-		err = -EOPNOTSUPP;
 		attr = data[IFLA_NETKIT_POLICY];
 		policy = nla_get_u32(attr);
-		if (nk->pair == NETKIT_DEVICE_PAIR)
-			err = netkit_check_policy(policy, attr, extack);
+		err = netkit_check_policy(policy, attr, extack);
 		if (err)
 			return err;
 		WRITE_ONCE(nk->policy, policy);
@@ -1113,48 +921,6 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
 	return 0;
 }
 
-static void netkit_check_lease_unregister(struct net_device *dev)
-{
-	LIST_HEAD(list_kill);
-	u32 q_idx;
-
-	if (READ_ONCE(dev->reg_state) != NETREG_UNREGISTERING ||
-	    !dev->dev.parent)
-		return;
-
-	netdev_lock_ops(dev);
-	for (q_idx = 0; q_idx < dev->real_num_rx_queues; q_idx++) {
-		struct net_device *tmp = dev;
-		u32 tmp_q_idx = q_idx;
-
-		if (netif_rx_queue_lease_get_owner(&tmp, &tmp_q_idx)) {
-			if (tmp->netdev_ops != &netkit_netdev_ops)
-				continue;
-			/* A single phys device can have multiple queues leased
-			 * to one netkit device. We can only queue that netkit
-			 * device once to the list_kill. Queues of that phys
-			 * device can be leased with different individual netkit
-			 * devices, hence we batch via list_kill.
-			 */
-			if (unregister_netdevice_queued(tmp))
-				continue;
-			netkit_del_link(tmp, &list_kill);
-		}
-	}
-	netdev_unlock_ops(dev);
-	unregister_netdevice_many(&list_kill);
-}
-
-static int netkit_notifier(struct notifier_block *this,
-			   unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-
-	if (event == NETDEV_UNREGISTER)
-		netkit_check_lease_unregister(dev);
-	return NOTIFY_DONE;
-}
-
 static size_t netkit_get_size(const struct net_device *dev)
 {
 	return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */
@@ -1165,7 +931,6 @@ static size_t netkit_get_size(const struct net_device *dev)
 	       nla_total_size(sizeof(u8))  + /* IFLA_NETKIT_PRIMARY */
 	       nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */
 	       nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */
-	       nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */
 	       0;
 }
 
@@ -1186,8 +951,6 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
 		return -EMSGSIZE;
 	if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom))
 		return -EMSGSIZE;
-	if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair))
-		return -EMSGSIZE;
 
 	if (peer) {
 		nk = netkit_priv(peer);
@@ -1209,15 +972,13 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
 	[IFLA_NETKIT_TAILROOM]		= { .type = NLA_U16 },
 	[IFLA_NETKIT_SCRUB]		= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
 	[IFLA_NETKIT_PEER_SCRUB]	= NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
-	[IFLA_NETKIT_PAIRING]		= NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE),
 	[IFLA_NETKIT_PRIMARY]		= { .type = NLA_REJECT,
 					    .reject_message = "Primary attribute is read-only" },
 };
 
 static struct rtnl_link_ops netkit_link_ops = {
-	.kind		= NETKIT_DRV_NAME,
+	.kind		= DRV_NAME,
 	.priv_size	= sizeof(struct netkit),
-	.alloc		= netkit_alloc,
 	.setup		= netkit_setup,
 	.newlink	= netkit_new_link,
 	.dellink	= netkit_del_link,
@@ -1231,39 +992,26 @@ static struct rtnl_link_ops netkit_link_ops = {
 	.maxtype	= IFLA_NETKIT_MAX,
 };
 
-static struct notifier_block netkit_netdev_notifier = {
-	.notifier_call	= netkit_notifier,
-};
-
-static __init int netkit_mod_init(void)
+static __init int netkit_init(void)
 {
-	int ret;
-
 	BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT ||
 		     (int)NETKIT_PASS != (int)TCX_PASS ||
 		     (int)NETKIT_DROP != (int)TCX_DROP ||
 		     (int)NETKIT_REDIRECT != (int)TCX_REDIRECT);
 
-	ret = rtnl_link_register(&netkit_link_ops);
-	if (ret)
-		return ret;
-	ret = register_netdevice_notifier(&netkit_netdev_notifier);
-	if (ret)
-		rtnl_link_unregister(&netkit_link_ops);
-	return ret;
+	return rtnl_link_register(&netkit_link_ops);
 }
 
-static __exit void netkit_mod_exit(void)
+static __exit void netkit_exit(void)
 {
-	unregister_netdevice_notifier(&netkit_netdev_notifier);
 	rtnl_link_unregister(&netkit_link_ops);
 }
 
-module_init(netkit_mod_init);
-module_exit(netkit_mod_exit);
+module_init(netkit_init);
+module_exit(netkit_exit);
 
 MODULE_DESCRIPTION("BPF-programmable network device");
 MODULE_AUTHOR("Daniel Borkmann <daniel@iogearbox.net>");
 MODULE_AUTHOR("Nikolay Aleksandrov <razor@blackwall.org>");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS_RTNL_LINK(NETKIT_DRV_NAME);
+MODULE_ALIAS_RTNL_LINK(DRV_NAME);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4d146c000e21..d99b0fbc1942 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3400,17 +3400,11 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 int register_netdevice(struct net_device *dev);
 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
 void unregister_netdevice_many(struct list_head *head);
-
 static inline void unregister_netdevice(struct net_device *dev)
 {
 	unregister_netdevice_queue(dev, NULL);
 }
 
-static inline bool unregister_netdevice_queued(const struct net_device *dev)
-{
-	return !list_empty(&dev->unreg_list);
-}
-
 int netdev_refcnt_read(const struct net_device *dev);
 void free_netdev(struct net_device *dev);
 
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index 81dc7cb2360c..b55d3b9cb9c2 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -130,11 +130,6 @@ void netdev_stat_queue_sum(struct net_device *netdev,
  * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used
  *			   for this queue. Return NULL on error.
  *
- * @ndo_queue_create: Create a new RX queue which can be leased to another queue.
- *		      Ops on this queue are redirected to the leased queue e.g.
- *		      when opening a memory provider. Return the new queue id on
- *		      success. Return negative error code on failure.
- *
  * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while
  * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only
  * be called for an interface which is open.
@@ -154,12 +149,9 @@ struct netdev_queue_mgmt_ops {
 						  int idx);
 	struct device *		(*ndo_queue_get_dma_dev)(struct net_device *dev,
 							 int idx);
-	int			(*ndo_queue_create)(struct net_device *dev);
 };
 
-bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx);
-bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx);
-bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx);
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx);
 
 /**
  * DOC: Lockless queue stopping / waking helpers.
@@ -348,10 +340,5 @@ static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq)
 	})
 
 struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx);
-bool netdev_can_create_queue(const struct net_device *dev,
-			     struct netlink_ext_ack *extack);
-bool netdev_can_lease_queue(const struct net_device *dev,
-			    struct netlink_ext_ack *extack);
-bool netdev_queue_busy(struct net_device *dev, int idx,
-		       struct netlink_ext_ack *extack);
-#endif /* _LINUX_NET_QUEUES_H */
+
+#endif
diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index 508d11afaecb..8cdcd138b33f 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -28,8 +28,6 @@ struct netdev_rx_queue {
 #endif
 	struct napi_struct		*napi;
 	struct pp_memory_provider_params mp_params;
-	struct netdev_rx_queue		*lease;
-	netdevice_tracker		lease_tracker;
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -59,22 +57,5 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue)
 }
 
 int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq);
-void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
-			   struct netdev_rx_queue *rxq_src);
-void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
-			     struct netdev_rx_queue *rxq_src);
-bool netif_rx_queue_lease_get_owner(struct net_device **dev, unsigned int *rxq);
 
-enum netif_lease_dir {
-	NETIF_VIRT_TO_PHYS,
-	NETIF_PHYS_TO_VIRT,
-};
-
-struct netdev_rx_queue *
-__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq,
-			   enum netif_lease_dir dir);
-struct netdev_rx_queue *
-netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq);
-void netif_put_rx_queue_lease_locked(struct net_device *orig_dev,
-				     struct net_device *dev);
-#endif /* _LINUX_NETDEV_RX_QUEUE_H */
+#endif
diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h
index b6f811c3416b..ada4f968960a 100644
--- a/include/net/page_pool/memory_provider.h
+++ b/include/net/page_pool/memory_provider.h
@@ -23,12 +23,12 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr);
 void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov);
 void net_mp_niov_clear_page_pool(struct net_iov *niov);
 
-int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx,
 		    struct pp_memory_provider_params *p);
 int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
 		      const struct pp_memory_provider_params *p,
 		      struct netlink_ext_ack *extack);
-void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
+void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
 		      struct pp_memory_provider_params *old_p);
 void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
 			const struct pp_memory_provider_params *old_p);
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index c07cfb431eac..242e34f771cc 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -28,7 +28,7 @@ void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max);
 void xsk_tx_release(struct xsk_buff_pool *pool);
-struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev,
+struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
 					    u16 queue_id);
 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool);
 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index bbd565757298..3b491d96e52e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1296,11 +1296,6 @@ enum netkit_mode {
 	NETKIT_L3,
 };
 
-enum netkit_pairing {
-	NETKIT_DEVICE_PAIR,
-	NETKIT_DEVICE_SINGLE,
-};
-
 /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to
  * the BPF program if attached. This also means the latter can
  * consume the two fields if they were populated earlier.
@@ -1325,7 +1320,6 @@ enum {
 	IFLA_NETKIT_PEER_SCRUB,
 	IFLA_NETKIT_HEADROOM,
 	IFLA_NETKIT_TAILROOM,
-	IFLA_NETKIT_PAIRING,
 	__IFLA_NETKIT_MAX,
 };
 #define IFLA_NETKIT_MAX	(__IFLA_NETKIT_MAX - 1)
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 7df1056a35fd..e0b579a1df4f 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -160,7 +160,6 @@ enum {
 	NETDEV_A_QUEUE_DMABUF,
 	NETDEV_A_QUEUE_IO_URING,
 	NETDEV_A_QUEUE_XSK,
-	NETDEV_A_QUEUE_LEASE,
 
 	__NETDEV_A_QUEUE_MAX,
 	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -203,15 +202,6 @@ enum {
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
 };
 
-enum {
-	NETDEV_A_LEASE_IFINDEX = 1,
-	NETDEV_A_LEASE_QUEUE,
-	NETDEV_A_LEASE_NETNS_ID,
-
-	__NETDEV_A_LEASE_MAX,
-	NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1)
-};
-
 enum {
 	NETDEV_A_DMABUF_IFINDEX = 1,
 	NETDEV_A_DMABUF_QUEUES,
@@ -238,7 +228,6 @@ enum {
 	NETDEV_CMD_BIND_RX,
 	NETDEV_CMD_NAPI_SET,
 	NETDEV_CMD_BIND_TX,
-	NETDEV_CMD_QUEUE_CREATE,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/net/core/dev.c b/net/core/dev.c
index 13a3de63a825..2661b68f5be3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1114,13 +1114,6 @@ netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
 	return __netdev_put_lock_ops_compat(dev, net);
 }
 
-struct net_device *
-netdev_put_lock(struct net_device *dev, netdevice_tracker *tracker)
-{
-	netdev_tracker_free(dev, tracker);
-	return __netdev_put_lock(dev, dev_net(dev));
-}
-
 struct net_device *
 netdev_xa_find_lock(struct net *net, struct net_device *dev,
 		    unsigned long *index)
diff --git a/net/core/dev.h b/net/core/dev.h
index 9bcb76b325d0..da18536cbd35 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -30,8 +30,6 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
 struct net_device *dev_get_by_napi_id(unsigned int napi_id);
 
 struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net);
-struct net_device *netdev_put_lock(struct net_device *dev,
-				   netdevice_tracker *tracker);
 struct net_device *
 netdev_xa_find_lock(struct net *net, struct net_device *dev,
 		    unsigned long *index);
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index 52ba99c019e7..ba673e81716f 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -28,12 +28,6 @@ static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range
 };
 
 /* Common nested types */
-const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = {
-	[NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
-	[NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
-	[NETDEV_A_LEASE_NETNS_ID] = { .type = NLA_S32, },
-};
-
 const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = {
 	[NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
 	[NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range),
@@ -113,13 +107,6 @@ static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1]
 	[NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
 };
 
-/* NETDEV_CMD_QUEUE_CREATE - do */
-static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = {
-	[NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
-	[NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
-	[NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy),
-};
-
 /* Ops table for netdev */
 static const struct genl_split_ops netdev_nl_ops[] = {
 	{
@@ -218,13 +205,6 @@ static const struct genl_split_ops netdev_nl_ops[] = {
 		.maxattr	= NETDEV_A_DMABUF_FD,
 		.flags		= GENL_CMD_CAP_DO,
 	},
-	{
-		.cmd		= NETDEV_CMD_QUEUE_CREATE,
-		.doit		= netdev_nl_queue_create_doit,
-		.policy		= netdev_queue_create_nl_policy,
-		.maxattr	= NETDEV_A_QUEUE_LEASE,
-		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
-	},
 };
 
 static const struct genl_multicast_group netdev_nl_mcgrps[] = {
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index d71b435d72c1..cffc08517a41 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -14,7 +14,6 @@
 #include <net/netdev_netlink.h>
 
 /* Common nested types */
-extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1];
 extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1];
 extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1];
 
@@ -37,7 +36,6 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
 int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
 int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
 int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info);
-int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info);
 
 enum {
 	NETDEV_NLGRP_MGMT,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 51c830f88f10..470fabbeacd9 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -391,11 +391,8 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
 			 u32 q_idx, u32 q_type, const struct genl_info *info)
 {
 	struct pp_memory_provider_params *params;
-	struct net_device *orig_netdev = netdev;
-	struct nlattr *nest_lease, *nest_queue;
 	struct netdev_rx_queue *rxq;
 	struct netdev_queue *txq;
-	u32 lease_q_idx = q_idx;
 	void *hdr;
 
 	hdr = genlmsg_iput(rsp, info);
@@ -413,37 +410,6 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
 		if (nla_put_napi_id(rsp, rxq->napi))
 			goto nla_put_failure;
 
-		if (netif_rx_queue_lease_get_owner(&netdev, &lease_q_idx)) {
-			struct net *net, *peer_net;
-
-			nest_lease = nla_nest_start(rsp, NETDEV_A_QUEUE_LEASE);
-			if (!nest_lease)
-				goto nla_put_failure;
-			nest_queue = nla_nest_start(rsp, NETDEV_A_LEASE_QUEUE);
-			if (!nest_queue)
-				goto nla_put_failure;
-			if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, lease_q_idx))
-				goto nla_put_failure;
-			if (nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type))
-				goto nla_put_failure;
-			nla_nest_end(rsp, nest_queue);
-			if (nla_put_u32(rsp, NETDEV_A_LEASE_IFINDEX,
-					READ_ONCE(netdev->ifindex)))
-				goto nla_put_failure;
-			rcu_read_lock();
-			peer_net = dev_net_rcu(netdev);
-			net = dev_net_rcu(orig_netdev);
-			if (!net_eq(net, peer_net)) {
-				s32 id = peernet2id_alloc(net, peer_net, GFP_ATOMIC);
-
-				if (nla_put_s32(rsp, NETDEV_A_LEASE_NETNS_ID, id))
-					goto nla_put_failure_unlock;
-			}
-			rcu_read_unlock();
-			nla_nest_end(rsp, nest_lease);
-			netdev = orig_netdev;
-		}
-
 		params = &rxq->mp_params;
 		if (params->mp_ops &&
 		    params->mp_ops->nl_fill(params->mp_priv, rsp, rxq))
@@ -471,8 +437,6 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
 
 	return 0;
 
-nla_put_failure_unlock:
-	rcu_read_unlock();
 nla_put_failure:
 	genlmsg_cancel(rsp, hdr);
 	return -EMSGSIZE;
@@ -1156,155 +1120,6 @@ err_genlmsg_free:
 	return err;
 }
 
-int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info)
-{
-	const int qmaxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1;
-	const int lmaxtype = ARRAY_SIZE(netdev_lease_nl_policy) - 1;
-	int err, ifindex, ifindex_lease, queue_id, queue_id_lease;
-	struct nlattr *qtb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
-	struct nlattr *ltb[ARRAY_SIZE(netdev_lease_nl_policy)];
-	struct netdev_rx_queue *rxq, *rxq_lease;
-	struct net_device *dev, *dev_lease;
-	netdevice_tracker dev_tracker;
-	struct nlattr *nest;
-	struct sk_buff *rsp;
-	void *hdr;
-
-	if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX) ||
-	    GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) ||
-	    GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_LEASE))
-		return -EINVAL;
-	if (nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]) !=
-	    NETDEV_QUEUE_TYPE_RX) {
-		NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QUEUE_TYPE]);
-		return -EINVAL;
-	}
-
-	ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
-
-	nest = info->attrs[NETDEV_A_QUEUE_LEASE];
-	err = nla_parse_nested(ltb, lmaxtype, nest,
-			       netdev_lease_nl_policy, info->extack);
-	if (err < 0)
-		return err;
-	if (NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_IFINDEX) ||
-	    NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_QUEUE))
-		return -EINVAL;
-	if (ltb[NETDEV_A_LEASE_NETNS_ID]) {
-		NL_SET_BAD_ATTR(info->extack, ltb[NETDEV_A_LEASE_NETNS_ID]);
-		return -EINVAL;
-	}
-
-	ifindex_lease = nla_get_u32(ltb[NETDEV_A_LEASE_IFINDEX]);
-
-	nest = ltb[NETDEV_A_LEASE_QUEUE];
-	err = nla_parse_nested(qtb, qmaxtype, nest,
-			       netdev_queue_id_nl_policy, info->extack);
-	if (err < 0)
-		return err;
-	if (NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_ID) ||
-	    NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_TYPE))
-		return -EINVAL;
-	if (nla_get_u32(qtb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
-		NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_TYPE]);
-		return -EINVAL;
-	}
-	if (ifindex == ifindex_lease) {
-		NL_SET_ERR_MSG(info->extack,
-			       "Lease ifindex cannot be the same as queue creation ifindex");
-		return -EINVAL;
-	}
-
-	queue_id_lease = nla_get_u32(qtb[NETDEV_A_QUEUE_ID]);
-
-	rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!rsp)
-		return -ENOMEM;
-
-	hdr = genlmsg_iput(rsp, info);
-	if (!hdr) {
-		err = -EMSGSIZE;
-		goto err_genlmsg_free;
-	}
-
-	/* Locking order is always from the virtual to the physical device
-	 * since this is also the same order when applications open the
-	 * memory provider later on.
-	 */
-	dev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
-	if (!dev) {
-		err = -ENODEV;
-		goto err_genlmsg_free;
-	}
-	if (!netdev_can_create_queue(dev, info->extack)) {
-		err = -EINVAL;
-		goto err_unlock_dev;
-	}
-
-	dev_lease = netdev_get_by_index(genl_info_net(info), ifindex_lease,
-					&dev_tracker, GFP_KERNEL);
-	if (!dev_lease) {
-		err = -ENODEV;
-		goto err_unlock_dev;
-	}
-	if (!netdev_can_lease_queue(dev_lease, info->extack)) {
-		netdev_put(dev_lease, &dev_tracker);
-		err = -EINVAL;
-		goto err_unlock_dev;
-	}
-
-	dev_lease = netdev_put_lock(dev_lease, &dev_tracker);
-	if (!dev_lease) {
-		err = -ENODEV;
-		goto err_unlock_dev;
-	}
-	if (queue_id_lease >= dev_lease->real_num_rx_queues) {
-		err = -ERANGE;
-		NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_ID]);
-		goto err_unlock_dev_lease;
-	}
-	if (netdev_queue_busy(dev_lease, queue_id_lease, info->extack)) {
-		err = -EBUSY;
-		goto err_unlock_dev_lease;
-	}
-
-	rxq_lease = __netif_get_rx_queue(dev_lease, queue_id_lease);
-	rxq = __netif_get_rx_queue(dev, dev->real_num_rx_queues - 1);
-
-	if (rxq->lease && rxq->lease->dev != dev_lease) {
-		err = -EOPNOTSUPP;
-		NL_SET_ERR_MSG(info->extack,
-			       "Leasing multiple queues from different devices not supported");
-		goto err_unlock_dev_lease;
-	}
-
-	err = queue_id = dev->queue_mgmt_ops->ndo_queue_create(dev);
-	if (err < 0) {
-		NL_SET_ERR_MSG(info->extack,
-			       "Device is unable to create a new queue");
-		goto err_unlock_dev_lease;
-	}
-
-	rxq = __netif_get_rx_queue(dev, queue_id);
-	netdev_rx_queue_lease(rxq, rxq_lease);
-
-	nla_put_u32(rsp, NETDEV_A_QUEUE_ID, queue_id);
-	genlmsg_end(rsp, hdr);
-
-	netdev_unlock(dev_lease);
-	netdev_unlock(dev);
-
-	return genlmsg_reply(rsp, info);
-
-err_unlock_dev_lease:
-	netdev_unlock(dev_lease);
-err_unlock_dev:
-	netdev_unlock(dev);
-err_genlmsg_free:
-	nlmsg_free(rsp);
-	return err;
-}
-
 void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)
 {
 	INIT_LIST_HEAD(&priv->bindings);
diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c
index 97acf6440829..251f27a8307f 100644
--- a/net/core/netdev_queues.c
+++ b/net/core/netdev_queues.c
@@ -1,37 +1,22 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include <net/netdev_queues.h>
-#include <net/netdev_rx_queue.h>
-#include <net/xdp_sock_drv.h>
 
 /**
  * netdev_queue_get_dma_dev() - get dma device for zero-copy operations
  * @dev:	net_device
  * @idx:	queue index
  *
- * Get dma device for zero-copy operations to be used for this queue. If the
- * queue is leased to a physical queue, we retrieve the latter's dma device.
+ * Get dma device for zero-copy operations to be used for this queue.
  * When such device is not available or valid, the function will return NULL.
  *
  * Return: Device or NULL on error
  */
 struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx)
 {
-	const struct netdev_queue_mgmt_ops *queue_ops;
+	const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops;
 	struct device *dma_dev;
 
-	if (idx < dev->real_num_rx_queues) {
-		struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx);
-
-		if (rxq->lease) {
-			rxq = rxq->lease;
-			dev = rxq->dev;
-			idx = get_netdev_rx_queue_index(rxq);
-		}
-	}
-
-	queue_ops = dev->queue_mgmt_ops;
-
 	if (queue_ops && queue_ops->ndo_queue_get_dma_dev)
 		dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx);
 	else
@@ -40,58 +25,3 @@ struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx)
 	return dma_dev && dma_dev->dma_mask ? dma_dev : NULL;
 }
 
-bool netdev_can_create_queue(const struct net_device *dev,
-			     struct netlink_ext_ack *extack)
-{
-	if (dev->dev.parent) {
-		NL_SET_ERR_MSG(extack, "Device is not a virtual device");
-		return false;
-	}
-	if (!dev->queue_mgmt_ops ||
-	    !dev->queue_mgmt_ops->ndo_queue_create) {
-		NL_SET_ERR_MSG(extack, "Device does not support queue creation");
-		return false;
-	}
-	if (dev->real_num_rx_queues < 1 ||
-	    dev->real_num_tx_queues < 1) {
-		NL_SET_ERR_MSG(extack, "Device must have at least one real queue");
-		return false;
-	}
-	return true;
-}
-
-bool netdev_can_lease_queue(const struct net_device *dev,
-			    struct netlink_ext_ack *extack)
-{
-	if (!dev->dev.parent) {
-		NL_SET_ERR_MSG(extack, "Lease device is a virtual device");
-		return false;
-	}
-	if (!netif_device_present(dev)) {
-		NL_SET_ERR_MSG(extack, "Lease device has been removed from the system");
-		return false;
-	}
-	if (!dev->queue_mgmt_ops) {
-		NL_SET_ERR_MSG(extack, "Lease device does not support queue management operations");
-		return false;
-	}
-	return true;
-}
-
-bool netdev_queue_busy(struct net_device *dev, int idx,
-		       struct netlink_ext_ack *extack)
-{
-	if (netif_rxq_is_leased(dev, idx)) {
-		NL_SET_ERR_MSG(extack, "Lease device queue is already leased");
-		return true;
-	}
-	if (xsk_get_pool_from_qid(dev, idx)) {
-		NL_SET_ERR_MSG(extack, "Lease device queue in use by AF_XDP");
-		return true;
-	}
-	if (netif_rxq_has_mp(dev, idx)) {
-		NL_SET_ERR_MSG(extack, "Lease device queue in use by memory provider");
-		return true;
-	}
-	return false;
-}
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index 75c7a68cb90d..c7d9341b7630 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -9,120 +9,14 @@
 
 #include "page_pool_priv.h"
 
-void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
-			   struct netdev_rx_queue *rxq_src)
-{
-	netdev_assert_locked(rxq_src->dev);
-	netdev_assert_locked(rxq_dst->dev);
-
-	netdev_hold(rxq_src->dev, &rxq_src->lease_tracker, GFP_KERNEL);
-
-	WRITE_ONCE(rxq_src->lease, rxq_dst);
-	WRITE_ONCE(rxq_dst->lease, rxq_src);
-}
-
-void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
-			     struct netdev_rx_queue *rxq_src)
-{
-	netdev_assert_locked(rxq_dst->dev);
-	netdev_assert_locked(rxq_src->dev);
-
-	WRITE_ONCE(rxq_src->lease, NULL);
-	WRITE_ONCE(rxq_dst->lease, NULL);
-
-	netdev_put(rxq_src->dev, &rxq_src->lease_tracker);
-}
-
-bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx)
-{
-	if (rxq_idx < dev->real_num_rx_queues)
-		return READ_ONCE(__netif_get_rx_queue(dev, rxq_idx)->lease);
-	return false;
-}
-
-static bool netif_lease_dir_ok(const struct net_device *dev,
-			       enum netif_lease_dir dir)
-{
-	if (dir == NETIF_VIRT_TO_PHYS && !dev->dev.parent)
-		return true;
-	if (dir == NETIF_PHYS_TO_VIRT && dev->dev.parent)
-		return true;
-	return false;
-}
-
-struct netdev_rx_queue *
-__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx,
-			   enum netif_lease_dir dir)
-{
-	struct net_device *orig_dev = *dev;
-	struct netdev_rx_queue *rxq = __netif_get_rx_queue(orig_dev, *rxq_idx);
-
-	if (rxq->lease) {
-		if (!netif_lease_dir_ok(orig_dev, dir))
-			return NULL;
-		rxq = rxq->lease;
-		*rxq_idx = get_netdev_rx_queue_index(rxq);
-		*dev = rxq->dev;
-	}
-	return rxq;
-}
-
-struct netdev_rx_queue *
-netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq_idx)
-{
-	struct net_device *orig_dev = *dev;
-	struct netdev_rx_queue *rxq;
-
-	/* Locking order is always from the virtual to the physical device
-	 * see netdev_nl_queue_create_doit().
-	 */
-	netdev_ops_assert_locked(orig_dev);
-	rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_VIRT_TO_PHYS);
-	if (rxq && orig_dev != *dev)
-		netdev_lock(*dev);
-	return rxq;
-}
-
-void netif_put_rx_queue_lease_locked(struct net_device *orig_dev,
-				     struct net_device *dev)
-{
-	if (orig_dev != dev)
-		netdev_unlock(dev);
-}
-
-bool netif_rx_queue_lease_get_owner(struct net_device **dev,
-				    unsigned int *rxq_idx)
-{
-	struct net_device *orig_dev = *dev;
-	struct netdev_rx_queue *rxq;
-
-	/* The physical device needs to be locked. If there is indeed a lease,
-	 * then the virtual device holds a reference on the physical device
-	 * and the lease stays active until the virtual device is torn down.
-	 * When queues get {un,}leased both devices are always locked.
-	 */
-	netdev_ops_assert_locked(orig_dev);
-	rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_PHYS_TO_VIRT);
-	if (rxq && orig_dev != *dev)
-		return true;
-	return false;
-}
-
 /* See also page_pool_is_unreadable() */
-bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx)
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx)
 {
-	if (rxq_idx < dev->real_num_rx_queues)
-		return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_ops;
-	return false;
-}
-EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
+	struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx);
 
-bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx)
-{
-	if (rxq_idx < dev->real_num_rx_queues)
-		return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_priv;
-	return false;
+	return !!rxq->mp_params.mp_ops;
 }
+EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
 
 int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
 {
@@ -206,63 +100,49 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
 		      const struct pp_memory_provider_params *p,
 		      struct netlink_ext_ack *extack)
 {
-	struct net_device *orig_dev = dev;
 	struct netdev_rx_queue *rxq;
 	int ret;
 
 	if (!netdev_need_ops_lock(dev))
 		return -EOPNOTSUPP;
+
 	if (rxq_idx >= dev->real_num_rx_queues) {
 		NL_SET_ERR_MSG(extack, "rx queue index out of range");
 		return -ERANGE;
 	}
-
 	rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues);
-	rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx);
-	if (!rxq) {
-		NL_SET_ERR_MSG(extack, "rx queue peered to a virtual netdev");
-		return -EBUSY;
-	}
-	if (!dev->dev.parent) {
-		NL_SET_ERR_MSG(extack, "rx queue is mapped to a virtual netdev");
-		ret = -EBUSY;
-		goto out;
-	}
+
 	if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
 		NL_SET_ERR_MSG(extack, "tcp-data-split is disabled");
-		ret = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 	if (dev->cfg->hds_thresh) {
 		NL_SET_ERR_MSG(extack, "hds-thresh is not zero");
-		ret = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 	if (dev_xdp_prog_count(dev)) {
 		NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached");
-		ret = -EEXIST;
-		goto out;
+		return -EEXIST;
 	}
+
+	rxq = __netif_get_rx_queue(dev, rxq_idx);
 	if (rxq->mp_params.mp_ops) {
 		NL_SET_ERR_MSG(extack, "designated queue already memory provider bound");
-		ret = -EEXIST;
-		goto out;
+		return -EEXIST;
 	}
 #ifdef CONFIG_XDP_SOCKETS
 	if (rxq->pool) {
 		NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP");
-		ret = -EBUSY;
-		goto out;
+		return -EBUSY;
 	}
 #endif
+
 	rxq->mp_params = *p;
 	ret = netdev_rx_queue_restart(dev, rxq_idx);
 	if (ret) {
 		rxq->mp_params.mp_ops = NULL;
 		rxq->mp_params.mp_priv = NULL;
 	}
-out:
-	netif_put_rx_queue_lease_locked(orig_dev, dev);
 	return ret;
 }
 
@@ -277,43 +157,38 @@ int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
 	return ret;
 }
 
-void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
+void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
 			const struct pp_memory_provider_params *old_p)
 {
-	struct net_device *orig_dev = dev;
 	struct netdev_rx_queue *rxq;
 	int err;
 
-	if (WARN_ON_ONCE(rxq_idx >= dev->real_num_rx_queues))
+	if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues))
 		return;
 
-	rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx);
-	if (WARN_ON_ONCE(!rxq))
-		return;
+	rxq = __netif_get_rx_queue(dev, ifq_idx);
 
 	/* Callers holding a netdev ref may get here after we already
 	 * went thru shutdown via dev_memory_provider_uninstall().
 	 */
 	if (dev->reg_state > NETREG_REGISTERED &&
 	    !rxq->mp_params.mp_ops)
-		goto out;
+		return;
 
 	if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops ||
 			 rxq->mp_params.mp_priv != old_p->mp_priv))
-		goto out;
+		return;
 
 	rxq->mp_params.mp_ops = NULL;
 	rxq->mp_params.mp_priv = NULL;
-	err = netdev_rx_queue_restart(dev, rxq_idx);
+	err = netdev_rx_queue_restart(dev, ifq_idx);
 	WARN_ON(err && err != -ENETDOWN);
-out:
-	netif_put_rx_queue_lease_locked(orig_dev, dev);
 }
 
-void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
+void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
 		      struct pp_memory_provider_params *old_p)
 {
 	netdev_lock(dev);
-	__net_mp_close_rxq(dev, rxq_idx, old_p);
+	__net_mp_close_rxq(dev, ifq_idx, old_p);
 	netdev_unlock(dev);
 }
diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c
index 797d2a08c515..ca4f80282448 100644
--- a/net/ethtool/channels.c
+++ b/net/ethtool/channels.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
-#include <net/netdev_queues.h>
+#include <net/xdp_sock_drv.h>
 
 #include "netlink.h"
 #include "common.h"
@@ -169,16 +169,14 @@ ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info)
 	if (ret)
 		return ret;
 
-	/* ensure channels are not busy at the moment */
+	/* Disabling channels, query zero-copy AF_XDP sockets */
 	from_channel = channels.combined_count +
 		       min(channels.rx_count, channels.tx_count);
-	for (i = from_channel; i < old_total; i++) {
-		if (netdev_queue_busy(dev, i, NULL)) {
-			GENL_SET_ERR_MSG(info,
-					 "requested channel counts are too low due to busy queues (AF_XDP or queue leasing)");
+	for (i = from_channel; i < old_total; i++)
+		if (xsk_get_pool_from_qid(dev, i)) {
+			GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets");
 			return -EINVAL;
 		}
-	}
 
 	ret = dev->ethtool_ops->set_channels(dev, &channels);
 	return ret < 0 ? ret : 1;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 02a3454234d6..9431e305b233 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -27,13 +27,12 @@
 #include <linux/net.h>
 #include <linux/pm_runtime.h>
 #include <linux/utsname.h>
-#include <linux/ethtool_netlink.h>
 #include <net/devlink.h>
 #include <net/ipv6.h>
+#include <net/xdp_sock_drv.h>
 #include <net/flow_offload.h>
 #include <net/netdev_lock.h>
-#include <net/netdev_queues.h>
-
+#include <linux/ethtool_netlink.h>
 #include "common.h"
 
 /* State held across locks and calls for commands which have devlink fallback */
@@ -2283,12 +2282,12 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 	if (ret)
 		return ret;
 
-	/* Disabling channels, query busy queues (AF_XDP, queue leasing) */
+	/* Disabling channels, query zero-copy AF_XDP sockets */
 	from_channel = channels.combined_count +
 		min(channels.rx_count, channels.tx_count);
 	to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count);
 	for (i = from_channel; i < to_channel; i++)
-		if (netdev_queue_busy(dev, i, NULL))
+		if (xsk_get_pool_from_qid(dev, i))
 			return -EINVAL;
 
 	ret = dev->ethtool_ops->set_channels(dev, &channels);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 92f791433725..3b46bc635c43 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -23,8 +23,6 @@
 #include <linux/netdevice.h>
 #include <linux/rculist.h>
 #include <linux/vmalloc.h>
-
-#include <net/netdev_queues.h>
 #include <net/xdp_sock_drv.h>
 #include <net/busy_poll.h>
 #include <net/netdev_lock.h>
@@ -105,7 +103,7 @@ bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
 }
 EXPORT_SYMBOL(xsk_uses_need_wakeup);
 
-struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev,
+struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
 					    u16 queue_id)
 {
 	if (queue_id < dev->real_num_rx_queues)
@@ -119,18 +117,10 @@ EXPORT_SYMBOL(xsk_get_pool_from_qid);
 
 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
 {
-	struct net_device *orig_dev = dev;
-	unsigned int id = queue_id;
-
-	if (id < dev->real_num_rx_queues)
-		WARN_ON_ONCE(!netif_get_rx_queue_lease_locked(&dev, &id));
-
-	if (id < dev->real_num_rx_queues)
-		dev->_rx[id].pool = NULL;
-	if (id < dev->real_num_tx_queues)
-		dev->_tx[id].pool = NULL;
-
-	netif_put_rx_queue_lease_locked(orig_dev, dev);
+	if (queue_id < dev->num_rx_queues)
+		dev->_rx[queue_id].pool = NULL;
+	if (queue_id < dev->num_tx_queues)
+		dev->_tx[queue_id].pool = NULL;
 }
 
 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
@@ -140,29 +130,17 @@ void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
 			u16 queue_id)
 {
-	struct net_device *orig_dev = dev;
-	unsigned int id = queue_id;
-	int ret = 0;
-
-	if (id >= max(dev->real_num_rx_queues,
-		      dev->real_num_tx_queues))
+	if (queue_id >= max_t(unsigned int,
+			      dev->real_num_rx_queues,
+			      dev->real_num_tx_queues))
 		return -EINVAL;
-	if (id < dev->real_num_rx_queues) {
-		if (!netif_get_rx_queue_lease_locked(&dev, &id))
-			return -EBUSY;
-		if (xsk_get_pool_from_qid(dev, id)) {
-			ret = -EBUSY;
-			goto out;
-		}
-	}
 
-	if (id < dev->real_num_rx_queues)
-		dev->_rx[id].pool = pool;
-	if (id < dev->real_num_tx_queues)
-		dev->_tx[id].pool = pool;
-out:
-	netif_put_rx_queue_lease_locked(orig_dev, dev);
-	return ret;
+	if (queue_id < dev->real_num_rx_queues)
+		dev->_rx[queue_id].pool = pool;
+	if (queue_id < dev->real_num_tx_queues)
+		dev->_tx[queue_id].pool = pool;
+
+	return 0;
 }
 
 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
@@ -346,37 +324,14 @@ static bool xsk_is_bound(struct xdp_sock *xs)
 	return false;
 }
 
-static bool xsk_dev_queue_valid(const struct xdp_sock *xs,
-				const struct xdp_rxq_info *info)
-{
-	struct net_device *dev = xs->dev;
-	u32 queue_index = xs->queue_id;
-	struct netdev_rx_queue *rxq;
-
-	if (info->dev == dev &&
-	    info->queue_index == queue_index)
-		return true;
-
-	if (queue_index < dev->real_num_rx_queues) {
-		rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease);
-		if (!rxq)
-			return false;
-
-		dev = rxq->dev;
-		queue_index = get_netdev_rx_queue_index(rxq);
-
-		return info->dev == dev &&
-		       info->queue_index == queue_index;
-	}
-	return false;
-}
-
 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
 	if (!xsk_is_bound(xs))
 		return -ENXIO;
-	if (!xsk_dev_queue_valid(xs, xdp->rxq))
+
+	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
+
 	if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
 		xs->rx_dropped++;
 		return -ENOSPC;
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 7df1056a35fd..e0b579a1df4f 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -160,7 +160,6 @@ enum {
 	NETDEV_A_QUEUE_DMABUF,
 	NETDEV_A_QUEUE_IO_URING,
 	NETDEV_A_QUEUE_XSK,
-	NETDEV_A_QUEUE_LEASE,
 
 	__NETDEV_A_QUEUE_MAX,
 	NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -203,15 +202,6 @@ enum {
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
 };
 
-enum {
-	NETDEV_A_LEASE_IFINDEX = 1,
-	NETDEV_A_LEASE_QUEUE,
-	NETDEV_A_LEASE_NETNS_ID,
-
-	__NETDEV_A_LEASE_MAX,
-	NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1)
-};
-
 enum {
 	NETDEV_A_DMABUF_IFINDEX = 1,
 	NETDEV_A_DMABUF_QUEUES,
@@ -238,7 +228,6 @@ enum {
 	NETDEV_CMD_BIND_RX,
 	NETDEV_CMD_NAPI_SET,
 	NETDEV_CMD_BIND_TX,
-	NETDEV_CMD_QUEUE_CREATE,
 
 	__NETDEV_CMD_MAX,
 	NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst
index b94e81c2e030..eb838ae94844 100644
--- a/tools/testing/selftests/drivers/net/README.rst
+++ b/tools/testing/selftests/drivers/net/README.rst
@@ -62,13 +62,6 @@ LOCAL_V4, LOCAL_V6, REMOTE_V4, REMOTE_V6
 
 Local and remote endpoint IP addresses.
 
-LOCAL_PREFIX_V4, LOCAL_PREFIX_V6
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Local IP prefix/subnet which can be used to allocate extra IP addresses (for
-network name spaces behind macvlan, veth, netkit devices). DUT must be
-reachable using these addresses from the endpoint.
-
 REMOTE_TYPE
 ~~~~~~~~~~~
 
diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
index 39ad86d693b3..9c163ba6feee 100644
--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -32,8 +32,6 @@ TEST_PROGS = \
 	irq.py \
 	loopback.sh \
 	nic_timestamp.py \
-	nk_netns.py \
-	nk_qlease.py \
 	pp_alloc_fail.py \
 	rss_api.py \
 	rss_ctx.py \
diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
index 022008249313..d5d247eca6b7 100644
--- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
+++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
@@ -3,7 +3,6 @@
 """
 Driver test environment (hardware-only tests).
 NetDrvEnv and NetDrvEpEnv are the main environment classes.
-NetDrvContEnv extends NetDrvEpEnv with netkit container support.
 Former is for local host only tests, latter creates / connects
 to a remote endpoint. See NIPA wiki for more information about
 running and writing driver tests.
@@ -30,7 +29,7 @@ try:
     from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \
         ksft_ne, ksft_not_in, ksft_raises, ksft_true, ksft_gt, ksft_not_none
     from drivers.net.lib.py import GenerateTraffic, Remote, Iperf3Runner
-    from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv
+    from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv
 
     __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev",
                "EthtoolFamily", "NetdevFamily", "NetshaperFamily",
@@ -45,8 +44,8 @@ try:
                "ksft_eq", "ksft_ge", "ksft_in", "ksft_is", "ksft_lt",
                "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt",
                "ksft_not_none", "ksft_not_none",
-               "NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic",
-               "Remote", "Iperf3Runner"]
+               "NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote",
+               "Iperf3Runner"]
 except ModuleNotFoundError as e:
     print("Failed importing `net` library from kernel sources")
     print(str(e))
diff --git a/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c b/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c
deleted file mode 100644
index 86ebfc1445b6..000000000000
--- a/tools/testing/selftests/drivers/net/hw/nk_forward.bpf.c
+++ /dev/null
@@ -1,49 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/bpf.h>
-#include <linux/pkt_cls.h>
-#include <linux/if_ether.h>
-#include <linux/ipv6.h>
-#include <linux/in6.h>
-#include <bpf/bpf_endian.h>
-#include <bpf/bpf_helpers.h>
-
-#define TC_ACT_OK 0
-#define ETH_P_IPV6 0x86DD
-
-#define ctx_ptr(field)		((void *)(long)(field))
-
-#define v6_p64_equal(a, b)	(a.s6_addr32[0] == b.s6_addr32[0] && \
-				 a.s6_addr32[1] == b.s6_addr32[1])
-
-volatile __u32 netkit_ifindex;
-volatile __u8 ipv6_prefix[16];
-
-SEC("tc/ingress")
-int tc_redirect_peer(struct __sk_buff *skb)
-{
-	void *data_end = ctx_ptr(skb->data_end);
-	void *data = ctx_ptr(skb->data);
-	struct in6_addr *peer_addr;
-	struct ipv6hdr *ip6h;
-	struct ethhdr *eth;
-
-	peer_addr = (struct in6_addr *)ipv6_prefix;
-
-	if (skb->protocol != bpf_htons(ETH_P_IPV6))
-		return TC_ACT_OK;
-
-	eth = data;
-	if ((void *)(eth + 1) > data_end)
-		return TC_ACT_OK;
-
-	ip6h = data + sizeof(struct ethhdr);
-	if ((void *)(ip6h + 1) > data_end)
-		return TC_ACT_OK;
-
-	if (!v6_p64_equal(ip6h->daddr, (*peer_addr)))
-		return TC_ACT_OK;
-
-	return bpf_redirect_peer(netkit_ifindex, 0);
-}
-
-char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/drivers/net/hw/nk_netns.py b/tools/testing/selftests/drivers/net/hw/nk_netns.py
deleted file mode 100755
index afa8638195d8..000000000000
--- a/tools/testing/selftests/drivers/net/hw/nk_netns.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0
-
-from lib.py import ksft_run, ksft_exit
-from lib.py import NetDrvContEnv
-from lib.py import cmd
-
-
-def test_ping(cfg) -> None:
-    cfg.require_ipver("6")
-
-    cmd(f"ping -c 1 -W5 {cfg.nk_guest_ipv6}", host=cfg.remote)
-    cmd(f"ping -c 1 -W5 {cfg.remote_addr_v['6']}", ns=cfg.netns)
-
-
-def main() -> None:
-    with NetDrvContEnv(__file__) as cfg:
-        ksft_run([test_ping], args=(cfg,))
-    ksft_exit()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/testing/selftests/drivers/net/hw/nk_qlease.py b/tools/testing/selftests/drivers/net/hw/nk_qlease.py
deleted file mode 100755
index 738a46d2d20c..000000000000
--- a/tools/testing/selftests/drivers/net/hw/nk_qlease.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0
-
-import re
-from os import path
-from lib.py import ksft_run, ksft_exit
-from lib.py import NetDrvContEnv
-from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen
-
-
-def create_rss_ctx(cfg):
-    output = ethtool(f"-X {cfg.ifname} context new start {cfg.src_queue} equal 1").stdout
-    values = re.search(r'New RSS context is (\d+)', output).group(1)
-    return int(values)
-
-
-def set_flow_rule(cfg):
-    output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.src_queue}").stdout
-    values = re.search(r'ID (\d+)', output).group(1)
-    return int(values)
-
-
-def set_flow_rule_rss(cfg, rss_ctx_id):
-    output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout
-    values = re.search(r'ID (\d+)', output).group(1)
-    return int(values)
-
-
-def test_iou_zcrx(cfg) -> None:
-    cfg.require_ipver('6')
-
-    ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}")
-    defer(ethtool, f"-X {cfg.ifname} default")
-
-    flow_rule_id = set_flow_rule(cfg)
-    defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
-
-    rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}"
-    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840"
-    with bkg(rx_cmd, exit_wait=True):
-        wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns)
-        cmd(tx_cmd, host=cfg.remote)
-
-
-def main() -> None:
-    with NetDrvContEnv(__file__, lease=True) as cfg:
-        cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx")
-        cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
-        cfg.port = rand_port()
-        ksft_run([test_iou_zcrx], args=(cfg,))
-    ksft_exit()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py
index be3a8a936882..8b75faa9af6d 100644
--- a/tools/testing/selftests/drivers/net/lib/py/__init__.py
+++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py
@@ -3,7 +3,6 @@
 """
 Driver test environment.
 NetDrvEnv and NetDrvEpEnv are the main environment classes.
-NetDrvContEnv extends NetDrvEpEnv with netkit container support.
 Former is for local host only tests, latter creates / connects
 to a remote endpoint. See NIPA wiki for more information about
 running and writing driver tests.
@@ -44,12 +43,12 @@ try:
                "ksft_ne", "ksft_not_in", "ksft_raises", "ksft_true", "ksft_gt",
                "ksft_not_none", "ksft_not_none"]
 
-    from .env import NetDrvEnv, NetDrvEpEnv, NetDrvContEnv
+    from .env import NetDrvEnv, NetDrvEpEnv
     from .load import GenerateTraffic, Iperf3Runner
     from .remote import Remote
 
-    __all__ += ["NetDrvEnv", "NetDrvEpEnv", "NetDrvContEnv", "GenerateTraffic",
-                "Remote", "Iperf3Runner"]
+    __all__ += ["NetDrvEnv", "NetDrvEpEnv", "GenerateTraffic", "Remote",
+                "Iperf3Runner"]
 except ModuleNotFoundError as e:
     print("Failed importing `net` library from kernel sources")
     print(str(e))
diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py
index 7066d78395c6..41cc248ac848 100644
--- a/tools/testing/selftests/drivers/net/lib/py/env.py
+++ b/tools/testing/selftests/drivers/net/lib/py/env.py
@@ -1,17 +1,13 @@
 # SPDX-License-Identifier: GPL-2.0
 
-import ipaddress
 import os
-import re
 import time
 from pathlib import Path
 from lib.py import KsftSkipEx, KsftXfailEx
 from lib.py import ksft_setup, wait_file
 from lib.py import cmd, ethtool, ip, CmdExitFailure
 from lib.py import NetNS, NetdevSimDev
-from lib.py import NetdevFamily, EthtoolFamily
 from .remote import Remote
-from . import bpftool
 
 
 class NetDrvEnvBase:
@@ -293,156 +289,3 @@ class NetDrvEpEnv(NetDrvEnvBase):
                 data.get('stats-block-usecs', 0) / 1000 / 1000
 
         time.sleep(self._stats_settle_time)
-
-
-class NetDrvContEnv(NetDrvEpEnv):
-    """
-    Class for an environment with a netkit pair setup for forwarding traffic
-    between the physical interface and a network namespace.
-    """
-
-    def __init__(self, src_path, lease=False, **kwargs):
-        super().__init__(src_path, **kwargs)
-
-        self.require_ipver("6")
-        local_prefix = self.env.get("LOCAL_PREFIX_V6")
-        if not local_prefix:
-            raise KsftSkipEx("LOCAL_PREFIX_V6 required")
-
-        self.netdevnl = NetdevFamily()
-        self.ethnl = EthtoolFamily()
-
-        local_prefix = local_prefix.rstrip("/64").rstrip("::").rstrip(":")
-        self.ipv6_prefix = f"{local_prefix}::"
-        self.nk_host_ipv6 = f"{local_prefix}::2:1"
-        self.nk_guest_ipv6 = f"{local_prefix}::2:2"
-
-        self.netns = None
-        self._nk_host_ifname = None
-        self._nk_guest_ifname = None
-        self._tc_attached = False
-        self._bpf_prog_pref = None
-        self._bpf_prog_id = None
-        self._leased = False
-
-        nk_rxqueues = 1
-        if lease:
-            nk_rxqueues = 2
-        ip(f"link add type netkit mode l2 forward peer forward numrxqueues {nk_rxqueues}")
-
-        all_links = ip("-d link show", json=True)
-        netkit_links = [link for link in all_links
-                        if link.get('linkinfo', {}).get('info_kind') == 'netkit'
-                        and 'UP' not in link.get('flags', [])]
-
-        if len(netkit_links) != 2:
-            raise KsftSkipEx("Failed to create netkit pair")
-
-        netkit_links.sort(key=lambda x: x['ifindex'])
-        self._nk_host_ifname = netkit_links[1]['ifname']
-        self._nk_guest_ifname = netkit_links[0]['ifname']
-        self.nk_host_ifindex = netkit_links[1]['ifindex']
-        self.nk_guest_ifindex = netkit_links[0]['ifindex']
-
-        if lease:
-            self._lease_queues()
-
-        self._setup_ns()
-        self._attach_bpf()
-
-    def __del__(self):
-        if self._tc_attached:
-            cmd(f"tc filter del dev {self.ifname} ingress pref {self._bpf_prog_pref}")
-            self._tc_attached = False
-
-        if self._nk_host_ifname:
-            cmd(f"ip link del dev {self._nk_host_ifname}")
-            self._nk_host_ifname = None
-            self._nk_guest_ifname = None
-
-        if self.netns:
-            del self.netns
-            self.netns = None
-
-        if self._leased:
-            self.ethnl.rings_set({'header': {'dev-index': self.ifindex},
-                                  'tcp-data-split': 'unknown',
-                                  'hds-thresh': self._hds_thresh,
-                                  'rx': self._rx_rings})
-            self._leased = False
-
-        super().__del__()
-
-    def _lease_queues(self):
-        channels = self.ethnl.channels_get({'header': {'dev-index': self.ifindex}})
-        channels = channels['combined-count']
-        if channels < 2:
-            raise KsftSkipEx('Test requires NETIF with at least 2 combined channels')
-
-        rings = self.ethnl.rings_get({'header': {'dev-index': self.ifindex}})
-        self._rx_rings = rings['rx']
-        self._hds_thresh = rings.get('hds-thresh', 0)
-        self.ethnl.rings_set({'header': {'dev-index': self.ifindex},
-                            'tcp-data-split': 'enabled',
-                            'hds-thresh': 0,
-                            'rx': 64})
-        self.src_queue = channels - 1
-        bind_result = self.netdevnl.queue_create(
-            {
-                "ifindex": self.nk_guest_ifindex,
-                "type": "rx",
-                "lease": {
-                    "ifindex": self.ifindex,
-                    "queue": {"id": self.src_queue, "type": "rx"},
-                },
-            }
-        )
-        self.nk_queue = bind_result['id']
-        self._leased = True
-
-    def _setup_ns(self):
-        self.netns = NetNS()
-        ip(f"link set dev {self._nk_guest_ifname} netns {self.netns.name}")
-        ip(f"link set dev {self._nk_host_ifname} up")
-        ip(f"-6 addr add fe80::1/64 dev {self._nk_host_ifname} nodad")
-        ip(f"-6 route add {self.nk_guest_ipv6}/128 via fe80::2 dev {self._nk_host_ifname}")
-
-        ip("link set lo up", ns=self.netns)
-        ip(f"link set dev {self._nk_guest_ifname} up", ns=self.netns)
-        ip(f"-6 addr add fe80::2/64 dev {self._nk_guest_ifname}", ns=self.netns)
-        ip(f"-6 addr add {self.nk_guest_ipv6}/64 dev {self._nk_guest_ifname} nodad", ns=self.netns)
-        ip(f"-6 route add default via fe80::1 dev {self._nk_guest_ifname}", ns=self.netns)
-
-    def _attach_bpf(self):
-        bpf_obj = self.test_dir / "nk_forward.bpf.o"
-        if not bpf_obj.exists():
-            raise KsftSkipEx("BPF prog not found")
-
-        cmd(f"tc filter add dev {self.ifname} ingress bpf obj {bpf_obj} sec tc/ingress direct-action")
-        self._tc_attached = True
-
-        tc_info = cmd(f"tc filter show dev {self.ifname} ingress").stdout
-        match = re.search(r'pref (\d+).*nk_forward\.bpf.*id (\d+)', tc_info)
-        if not match:
-            raise Exception("Failed to get BPF prog ID")
-        self._bpf_prog_pref = int(match.group(1))
-        self._bpf_prog_id = int(match.group(2))
-
-        prog_info = bpftool(f"prog show id {self._bpf_prog_id}", json=True)
-        map_ids = prog_info.get("map_ids", [])
-
-        bss_map_id = None
-        for map_id in map_ids:
-            map_info = bpftool(f"map show id {map_id}", json=True)
-            if map_info.get("name").endswith("bss"):
-                bss_map_id = map_id
-
-        if bss_map_id is None:
-            raise Exception("Failed to find .bss map")
-
-        ipv6_addr = ipaddress.IPv6Address(self.ipv6_prefix)
-        ipv6_bytes = ipv6_addr.packed
-        ifindex_bytes = self.nk_host_ifindex.to_bytes(4, byteorder='little')
-        value = ipv6_bytes + ifindex_bytes
-        value_hex = ' '.join(f'{b:02x}' for b in value)
-        bpftool(f"map update id {bss_map_id} key hex 00 00 00 00 value hex {value_hex}")
-- 
cgit v1.2.3


From 49743f27268ffbe2029d9c8fdfbd04d0869bd51d Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Date: Fri, 16 Jan 2026 06:21:21 +0000
Subject: selftests: drv-net: extend HW timestamp test with ioctl

Extend HW timestamp tests to check that ioctl interface is not broken
and configuration setups and requests are equal to netlink interface.
Some linter warnings are disabled because of ctypes classes.

Reviewed-by: Kory Maincent <kory.maincent@bootlin.com>
Signed-off-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20260116062121.1230184-2-vadim.fedorenko@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../selftests/drivers/net/hw/nic_timestamp.py      | 128 +++++++++++++++++++--
 1 file changed, 120 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/nic_timestamp.py b/tools/testing/selftests/drivers/net/hw/nic_timestamp.py
index c1e943d53f19..c632b41e7a23 100755
--- a/tools/testing/selftests/drivers/net/hw/nic_timestamp.py
+++ b/tools/testing/selftests/drivers/net/hw/nic_timestamp.py
@@ -1,15 +1,38 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0
+# pylint: disable=locally-disabled, invalid-name, attribute-defined-outside-init, too-few-public-methods
 
 """
 Tests related to configuration of HW timestamping
 """
 
 import errno
+import ctypes
+import fcntl
+import socket
 from lib.py import ksft_run, ksft_exit, ksft_ge, ksft_eq, KsftSkipEx
 from lib.py import NetDrvEnv, EthtoolFamily, NlError
 
 
+SIOCSHWTSTAMP = 0x89b0
+SIOCGHWTSTAMP = 0x89b1
+class hwtstamp_config(ctypes.Structure):
+    """ Python copy of struct hwtstamp_config """
+    _fields_ = [
+        ("flags", ctypes.c_int),
+        ("tx_type", ctypes.c_int),
+        ("rx_filter", ctypes.c_int),
+    ]
+
+
+class ifreq(ctypes.Structure):
+    """ Python copy of struct ifreq """
+    _fields_ = [
+        ("ifr_name", ctypes.c_char * 16),
+        ("ifr_data", ctypes.POINTER(hwtstamp_config)),
+    ]
+
+
 def __get_hwtimestamp_support(cfg):
     """ Retrieve supported configuration information """
 
@@ -31,8 +54,29 @@ def __get_hwtimestamp_support(cfg):
     return ctx
 
 
+def __get_hwtimestamp_config_ioctl(cfg):
+    """ Retrieve current TS configuration information (via ioctl) """
+
+    config = hwtstamp_config()
+
+    req = ifreq()
+    req.ifr_name = cfg.ifname.encode()
+    req.ifr_data = ctypes.pointer(config)
+
+    try:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        fcntl.ioctl(sock.fileno(), SIOCGHWTSTAMP, req)
+        sock.close()
+
+    except OSError as e:
+        if e.errno == errno.EOPNOTSUPP:
+            raise KsftSkipEx("timestamping configuration is not supported via ioctl") from e
+        raise
+    return config
+
+
 def __get_hwtimestamp_config(cfg):
-    """ Retrieve current TS configuration information """
+    """ Retrieve current TS configuration information (via netLink) """
 
     try:
         tscfg = cfg.ethnl.tsconfig_get({'header': {'dev-name': cfg.ifname}})
@@ -43,8 +87,27 @@ def __get_hwtimestamp_config(cfg):
     return tscfg
 
 
+def __set_hwtimestamp_config_ioctl(cfg, ts):
+    """ Setup new TS configuration information (via ioctl) """
+    config = hwtstamp_config()
+    config.rx_filter = ts['rx-filters']['bits']['bit'][0]['index']
+    config.tx_type = ts['tx-types']['bits']['bit'][0]['index']
+    req = ifreq()
+    req.ifr_name = cfg.ifname.encode()
+    req.ifr_data = ctypes.pointer(config)
+    try:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        fcntl.ioctl(sock.fileno(), SIOCSHWTSTAMP, req)
+        sock.close()
+
+    except OSError as e:
+        if e.errno == errno.EOPNOTSUPP:
+            raise KsftSkipEx("timestamping configuration is not supported via ioctl") from e
+        raise
+
+
 def __set_hwtimestamp_config(cfg, ts):
-    """ Setup new TS configuration information """
+    """ Setup new TS configuration information (via netlink) """
 
     ts['header'] = {'dev-name': cfg.ifname}
     try:
@@ -56,9 +119,9 @@ def __set_hwtimestamp_config(cfg, ts):
     return res
 
 
-def test_hwtstamp_tx(cfg):
+def __perform_hwtstamp_tx(cfg, is_ioctl):
     """
-    Test TX timestamp configuration.
+    Test TX timestamp configuration via either netlink or ioctl.
     The driver should apply provided config and report back proper state.
     """
 
@@ -66,16 +129,37 @@ def test_hwtstamp_tx(cfg):
     ts = __get_hwtimestamp_support(cfg)
     tx = ts['tx']
     for t in tx:
+        res = None
         tscfg = orig_tscfg
         tscfg['tx-types']['bits']['bit'] = [t]
-        res = __set_hwtimestamp_config(cfg, tscfg)
+        if is_ioctl:
+            __set_hwtimestamp_config_ioctl(cfg, tscfg)
+        else:
+            res = __set_hwtimestamp_config(cfg, tscfg)
         if res is None:
             res = __get_hwtimestamp_config(cfg)
+        resioctl = __get_hwtimestamp_config_ioctl(cfg)
         ksft_eq(res['tx-types']['bits']['bit'], [t])
+        ksft_eq(resioctl.tx_type, t['index'])
     __set_hwtimestamp_config(cfg, orig_tscfg)
 
+def test_hwtstamp_tx_netlink(cfg):
+    """
+    Test TX timestamp configuration setup via netlink.
+    The driver should apply provided config and report back proper state.
+    """
+    __perform_hwtstamp_tx(cfg, False)
+
+
+def test_hwtstamp_tx_ioctl(cfg):
+    """
+    Test TX timestamp configuration setup via ioctl.
+    The driver should apply provided config and report back proper state.
+    """
+    __perform_hwtstamp_tx(cfg, True)
+
 
-def test_hwtstamp_rx(cfg):
+def __perform_hwtstamp_rx(cfg, is_ioctl):
     """
     Test RX timestamp configuration.
     The filter configuration is taken from the list of supported filters.
@@ -87,11 +171,17 @@ def test_hwtstamp_rx(cfg):
     ts = __get_hwtimestamp_support(cfg)
     rx = ts['rx']
     for r in rx:
+        res = None
         tscfg = orig_tscfg
         tscfg['rx-filters']['bits']['bit'] = [r]
-        res = __set_hwtimestamp_config(cfg, tscfg)
+        if is_ioctl:
+            __set_hwtimestamp_config_ioctl(cfg, tscfg)
+        else:
+            res = __set_hwtimestamp_config(cfg, tscfg)
         if res is None:
             res = __get_hwtimestamp_config(cfg)
+        resioctl = __get_hwtimestamp_config_ioctl(cfg)
+        ksft_eq(resioctl.rx_filter, res['rx-filters']['bits']['bit'][0]['index'])
         if r['index'] == 0 or r['index'] == 1:
             ksft_eq(res['rx-filters']['bits']['bit'][0]['index'], r['index'])
         else:
@@ -100,12 +190,34 @@ def test_hwtstamp_rx(cfg):
     __set_hwtimestamp_config(cfg, orig_tscfg)
 
 
+def test_hwtstamp_rx_netlink(cfg):
+    """
+    Test RX timestamp configuration via netlink.
+    The filter configuration is taken from the list of supported filters.
+    The driver should apply the config without error and report back proper state.
+    Some extension of the timestamping scope is allowed for PTP filters.
+    """
+    __perform_hwtstamp_rx(cfg, False)
+
+
+def test_hwtstamp_rx_ioctl(cfg):
+    """
+    Test RX timestamp configuration via ioctl.
+    The filter configuration is taken from the list of supported filters.
+    The driver should apply the config without error and report back proper state.
+    Some extension of the timestamping scope is allowed for PTP filters.
+    """
+    __perform_hwtstamp_rx(cfg, True)
+
+
 def main() -> None:
     """ Ksft boiler plate main """
 
     with NetDrvEnv(__file__, nsim_test=False) as cfg:
         cfg.ethnl = EthtoolFamily()
-        ksft_run([test_hwtstamp_tx, test_hwtstamp_rx], args=(cfg,))
+        ksft_run([test_hwtstamp_tx_ioctl, test_hwtstamp_tx_netlink,
+                  test_hwtstamp_rx_ioctl, test_hwtstamp_rx_netlink],
+                 args=(cfg,))
         ksft_exit()
 
 
-- 
cgit v1.2.3


From 1802e9079f65cbe47d90d048b06df650a91407f4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 20 Jan 2026 10:03:19 -0800
Subject: selftests: drv-net: fix missing include in ncdevmem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit ca9d74eb5f6a ("uapi: add INT_MAX and INT_MIN constants")
recently removed some includes of limits.h in uAPI headers.
ncdevmem.c was depending on them:

  ncdevmem.c: In function ‘ethtool_add_flow’:
  ncdevmem.c:369:60: error: ‘INT_MAX’ undeclared (first use in this function)
  369 |         if (endptr == id_start || flow_id < 0 || flow_id > INT_MAX)
      |                                                            ^~~~~~~
  ncdevmem.c:77:1: note: ‘INT_MAX’ is defined in header ‘<limits.h>’; did you forget to ‘#include <limits.h>’?

Reviewed-by: Mina Almasry <almasrymina@google.com>
Link: https://patch.msgid.link/20260120180319.1673271-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/ncdevmem.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c
index 3288ed04ce08..16864c844108 100644
--- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c
+++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c
@@ -48,6 +48,7 @@
 #include <errno.h>
 #define __iovec_defined
 #include <fcntl.h>
+#include <limits.h>
 #include <malloc.h>
 #include <error.h>
 #include <poll.h>
-- 
cgit v1.2.3


From 9f5edd785da3cf373285259928d7f1f08c9ce758 Mon Sep 17 00:00:00 2001
From: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Date: Tue, 9 Dec 2025 08:47:45 +0530
Subject: tools/mm/thp_swap_allocator_test: fix small folio alignment

Use ALIGNMENT_SMALLFOLIO instead of ALIGNMENT_MTHP when allocating small
folios to ensure correct memory alignment for the test case.

Before: test allocates small folios with 64KB alignment
(ALIGNMENT_MTHP) when only 4KB alignment (ALIGNMENT_SMALLFOLIO) is
needed.  This wastes address space and may cause allocation failures on
systems with fragmented memory.

Worst-case impact: this only affects thp_swap_allocator_test tool
behavior.

Link: https://lkml.kernel.org/r/20251209031745.2723120-1-kaushlendra.kumar@intel.com
Signed-off-by: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/mm/thp_swap_allocator_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/mm/thp_swap_allocator_test.c b/tools/mm/thp_swap_allocator_test.c
index 83afc52275a5..d4434df3dcff 100644
--- a/tools/mm/thp_swap_allocator_test.c
+++ b/tools/mm/thp_swap_allocator_test.c
@@ -142,7 +142,7 @@ int main(int argc, char *argv[])
 	}
 
 	if (use_small_folio) {
-		mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_MTHP);
+		mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_SMALLFOLIO);
 		if (mem2 == NULL) {
 			fprintf(stderr, "Failed to allocate small folios memory\n");
 			free(mem1);
-- 
cgit v1.2.3


From 8b8017d7c411403731ee4d502cdbd76e9425f0e1 Mon Sep 17 00:00:00 2001
From: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Date: Mon, 8 Dec 2025 16:22:40 +0530
Subject: tools/mm/slabinfo: fix --partial long option mapping

The long option "--partial" was incorrectly mapped to lowercase 'p' in the
opts[] array, but the getopt string and switch case handle uppercase 'P'.
This mismatch caused --partial to be rejected.

Fix the long_options mapping to use 'P' so --partial works correctly
alongside the existing -P short option.

Link: https://lkml.kernel.org/r/20251208105240.2719773-1-kaushlendra.kumar@intel.com
Signed-off-by: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Tested-by: SeongJae Park <sj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/mm/slabinfo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c
index 80cdbd3db82d..54c7265ab52d 100644
--- a/tools/mm/slabinfo.c
+++ b/tools/mm/slabinfo.c
@@ -1405,7 +1405,7 @@ struct option opts[] = {
 	{ "numa", no_argument, NULL, 'n' },
 	{ "lines", required_argument, NULL, 'N'},
 	{ "ops", no_argument, NULL, 'o' },
-	{ "partial", no_argument, NULL, 'p'},
+	{ "partial", no_argument, NULL, 'P'},
 	{ "report", no_argument, NULL, 'r' },
 	{ "shrink", no_argument, NULL, 's' },
 	{ "Size", no_argument, NULL, 'S'},
-- 
cgit v1.2.3


From a98ec863fdedf4940447f32ceda7d937bebd06a2 Mon Sep 17 00:00:00 2001
From: Audra Mitchell <audra@redhat.com>
Date: Mon, 1 Dec 2025 13:18:48 -0500
Subject: lib/test_vmalloc.c: minor fixes to test_vmalloc.c

If PAGE_SIZE is larger than 4k and if you have a system with a large
number of CPUs, this test can require a very large amount of memory
leading to oom-killer firing.  Given the type of allocation, the kernel
won't have anything to kill, causing the system to stall.

Add a parameter to the test_vmalloc driver to represent the number of
times a percpu object will be allocated.  Calculate this in
test_vmalloc.sh to be 90% of available memory or the current default of
35000, whichever is smaller.

Link: https://lkml.kernel.org/r/20251201181848.1216197-1-audra@redhat.com
Signed-off-by: Audra Mitchell <audra@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Rafael Aquini <raquini@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/test_vmalloc.c                         | 11 +++++++----
 tools/testing/selftests/mm/test_vmalloc.sh | 31 +++++++++++++++++++++++++++---
 2 files changed, 35 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index 6521c05c7816..270b6f7ca807 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -58,6 +58,9 @@ __param(int, run_test_mask, 7,
 		/* Add a new test case description here. */
 );
 
+__param(int, nr_pcpu_objects, 35000,
+	"Number of pcpu objects to allocate for pcpu_alloc_test");
+
 /*
  * This is for synchronization of setup phase.
  */
@@ -317,24 +320,24 @@ pcpu_alloc_test(void)
 	size_t size, align;
 	int i;
 
-	pcpu = vmalloc(sizeof(void __percpu *) * 35000);
+	pcpu = vmalloc(sizeof(void __percpu *) * nr_pcpu_objects);
 	if (!pcpu)
 		return -1;
 
-	for (i = 0; i < 35000; i++) {
+	for (i = 0; i < nr_pcpu_objects; i++) {
 		size = get_random_u32_inclusive(1, PAGE_SIZE / 4);
 
 		/*
 		 * Maximum PAGE_SIZE
 		 */
-		align = 1 << get_random_u32_inclusive(1, 11);
+		align = 1 << get_random_u32_inclusive(1, PAGE_SHIFT - 1);
 
 		pcpu[i] = __alloc_percpu(size, align);
 		if (!pcpu[i])
 			rv = -1;
 	}
 
-	for (i = 0; i < 35000; i++)
+	for (i = 0; i < nr_pcpu_objects; i++)
 		free_percpu(pcpu[i]);
 
 	vfree(pcpu);
diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh
index d39096723fca..b23d705bf570 100755
--- a/tools/testing/selftests/mm/test_vmalloc.sh
+++ b/tools/testing/selftests/mm/test_vmalloc.sh
@@ -13,6 +13,9 @@ TEST_NAME="vmalloc"
 DRIVER="test_${TEST_NAME}"
 NUM_CPUS=`grep -c ^processor /proc/cpuinfo`
 
+# Default number of times we allocate percpu objects:
+NR_PCPU_OBJECTS=35000
+
 # 1 if fails
 exitcode=1
 
@@ -27,6 +30,8 @@ PERF_PARAM="sequential_test_order=1 test_repeat_count=3"
 SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10"
 STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20"
 
+PCPU_OBJ_PARAM="nr_pcpu_objects=$NR_PCPU_OBJECTS"
+
 check_test_requirements()
 {
 	uid=$(id -u)
@@ -47,12 +52,30 @@ check_test_requirements()
 	fi
 }
 
+check_memory_requirement()
+{
+	# The pcpu_alloc_test allocates nr_pcpu_objects per cpu. If the
+	# PAGE_SIZE is on the larger side it is easier to set a value
+	# that can cause oom events during testing. Since we are
+	# testing the functionality of vmalloc and not the oom-killer,
+	# calculate what is 90% of available memory and divide it by
+	# the number of online CPUs.
+	pages=$(($(getconf _AVPHYS_PAGES) * 90 / 100 / $NUM_CPUS))
+
+	if (($pages < $NR_PCPU_OBJECTS)); then
+		echo "Updated nr_pcpu_objects to 90% of available memory."
+		echo "nr_pcpu_objects is now set to: $pages."
+		PCPU_OBJ_PARAM="nr_pcpu_objects=$pages"
+	fi
+}
+
 run_performance_check()
 {
 	echo "Run performance tests to evaluate how fast vmalloc allocation is."
 	echo "It runs all test cases on one single CPU with sequential order."
 
-	modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1
+	check_memory_requirement
+	modprobe $DRIVER $PERF_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1
 	echo "Done."
 	echo "Check the kernel message buffer to see the summary."
 }
@@ -63,7 +86,8 @@ run_stability_check()
 	echo "available test cases are run by NUM_CPUS workers simultaneously."
 	echo "It will take time, so be patient."
 
-	modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1
+	check_memory_requirement
+	modprobe $DRIVER $STRESS_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1
 	echo "Done."
 	echo "Check the kernel ring buffer to see the summary."
 }
@@ -74,7 +98,8 @@ run_smoke_check()
 	echo "Please check $0 output how it can be used"
 	echo "for deep performance analysis as well as stress testing."
 
-	modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1
+	check_memory_requirement
+	modprobe $DRIVER $SMOKE_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1
 	echo "Done."
 	echo "Check the kernel ring buffer to see the summary."
 }
-- 
cgit v1.2.3


From 8e46adb62fae98a866baa7c23f6ed3bfe02e6f88 Mon Sep 17 00:00:00 2001
From: Li Wang <liwang@redhat.com>
Date: Sun, 21 Dec 2025 20:26:37 +0800
Subject: selftests/mm/write_to_hugetlbfs: parse -s as size_t

Patch series "selftests/mm: hugetlb cgroup charging: robustness fixes", v3.

This series fixes a few issues in the hugetlb cgroup charging selftests
(write_to_hugetlbfs.c + charge_reserved_hugetlb.sh) that show up on
systems with large hugepages (e.g.  512MB) and when failures cause the
test to wait indefinitely.

On an aarch64 64k page kernel with 512MB hugepages, the test consistently
fails in write_to_hugetlbfs with ENOMEM and then hangs waiting for the
expected usage values.  The root cause is that charge_reserved_hugetlb.sh
mounts hugetlbfs with a fixed size=256M, which is smaller than a single
hugepage, resulting in a mount with size=0 capacity.

In addition, write_to_hugetlbfs previously parsed -s via atoi() into an
int, which can overflow and print negative sizes.

Reproducer / environment:
  - Kernel: 6.12.0-xxx.el10.aarch64+64k
  - Hugepagesize: 524288 kB (512MB)
  - ./charge_reserved_hugetlb.sh -cgroup-v2
  - Observed mount: pagesize=512M,size=0 before this series

After applying the series, the test completes successfully on the above
setup.


This patch (of 3):

write_to_hugetlbfs currently parses the -s size argument with atoi() into
an int.  This silently accepts malformed input, cannot report overflow,
and can truncate large sizes.

=== Error log ===
 # uname -r
 6.12.0-xxx.el10.aarch64+64k

 # ls /sys/kernel/mm/hugepages/hugepages-*
 hugepages-16777216kB/  hugepages-2048kB/  hugepages-524288kB/

 #./charge_reserved_hugetlb.sh -cgroup-v2
 # -----------------------------------------
 ...
 # nr hugepages = 10
 # writing cgroup limit: 5368709120
 # writing reseravation limit: 5368709120
 ...
 # Writing to this path: /mnt/huge/test
 # Writing this size: -1610612736        <--------

Switch the size variable to size_t and parse -s with sscanf("%zu", ...).
Also print the size using %zu.

This avoids incorrect behavior with large -s values and makes the utility
more robust.

Link: https://lkml.kernel.org/r/20251221122639.3168038-1-liwang@redhat.com
Link: https://lkml.kernel.org/r/20251221122639.3168038-2-liwang@redhat.com
Signed-off-by: Li Wang <liwang@redhat.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/write_to_hugetlbfs.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c
index 34c91f7e6128..ecb5f7619960 100644
--- a/tools/testing/selftests/mm/write_to_hugetlbfs.c
+++ b/tools/testing/selftests/mm/write_to_hugetlbfs.c
@@ -68,7 +68,7 @@ int main(int argc, char **argv)
 	int key = 0;
 	int *ptr = NULL;
 	int c = 0;
-	int size = 0;
+	size_t size = 0;
 	char path[256] = "";
 	enum method method = MAX_METHOD;
 	int want_sleep = 0, private = 0;
@@ -86,7 +86,10 @@ int main(int argc, char **argv)
 	while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) {
 		switch (c) {
 		case 's':
-			size = atoi(optarg);
+			if (sscanf(optarg, "%zu", &size) != 1) {
+				perror("Invalid -s.");
+				exit_usage();
+			}
 			break;
 		case 'p':
 			strncpy(path, optarg, sizeof(path) - 1);
@@ -131,7 +134,7 @@ int main(int argc, char **argv)
 	}
 
 	if (size != 0) {
-		printf("Writing this size: %d\n", size);
+		printf("Writing this size: %zu\n", size);
 	} else {
 		errno = EINVAL;
 		perror("size not found");
-- 
cgit v1.2.3


From 1aa1dd9cc595917882fb6db67725442956f79607 Mon Sep 17 00:00:00 2001
From: Li Wang <liwang@redhat.com>
Date: Sun, 21 Dec 2025 20:26:38 +0800
Subject: selftests/mm/charge_reserved_hugetlb: drop mount size for hugetlbfs

charge_reserved_hugetlb.sh mounts a hugetlbfs instance at /mnt/huge with a
fixed size of 256M.  On systems with large base hugepages (e.g.  512MB),
this is smaller than a single hugepage, so the hugetlbfs mount ends up
with zero capacity (often visible as size=0 in mount output).

As a result, write_to_hugetlbfs fails with ENOMEM and the test can hang
waiting for progress.

=== Error log ===
  # uname -r
  6.12.0-xxx.el10.aarch64+64k

  #./charge_reserved_hugetlb.sh -cgroup-v2
  # -----------------------------------------
  ...
  # nr hugepages = 10
  # writing cgroup limit: 5368709120
  # writing reseravation limit: 5368709120
  ...
  # write_to_hugetlbfs: Error mapping the file: Cannot allocate memory
  # Waiting for hugetlb memory reservation to reach size 2684354560.
  # 0
  # Waiting for hugetlb memory reservation to reach size 2684354560.
  # 0
  ...

  # mount |grep /mnt/huge
  none on /mnt/huge type hugetlbfs (rw,relatime,seclabel,pagesize=512M,size=0)

  # grep -i huge /proc/meminfo
  ...
  HugePages_Total:      10
  HugePages_Free:       10
  HugePages_Rsvd:        0
  HugePages_Surp:        0
  Hugepagesize:     524288 kB
  Hugetlb:         5242880 kB

Drop the mount args with 'size=256M', so the filesystem capacity is sufficient
regardless of HugeTLB page size.

Link: https://lkml.kernel.org/r/20251221122639.3168038-3-liwang@redhat.com
Fixes: 29750f71a9b4 ("hugetlb_cgroup: add hugetlb_cgroup reservation tests")
Signed-off-by: Li Wang <liwang@redhat.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/charge_reserved_hugetlb.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
index e1fe16bcbbe8..fa6713892d82 100755
--- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
+++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -290,7 +290,7 @@ function run_test() {
   setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit"
 
   mkdir -p /mnt/huge
-  mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
+  mount -t hugetlbfs -o pagesize=${MB}M none /mnt/huge
 
   write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \
     "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \
@@ -344,7 +344,7 @@ function run_multiple_cgroup_test() {
   setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2"
 
   mkdir -p /mnt/huge
-  mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
+  mount -t hugetlbfs -o pagesize=${MB}M none /mnt/huge
 
   write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \
     "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \
-- 
cgit v1.2.3


From b618876f2e7055160cc5b98b4ff5cd8917e7b49e Mon Sep 17 00:00:00 2001
From: Li Wang <liwang@redhat.com>
Date: Sun, 21 Dec 2025 20:26:39 +0800
Subject: selftests/mm/charge_reserved_hugetlb.sh: add waits with timeout
 helper

The hugetlb cgroup usage wait loops in charge_reserved_hugetlb.sh were
unbounded and could hang forever if the expected cgroup file value never
appears (e.g.  due to write_to_hugetlbfs in Error mapping).

=== Error log ===
  # uname -r
  6.12.0-xxx.el10.aarch64+64k

  # ls /sys/kernel/mm/hugepages/hugepages-*
  hugepages-16777216kB/  hugepages-2048kB/  hugepages-524288kB/

  #./charge_reserved_hugetlb.sh -cgroup-v2
  # -----------------------------------------
  ...
  # nr hugepages = 10
  # writing cgroup limit: 5368709120
  # writing reseravation limit: 5368709120
  ...
  # write_to_hugetlbfs: Error mapping the file: Cannot allocate memory
  # Waiting for hugetlb memory reservation to reach size 2684354560.
  # 0
  # Waiting for hugetlb memory reservation to reach size 2684354560.
  # 0
  # Waiting for hugetlb memory reservation to reach size 2684354560.
  # 0
  # Waiting for hugetlb memory reservation to reach size 2684354560.
  # 0
  # Waiting for hugetlb memory reservation to reach size 2684354560.
  # 0
  # Waiting for hugetlb memory reservation to reach size 2684354560.
  # 0
  ...

Introduce a small helper, wait_for_file_value(), and use it for:
  - waiting for reservation usage to drop to 0,
  - waiting for reservation usage to reach a given size,
  - waiting for fault usage to reach a given size.

This makes the waits consistent and adds a hard timeout (60 tries with 1s
sleep) so the test fails instead of stalling indefinitely.

Link: https://lkml.kernel.org/r/20251221122639.3168038-4-liwang@redhat.com
Signed-off-by: Li Wang <liwang@redhat.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../selftests/mm/charge_reserved_hugetlb.sh        | 51 +++++++++++++---------
 1 file changed, 30 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
index fa6713892d82..447769657634 100755
--- a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
+++ b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
@@ -100,7 +100,7 @@ function setup_cgroup() {
   echo writing cgroup limit: "$cgroup_limit"
   echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file
 
-  echo writing reseravation limit: "$reservation_limit"
+  echo writing reservation limit: "$reservation_limit"
   echo "$reservation_limit" > \
     $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file
 
@@ -112,41 +112,50 @@ function setup_cgroup() {
   fi
 }
 
+function wait_for_file_value() {
+  local path="$1"
+  local expect="$2"
+  local max_tries=60
+
+  if [[ ! -r "$path" ]]; then
+    echo "ERROR: cannot read '$path', missing or permission denied"
+    return 1
+  fi
+
+  for ((i=1; i<=max_tries; i++)); do
+    local cur="$(cat "$path")"
+    if [[ "$cur" == "$expect" ]]; then
+      return 0
+    fi
+    echo "Waiting for $path to become '$expect' (current: '$cur') (try $i/$max_tries)"
+    sleep 1
+  done
+
+  echo "ERROR: timeout waiting for $path to become '$expect'"
+  return 1
+}
+
 function wait_for_hugetlb_memory_to_get_depleted() {
   local cgroup="$1"
   local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
-  # Wait for hugetlbfs memory to get depleted.
-  while [ $(cat $path) != 0 ]; do
-    echo Waiting for hugetlb memory to get depleted.
-    cat $path
-    sleep 0.5
-  done
+
+  wait_for_file_value "$path" "0"
 }
 
 function wait_for_hugetlb_memory_to_get_reserved() {
   local cgroup="$1"
   local size="$2"
-
   local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
-  # Wait for hugetlbfs memory to get written.
-  while [ $(cat $path) != $size ]; do
-    echo Waiting for hugetlb memory reservation to reach size $size.
-    cat $path
-    sleep 0.5
-  done
+
+  wait_for_file_value "$path" "$size"
 }
 
 function wait_for_hugetlb_memory_to_get_written() {
   local cgroup="$1"
   local size="$2"
-
   local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file"
-  # Wait for hugetlbfs memory to get written.
-  while [ $(cat $path) != $size ]; do
-    echo Waiting for hugetlb memory to reach size $size.
-    cat $path
-    sleep 0.5
-  done
+
+  wait_for_file_value "$path" "$size"
 }
 
 function write_hugetlbfs_and_get_usage() {
-- 
cgit v1.2.3


From b47beff129c6193df3dd406f2db2628fcc09d1eb Mon Sep 17 00:00:00 2001
From: Chunyu Hu <chuhu@redhat.com>
Date: Sun, 21 Dec 2025 12:00:21 +0800
Subject: selftests/mm: fix va_high_addr_switch.sh return value

Patch series "Fix va_high_addr_switch.sh test failure - again", v2.

The series address several issues exist for the va_high_addr_switch test:
1) the test return value is ignored in va_high_addr_switch.sh.
2) the va_high_addr_switch test requires 6 hugepages not 5.
3) the reurn value of the first test in va_high_addr_switch.c can be
   overridden by the second test.
4) the nr_hugepages setup in run_vmtests.sh for arm64 can be done in
   va_high_addr_switch.sh too.
5) update a comment for check_test_requirements.


This patch: (of 5)

The return value should be return value of va_high_addr_switch, otherwise
a test failure would be silently ignored.

Link: https://lkml.kernel.org/r/20251221040025.3159990-1-chuhu@redhat.com
Fixes: d9d957bd7b61 ("selftests/mm: alloc hugepages in va_high_addr_switch test")
Signed-off-by: Chunyu Hu <chuhu@redhat.com>
Reviewed-by: Luiz Capitulino <luizcap@redhat.com>
Cc: Luiz Capitulino <luizcap@redhat.com>
Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/va_high_addr_switch.sh | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh
index a7d4b02b21dd..f89fe078a8e6 100755
--- a/tools/testing/selftests/mm/va_high_addr_switch.sh
+++ b/tools/testing/selftests/mm/va_high_addr_switch.sh
@@ -114,4 +114,6 @@ save_nr_hugepages
 # 4 keep_mapped pages, and one for tmp usage
 setup_nr_hugepages 5
 ./va_high_addr_switch --run-hugetlb
+retcode=$?
 restore_nr_hugepages
+exit $retcode
-- 
cgit v1.2.3


From b1f031e33cb5ae4be039a17613ad8da84c777e70 Mon Sep 17 00:00:00 2001
From: Chunyu Hu <chuhu@redhat.com>
Date: Sun, 21 Dec 2025 12:00:22 +0800
Subject: selftests/mm: allocate 6 hugepages in va_high_addr_switch.sh

The va_high_addr_switch test requires 6 hugepages, not 5. If running the
test directly by: ./va_high_addr_switch.sh, the test will hit a mmap 'FAIL'
caused by not enough hugepages:

  mmap(addr_switch_hint - hugepagesize, 2*hugepagesize, MAP_HUGETLB): 0x7f330f800000 - OK
  mmap(addr_switch_hint , 2*hugepagesize, MAP_FIXED | MAP_HUGETLB): 0xffffffffffffffff - FAILED

The failure can't be hit if run the tests by running 'run_vmtests.sh -t
hugevm' because the nr_hugepages is set to 128 at the beginning of
run_vmtests.sh and va_high_addr_switch.sh skip the setup of nr_hugepages
because already enough.

Link: https://lkml.kernel.org/r/20251221040025.3159990-2-chuhu@redhat.com
Fixes: d9d957bd7b61 ("selftests/mm: alloc hugepages in va_high_addr_switch test")
Signed-off-by: Chunyu Hu <chuhu@redhat.com>
Reviewed-by: Luiz Capitulino <luizcap@redhat.com>
Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/va_high_addr_switch.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh
index f89fe078a8e6..a0c93d348b11 100755
--- a/tools/testing/selftests/mm/va_high_addr_switch.sh
+++ b/tools/testing/selftests/mm/va_high_addr_switch.sh
@@ -111,8 +111,8 @@ setup_nr_hugepages()
 
 check_test_requirements
 save_nr_hugepages
-# 4 keep_mapped pages, and one for tmp usage
-setup_nr_hugepages 5
+# The HugeTLB tests require 6 pages
+setup_nr_hugepages 6
 ./va_high_addr_switch --run-hugetlb
 retcode=$?
 restore_nr_hugepages
-- 
cgit v1.2.3


From 7544d7969d84c1c4a078d1c5a7d4117fbf6f385c Mon Sep 17 00:00:00 2001
From: Chunyu Hu <chuhu@redhat.com>
Date: Sun, 21 Dec 2025 12:00:23 +0800
Subject: selftests/mm: remove arm64 nr_hugepages setup for va_high_addr_switch
 test

arm64 and x86_64 has the same nr_hugepages requriement for running the
va_high_addr_switch test.  Since commit d9d957bd7b61 ("selftests/mm: alloc
hugepages in va_high_addr_switch test"), the setup can be done in
va_high_addr_switch.sh.  So remove the duplicated setup.

Link: https://lkml.kernel.org/r/20251221040025.3159990-3-chuhu@redhat.com
Signed-off-by: Chunyu Hu <chuhu@redhat.com>
Reviewed-by: Luiz Capitulino <luizcap@redhat.com>
Cc: Luiz Capitulino <luizcap@redhat.com>
Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/run_vmtests.sh | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index d9173f2312b7..2dadbfc6e535 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -412,15 +412,7 @@ if [ $VADDR64 -ne 0 ]; then
 	fi
 
 	# va high address boundary switch test
-	ARCH_ARM64="arm64"
-	prev_nr_hugepages=$(cat /proc/sys/vm/nr_hugepages)
-	if [ "$ARCH" == "$ARCH_ARM64" ]; then
-		echo 6 > /proc/sys/vm/nr_hugepages
-	fi
 	CATEGORY="hugevm" run_test bash ./va_high_addr_switch.sh
-	if [ "$ARCH" == "$ARCH_ARM64" ]; then
-		echo $prev_nr_hugepages > /proc/sys/vm/nr_hugepages
-	fi
 fi # VADDR64
 
 # vmalloc stability smoke test
-- 
cgit v1.2.3


From dd0202a0bd81c33096f3d473c296cad997baba5b Mon Sep 17 00:00:00 2001
From: Chunyu Hu <chuhu@redhat.com>
Date: Sun, 21 Dec 2025 12:00:24 +0800
Subject: selftests/mm: va_high_addr_switch return fail when either test failed

When the first test failed, and the hugetlb test passed, the result would
be pass, but we expect a fail.  Fix this issue by returning fail if either
is not KSFT_PASS.

Link: https://lkml.kernel.org/r/20251221040025.3159990-4-chuhu@redhat.com
Signed-off-by: Chunyu Hu <chuhu@redhat.com>
Reviewed-by: Luiz Capitulino <luizcap@redhat.com>
Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/va_high_addr_switch.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c
index 02f290a69132..51401e081b20 100644
--- a/tools/testing/selftests/mm/va_high_addr_switch.c
+++ b/tools/testing/selftests/mm/va_high_addr_switch.c
@@ -322,7 +322,7 @@ static int supported_arch(void)
 
 int main(int argc, char **argv)
 {
-	int ret;
+	int ret, hugetlb_ret = KSFT_PASS;
 
 	if (!supported_arch())
 		return KSFT_SKIP;
@@ -331,6 +331,10 @@ int main(int argc, char **argv)
 
 	ret = run_test(testcases, sz_testcases);
 	if (argc == 2 && !strcmp(argv[1], "--run-hugetlb"))
-		ret = run_test(hugetlb_testcases, sz_hugetlb_testcases);
-	return ret;
+		hugetlb_ret = run_test(hugetlb_testcases, sz_hugetlb_testcases);
+
+	if (ret == KSFT_PASS && hugetlb_ret == KSFT_PASS)
+		return KSFT_PASS;
+	else
+		return KSFT_FAIL;
 }
-- 
cgit v1.2.3


From 6319c4f44234c3849fdb2c3f72c45353aa428d3f Mon Sep 17 00:00:00 2001
From: Chunyu Hu <chuhu@redhat.com>
Date: Sun, 21 Dec 2025 12:00:25 +0800
Subject: selftests/mm: fix comment for check_test_requirements

The test supports arm64 as well so the comment is incorrect.  And there's
a check for arm64 in va_high_addr_switch.c.

Link: https://lkml.kernel.org/r/20251221040025.3159990-5-chuhu@redhat.com
Fixes: 983e760bcdb6 ("selftest/mm: va_high_addr_switch: add ppc64 support check")
Fixes: f556acc2facd ("selftests/mm: skip test for non-LPA2 and non-LVA systems")
Signed-off-by: Chunyu Hu <chuhu@redhat.com>
Reviewed-by: Luiz Capitulino <luizcap@redhat.com>
Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/va_high_addr_switch.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh
index a0c93d348b11..9492c2d72634 100755
--- a/tools/testing/selftests/mm/va_high_addr_switch.sh
+++ b/tools/testing/selftests/mm/va_high_addr_switch.sh
@@ -61,9 +61,9 @@ check_supported_ppc64()
 
 check_test_requirements()
 {
-	# The test supports x86_64 and powerpc64. We currently have no useful
-	# eligibility check for powerpc64, and the test itself will reject other
-	# architectures.
+	# The test supports x86_64, powerpc64 and arm64. There's check for arm64
+	# in va_high_addr_switch.c. The test itself will reject other architectures.
+
 	case `uname -m` in
 		"x86_64")
 			check_supported_x86_64
-- 
cgit v1.2.3


From e700f5d1560798aacf0e56fdcc70ee2c20bf56ec Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Tue, 16 Dec 2025 02:45:21 -0500
Subject: watchdog: softlockup: panic when lockup duration exceeds N thresholds

The softlockup_panic sysctl is currently a binary option: panic
immediately or never panic on soft lockups.

Panicking on any soft lockup, regardless of duration, can be overly
aggressive for brief stalls that may be caused by legitimate operations.
Conversely, never panicking may allow severe system hangs to persist
undetected.

Extend softlockup_panic to accept an integer threshold, allowing the
kernel to panic only when the normalized lockup duration exceeds N
watchdog threshold periods.  This provides finer-grained control to
distinguish between transient delays and persistent system failures.

The accepted values are:
- 0: Don't panic (unchanged)
- 1: Panic when duration >= 1 * threshold (20s default, original behavior)
- N > 1: Panic when duration >= N * threshold (e.g., 2 = 40s, 3 = 60s.)

The original behavior is preserved for values 0 and 1, maintaining full
backward compatibility while allowing systems to tolerate brief lockups
while still catching severe, persistent hangs.

[lirongqing@baidu.com: v2]
  Link: https://lkml.kernel.org/r/20251218074300.4080-1-lirongqing@baidu.com
Link: https://lkml.kernel.org/r/20251216074521.2796-1-lirongqing@baidu.com
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Cc: Eduard Zingerman <eddyz87@gmail.com>
Cc: Hao Luo <haoluo@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Song Liu <song@kernel.org>
Cc: Stanislav Fomichev <sdf@fomichev.me>
Cc: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt      | 10 +++++-----
 arch/arm/configs/aspeed_g5_defconfig                 |  2 +-
 arch/arm/configs/pxa3xx_defconfig                    |  2 +-
 arch/openrisc/configs/or1klitex_defconfig            |  2 +-
 arch/powerpc/configs/skiroot_defconfig               |  2 +-
 drivers/gpu/drm/ci/arm.config                        |  2 +-
 drivers/gpu/drm/ci/arm64.config                      |  2 +-
 drivers/gpu/drm/ci/x86_64.config                     |  2 +-
 kernel/configs/debug.config                          |  2 +-
 kernel/watchdog.c                                    | 10 ++++++----
 lib/Kconfig.debug                                    | 13 +++++++------
 tools/testing/selftests/bpf/config                   |  2 +-
 tools/testing/selftests/wireguard/qemu/kernel.config |  2 +-
 13 files changed, 28 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1058f2a6d6a8..73d846211144 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6969,12 +6969,12 @@ Kernel parameters
 
 	softlockup_panic=
 			[KNL] Should the soft-lockup detector generate panics.
-			Format: 0 | 1
+			Format: <int>
 
-			A value of 1 instructs the soft-lockup detector
-			to panic the machine when a soft-lockup occurs. It is
-			also controlled by the kernel.softlockup_panic sysctl
-			and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the
+			A value of non-zero instructs the soft-lockup detector
+			to panic the machine when a soft-lockup duration exceeds
+			N thresholds. It is also controlled by the kernel.softlockup_panic
+			sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the
 			respective build-time switch to that functionality.
 
 	softlockup_all_cpu_backtrace=
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 2e6ea13c1e9b..ec558e57d081 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -306,7 +306,7 @@ CONFIG_SCHED_STACK_END_CHECK=y
 CONFIG_PANIC_ON_OOPS=y
 CONFIG_PANIC_TIMEOUT=-1
 CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
 CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
 CONFIG_WQ_WATCHDOG=y
 # CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/configs/pxa3xx_defconfig b/arch/arm/configs/pxa3xx_defconfig
index 07d422f0ff34..fb272e3a2337 100644
--- a/arch/arm/configs/pxa3xx_defconfig
+++ b/arch/arm/configs/pxa3xx_defconfig
@@ -100,7 +100,7 @@ CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_SHIRQ=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_DEBUG_SPINLOCK=y
 CONFIG_DEBUG_SPINLOCK_SLEEP=y
diff --git a/arch/openrisc/configs/or1klitex_defconfig b/arch/openrisc/configs/or1klitex_defconfig
index fb1eb9a68bd6..984b0e3b2768 100644
--- a/arch/openrisc/configs/or1klitex_defconfig
+++ b/arch/openrisc/configs/or1klitex_defconfig
@@ -52,5 +52,5 @@ CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity,bpf"
 CONFIG_PRINTK_TIME=y
 CONFIG_PANIC_ON_OOPS=y
 CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
 CONFIG_BUG_ON_DATA_CORRUPTION=y
diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig
index 2b71a6dc399e..a4114fca5a39 100644
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -289,7 +289,7 @@ CONFIG_SCHED_STACK_END_CHECK=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_PANIC_ON_OOPS=y
 CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
 CONFIG_HARDLOCKUP_DETECTOR=y
 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
 CONFIG_WQ_WATCHDOG=y
diff --git a/drivers/gpu/drm/ci/arm.config b/drivers/gpu/drm/ci/arm.config
index 411e814819a8..d7c51670da2f 100644
--- a/drivers/gpu/drm/ci/arm.config
+++ b/drivers/gpu/drm/ci/arm.config
@@ -52,7 +52,7 @@ CONFIG_TMPFS=y
 CONFIG_PROVE_LOCKING=n
 CONFIG_DEBUG_LOCKDEP=n
 CONFIG_SOFTLOCKUP_DETECTOR=n
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=n
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0
 
 CONFIG_FW_LOADER_COMPRESS=y
 
diff --git a/drivers/gpu/drm/ci/arm64.config b/drivers/gpu/drm/ci/arm64.config
index fddfbd4d2493..ea0e30737c4d 100644
--- a/drivers/gpu/drm/ci/arm64.config
+++ b/drivers/gpu/drm/ci/arm64.config
@@ -161,7 +161,7 @@ CONFIG_TMPFS=y
 CONFIG_PROVE_LOCKING=n
 CONFIG_DEBUG_LOCKDEP=n
 CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
 
 CONFIG_DETECT_HUNG_TASK=y
 
diff --git a/drivers/gpu/drm/ci/x86_64.config b/drivers/gpu/drm/ci/x86_64.config
index 8eaba388b141..7ac98a78691e 100644
--- a/drivers/gpu/drm/ci/x86_64.config
+++ b/drivers/gpu/drm/ci/x86_64.config
@@ -47,7 +47,7 @@ CONFIG_TMPFS=y
 CONFIG_PROVE_LOCKING=n
 CONFIG_DEBUG_LOCKDEP=n
 CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
 
 CONFIG_DETECT_HUNG_TASK=y
 
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index 9f6ab7dabf67..774702591d26 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -84,7 +84,7 @@ CONFIG_SLUB_DEBUG_ON=y
 # Debug Oops, Lockups and Hangs
 #
 CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0
-# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0
 CONFIG_DEBUG_ATOMIC_SLEEP=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_PANIC_ON_OOPS=y
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 366122f4a0f8..b4d5fbdb933a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -363,7 +363,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly;
 
 /* Global variables, exported for sysctl */
 unsigned int __read_mostly softlockup_panic =
-			IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC);
+			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC;
 
 static bool softlockup_initialized __read_mostly;
 static u64 __read_mostly sample_period;
@@ -774,8 +774,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
 	unsigned long touch_ts, period_ts, now;
 	struct pt_regs *regs = get_irq_regs();
-	int duration;
 	int softlockup_all_cpu_backtrace;
+	int duration, thresh_count;
 	unsigned long flags;
 
 	if (!watchdog_enabled)
@@ -879,7 +879,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 
 		add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
 		sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT);
-		if (softlockup_panic)
+		thresh_count = duration / get_softlockup_thresh();
+
+		if (softlockup_panic && thresh_count >= softlockup_panic)
 			panic("softlockup: hung tasks");
 	}
 
@@ -1228,7 +1230,7 @@ static const struct ctl_table watchdog_sysctls[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "softlockup_sys_info",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4bfca37f313e..947e62e92da8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1110,13 +1110,14 @@ config SOFTLOCKUP_DETECTOR_INTR_STORM
 	  the CPU stats and the interrupt counts during the "soft lockups".
 
 config BOOTPARAM_SOFTLOCKUP_PANIC
-	bool "Panic (Reboot) On Soft Lockups"
+	int "Panic (Reboot) On Soft Lockups"
 	depends on SOFTLOCKUP_DETECTOR
+	default 0
 	help
-	  Say Y here to enable the kernel to panic on "soft lockups",
-	  which are bugs that cause the kernel to loop in kernel
-	  mode for more than 20 seconds (configurable using the watchdog_thresh
-	  sysctl), without giving other tasks a chance to run.
+	  Set to a non-zero value N to enable the kernel to panic on "soft
+	  lockups", which are bugs that cause the kernel to loop in kernel
+	  mode for more than (N * 20 seconds) (configurable using the
+	  watchdog_thresh sysctl), without giving other tasks a chance to run.
 
 	  The panic can be used in combination with panic_timeout,
 	  to cause the system to reboot automatically after a
@@ -1124,7 +1125,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC
 	  high-availability systems that have uptime guarantees and
 	  where a lockup must be resolved ASAP.
 
-	  Say N if unsure.
+	  Say 0 if unsure.
 
 config HAVE_HARDLOCKUP_DETECTOR_BUDDY
 	bool
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 558839e3c185..24855381290d 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -1,6 +1,6 @@
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
 CONFIG_BPF=y
 CONFIG_BPF_EVENTS=y
 CONFIG_BPF_JIT=y
diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config
index 0504c11c2de6..bb89d2dfaa2a 100644
--- a/tools/testing/selftests/wireguard/qemu/kernel.config
+++ b/tools/testing/selftests/wireguard/qemu/kernel.config
@@ -80,7 +80,7 @@ CONFIG_HARDLOCKUP_DETECTOR=y
 CONFIG_WQ_WATCHDOG=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
 CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
 CONFIG_PANIC_TIMEOUT=-1
 CONFIG_STACKTRACE=y
-- 
cgit v1.2.3


From 4fca95095cdcd81bd4a8c8c7008fb3c175a3a5d5 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Tue, 20 Jan 2026 15:05:55 +0800
Subject: selftests/bpf: test the jited inline of bpf_get_current_task

Add the testcase for the jited inline of bpf_get_current_task().

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20260120070555.233486-3-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/verifier.c    |  2 ++
 .../selftests/bpf/progs/verifier_jit_inline.c        | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/verifier_jit_inline.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index b6a1e79709be..302286a80154 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -112,6 +112,7 @@
 #include "verifier_xdp_direct_packet_access.skel.h"
 #include "verifier_bits_iter.skel.h"
 #include "verifier_lsm.skel.h"
+#include "verifier_jit_inline.skel.h"
 #include "irq.skel.h"
 
 #define MAX_ENTRIES 11
@@ -255,6 +256,7 @@ void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); }
 void test_verifier_lsm(void)                  { RUN(verifier_lsm); }
 void test_irq(void)			      { RUN(irq); }
 void test_verifier_mtu(void)		      { RUN(verifier_mtu); }
+void test_verifier_jit_inline(void)               { RUN(verifier_jit_inline); }
 
 static int init_test_val_map(struct bpf_object *obj, char *map_name)
 {
diff --git a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c
new file mode 100644
index 000000000000..4ea254063646
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+SEC("fentry/bpf_fentry_test1")
+__success __retval(0)
+__arch_x86_64
+__jited("	addq	%gs:{{.*}}, %rax")
+__arch_arm64
+__jited("	mrs	x7, SP_EL0")
+int inline_bpf_get_current_task(void)
+{
+	bpf_get_current_task();
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 9a0a5b5ac4372da84394dc329f763d6b7d384a86 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 20 Jan 2026 18:26:44 -0300
Subject: perf list: Signal changing const memory is ok

In this case its a temp list that is created just for listing events and
will be deleted at the end, so just cast it to get rid of the compiler
warning.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/print-events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c
index 8f3ed83853a9..4bbcdbf05b84 100644
--- a/tools/perf/util/print-events.c
+++ b/tools/perf/util/print-events.c
@@ -86,7 +86,7 @@ void print_sdt_events(const struct print_callbacks *print_cb, void *print_state)
 
 	strlist__for_each_entry(sdt_name, sdtlist) {
 		bool show_detail = false;
-		char *bid = strchr(sdt_name->s, '@');
+		char *bid = (char *)strchr(sdt_name->s, '@');
 		char *evt_name = NULL;
 
 		if (bid)
-- 
cgit v1.2.3


From 29132d16965e66fed0bf7b38242e7e57df294ba0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 20 Jan 2026 18:16:09 -0300
Subject: perf list: Don't write to const memory

Something now detected on fedora 44, where strchr() returns const if it
is passed a const pointer:

  util/print-events.c: In function 'print_sdt_events':
  util/print-events.c:89:29: error: initialization discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers]
     89 |                 char *bid = strchr(sdt_name->s, '@');
        |                             ^~~~~~

Fix it by using strchrnul() + strncmp() instead of temporarily scrubbing
it with '\0'.

Reviewed-by: Ian Rogers <irogers@google.com>
Suggested-by: David Laight <david.laight.linux@gmail.com>
Link: https://lore.kernel.org/r/20260121112536.27fd5d11@pumpkin
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/print-events.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c
index 4bbcdbf05b84..cb27e2898aa0 100644
--- a/tools/perf/util/print-events.c
+++ b/tools/perf/util/print-events.c
@@ -97,14 +97,9 @@ void print_sdt_events(const struct print_callbacks *print_cb, void *print_state)
 		} else {
 			next_sdt_name = strlist__next(sdt_name);
 			if (next_sdt_name) {
-				char *bid2 = strchr(next_sdt_name->s, '@');
-
-				if (bid2)
-					*bid2 = '\0';
-				if (strcmp(sdt_name->s, next_sdt_name->s) == 0)
-					show_detail = true;
-				if (bid2)
-					*bid2 = '@';
+				const char *bid2 = strchrnul(next_sdt_name->s, '@');
+
+				show_detail = strncmp(sdt_name->s, next_sdt_name->s, bid2 - next_sdt_name->s) == 0;
 			}
 		}
 		last_sdt_name = sdt_name->s;
-- 
cgit v1.2.3


From 75aad5ffe099a1b1a342257236dc260493917ed2 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 13 Jan 2026 16:58:00 +0800
Subject: selftests/ublk: fix IO thread idle check

Include cmd_inflight in ublk_thread_is_done() check. Without this,
the thread may exit before all FETCH commands are completed, which
may cause device deletion to hang.

Fixes: 6aecda00b7d1 ("selftests: ublk: add kernel selftests for ublk")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/kublk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 185ba553686a..f52431fe9b6c 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -753,7 +753,7 @@ static int ublk_thread_is_idle(struct ublk_thread *t)
 
 static int ublk_thread_is_done(struct ublk_thread *t)
 {
-	return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t);
+	return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t) && !t->cmd_inflight;
 }
 
 static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t,
-- 
cgit v1.2.3


From 23e62cf75518825aac12e9a22bdc40f062428898 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 13 Jan 2026 16:58:01 +0800
Subject: selftests/ublk: fix error handling for starting device

Fix error handling in ublk_start_daemon() when start_dev fails:

1. Call ublk_ctrl_stop_dev() to cancel inflight uring_cmd before
   cleanup. Without this, the device deletion may hang waiting for
   I/O completion that will never happen.

2. Add fail_start label so that pthread_join() is called on the
   error path. This ensures proper thread cleanup when startup fails.

Fixes: 6aecda00b7d1 ("selftests: ublk: add kernel selftests for ublk")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/kublk.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index f52431fe9b6c..65f59e7b6972 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -1054,7 +1054,9 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 	}
 	if (ret < 0) {
 		ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret);
-		goto fail;
+		/* stop device so that inflight uring_cmd can be cancelled */
+		ublk_ctrl_stop_dev(dev);
+		goto fail_start;
 	}
 
 	ublk_ctrl_get_info(dev);
@@ -1062,7 +1064,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 		ublk_ctrl_dump(dev);
 	else
 		ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
-
+fail_start:
 	/* wait until we are terminated */
 	for (i = 0; i < dev->nthreads; i++)
 		pthread_join(tinfo[i].thread, &thread_ret);
-- 
cgit v1.2.3


From e7e1cc18f120a415646be12470169a978a1adcd9 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 13 Jan 2026 16:58:02 +0800
Subject: selftests/ublk: fix garbage output in foreground mode

Initialize _evtfd to -1 in struct dev_ctx to prevent garbage output
when running kublk in foreground mode. Without this, _evtfd is
zero-initialized to 0 (stdin), and ublk_send_dev_event() writes
binary data to stdin which appears as garbage on the terminal.

Also fix debug message format string.

Fixes: 6aecda00b7d1 ("selftests: ublk: add kernel selftests for ublk")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/kublk.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 65f59e7b6972..f197ad9cc262 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -1274,7 +1274,7 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
 	}
 
 	ret = ublk_start_daemon(ctx, dev);
-	ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\b", ret);
+	ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret);
 	if (ret < 0)
 		ublk_ctrl_del_dev(dev);
 
@@ -1620,6 +1620,7 @@ int main(int argc, char *argv[])
 	int option_idx, opt;
 	const char *cmd = argv[1];
 	struct dev_ctx ctx = {
+		._evtfd         =       -1,
 		.queue_depth	=	128,
 		.nr_hw_queues	=	2,
 		.dev_id		=	-1,
-- 
cgit v1.2.3


From 73061dbeca783aaf311e1af9610f8cba1c1176cd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 20 Jan 2026 21:11:44 +0000
Subject: selftests/io_uring: add io_uring_queue_init_params

Add a ring init variant taking struct io_uring_params, which mimicks
liburing API.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/include/io_uring/mini_liburing.h | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/include/io_uring/mini_liburing.h b/tools/include/io_uring/mini_liburing.h
index 9ccb16074eb5..a55407b09dbb 100644
--- a/tools/include/io_uring/mini_liburing.h
+++ b/tools/include/io_uring/mini_liburing.h
@@ -126,21 +126,18 @@ static inline int io_uring_enter(int fd, unsigned int to_submit,
 		       flags, sig, _NSIG / 8);
 }
 
-static inline int io_uring_queue_init(unsigned int entries,
-				      struct io_uring *ring,
-				      unsigned int flags)
+static inline int io_uring_queue_init_params(unsigned int entries,
+					     struct io_uring *ring,
+					     struct io_uring_params *p)
 {
-	struct io_uring_params p;
 	int fd, ret;
 
 	memset(ring, 0, sizeof(*ring));
-	memset(&p, 0, sizeof(p));
-	p.flags = flags;
 
-	fd = io_uring_setup(entries, &p);
+	fd = io_uring_setup(entries, p);
 	if (fd < 0)
 		return fd;
-	ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq);
+	ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq);
 	if (!ret)
 		ring->ring_fd = fd;
 	else
@@ -148,6 +145,18 @@ static inline int io_uring_queue_init(unsigned int entries,
 	return ret;
 }
 
+static inline int io_uring_queue_init(unsigned int entries,
+				      struct io_uring *ring,
+				      unsigned int flags)
+{
+	struct io_uring_params p;
+
+	memset(&p, 0, sizeof(p));
+	p.flags = flags;
+
+	return io_uring_queue_init_params(entries, ring, &p);
+}
+
 /* Get a sqe */
 static inline struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
 {
-- 
cgit v1.2.3


From 145e0074392587606aa5df353d0e761f0b8357d5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 20 Jan 2026 21:11:45 +0000
Subject: selftests/io_uring: support NO_SQARRAY in miniliburing

Add support for IORING_SETUP_NO_SQARRAY in miniliburing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/include/io_uring/mini_liburing.h | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/include/io_uring/mini_liburing.h b/tools/include/io_uring/mini_liburing.h
index a55407b09dbb..44be4446feda 100644
--- a/tools/include/io_uring/mini_liburing.h
+++ b/tools/include/io_uring/mini_liburing.h
@@ -6,6 +6,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
+#include <sys/uio.h>
 
 struct io_sq_ring {
 	unsigned int *head;
@@ -55,6 +56,7 @@ struct io_uring {
 	struct io_uring_sq sq;
 	struct io_uring_cq cq;
 	int ring_fd;
+	unsigned flags;
 };
 
 #if defined(__x86_64) || defined(__i386__)
@@ -72,7 +74,14 @@ static inline int io_uring_mmap(int fd, struct io_uring_params *p,
 	void *ptr;
 	int ret;
 
-	sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned int);
+	if (p->flags & IORING_SETUP_NO_SQARRAY) {
+		sq->ring_sz = p->cq_off.cqes;
+		sq->ring_sz += p->cq_entries * sizeof(struct io_uring_cqe);
+	} else {
+		sq->ring_sz = p->sq_off.array;
+		sq->ring_sz += p->sq_entries * sizeof(unsigned int);
+	}
+
 	ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
 		   MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
 	if (ptr == MAP_FAILED)
@@ -83,7 +92,8 @@ static inline int io_uring_mmap(int fd, struct io_uring_params *p,
 	sq->kring_entries = ptr + p->sq_off.ring_entries;
 	sq->kflags = ptr + p->sq_off.flags;
 	sq->kdropped = ptr + p->sq_off.dropped;
-	sq->array = ptr + p->sq_off.array;
+	if (!(p->flags & IORING_SETUP_NO_SQARRAY))
+		sq->array = ptr + p->sq_off.array;
 
 	size = p->sq_entries * sizeof(struct io_uring_sqe);
 	sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
@@ -138,10 +148,12 @@ static inline int io_uring_queue_init_params(unsigned int entries,
 	if (fd < 0)
 		return fd;
 	ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq);
-	if (!ret)
+	if (!ret) {
 		ring->ring_fd = fd;
-	else
+		ring->flags = p->flags;
+	} else {
 		close(fd);
+	}
 	return ret;
 }
 
@@ -208,10 +220,18 @@ static inline int io_uring_submit(struct io_uring *ring)
 
 	ktail = *sq->ktail;
 	to_submit = sq->sqe_tail - sq->sqe_head;
-	for (submitted = 0; submitted < to_submit; submitted++) {
-		read_barrier();
-		sq->array[ktail++ & mask] = sq->sqe_head++ & mask;
+
+	if (!(ring->flags & IORING_SETUP_NO_SQARRAY)) {
+		for (submitted = 0; submitted < to_submit; submitted++) {
+			read_barrier();
+			sq->array[ktail++ & mask] = sq->sqe_head++ & mask;
+		}
+	} else {
+		ktail += to_submit;
+		sq->sqe_head += to_submit;
+		submitted = to_submit;
 	}
+
 	if (!submitted)
 		return 0;
 
-- 
cgit v1.2.3


From 1ed797764315496a115cd0568450a7f72da80df6 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Wed, 21 Jan 2026 12:43:48 +0800
Subject: selftests/bpf: test bpf_get_func_arg() for tp_btf

Test bpf_get_func_arg() and bpf_get_func_arg_cnt() for tp_btf. The code
is most copied from test1 and test2.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20260121044348.113201-3-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/get_func_args_test.c  |  3 ++
 .../selftests/bpf/progs/get_func_args_test.c       | 44 ++++++++++++++++++++++
 .../selftests/bpf/test_kmods/bpf_testmod-events.h  | 10 +++++
 .../testing/selftests/bpf/test_kmods/bpf_testmod.c |  4 ++
 4 files changed, 61 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c
index 64a9c95d4acf..fadee95d3ae8 100644
--- a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c
@@ -33,11 +33,14 @@ void test_get_func_args_test(void)
 
 	ASSERT_EQ(topts.retval >> 16, 1, "test_run");
 	ASSERT_EQ(topts.retval & 0xffff, 1234 + 29, "test_run");
+	ASSERT_OK(trigger_module_test_read(1), "trigger_read");
 
 	ASSERT_EQ(skel->bss->test1_result, 1, "test1_result");
 	ASSERT_EQ(skel->bss->test2_result, 1, "test2_result");
 	ASSERT_EQ(skel->bss->test3_result, 1, "test3_result");
 	ASSERT_EQ(skel->bss->test4_result, 1, "test4_result");
+	ASSERT_EQ(skel->bss->test5_result, 1, "test5_result");
+	ASSERT_EQ(skel->bss->test6_result, 1, "test6_result");
 
 cleanup:
 	get_func_args_test__destroy(skel);
diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c
index e0f34a55e697..5b7233afef05 100644
--- a/tools/testing/selftests/bpf/progs/get_func_args_test.c
+++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c
@@ -121,3 +121,47 @@ int BPF_PROG(fexit_test, int _a, int *_b, int _ret)
 	test4_result &= err == 0 && ret == 1234;
 	return 0;
 }
+
+__u64 test5_result = 0;
+SEC("tp_btf/bpf_testmod_fentry_test1_tp")
+int BPF_PROG(tp_test1)
+{
+	__u64 cnt = bpf_get_func_arg_cnt(ctx);
+	__u64 a = 0, z = 0;
+	__s64 err;
+
+	test5_result = cnt == 1;
+
+	err = bpf_get_func_arg(ctx, 0, &a);
+	test5_result &= err == 0 && ((int) a == 1);
+
+	/* not valid argument */
+	err = bpf_get_func_arg(ctx, 1, &z);
+	test5_result &= err == -EINVAL;
+
+	return 0;
+}
+
+__u64 test6_result = 0;
+SEC("tp_btf/bpf_testmod_fentry_test2_tp")
+int BPF_PROG(tp_test2)
+{
+	__u64 cnt = bpf_get_func_arg_cnt(ctx);
+	__u64 a = 0, b = 0, z = 0;
+	__s64 err;
+
+	test6_result = cnt == 2;
+
+	/* valid arguments */
+	err = bpf_get_func_arg(ctx, 0, &a);
+	test6_result &= err == 0 && (int) a == 2;
+
+	err = bpf_get_func_arg(ctx, 1, &b);
+	test6_result &= err == 0 && b == 3;
+
+	/* not valid argument */
+	err = bpf_get_func_arg(ctx, 2, &z);
+	test6_result &= err == -EINVAL;
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h
index aeef86b3da74..45a5e41f3a92 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h
@@ -63,6 +63,16 @@ BPF_TESTMOD_DECLARE_TRACE(bpf_testmod_test_writable_bare,
 	sizeof(struct bpf_testmod_test_writable_ctx)
 );
 
+DECLARE_TRACE(bpf_testmod_fentry_test1,
+	TP_PROTO(int a),
+	TP_ARGS(a)
+);
+
+DECLARE_TRACE(bpf_testmod_fentry_test2,
+	TP_PROTO(int a, u64 b),
+	TP_ARGS(a, b)
+);
+
 #endif /* _BPF_TESTMOD_EVENTS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index d425034b72d3..77a81fa8ec6a 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -412,11 +412,15 @@ __weak noinline struct file *bpf_testmod_return_ptr(int arg)
 
 noinline int bpf_testmod_fentry_test1(int a)
 {
+	trace_bpf_testmod_fentry_test1_tp(a);
+
 	return a + 1;
 }
 
 noinline int bpf_testmod_fentry_test2(int a, u64 b)
 {
+	trace_bpf_testmod_fentry_test2_tp(a, b);
+
 	return a + b;
 }
 
-- 
cgit v1.2.3


From f4924ad0b13fd4ca4f0c7117dc143bf372224aec Mon Sep 17 00:00:00 2001
From: Yuzuki Ishiyama <ishiyama@hpc.is.uec.ac.jp>
Date: Wed, 21 Jan 2026 12:33:28 +0900
Subject: selftests/bpf: Test kfunc bpf_strncasecmp

Add testsuites for kfunc bpf_strncasecmp.

Signed-off-by: Yuzuki Ishiyama <ishiyama@hpc.is.uec.ac.jp>
Acked-by: Viktor Malik <vmalik@redhat.com>
Link: https://lore.kernel.org/r/20260121033328.1850010-3-ishiyama@hpc.is.uec.ac.jp
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/string_kfuncs.c     | 1 +
 tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c | 6 ++++++
 tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c | 1 +
 tools/testing/selftests/bpf/progs/string_kfuncs_success.c  | 7 +++++++
 4 files changed, 15 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c
index 0f3bf594e7a5..300032a19445 100644
--- a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c
+++ b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c
@@ -9,6 +9,7 @@
 static const char * const test_cases[] = {
 	"strcmp",
 	"strcasecmp",
+	"strncasecmp",
 	"strchr",
 	"strchrnul",
 	"strnchr",
diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c
index 826e6b6aff7e..bddc4e8579d2 100644
--- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c
+++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c
@@ -33,6 +33,8 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_null1(void *ctx) { return
 SEC("syscall")  __retval(USER_PTR_ERR)int test_strcmp_null2(void *ctx) { return bpf_strcmp("hello", NULL); }
 SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_null1(void *ctx) { return bpf_strcasecmp(NULL, "HELLO"); }
 SEC("syscall")  __retval(USER_PTR_ERR)int test_strcasecmp_null2(void *ctx) { return bpf_strcasecmp("HELLO", NULL); }
+SEC("syscall") __retval(USER_PTR_ERR)int test_strncasecmp_null1(void *ctx) { return bpf_strncasecmp(NULL, "HELLO", 5); }
+SEC("syscall")  __retval(USER_PTR_ERR)int test_strncasecmp_null2(void *ctx) { return bpf_strncasecmp("HELLO", NULL, 5);	 }
 SEC("syscall")  __retval(USER_PTR_ERR)int test_strchr_null(void *ctx) { return bpf_strchr(NULL, 'a'); }
 SEC("syscall")  __retval(USER_PTR_ERR)int test_strchrnul_null(void *ctx) { return bpf_strchrnul(NULL, 'a'); }
 SEC("syscall")  __retval(USER_PTR_ERR)int test_strnchr_null(void *ctx) { return bpf_strnchr(NULL, 1, 'a'); }
@@ -57,6 +59,8 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr1(void *ctx) { ret
 SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr2(void *ctx) { return bpf_strcmp("hello", user_ptr); }
 SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr1(void *ctx) { return bpf_strcasecmp(user_ptr, "HELLO"); }
 SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr2(void *ctx) { return bpf_strcasecmp("HELLO", user_ptr); }
+SEC("syscall") __retval(USER_PTR_ERR) int test_strncasecmp_user_ptr1(void *ctx) { return bpf_strncasecmp(user_ptr, "HELLO", 5); }
+SEC("syscall") __retval(USER_PTR_ERR) int test_strncasecmp_user_ptr2(void *ctx) { return bpf_strncasecmp("HELLO", user_ptr, 5);	 }
 SEC("syscall") __retval(USER_PTR_ERR) int test_strchr_user_ptr(void *ctx) { return bpf_strchr(user_ptr, 'a'); }
 SEC("syscall") __retval(USER_PTR_ERR) int test_strchrnul_user_ptr(void *ctx) { return bpf_strchrnul(user_ptr, 'a'); }
 SEC("syscall") __retval(USER_PTR_ERR) int test_strnchr_user_ptr(void *ctx) { return bpf_strnchr(user_ptr, 1, 'a'); }
@@ -83,6 +87,8 @@ SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault1(void *ctx) { return
 SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault2(void *ctx) { return bpf_strcmp("hello", invalid_kern_ptr); }
 SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault1(void *ctx) { return bpf_strcasecmp(invalid_kern_ptr, "HELLO"); }
 SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault2(void *ctx) { return bpf_strcasecmp("HELLO", invalid_kern_ptr); }
+SEC("syscall") __retval(-EFAULT) int test_strncasecmp_pagefault1(void *ctx) { return bpf_strncasecmp(invalid_kern_ptr, "HELLO", 5); }
+SEC("syscall") __retval(-EFAULT) int test_strncasecmp_pagefault2(void *ctx) { return bpf_strncasecmp("HELLO", invalid_kern_ptr, 5);	 }
 SEC("syscall") __retval(-EFAULT) int test_strchr_pagefault(void *ctx) { return bpf_strchr(invalid_kern_ptr, 'a'); }
 SEC("syscall") __retval(-EFAULT) int test_strchrnul_pagefault(void *ctx) { return bpf_strchrnul(invalid_kern_ptr, 'a'); }
 SEC("syscall") __retval(-EFAULT) int test_strnchr_pagefault(void *ctx) { return bpf_strnchr(invalid_kern_ptr, 1, 'a'); }
diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c
index 05e1da1f250f..412c53b87b18 100644
--- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c
+++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c
@@ -8,6 +8,7 @@ char long_str[XATTR_SIZE_MAX + 1];
 
 SEC("syscall") int test_strcmp_too_long(void *ctx) { return bpf_strcmp(long_str, long_str); }
 SEC("syscall") int test_strcasecmp_too_long(void *ctx) { return bpf_strcasecmp(long_str, long_str); }
+SEC("syscall") int test_strncasecmp_too_long(void *ctx) { return bpf_strncasecmp(long_str, long_str, sizeof(long_str)); }
 SEC("syscall") int test_strchr_too_long(void *ctx) { return bpf_strchr(long_str, 'b'); }
 SEC("syscall") int test_strchrnul_too_long(void *ctx) { return bpf_strchrnul(long_str, 'b'); }
 SEC("syscall") int test_strnchr_too_long(void *ctx) { return bpf_strnchr(long_str, sizeof(long_str), 'b'); }
diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c
index a8513964516b..f65b1226a81a 100644
--- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c
+++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c
@@ -17,6 +17,13 @@ __test(0) int test_strcasecmp_eq2(void *ctx) { return bpf_strcasecmp(str, "HELLO
 __test(0) int test_strcasecmp_eq3(void *ctx) { return bpf_strcasecmp(str, "HELLO world"); }
 __test(1) int test_strcasecmp_neq1(void *ctx) { return bpf_strcasecmp(str, "hello"); }
 __test(1) int test_strcasecmp_neq2(void *ctx) { return bpf_strcasecmp(str, "HELLO"); }
+__test(0) int test_strncasecmp_eq1(void *ctx) { return bpf_strncasecmp(str, "hello world", 11); }
+__test(0) int test_strncasecmp_eq2(void *ctx) { return bpf_strncasecmp(str, "HELLO WORLD", 11); }
+__test(0) int test_strncasecmp_eq3(void *ctx) { return bpf_strncasecmp(str, "HELLO world", 11); }
+__test(0) int test_strncasecmp_eq4(void *ctx) { return bpf_strncasecmp(str, "hello", 5); }
+__test(0) int test_strncasecmp_eq5(void *ctx) { return bpf_strncasecmp(str, "hello world!", 11); }
+__test(-1) int test_strncasecmp_neq1(void *ctx) { return bpf_strncasecmp(str, "hello!", 6); }
+__test(1) int test_strncasecmp_neq2(void *ctx) { return bpf_strncasecmp(str, "abc", 3); }
 __test(1) int test_strchr_found(void *ctx) { return bpf_strchr(str, 'e'); }
 __test(11) int test_strchr_null(void *ctx) { return bpf_strchr(str, '\0'); }
 __test(-ENOENT) int test_strchr_notfound(void *ctx) { return bpf_strchr(str, 'x'); }
-- 
cgit v1.2.3


From 6d6ad32e22f028c525d5df471c5522616e645a6b Mon Sep 17 00:00:00 2001
From: Ziyu Chen <chenziyu@uniontech.com>
Date: Wed, 21 Jan 2026 17:41:47 +0800
Subject: selftests/pidfd: fix typo in comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the typo "untill" → "until" in a comment in pidfd_info_test.c.

This typo is already listed in scripts/spelling.txt by commit
66b47b4a9dad ("checkpatch: look for common misspellings").

Link: https://lore.kernel.org/r/20260121094147.4187337-1-chenziyu@uniontech.com
Suggested-by: Cryolitia PukNgae <cryolitia@uniontech.com>
Signed-off-by: Ziyu Chen <chenziyu@uniontech.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/pidfd/pidfd_info_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c
index 6571e04acd88..8bed951e06a0 100644
--- a/tools/testing/selftests/pidfd/pidfd_info_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_info_test.c
@@ -229,7 +229,7 @@ static void *pidfd_info_pause_thread(void *arg)
 
 	close(ipc_socket);
 
-	/* Sleep untill we're killed. */
+	/* Sleep until we're killed. */
 	pause();
 	return NULL;
 }
-- 
cgit v1.2.3


From 3f2de814c0597c97d5abe09a1635d8c4e2fddaf2 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sashal@kernel.org>
Date: Wed, 21 Jan 2026 11:25:32 -0500
Subject: objtool: Fix libopcodes linking with static libraries

Commit 436326bc525d ("objtool: fix build failure due to missing libopcodes
check") tests for libopcodes using an empty main(), which passes even when
static libraries lack their dependencies. This causes undefined reference
errors (xmalloc, bfd_get_bits, etc.) when linking against static libopcodes
without its required libbfd and libiberty.

Fix by testing with an actual libopcodes symbol and trying increasingly
complete library combinations until one succeeds.

Fixes: 436326bc525d ("objtool: fix build failure due to missing libopcodes check")
Reported-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Rafael J. Wysocki (Intel) <rafael@kernel.org>
Link: https://patch.msgid.link/20260121162532.1596238-1-sashal@kernel.org
---
 tools/objtool/Makefile | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index 9b4503113ce5..a40f30232929 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -77,8 +77,21 @@ HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)"
 # We check using HOSTCC directly rather than the shared feature framework
 # because objtool is a host tool that links against host libraries.
 #
-HAVE_LIBOPCODES := $(shell echo 'int main(void) { return 0; }' | \
-			$(HOSTCC) -xc - -o /dev/null -lopcodes 2>/dev/null && echo y)
+# When using shared libraries, -lopcodes is sufficient as dependencies are
+# resolved automatically. With static libraries, we must explicitly link
+# against libopcodes' dependencies: libbfd, libiberty, and sometimes libz.
+# Try each combination and use the first one that succeeds.
+#
+LIBOPCODES_LIBS := $(shell \
+	for libs in "-lopcodes" \
+		    "-lopcodes -lbfd" \
+		    "-lopcodes -lbfd -liberty" \
+		    "-lopcodes -lbfd -liberty -lz"; do \
+		echo 'extern void disassemble_init_for_target(void *);' \
+		     'int main(void) { disassemble_init_for_target(0); return 0; }' | \
+			$(HOSTCC) -xc - -o /dev/null $$libs 2>/dev/null && \
+			echo "$$libs" && break; \
+	done)
 
 # Styled disassembler support requires binutils >= 2.39
 HAVE_DISASM_STYLED := $(shell echo '$(pound)include <dis-asm.h>' | \
@@ -86,10 +99,10 @@ HAVE_DISASM_STYLED := $(shell echo '$(pound)include <dis-asm.h>' | \
 
 BUILD_DISAS := n
 
-ifeq ($(HAVE_LIBOPCODES),y)
+ifneq ($(LIBOPCODES_LIBS),)
 	BUILD_DISAS := y
 	OBJTOOL_CFLAGS += -DDISAS -DPACKAGE='"objtool"'
-	OBJTOOL_LDFLAGS += -lopcodes
+	OBJTOOL_LDFLAGS += $(LIBOPCODES_LIBS)
 ifeq ($(HAVE_DISASM_STYLED),y)
 	OBJTOOL_CFLAGS += -DDISASM_INIT_STYLED
 endif
-- 
cgit v1.2.3


From a32ae2658471dd87a2f7a438388ed7d9a5767212 Mon Sep 17 00:00:00 2001
From: Kery Qi <qikeyu2017@gmail.com>
Date: Wed, 21 Jan 2026 17:41:16 +0800
Subject: selftests/bpf: Fix resource leak in serial_test_wq on attach failure

When wq__attach() fails, serial_test_wq() returns early without calling
wq__destroy(), leaking the skeleton resources allocated by
wq__open_and_load(). This causes ASAN leak reports in selftests runs.

Fix this by jumping to a common clean_up label that calls wq__destroy()
on all exit paths after successful open_and_load.

Note that the early return after wq__open_and_load() failure is correct
and doesn't need fixing, since that function returns NULL on failure
(after internally cleaning up any partial allocations).

Fixes: 8290dba51910 ("selftests/bpf: wq: add bpf_wq_start() checks")
Signed-off-by: Kery Qi <qikeyu2017@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20260121094114.1801-3-qikeyu2017@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/wq.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c
index 15c67d23128b..84831eecc935 100644
--- a/tools/testing/selftests/bpf/prog_tests/wq.c
+++ b/tools/testing/selftests/bpf/prog_tests/wq.c
@@ -16,12 +16,12 @@ void serial_test_wq(void)
 	/* re-run the success test to check if the timer was actually executed */
 
 	wq_skel = wq__open_and_load();
-	if (!ASSERT_OK_PTR(wq_skel, "wq_skel_load"))
+	if (!ASSERT_OK_PTR(wq_skel, "wq__open_and_load"))
 		return;
 
 	err = wq__attach(wq_skel);
 	if (!ASSERT_OK(err, "wq_attach"))
-		return;
+		goto clean_up;
 
 	prog_fd = bpf_program__fd(wq_skel->progs.test_syscall_array_sleepable);
 	err = bpf_prog_test_run_opts(prog_fd, &topts);
@@ -31,6 +31,7 @@ void serial_test_wq(void)
 	usleep(50); /* 10 usecs should be enough, but give it extra */
 
 	ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable");
+clean_up:
 	wq__destroy(wq_skel);
 }
 
-- 
cgit v1.2.3


From cb68cba4453d3e021b27c2a08fcefdd1376a5ef0 Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Mon, 19 Jan 2026 17:58:23 +0000
Subject: tools/lib: Add list_is_first()

Add list_is_first() to check whether @list is the first entry in list @head

Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: David Vernet <void@manifault.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Graham Woodward <graham.woodward@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Cc: Zhongqiu Han <quic_zhonhan@quicinc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/linux/list.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'tools')

diff --git a/tools/include/linux/list.h b/tools/include/linux/list.h
index a4dfb6a7cc6a..a692ff7aed5c 100644
--- a/tools/include/linux/list.h
+++ b/tools/include/linux/list.h
@@ -169,6 +169,16 @@ static inline void list_move_tail(struct list_head *list,
 	list_add_tail(list, head);
 }
 
+/**
+ * list_is_first -- tests whether @list is the first entry in list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_is_first(const struct list_head *list, const struct list_head *head)
+{
+	return list->prev == head;
+}
+
 /**
  * list_is_last - tests whether @list is the last entry in list @head
  * @list: the entry to test
-- 
cgit v1.2.3


From d40c68a49f69c9bdb4ca14b3e6a0422bbaeb5d8f Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Mon, 19 Jan 2026 17:58:24 +0000
Subject: perf header: Support CPU DOMAIN relation info

The '/proc/schedstat' file gives info about load balancing statistics
within a given domain.

It also contains the cpu_mask giving information about the sibling cpus
and domain names after schedstat version 17.

Storing this information in perf header will help tools like `perf sched
stats` for better analysis.

Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: David Vernet <void@manifault.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Graham Woodward <graham.woodward@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Cc: Zhongqiu Han <quic_zhonhan@quicinc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf.data-file-format.txt |  17 ++
 tools/perf/builtin-inject.c                        |   1 +
 tools/perf/util/env.c                              |  29 +++
 tools/perf/util/env.h                              |  17 ++
 tools/perf/util/header.c                           | 286 +++++++++++++++++++++
 tools/perf/util/header.h                           |   1 +
 tools/perf/util/util.c                             |  42 +++
 tools/perf/util/util.h                             |   3 +
 8 files changed, 396 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt
index c9d4dec65344..0e4d0ecc9e12 100644
--- a/tools/perf/Documentation/perf.data-file-format.txt
+++ b/tools/perf/Documentation/perf.data-file-format.txt
@@ -447,6 +447,23 @@ struct {
 	} [nr_pmu];
 };
 
+	HEADER_CPU_DOMAIN_INFO = 32,
+
+List of cpu-domain relation info. The format of the data is as below.
+
+struct domain_info {
+	int domain;
+	char dname[];
+	char cpumask[];
+	char cpulist[];
+};
+
+struct cpu_domain_info {
+	int cpu;
+	int nr_domains;
+	struct domain_info domains[];
+};
+
 	other bits are reserved and should ignored for now
 	HEADER_FEAT_BITS	= 256,
 
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index e2a653280e1b..c89ac85ec112 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -2133,6 +2133,7 @@ static bool keep_feat(struct perf_inject *inject, int feat)
 	case HEADER_CLOCK_DATA:
 	case HEADER_HYBRID_TOPOLOGY:
 	case HEADER_PMU_CAPS:
+	case HEADER_CPU_DOMAIN_INFO:
 		return true;
 	/* Information that can be updated */
 	case HEADER_BUILD_ID:
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index f1626d2032cd..93d475a80f14 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -216,6 +216,34 @@ static void perf_env__purge_bpf(struct perf_env *env __maybe_unused)
 }
 #endif // HAVE_LIBBPF_SUPPORT
 
+void free_cpu_domain_info(struct cpu_domain_map **cd_map, u32 schedstat_version, u32 nr)
+{
+	if (!cd_map)
+		return;
+
+	for (u32 i = 0; i < nr; i++) {
+		if (!cd_map[i])
+			continue;
+
+		for (u32 j = 0; j < cd_map[i]->nr_domains; j++) {
+			struct domain_info *d_info = cd_map[i]->domains[j];
+
+			if (!d_info)
+				continue;
+
+			if (schedstat_version >= 17)
+				zfree(&d_info->dname);
+
+			zfree(&d_info->cpumask);
+			zfree(&d_info->cpulist);
+			zfree(&d_info);
+		}
+		zfree(&cd_map[i]->domains);
+		zfree(&cd_map[i]);
+	}
+	zfree(&cd_map);
+}
+
 void perf_env__exit(struct perf_env *env)
 {
 	int i, j;
@@ -265,6 +293,7 @@ void perf_env__exit(struct perf_env *env)
 		zfree(&env->pmu_caps[i].pmu_name);
 	}
 	zfree(&env->pmu_caps);
+	free_cpu_domain_info(env->cpu_domain, env->schedstat_version, env->nr_cpus_avail);
 }
 
 void perf_env__init(struct perf_env *env)
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index 9977b85523a8..76ba1a36e9ff 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -54,6 +54,19 @@ struct pmu_caps {
 	char            *pmu_name;
 };
 
+struct domain_info {
+	u32	domain;
+	char	*dname;
+	char	*cpumask;
+	char	*cpulist;
+};
+
+struct cpu_domain_map {
+	u32			cpu;
+	u32			nr_domains;
+	struct domain_info	**domains;
+};
+
 typedef const char *(arch_syscalls__strerrno_t)(int err);
 
 struct perf_env {
@@ -70,6 +83,8 @@ struct perf_env {
 	unsigned int		max_branches;
 	unsigned int		br_cntr_nr;
 	unsigned int		br_cntr_width;
+	unsigned int		schedstat_version;
+	unsigned int		max_sched_domains;
 	int			kernel_is_64_bit;
 
 	int			nr_cmdline;
@@ -92,6 +107,7 @@ struct perf_env {
 	char			**cpu_pmu_caps;
 	struct cpu_topology_map	*cpu;
 	struct cpu_cache_level	*caches;
+	struct cpu_domain_map	**cpu_domain;
 	int			 caches_cnt;
 	u32			comp_ratio;
 	u32			comp_ver;
@@ -151,6 +167,7 @@ struct bpf_prog_info_node;
 struct btf_node;
 
 int perf_env__read_core_pmu_caps(struct perf_env *env);
+void free_cpu_domain_info(struct cpu_domain_map **cd_map, u32 schedstat_version, u32 nr);
 void perf_env__exit(struct perf_env *env);
 
 int perf_env__kernel_is_64_bit(struct perf_env *env);
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index f5cad377c99e..673d53bb2a2c 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1614,6 +1614,162 @@ static int write_pmu_caps(struct feat_fd *ff,
 	return 0;
 }
 
+static struct cpu_domain_map **build_cpu_domain_map(u32 *schedstat_version, u32 *max_sched_domains,
+						    u32 nr)
+{
+	struct domain_info *domain_info;
+	struct cpu_domain_map **cd_map;
+	char dname[16], cpumask[256];
+	char cpulist[1024];
+	char *line = NULL;
+	u32 cpu, domain;
+	u32 dcount = 0;
+	size_t len;
+	FILE *fp;
+
+	fp = fopen("/proc/schedstat", "r");
+	if (!fp) {
+		pr_err("Failed to open /proc/schedstat\n");
+		return NULL;
+	}
+
+	cd_map = zalloc(sizeof(*cd_map) * nr);
+	if (!cd_map)
+		goto out;
+
+	while (getline(&line, &len, fp) > 0) {
+		int retval;
+
+		if (strncmp(line, "version", 7) == 0) {
+			retval = sscanf(line, "version %d\n", schedstat_version);
+			if (retval != 1)
+				continue;
+
+		} else if (strncmp(line, "cpu", 3) == 0) {
+			retval = sscanf(line, "cpu%u %*s", &cpu);
+			if (retval == 1) {
+				cd_map[cpu] = zalloc(sizeof(*cd_map[cpu]));
+				if (!cd_map[cpu])
+					goto out_free_line;
+				cd_map[cpu]->cpu = cpu;
+			} else
+				continue;
+
+			dcount = 0;
+		} else if (strncmp(line, "domain", 6) == 0) {
+			struct domain_info **temp_domains;
+
+			dcount++;
+			temp_domains = realloc(cd_map[cpu]->domains, dcount * sizeof(domain_info));
+			if (!temp_domains)
+				goto out_free_line;
+			else
+				cd_map[cpu]->domains = temp_domains;
+
+			domain_info = zalloc(sizeof(*domain_info));
+			if (!domain_info)
+				goto out_free_line;
+
+			cd_map[cpu]->domains[dcount - 1] = domain_info;
+
+			if (*schedstat_version >= 17) {
+				retval = sscanf(line, "domain%u %s %s %*s", &domain, dname,
+						cpumask);
+				if (retval != 3)
+					continue;
+
+				domain_info->dname = strdup(dname);
+				if (!domain_info->dname)
+					goto out_free_line;
+			} else {
+				retval = sscanf(line, "domain%u %s %*s", &domain, cpumask);
+				if (retval != 2)
+					continue;
+			}
+
+			domain_info->domain = domain;
+			if (domain > *max_sched_domains)
+				*max_sched_domains = domain;
+
+			domain_info->cpumask = strdup(cpumask);
+			if (!domain_info->cpumask)
+				goto out_free_line;
+
+			cpumask_to_cpulist(cpumask, cpulist);
+			domain_info->cpulist = strdup(cpulist);
+			if (!domain_info->cpulist)
+				goto out_free_line;
+
+			cd_map[cpu]->nr_domains = dcount;
+		}
+	}
+
+out_free_line:
+	free(line);
+out:
+	fclose(fp);
+	return cd_map;
+}
+
+static int write_cpu_domain_info(struct feat_fd *ff,
+				 struct evlist *evlist __maybe_unused)
+{
+	u32 max_sched_domains = 0, schedstat_version = 0;
+	struct cpu_domain_map **cd_map;
+	u32 i, j, nr, ret;
+
+	nr = cpu__max_present_cpu().cpu;
+
+	cd_map = build_cpu_domain_map(&schedstat_version, &max_sched_domains, nr);
+	if (!cd_map)
+		return -1;
+
+	ret = do_write(ff, &schedstat_version, sizeof(u32));
+	if (ret < 0)
+		goto out;
+
+	max_sched_domains += 1;
+	ret = do_write(ff, &max_sched_domains, sizeof(u32));
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < nr; i++) {
+		if (!cd_map[i])
+			continue;
+
+		ret = do_write(ff, &cd_map[i]->cpu, sizeof(u32));
+		if (ret < 0)
+			goto out;
+
+		ret = do_write(ff, &cd_map[i]->nr_domains, sizeof(u32));
+		if (ret < 0)
+			goto out;
+
+		for (j = 0; j < cd_map[i]->nr_domains; j++) {
+			ret = do_write(ff, &cd_map[i]->domains[j]->domain, sizeof(u32));
+			if (ret < 0)
+				goto out;
+			if (schedstat_version >= 17) {
+				ret = do_write_string(ff, cd_map[i]->domains[j]->dname);
+				if (ret < 0)
+					goto out;
+			}
+
+			ret = do_write_string(ff, cd_map[i]->domains[j]->cpumask);
+			if (ret < 0)
+				goto out;
+
+			ret = do_write_string(ff, cd_map[i]->domains[j]->cpulist);
+			if (ret < 0)
+				goto out;
+		}
+	}
+
+out:
+	free_cpu_domain_info(cd_map, schedstat_version, nr);
+	return ret;
+}
+
 static void print_hostname(struct feat_fd *ff, FILE *fp)
 {
 	fprintf(fp, "# hostname : %s\n", ff->ph->env.hostname);
@@ -2247,6 +2403,39 @@ static void print_mem_topology(struct feat_fd *ff, FILE *fp)
 	}
 }
 
+static void print_cpu_domain_info(struct feat_fd *ff, FILE *fp)
+{
+	struct cpu_domain_map **cd_map = ff->ph->env.cpu_domain;
+	u32 nr = ff->ph->env.nr_cpus_avail;
+	struct domain_info *d_info;
+	u32 i, j;
+
+	fprintf(fp, "# schedstat version	: %u\n", ff->ph->env.schedstat_version);
+	fprintf(fp, "# Maximum sched domains	: %u\n", ff->ph->env.max_sched_domains);
+
+	for (i = 0; i < nr; i++) {
+		if (!cd_map[i])
+			continue;
+
+		fprintf(fp, "# cpu		: %u\n", cd_map[i]->cpu);
+		fprintf(fp, "# nr_domains	: %u\n", cd_map[i]->nr_domains);
+
+		for (j = 0; j < cd_map[i]->nr_domains; j++) {
+			d_info = cd_map[i]->domains[j];
+			if (!d_info)
+				continue;
+
+			fprintf(fp, "# Domain		: %u\n", d_info->domain);
+
+			if (ff->ph->env.schedstat_version >= 17)
+				fprintf(fp, "# Domain name      : %s\n", d_info->dname);
+
+			fprintf(fp, "# Domain cpu map   : %s\n", d_info->cpumask);
+			fprintf(fp, "# Domain cpu list  : %s\n", d_info->cpulist);
+		}
+	}
+}
+
 static int __event_process_build_id(struct perf_record_header_build_id *bev,
 				    char *filename,
 				    struct perf_session *session)
@@ -3388,6 +3577,102 @@ err:
 	return ret;
 }
 
+static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused)
+{
+	u32 schedstat_version, max_sched_domains, cpu, domain, nr_domains;
+	struct perf_env *env = &ff->ph->env;
+	char *dname, *cpumask, *cpulist;
+	struct cpu_domain_map **cd_map;
+	struct domain_info *d_info;
+	u32 nra, nr, i, j;
+	int ret;
+
+	nra = env->nr_cpus_avail;
+	nr = env->nr_cpus_online;
+
+	cd_map = zalloc(sizeof(*cd_map) * nra);
+	if (!cd_map)
+		return -1;
+
+	env->cpu_domain = cd_map;
+
+	ret = do_read_u32(ff, &schedstat_version);
+	if (ret)
+		return ret;
+
+	env->schedstat_version = schedstat_version;
+
+	ret = do_read_u32(ff, &max_sched_domains);
+	if (ret)
+		return ret;
+
+	env->max_sched_domains = max_sched_domains;
+
+	for (i = 0; i < nr; i++) {
+		if (do_read_u32(ff, &cpu))
+			return -1;
+
+		cd_map[cpu] = zalloc(sizeof(*cd_map[cpu]));
+		if (!cd_map[cpu])
+			return -1;
+
+		cd_map[cpu]->cpu = cpu;
+
+		if (do_read_u32(ff, &nr_domains))
+			return -1;
+
+		cd_map[cpu]->nr_domains = nr_domains;
+
+		cd_map[cpu]->domains = zalloc(sizeof(*d_info) * max_sched_domains);
+		if (!cd_map[cpu]->domains)
+			return -1;
+
+		for (j = 0; j < nr_domains; j++) {
+			if (do_read_u32(ff, &domain))
+				return -1;
+
+			d_info = zalloc(sizeof(*d_info));
+			if (!d_info)
+				return -1;
+
+			cd_map[cpu]->domains[domain] = d_info;
+			d_info->domain = domain;
+
+			if (schedstat_version >= 17) {
+				dname = do_read_string(ff);
+				if (!dname)
+					return -1;
+
+				d_info->dname = zalloc(strlen(dname) + 1);
+				if (!d_info->dname)
+					return -1;
+
+				d_info->dname = strdup(dname);
+			}
+
+			cpumask = do_read_string(ff);
+			if (!cpumask)
+				return -1;
+
+			d_info->cpumask = zalloc(strlen(cpumask) + 1);
+			if (!d_info->cpumask)
+				return -1;
+			d_info->cpumask = strdup(cpumask);
+
+			cpulist = do_read_string(ff);
+			if (!cpulist)
+				return -1;
+
+			d_info->cpulist = zalloc(strlen(cpulist) + 1);
+			if (!d_info->cpulist)
+				return -1;
+			d_info->cpulist = strdup(cpulist);
+		}
+	}
+
+	return ret;
+}
+
 #define FEAT_OPR(n, func, __full_only) \
 	[HEADER_##n] = {					\
 		.name	    = __stringify(n),			\
@@ -3453,6 +3738,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPR(CLOCK_DATA,	clock_data,	false),
 	FEAT_OPN(HYBRID_TOPOLOGY,	hybrid_topology,	true),
 	FEAT_OPR(PMU_CAPS,	pmu_caps,	false),
+	FEAT_OPR(CPU_DOMAIN_INFO,	cpu_domain_info,	true),
 };
 
 struct header_print_data {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index c058021c3150..c62f3275a80f 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -53,6 +53,7 @@ enum {
 	HEADER_CLOCK_DATA,
 	HEADER_HYBRID_TOPOLOGY,
 	HEADER_PMU_CAPS,
+	HEADER_CPU_DOMAIN_INFO,
 	HEADER_LAST_FEATURE,
 	HEADER_FEAT_BITS	= 256,
 };
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 0f031eb80b4c..b87ff96a9f45 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -257,6 +257,48 @@ static int rm_rf_kcore_dir(const char *path)
 	return 0;
 }
 
+void cpumask_to_cpulist(char *cpumask, char *cpulist)
+{
+	int i, j, bm_size, nbits;
+	int len = strlen(cpumask);
+	unsigned long *bm;
+	char cpus[1024];
+
+	for (i = 0; i < len; i++) {
+		if (cpumask[i] == ',') {
+			for (j = i; j < len; j++)
+				cpumask[j] = cpumask[j + 1];
+		}
+	}
+
+	len = strlen(cpumask);
+	bm_size = (len + 15) / 16;
+	nbits = bm_size * 64;
+	if (nbits <= 0)
+		return;
+
+	bm = calloc(bm_size, sizeof(unsigned long));
+	if (!cpumask)
+		goto free_bm;
+
+	for (i = 0; i < bm_size; i++) {
+		char blk[17];
+		int blklen = len > 16 ? 16 : len;
+
+		strncpy(blk, cpumask + len - blklen, blklen);
+		blk[blklen] = '\0';
+		bm[i] = strtoul(blk, NULL, 16);
+		cpumask[len - blklen] = '\0';
+		len = strlen(cpumask);
+	}
+
+	bitmap_scnprintf(bm, nbits, cpus, sizeof(cpus));
+	strcpy(cpulist, cpus);
+
+free_bm:
+	free(bm);
+}
+
 int rm_rf_perf_data(const char *path)
 {
 	const char *pat[] = {
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 3423778e39a5..1572c8cf04e5 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -11,6 +11,7 @@
 #include <stdbool.h>
 #include <stddef.h>
 #include <linux/compiler.h>
+#include <linux/bitmap.h>
 #include <sys/types.h>
 #ifndef __cplusplus
 #include <internal/cpumap.h>
@@ -48,6 +49,8 @@ bool sysctl__nmi_watchdog_enabled(void);
 
 int perf_tip(char **strp, const char *dirpath);
 
+void cpumask_to_cpulist(char *cpumask, char *cpulist);
+
 #ifndef HAVE_SCHED_GETCPU_SUPPORT
 int sched_getcpu(void);
 #endif
-- 
cgit v1.2.3


From 6ecc08329bab2c87f579cf1a8ab7799d8d88d9bc Mon Sep 17 00:00:00 2001
From: Andre Carvalho <asantostc@gmail.com>
Date: Sun, 18 Jan 2026 11:00:27 +0000
Subject: selftests: netconsole: validate target resume

Introduce a new netconsole selftest to validate that netconsole is able
to resume a deactivated target when the low level interface comes back.

The test setups the network using netdevsim, creates a netconsole target
and then remove/add netdevsim in order to bring the same interfaces
back. Afterwards, the test validates that the target works as expected.

Targets are created via cmdline parameters to the module to ensure that
we are able to resume targets that were bound by mac and interface name.

Reviewed-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Andre Carvalho <asantostc@gmail.com>
Tested-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20260118-netcons-retrigger-v11-7-4de36aebcf48@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/Makefile       |   1 +
 .../selftests/drivers/net/lib/sh/lib_netcons.sh    |  35 +++++-
 .../selftests/drivers/net/netcons_resume.sh        | 124 +++++++++++++++++++++
 3 files changed, 155 insertions(+), 5 deletions(-)
 create mode 100755 tools/testing/selftests/drivers/net/netcons_resume.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile
index f5c71d993750..3eba569b3366 100644
--- a/tools/testing/selftests/drivers/net/Makefile
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -19,6 +19,7 @@ TEST_PROGS := \
 	netcons_cmdline.sh \
 	netcons_fragmented_msg.sh \
 	netcons_overflow.sh \
+	netcons_resume.sh \
 	netcons_sysdata.sh \
 	netcons_torture.sh \
 	netpoll_basic.py \
diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
index ae8abff4be40..b6093bcf2b06 100644
--- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
+++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
@@ -203,19 +203,21 @@ function do_cleanup() {
 function cleanup_netcons() {
 	# delete netconsole dynamic reconfiguration
 	# do not fail if the target is already disabled
-	if [[ ! -d "${NETCONS_PATH}" ]]
+	local TARGET_PATH=${1:-${NETCONS_PATH}}
+
+	if [[ ! -d "${TARGET_PATH}" ]]
 	then
 		# in some cases this is called before netcons path is created
 		return
 	fi
-	if [[ $(cat "${NETCONS_PATH}"/enabled) != 0 ]]
+	if [[ $(cat "${TARGET_PATH}"/enabled) != 0 ]]
 	then
-		echo 0 > "${NETCONS_PATH}"/enabled || true
+		echo 0 > "${TARGET_PATH}"/enabled || true
 	fi
 	# Remove all the keys that got created during the selftest
-	find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete
+	find "${TARGET_PATH}/userdata/" -mindepth 1 -type d -delete
 	# Remove the configfs entry
-	rmdir "${NETCONS_PATH}"
+	rmdir "${TARGET_PATH}"
 }
 
 function cleanup() {
@@ -377,6 +379,29 @@ function check_netconsole_module() {
 	fi
 }
 
+function wait_target_state() {
+	local TARGET=${1}
+	local STATE=${2}
+	local TARGET_PATH="${NETCONS_CONFIGFS}"/"${TARGET}"
+	local ENABLED=0
+
+	if [ "${STATE}" == "enabled" ]
+	then
+		ENABLED=1
+	fi
+
+	if [ ! -d "$TARGET_PATH" ]; then
+		echo "FAIL: Target does not exist." >&2
+		exit "${ksft_fail}"
+	fi
+
+	local CHECK_CMD="grep \"$ENABLED\" \"$TARGET_PATH/enabled\""
+	slowwait 2 sh -c "test -n \"\$($CHECK_CMD)\"" || {
+		echo "FAIL: ${TARGET} is not ${STATE}." >&2
+		exit "${ksft_fail}"
+	}
+}
+
 # A wrapper to translate protocol version to udp version
 function wait_for_port() {
 	local NAMESPACE=${1}
diff --git a/tools/testing/selftests/drivers/net/netcons_resume.sh b/tools/testing/selftests/drivers/net/netcons_resume.sh
new file mode 100755
index 000000000000..fc5e5e3ad3d4
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netcons_resume.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test validates that netconsole is able to resume a target that was
+# deactivated when its interface was removed when the interface is brought
+# back up.
+#
+# The test configures a netconsole target and then removes netdevsim module to
+# cause the interface to disappear. Targets are configured via cmdline to ensure
+# targets bound by interface name and mac address can be resumed.
+# The test verifies that the target moved to disabled state before adding
+# netdevsim and the interface back.
+#
+# Finally, the test verifies that the target is re-enabled automatically and
+# the message is received on the destination interface.
+#
+# Author: Andre Carvalho <asantostc@gmail.com>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
+
+SAVED_SRCMAC="" # to be populated later
+SAVED_DSTMAC="" # to be populated later
+
+modprobe netdevsim 2> /dev/null || true
+rmmod netconsole 2> /dev/null || true
+
+check_netconsole_module
+
+function cleanup() {
+	cleanup_netcons "${NETCONS_CONFIGFS}/cmdline0"
+	do_cleanup
+	rmmod netconsole
+}
+
+function trigger_reactivation() {
+	# Add back low level module
+	modprobe netdevsim
+	# Recreate namespace and two interfaces
+	set_network
+	# Restore MACs
+	ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" \
+		address "${SAVED_DSTMAC}"
+	if [ "${BINDMODE}" == "mac" ]; then
+		ip link set dev "${SRCIF}" down
+		ip link set dev "${SRCIF}" address "${SAVED_SRCMAC}"
+		# Rename device in order to trigger target resume, as initial
+		# when device was recreated it didn't have correct mac address.
+		ip link set dev "${SRCIF}" name "${TARGET}"
+	fi
+}
+
+function trigger_deactivation() {
+	# Start by storing mac addresses so we can be restored in reactivate
+	SAVED_DSTMAC=$(ip netns exec "${NAMESPACE}" \
+		cat /sys/class/net/"$DSTIF"/address)
+	SAVED_SRCMAC=$(mac_get "${SRCIF}")
+	# Remove low level module
+	rmmod netdevsim
+}
+
+trap cleanup EXIT
+
+# Run the test twice, with different cmdline parameters
+for BINDMODE in "ifname" "mac"
+do
+	echo "Running with bind mode: ${BINDMODE}" >&2
+	# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
+	echo "6 5" > /proc/sys/kernel/printk
+
+	# Create one namespace and two interfaces
+	set_network
+
+	# Create the command line for netconsole, with the configuration from
+	# the function above
+	CMDLINE=$(create_cmdline_str "${BINDMODE}")
+
+	# The content of kmsg will be save to the following file
+	OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}"
+
+	# Load the module, with the cmdline set
+	modprobe netconsole "${CMDLINE}"
+	# Expose cmdline target in configfs
+	mkdir "${NETCONS_CONFIGFS}/cmdline0"
+
+	# Target should be enabled
+	wait_target_state "cmdline0" "enabled"
+
+	# Trigger deactivation by unloading netdevsim module. Target should be
+	# disabled.
+	trigger_deactivation
+	wait_target_state "cmdline0" "disabled"
+
+	# Trigger reactivation by loading netdevsim, recreating the network and
+	# restoring mac addresses. Target should be re-enabled.
+	trigger_reactivation
+	wait_target_state "cmdline0" "enabled"
+
+	# Listen for netconsole port inside the namespace and destination
+	# interface
+	listen_port_and_save_to "${OUTPUT_FILE}" &
+	# Wait for socat to start and listen to the port.
+	wait_local_port_listen "${NAMESPACE}" "${PORT}" udp
+	# Send the message
+	echo "${MSG}: ${TARGET}" > /dev/kmsg
+	# Wait until socat saves the file to disk
+	busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
+	# Make sure the message was received in the dst part
+	# and exit
+	validate_msg "${OUTPUT_FILE}"
+
+	# kill socat in case it is still running
+	pkill_socat
+	# Cleanup & unload the module
+	cleanup
+
+	echo "${BINDMODE} : Test passed" >&2
+done
+
+trap - EXIT
+exit "${EXIT_STATUS}"
-- 
cgit v1.2.3


From 04708606fd7bdc34b69089a4ff848ff36d7088f9 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Tue, 20 Jan 2026 13:39:30 +0000
Subject: selftests: net: amt: wait longer for connection before sending
 packets

Both send_mcast4() and send_mcast6() use sleep 2 to wait for the tunnel
connection between the gateway and the relay, and for the listener
socket to be created in the LISTENER namespace.

However, tests sometimes fail because packets are sent before the
connection is fully established.

Increase the waiting time to make the tests more reliable, and use
wait_local_port_listen() to explicitly wait for the listener socket.

Fixes: c08e8baea78e ("selftests: add amt interface selftest script")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Link: https://patch.msgid.link/20260120133930.863845-1-ap420073@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/amt.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/amt.sh b/tools/testing/selftests/net/amt.sh
index 3ef209cacb8e..663744305e52 100755
--- a/tools/testing/selftests/net/amt.sh
+++ b/tools/testing/selftests/net/amt.sh
@@ -73,6 +73,8 @@
 #       +------------------------+
 #==============================================================================
 
+source lib.sh
+
 readonly LISTENER=$(mktemp -u listener-XXXXXXXX)
 readonly GATEWAY=$(mktemp -u gateway-XXXXXXXX)
 readonly RELAY=$(mktemp -u relay-XXXXXXXX)
@@ -246,14 +248,15 @@ test_ipv6_forward()
 
 send_mcast4()
 {
-	sleep 2
+	sleep 5
+	wait_local_port_listen ${LISTENER} 4000 udp
 	ip netns exec "${SOURCE}" bash -c \
 		'printf "%s %128s" 172.17.0.2 | nc -w 1 -u 239.0.0.1 4000' &
 }
 
 send_mcast6()
 {
-	sleep 2
+	wait_local_port_listen ${LISTENER} 6000 udp
 	ip netns exec "${SOURCE}" bash -c \
 		'printf "%s %128s" 2001:db8:3::2 | nc -w 1 -u ff0e::5:6 6000' &
 }
-- 
cgit v1.2.3


From 830969e7821af377bdc1bb016929ff28c78490e8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 15 Dec 2025 17:52:34 +0100
Subject: selftests/rseq: Implement time slice extension test

Provide an initial test case to evaluate the functionality. This needs to be
extended to cover the ABI violations and expose the race condition between
observing granted and arriving in rseq_slice_yield().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251215155709.320325431@linutronix.de
---
 tools/testing/selftests/rseq/.gitignore   |   1 +
 tools/testing/selftests/rseq/Makefile     |   5 +-
 tools/testing/selftests/rseq/rseq-abi.h   |  27 ++++
 tools/testing/selftests/rseq/slice_test.c | 219 ++++++++++++++++++++++++++++++
 4 files changed, 251 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/rseq/slice_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore
index 0fda241fa62b..ec01d164c1f0 100644
--- a/tools/testing/selftests/rseq/.gitignore
+++ b/tools/testing/selftests/rseq/.gitignore
@@ -10,3 +10,4 @@ param_test_mm_cid
 param_test_mm_cid_benchmark
 param_test_mm_cid_compare_twice
 syscall_errors_test
+slice_test
diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
index 0d0a5fae5954..4ef90823b652 100644
--- a/tools/testing/selftests/rseq/Makefile
+++ b/tools/testing/selftests/rseq/Makefile
@@ -17,7 +17,7 @@ OVERRIDE_TARGETS = 1
 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \
 		param_test_benchmark param_test_compare_twice param_test_mm_cid \
 		param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \
-		syscall_errors_test
+		syscall_errors_test slice_test
 
 TEST_GEN_PROGS_EXTENDED = librseq.so
 
@@ -59,3 +59,6 @@ $(OUTPUT)/param_test_mm_cid_compare_twice: param_test.c $(TEST_GEN_PROGS_EXTENDE
 $(OUTPUT)/syscall_errors_test: syscall_errors_test.c $(TEST_GEN_PROGS_EXTENDED) \
 					rseq.h rseq-*.h
 	$(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@
+
+$(OUTPUT)/slice_test: slice_test.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h
+	$(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@
diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h
index fb4ec8a75dd4..ecef315204b2 100644
--- a/tools/testing/selftests/rseq/rseq-abi.h
+++ b/tools/testing/selftests/rseq/rseq-abi.h
@@ -53,6 +53,27 @@ struct rseq_abi_cs {
 	__u64 abort_ip;
 } __attribute__((aligned(4 * sizeof(__u64))));
 
+/**
+ * rseq_abi_slice_ctrl - Time slice extension control structure
+ * @all:	Compound value
+ * @request:	Request for a time slice extension
+ * @granted:	Granted time slice extension
+ *
+ * @request is set by user space and can be cleared by user space or kernel
+ * space.  @granted is set and cleared by the kernel and must only be read
+ * by user space.
+ */
+struct rseq_abi_slice_ctrl {
+	union {
+		__u32		all;
+		struct {
+			__u8	request;
+			__u8	granted;
+			__u16	__reserved;
+		};
+	};
+};
+
 /*
  * struct rseq_abi is aligned on 4 * 8 bytes to ensure it is always
  * contained within a single cache-line.
@@ -164,6 +185,12 @@ struct rseq_abi {
 	 */
 	__u32 mm_cid;
 
+	/*
+	 * Time slice extension control structure. CPU local updates from
+	 * kernel and user space.
+	 */
+	struct rseq_abi_slice_ctrl slice_ctrl;
+
 	/*
 	 * Flexible array member at end of structure, after last feature field.
 	 */
diff --git a/tools/testing/selftests/rseq/slice_test.c b/tools/testing/selftests/rseq/slice_test.c
new file mode 100644
index 000000000000..357122dcb487
--- /dev/null
+++ b/tools/testing/selftests/rseq/slice_test.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: LGPL-2.1
+#define _GNU_SOURCE
+#include <assert.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include <linux/prctl.h>
+#include <sys/prctl.h>
+#include <sys/time.h>
+
+#include "rseq.h"
+
+#include "../kselftest_harness.h"
+
+#ifndef __NR_rseq_slice_yield
+# define __NR_rseq_slice_yield	471
+#endif
+
+#define BITS_PER_INT	32
+#define BITS_PER_BYTE	8
+
+#ifndef PR_RSEQ_SLICE_EXTENSION
+# define PR_RSEQ_SLICE_EXTENSION		79
+#  define PR_RSEQ_SLICE_EXTENSION_GET		1
+#  define PR_RSEQ_SLICE_EXTENSION_SET		2
+#  define PR_RSEQ_SLICE_EXT_ENABLE		0x01
+#endif
+
+#ifndef RSEQ_SLICE_EXT_REQUEST_BIT
+# define RSEQ_SLICE_EXT_REQUEST_BIT	0
+# define RSEQ_SLICE_EXT_GRANTED_BIT	1
+#endif
+
+#ifndef asm_inline
+# define asm_inline	asm __inline
+#endif
+
+#define NSEC_PER_SEC	1000000000L
+#define NSEC_PER_USEC	      1000L
+
+struct noise_params {
+	int64_t	noise_nsecs;
+	int64_t	sleep_nsecs;
+	int64_t	run;
+};
+
+FIXTURE(slice_ext)
+{
+	pthread_t		noise_thread;
+	struct noise_params	noise_params;
+};
+
+FIXTURE_VARIANT(slice_ext)
+{
+	int64_t	total_nsecs;
+	int64_t	slice_nsecs;
+	int64_t	noise_nsecs;
+	int64_t	sleep_nsecs;
+	bool	no_yield;
+};
+
+FIXTURE_VARIANT_ADD(slice_ext, n2_2_50)
+{
+	.total_nsecs	=  5LL * NSEC_PER_SEC,
+	.slice_nsecs	=  2LL * NSEC_PER_USEC,
+	.noise_nsecs    =  2LL * NSEC_PER_USEC,
+	.sleep_nsecs	= 50LL * NSEC_PER_USEC,
+};
+
+FIXTURE_VARIANT_ADD(slice_ext, n50_2_50)
+{
+	.total_nsecs	=  5LL * NSEC_PER_SEC,
+	.slice_nsecs	= 50LL * NSEC_PER_USEC,
+	.noise_nsecs    =  2LL * NSEC_PER_USEC,
+	.sleep_nsecs	= 50LL * NSEC_PER_USEC,
+};
+
+FIXTURE_VARIANT_ADD(slice_ext, n2_2_50_no_yield)
+{
+	.total_nsecs	=  5LL * NSEC_PER_SEC,
+	.slice_nsecs	=  2LL * NSEC_PER_USEC,
+	.noise_nsecs    =  2LL * NSEC_PER_USEC,
+	.sleep_nsecs	= 50LL * NSEC_PER_USEC,
+	.no_yield	= true,
+};
+
+
+static inline bool elapsed(struct timespec *start, struct timespec *now,
+			   int64_t span)
+{
+	int64_t delta = now->tv_sec - start->tv_sec;
+
+	delta *= NSEC_PER_SEC;
+	delta += now->tv_nsec - start->tv_nsec;
+	return delta >= span;
+}
+
+static void *noise_thread(void *arg)
+{
+	struct noise_params *p = arg;
+
+	while (RSEQ_READ_ONCE(p->run)) {
+		struct timespec ts_start, ts_now;
+
+		clock_gettime(CLOCK_MONOTONIC, &ts_start);
+		do {
+			clock_gettime(CLOCK_MONOTONIC, &ts_now);
+		} while (!elapsed(&ts_start, &ts_now, p->noise_nsecs));
+
+		ts_start.tv_sec = 0;
+		ts_start.tv_nsec = p->sleep_nsecs;
+		clock_nanosleep(CLOCK_MONOTONIC, 0, &ts_start, NULL);
+	}
+	return NULL;
+}
+
+FIXTURE_SETUP(slice_ext)
+{
+	cpu_set_t affinity;
+
+	ASSERT_EQ(sched_getaffinity(0, sizeof(affinity), &affinity), 0);
+
+	/* Pin it on a single CPU. Avoid CPU 0 */
+	for (int i = 1; i < CPU_SETSIZE; i++) {
+		if (!CPU_ISSET(i, &affinity))
+			continue;
+
+		CPU_ZERO(&affinity);
+		CPU_SET(i, &affinity);
+		ASSERT_EQ(sched_setaffinity(0, sizeof(affinity), &affinity), 0);
+		break;
+	}
+
+	ASSERT_EQ(rseq_register_current_thread(), 0);
+
+	ASSERT_EQ(prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET,
+			PR_RSEQ_SLICE_EXT_ENABLE, 0, 0), 0);
+
+	self->noise_params.noise_nsecs = variant->noise_nsecs;
+	self->noise_params.sleep_nsecs = variant->sleep_nsecs;
+	self->noise_params.run = 1;
+
+	ASSERT_EQ(pthread_create(&self->noise_thread, NULL, noise_thread, &self->noise_params), 0);
+}
+
+FIXTURE_TEARDOWN(slice_ext)
+{
+	self->noise_params.run = 0;
+	pthread_join(self->noise_thread, NULL);
+}
+
+TEST_F(slice_ext, slice_test)
+{
+	unsigned long success = 0, yielded = 0, scheduled = 0, raced = 0;
+	unsigned long total = 0, aborted = 0;
+	struct rseq_abi *rs = rseq_get_abi();
+	struct timespec ts_start, ts_now;
+
+	ASSERT_NE(rs, NULL);
+
+	clock_gettime(CLOCK_MONOTONIC, &ts_start);
+	do {
+		struct timespec ts_cs;
+		bool req = false;
+
+		clock_gettime(CLOCK_MONOTONIC, &ts_cs);
+
+		total++;
+		RSEQ_WRITE_ONCE(rs->slice_ctrl.request, 1);
+		do {
+			clock_gettime(CLOCK_MONOTONIC, &ts_now);
+		} while (!elapsed(&ts_cs, &ts_now, variant->slice_nsecs));
+
+		/*
+		 * request can be cleared unconditionally, but for making
+		 * the stats work this is actually checking it first
+		 */
+		if (RSEQ_READ_ONCE(rs->slice_ctrl.request)) {
+			RSEQ_WRITE_ONCE(rs->slice_ctrl.request, 0);
+			/* Race between check and clear! */
+			req = true;
+			success++;
+		}
+
+		if (RSEQ_READ_ONCE(rs->slice_ctrl.granted)) {
+			/* The above raced against a late grant */
+			if (req)
+				success--;
+			if (variant->no_yield) {
+				syscall(__NR_getpid);
+				aborted++;
+			} else {
+				yielded++;
+				if (!syscall(__NR_rseq_slice_yield))
+					raced++;
+			}
+		} else {
+			if (!req)
+				scheduled++;
+		}
+
+		clock_gettime(CLOCK_MONOTONIC, &ts_now);
+	} while (!elapsed(&ts_start, &ts_now, variant->total_nsecs));
+
+	printf("# Total     %12ld\n", total);
+	printf("# Success   %12ld\n", success);
+	printf("# Yielded   %12ld\n", yielded);
+	printf("# Aborted   %12ld\n", aborted);
+	printf("# Scheduled %12ld\n", scheduled);
+	printf("# Raced     %12ld\n", raced);
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From bb332a9e5a057d2cb9b90e307b26cce9b1f6f660 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 21 Jan 2026 15:10:29 +0100
Subject: selftests/rseq: Add rseq slice histogram script

A script that processes trace-cmd data and generates a histogram of
rseq slice_ext durations for the recorded workload.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260121143208.340549136@infradead.org
---
 Documentation/userspace-api/rseq.rst            |   3 +
 tools/testing/selftests/rseq/rseq-slice-hist.py | 132 ++++++++++++++++++++++++
 2 files changed, 135 insertions(+)
 create mode 100644 tools/testing/selftests/rseq/rseq-slice-hist.py

(limited to 'tools')

diff --git a/Documentation/userspace-api/rseq.rst b/Documentation/userspace-api/rseq.rst
index 468f6bbe0e25..3cd27a3c7c7e 100644
--- a/Documentation/userspace-api/rseq.rst
+++ b/Documentation/userspace-api/rseq.rst
@@ -83,6 +83,9 @@ determined by debugfs:rseq/slice_ext_nsec. The default value is 5 usec; which
 is the minimum value. It can be incremented to 50 usecs, however doing so
 can/will affect the minimum scheduling latency.
 
+Any proposed changes to this default will have to come with a selftest and
+rseq-slice-hist.py output that shows the new value has merrit.
+
 The kernel indicates the grant by clearing rseq::slice_ctrl::request and
 setting rseq::slice_ctrl::granted to 1. If there is a reschedule of the
 thread after granting the extension, the kernel clears the granted bit to
diff --git a/tools/testing/selftests/rseq/rseq-slice-hist.py b/tools/testing/selftests/rseq/rseq-slice-hist.py
new file mode 100644
index 000000000000..b7933eeaefb9
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-slice-hist.py
@@ -0,0 +1,132 @@
+#!/usr/bin/python3
+
+#
+# trace-cmd record -e hrtimer_start -e hrtimer_cancel -e hrtimer_expire_entry -- $cmd
+#
+
+from tracecmd import *
+
+def load_kallsyms(file_path='/proc/kallsyms'):
+    """
+    Parses /proc/kallsyms into a dictionary.
+    Returns: { address_int: symbol_name }
+    """
+    kallsyms_map = {}
+
+    try:
+        with open(file_path, 'r') as f:
+            for line in f:
+                # The format is: [address] [type] [name] [module]
+                parts = line.split()
+                if len(parts) < 3:
+                    continue
+
+                addr = int(parts[0], 16)
+                name = parts[2]
+
+                kallsyms_map[addr] = name
+
+    except PermissionError:
+        print(f"Error: Permission denied reading {file_path}. Try running with sudo.")
+    except FileNotFoundError:
+        print(f"Error: {file_path} not found.")
+
+    return kallsyms_map
+
+ksyms = load_kallsyms()
+
+# pending[timer_ptr] = {'ts': timestamp, 'comm': comm}
+pending = {}
+
+# histograms[comm][bucket] = count
+histograms = {}
+
+class OnlineHarmonicMean:
+    def __init__(self):
+        self.n = 0          # Count of elements
+        self.S = 0.0        # Cumulative sum of reciprocals
+
+    def update(self, x):
+        if x == 0:
+            raise ValueError("Harmonic mean is undefined for zero.")
+
+        self.n += 1
+        self.S += 1.0 / x
+        return self.n / self.S
+
+    @property
+    def mean(self):
+        return self.n / self.S if self.n > 0 else 0
+
+ohms = {}
+
+def handle_start(record):
+    func_name = ksyms[record.num_field("function")]
+    if "rseq_slice_expired" in func_name:
+        timer_ptr = record.num_field("hrtimer")
+        pending[timer_ptr] = {
+            'ts': record.ts,
+            'comm': record.comm
+        }
+    return None
+
+def handle_cancel(record):
+    timer_ptr = record.num_field("hrtimer")
+
+    if timer_ptr in pending:
+        start_data = pending.pop(timer_ptr)
+        duration_ns = record.ts - start_data['ts']
+        duration_us = duration_ns // 1000
+
+        comm = start_data['comm']
+
+        if comm not in ohms:
+            ohms[comm] = OnlineHarmonicMean()
+
+        ohms[comm].update(duration_ns)
+
+        if comm not in histograms:
+            histograms[comm] = {}
+
+        histograms[comm][duration_us] = histograms[comm].get(duration_us, 0) + 1
+    return None
+
+def handle_expire(record):
+    timer_ptr = record.num_field("hrtimer")
+
+    if timer_ptr in pending:
+        start_data = pending.pop(timer_ptr)
+        comm = start_data['comm']
+
+        if comm not in histograms:
+            histograms[comm] = {}
+
+        # Record -1 bucket for expired (failed to cancel)
+        histograms[comm][-1] = histograms[comm].get(-1, 0) + 1
+    return None
+
+if __name__ == "__main__":
+    t = Trace("trace.dat")
+    for cpu in range(0, t.cpus):
+        ev = t.read_event(cpu)
+        while ev:
+            if "hrtimer_start" in ev.name:
+                handle_start(ev)
+            if "hrtimer_cancel" in ev.name:
+                handle_cancel(ev)
+            if "hrtimer_expire_entry" in ev.name:
+                handle_expire(ev)
+
+            ev = t.read_event(cpu)
+
+    print("\n" + "="*40)
+    print("RSEQ SLICE HISTOGRAM (us)")
+    print("="*40)
+    for comm, buckets in histograms.items():
+        print(f"\nTask: {comm}    Mean: {ohms[comm].mean:.3f} ns")
+        print(f"  {'Latency (us)':<15} | {'Count'}")
+        print(f"  {'-'*30}")
+        # Sort buckets numerically, putting -1 at the top
+        for bucket in sorted(buckets.keys()):
+            label = "EXPIRED" if bucket == -1 else f"{bucket} us"
+            print(f"  {label:<15} | {buckets[bucket]}")
-- 
cgit v1.2.3


From 2a369c4942489aeab799a7509b7cc721eecafa8a Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sat, 17 Jan 2026 13:10:51 +0100
Subject: kselftest/arm64: Use syscall() macro over nolibc my_syscall()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The my_syscall*() macros are internal implementation details of nolibc.
Nolibc also provides the regular syscall(2), which is also a macro
and directly expands to the correct my_syscall().

Use syscall() instead.

As a side-effect this fixes some return value checks, as my_syscall()
returns the raw value as set by the kernel and does not set errno.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/abi/tpidr2.c    |  3 +-
 tools/testing/selftests/arm64/gcs/basic-gcs.c | 40 +++++++++++----------------
 2 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c
index 1703543fb7c7..ce4550fb7224 100644
--- a/tools/testing/selftests/arm64/abi/tpidr2.c
+++ b/tools/testing/selftests/arm64/abi/tpidr2.c
@@ -128,8 +128,7 @@ static int sys_clone(unsigned long clone_flags, unsigned long newsp,
 		     int *parent_tidptr, unsigned long tls,
 		     int *child_tidptr)
 {
-	return my_syscall5(__NR_clone, clone_flags, newsp, parent_tidptr, tls,
-			   child_tidptr);
+	return syscall(__NR_clone, clone_flags, newsp, parent_tidptr, tls, child_tidptr);
 }
 
 #define __STACK_SIZE (8 * 1024 * 1024)
diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c
index 250977abc398..ae4cce6afe2b 100644
--- a/tools/testing/selftests/arm64/gcs/basic-gcs.c
+++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c
@@ -22,7 +22,7 @@ static size_t page_size = 65536;
 static  __attribute__((noinline)) void valid_gcs_function(void)
 {
 	/* Do something the compiler can't optimise out */
-	my_syscall1(__NR_prctl, PR_SVE_GET_VL);
+	syscall(__NR_prctl, PR_SVE_GET_VL);
 }
 
 static inline int gcs_set_status(unsigned long mode)
@@ -36,12 +36,10 @@ static inline int gcs_set_status(unsigned long mode)
 	 * other 3 values passed in registers to the syscall are zero
 	 * since the kernel validates them.
 	 */
-	ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode,
-			  0, 0, 0);
+	ret = syscall(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode, 0, 0, 0);
 
 	if (ret == 0) {
-		ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
-				  &new_mode, 0, 0, 0);
+		ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &new_mode, 0, 0, 0);
 		if (ret == 0) {
 			if (new_mode != mode) {
 				ksft_print_msg("Mode set to %lx not %lx\n",
@@ -49,7 +47,7 @@ static inline int gcs_set_status(unsigned long mode)
 				ret = -EINVAL;
 			}
 		} else {
-			ksft_print_msg("Failed to validate mode: %d\n", ret);
+			ksft_print_msg("Failed to validate mode: %d\n", errno);
 		}
 
 		if (enabling != chkfeat_gcs()) {
@@ -69,10 +67,9 @@ static bool read_status(void)
 	unsigned long state;
 	int ret;
 
-	ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
-			  &state, 0, 0, 0);
+	ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &state, 0, 0, 0);
 	if (ret != 0) {
-		ksft_print_msg("Failed to read state: %d\n", ret);
+		ksft_print_msg("Failed to read state: %d\n", errno);
 		return false;
 	}
 
@@ -188,9 +185,8 @@ static bool map_guarded_stack(void)
 	int elem;
 	bool pass = true;
 
-	buf = (void *)my_syscall3(__NR_map_shadow_stack, 0, page_size,
-				  SHADOW_STACK_SET_MARKER |
-				  SHADOW_STACK_SET_TOKEN);
+	buf = (void *)syscall(__NR_map_shadow_stack, 0, page_size,
+			      SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN);
 	if (buf == MAP_FAILED) {
 		ksft_print_msg("Failed to map %lu byte GCS: %d\n",
 			       page_size, errno);
@@ -257,8 +253,7 @@ static bool test_fork(void)
 		valid_gcs_function();
 		get_gcspr();
 
-		ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
-				  &child_mode, 0, 0, 0);
+		ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &child_mode, 0, 0, 0);
 		if (ret == 0 && !(child_mode & PR_SHADOW_STACK_ENABLE)) {
 			ksft_print_msg("GCS not enabled in child\n");
 			ret = -EINVAL;
@@ -321,8 +316,7 @@ static bool test_vfork(void)
 		valid_gcs_function();
 		get_gcspr();
 
-		ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
-				  &child_mode, 0, 0, 0);
+		ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &child_mode, 0, 0, 0);
 		if (ret == 0 && !(child_mode & PR_SHADOW_STACK_ENABLE)) {
 			ksft_print_msg("GCS not enabled in child\n");
 			ret = EXIT_FAILURE;
@@ -390,17 +384,15 @@ int main(void)
 	if (!(getauxval(AT_HWCAP) & HWCAP_GCS))
 		ksft_exit_skip("SKIP GCS not supported\n");
 
-	ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
-			  &gcs_mode, 0, 0, 0);
+	ret = syscall(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &gcs_mode, 0, 0, 0);
 	if (ret != 0)
-		ksft_exit_fail_msg("Failed to read GCS state: %d\n", ret);
+		ksft_exit_fail_msg("Failed to read GCS state: %d\n", errno);
 
 	if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) {
 		gcs_mode = PR_SHADOW_STACK_ENABLE;
-		ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
-				  gcs_mode, 0, 0, 0);
+		ret = syscall(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, gcs_mode, 0, 0, 0);
 		if (ret != 0)
-			ksft_exit_fail_msg("Failed to enable GCS: %d\n", ret);
+			ksft_exit_fail_msg("Failed to enable GCS: %d\n", errno);
 	}
 
 	ksft_set_plan(ARRAY_SIZE(tests));
@@ -410,9 +402,9 @@ int main(void)
 	}
 
 	/* One last test: disable GCS, we can do this one time */
-	ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0);
+	ret = syscall(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0);
 	if (ret != 0)
-		ksft_print_msg("Failed to disable GCS: %d\n", ret);
+		ksft_print_msg("Failed to disable GCS: %d\n", errno);
 
 	ksft_finished();
 
-- 
cgit v1.2.3


From 57a96356bb6942e16283138d0a42baad29169ed8 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Mon, 19 Jan 2026 10:29:28 +0800
Subject: kselftest/arm64: Add HWCAP test for FEAT_LS64

Add tests for FEAT_LS64. Issue related instructions if feature
presents, no SIGILL should be received. When such instructions
operate on Device memory or non-cacheable memory, we may received
a SIGBUS during the test (w/o FEAT_LS64WB). Just ignore it since
we only tested whether the instruction itself can be issued as
expected on platforms declaring the support of such features.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Oliver Upton <oupton@kernel.org>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 49 +++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index c41640f18e4e..9d2df1f3e6bb 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -11,6 +11,8 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <linux/auxvec.h>
+#include <linux/compiler.h>
 #include <sys/auxv.h>
 #include <sys/prctl.h>
 #include <asm/hwcap.h>
@@ -595,6 +597,45 @@ static void lrcpc3_sigill(void)
 	              : "=r" (data0), "=r" (data1) : "r" (src) :);
 }
 
+static void ignore_signal(int sig, siginfo_t *info, void *context)
+{
+	ucontext_t *uc = context;
+
+	uc->uc_mcontext.pc += 4;
+}
+
+static void ls64_sigill(void)
+{
+	struct sigaction ign, old;
+	char src[64] __aligned(64) = { 1 };
+
+	/*
+	 * LS64 requires target memory to be Device/Non-cacheable (if
+	 * FEAT_LS64WB not supported) and the completer supports these
+	 * instructions, otherwise we'll receive a SIGBUS. Since we are only
+	 * testing the ABI here, so just ignore the SIGBUS and see if we can
+	 * execute the instructions without receiving a SIGILL. Restore the
+	 * handler of SIGBUS after this test.
+	 */
+	ign.sa_sigaction = ignore_signal;
+	ign.sa_flags = SA_SIGINFO | SA_RESTART;
+	sigemptyset(&ign.sa_mask);
+	sigaction(SIGBUS, &ign, &old);
+
+	register void *xn asm ("x8") = src;
+	register u64 xt_1 asm ("x0");
+
+	/* LD64B x0, [x8] */
+	asm volatile(".inst 0xf83fd100" : "=r" (xt_1) : "r" (xn)
+		     : "x1", "x2", "x3", "x4", "x5", "x6", "x7");
+
+	/* ST64B x0, [x8] */
+	asm volatile(".inst 0xf83f9100" : : "r" (xt_1), "r" (xn)
+		     : "x1", "x2", "x3", "x4", "x5", "x6", "x7");
+
+	sigaction(SIGBUS, &old, NULL);
+}
+
 static const struct hwcap_data {
 	const char *name;
 	unsigned long at_hwcap;
@@ -1134,6 +1175,14 @@ static const struct hwcap_data {
 		.hwcap_bit = HWCAP3_MTE_STORE_ONLY,
 		.cpuinfo = "mtestoreonly",
 	},
+	{
+		.name = "LS64",
+		.at_hwcap = AT_HWCAP3,
+		.hwcap_bit = HWCAP3_LS64,
+		.cpuinfo = "ls64",
+		.sigill_fn = ls64_sigill,
+		.sigill_reliable = true,
+	},
 };
 
 typedef void (*sighandler_fn)(int, siginfo_t *, void *);
-- 
cgit v1.2.3


From 0a98de80136968bab7db37b16282b37f044694d3 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Wed, 21 Jan 2026 10:36:26 +0100
Subject: vsock/test: fix seqpacket message bounds test

The test requires the sender (client) to send all messages before waking
up the receiver (server).
Since virtio-vsock had a bug and did not respect the size of the TX
buffer, this test worked, but now that we are going to fix the bug, the
test hangs because the sender would fill the TX buffer before waking up
the receiver.

Set the buffer size in the sender (client) as well, as we already do for
the receiver (server).

Fixes: 5c338112e48a ("test/vsock: rework message bounds test")
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20260121093628.9941-3-sgarzare@redhat.com
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/vsock/vsock_test.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 27e39354499a..668fbe9eb3cc 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -351,6 +351,7 @@ static void test_stream_msg_peek_server(const struct test_opts *opts)
 
 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
 {
+	unsigned long long sock_buf_size;
 	unsigned long curr_hash;
 	size_t max_msg_size;
 	int page_size;
@@ -363,6 +364,16 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
 		exit(EXIT_FAILURE);
 	}
 
+	sock_buf_size = SOCK_BUF_SIZE;
+
+	setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE,
+			     sock_buf_size,
+			     "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)");
+
+	setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+			     sock_buf_size,
+			     "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+
 	/* Wait, until receiver sets buffer size. */
 	control_expectln("SRVREADY");
 
-- 
cgit v1.2.3


From 2a689f76edd04a53137bd01d4618343f4cdd7e23 Mon Sep 17 00:00:00 2001
From: Melbin K Mathew <mlbnkm1@gmail.com>
Date: Wed, 21 Jan 2026 10:36:28 +0100
Subject: vsock/test: add stream TX credit bounds test

Add a regression test for the TX credit bounds fix. The test verifies
that a sender with a small local buffer size cannot queue excessive
data even when the peer advertises a large receive buffer.

The client:
  - Sets a small buffer size (64 KiB)
  - Connects to server (which advertises 2 MiB buffer)
  - Sends in non-blocking mode until EAGAIN
  - Verifies total queued data is bounded

This guards against the original vulnerability where a remote peer
could cause unbounded kernel memory allocation by advertising a large
buffer and reading slowly.

Suggested-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Melbin K Mathew <mlbnkm1@gmail.com>
[Stefano: use sock_buf_size to check the bytes sent + small fixes]
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20260121093628.9941-5-sgarzare@redhat.com
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/vsock/vsock_test.c | 101 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 668fbe9eb3cc..5bd20ccd9335 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -347,6 +347,7 @@ static void test_stream_msg_peek_server(const struct test_opts *opts)
 }
 
 #define SOCK_BUF_SIZE (2 * 1024 * 1024)
+#define SOCK_BUF_SIZE_SMALL (64 * 1024)
 #define MAX_MSG_PAGES 4
 
 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
@@ -2230,6 +2231,101 @@ static void test_stream_accepted_setsockopt_server(const struct test_opts *opts)
 	close(fd);
 }
 
+static void test_stream_tx_credit_bounds_client(const struct test_opts *opts)
+{
+	unsigned long long sock_buf_size;
+	size_t total = 0;
+	char buf[4096];
+	int fd;
+
+	memset(buf, 'A', sizeof(buf));
+
+	fd = vsock_stream_connect(opts->peer_cid, opts->peer_port);
+	if (fd < 0) {
+		perror("connect");
+		exit(EXIT_FAILURE);
+	}
+
+	sock_buf_size = SOCK_BUF_SIZE_SMALL;
+
+	setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE,
+			     sock_buf_size,
+			     "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)");
+
+	setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+			     sock_buf_size,
+			     "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+
+	if (fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK) < 0) {
+		perror("fcntl(F_SETFL)");
+		exit(EXIT_FAILURE);
+	}
+
+	control_expectln("SRVREADY");
+
+	for (;;) {
+		ssize_t sent = send(fd, buf, sizeof(buf), 0);
+
+		if (sent == 0) {
+			fprintf(stderr, "unexpected EOF while sending bytes\n");
+			exit(EXIT_FAILURE);
+		}
+
+		if (sent < 0) {
+			if (errno == EINTR)
+				continue;
+
+			if (errno == EAGAIN || errno == EWOULDBLOCK)
+				break;
+
+			perror("send");
+			exit(EXIT_FAILURE);
+		}
+
+		total += sent;
+	}
+
+	control_writeln("CLIDONE");
+	close(fd);
+
+	/* We should not be able to send more bytes than the value set as
+	 * local buffer size.
+	 */
+	if (total > sock_buf_size) {
+		fprintf(stderr,
+			"TX credit too large: queued %zu bytes (expected <= %llu)\n",
+			total, sock_buf_size);
+		exit(EXIT_FAILURE);
+	}
+}
+
+static void test_stream_tx_credit_bounds_server(const struct test_opts *opts)
+{
+	unsigned long long sock_buf_size;
+	int fd;
+
+	fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL);
+	if (fd < 0) {
+		perror("accept");
+		exit(EXIT_FAILURE);
+	}
+
+	sock_buf_size = SOCK_BUF_SIZE;
+
+	setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE,
+			     sock_buf_size,
+			     "setsockopt(SO_VM_SOCKETS_BUFFER_MAX_SIZE)");
+
+	setsockopt_ull_check(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+			     sock_buf_size,
+			     "setsockopt(SO_VM_SOCKETS_BUFFER_SIZE)");
+
+	control_writeln("SRVREADY");
+	control_expectln("CLIDONE");
+
+	close(fd);
+}
+
 static struct test_case test_cases[] = {
 	{
 		.name = "SOCK_STREAM connection reset",
@@ -2419,6 +2515,11 @@ static struct test_case test_cases[] = {
 		.run_client = test_stream_msgzcopy_mangle_client,
 		.run_server = test_stream_msgzcopy_mangle_server,
 	},
+	{
+		.name = "SOCK_STREAM TX credit bounds",
+		.run_client = test_stream_tx_credit_bounds_client,
+		.run_server = test_stream_tx_credit_bounds_server,
+	},
 	{},
 };
 
-- 
cgit v1.2.3


From c3030995f23b3d35f94b9bc4375706ec5916fd55 Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Mon, 19 Jan 2026 17:58:25 +0000
Subject: perf sched stats: Add record and rawdump support

Define new, perf tool only, sample types and their layouts. Add logic
to parse /proc/schedstat, convert it to perf sample format and save
samples to perf.data file with `perf sched stats record` command.

Also add logic to read perf.data file, interpret schedstat samples and
print rawdump of samples with `perf script -D`.

Note that, /proc/schedstat file output is standardized with version
number. The patch supports v15 but older or newer version can be added
easily.

Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: James Clark <james.clark@linaro.org>
Acked-by: Ian Rogers <irogers@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: David Vernet <void@manifault.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Graham Woodward <graham.woodward@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Cc: Zhongqiu Han <quic_zhonhan@quicinc.com>
[ PRIu64 needs uint64_t, not 'unsigned long' to work on both 32-bit and 64-bit ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/Documentation/libperf.txt    |   2 +
 tools/lib/perf/Makefile                     |   1 +
 tools/lib/perf/include/perf/event.h         |  41 +++++
 tools/lib/perf/include/perf/schedstat-v15.h | 146 ++++++++++++++++++
 tools/perf/builtin-inject.c                 |   2 +
 tools/perf/builtin-sched.c                  | 222 +++++++++++++++++++++++++++-
 tools/perf/util/event.c                     |  40 +++++
 tools/perf/util/event.h                     |   2 +
 tools/perf/util/session.c                   |  22 +++
 tools/perf/util/synthetic-events.c          | 179 ++++++++++++++++++++++
 tools/perf/util/synthetic-events.h          |   3 +
 tools/perf/util/tool.c                      |  20 +++
 tools/perf/util/tool.h                      |   4 +-
 13 files changed, 682 insertions(+), 2 deletions(-)
 create mode 100644 tools/lib/perf/include/perf/schedstat-v15.h

(limited to 'tools')

diff --git a/tools/lib/perf/Documentation/libperf.txt b/tools/lib/perf/Documentation/libperf.txt
index 4072bc9b7670..576ecc5fc312 100644
--- a/tools/lib/perf/Documentation/libperf.txt
+++ b/tools/lib/perf/Documentation/libperf.txt
@@ -211,6 +211,8 @@ SYNOPSIS
   struct perf_record_header_feature;
   struct perf_record_compressed;
   struct perf_record_compressed2;
+  struct perf_record_schedstat_cpu;
+  struct perf_record_schedstat_domain;
 --
 
 DESCRIPTION
diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile
index 7fbb50b74c00..9fa28e512ca8 100644
--- a/tools/lib/perf/Makefile
+++ b/tools/lib/perf/Makefile
@@ -179,6 +179,7 @@ install_lib: libs
 		cp -fpR $(LIBPERF_ALL) $(DESTDIR)$(libdir_SQ)
 
 HDRS := bpf_perf.h core.h cpumap.h threadmap.h evlist.h evsel.h event.h mmap.h
+HDRS += schedstat-v15.h
 INTERNAL_HDRS := cpumap.h evlist.h evsel.h lib.h mmap.h rc_check.h threadmap.h xyarray.h
 
 INSTALL_HDRS_PFX := $(DESTDIR)$(prefix)/include/perf
diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h
index 43a8cb04994f..ce04fed7cefc 100644
--- a/tools/lib/perf/include/perf/event.h
+++ b/tools/lib/perf/include/perf/event.h
@@ -496,6 +496,43 @@ struct perf_record_bpf_metadata {
 	struct perf_record_bpf_metadata_entry entries[];
 };
 
+struct perf_record_schedstat_cpu_v15 {
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)		_type _name
+#include "schedstat-v15.h"
+#undef CPU_FIELD
+};
+
+struct perf_record_schedstat_cpu {
+	struct perf_event_header header;
+	__u64			 timestamp;
+	__u32			 cpu;
+	__u16			 version;
+	/* Padding */
+	char			 __pad[2];
+	union {
+		struct perf_record_schedstat_cpu_v15 v15;
+	};
+};
+
+struct perf_record_schedstat_domain_v15 {
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		_type _name
+#include "schedstat-v15.h"
+#undef DOMAIN_FIELD
+};
+
+#define DOMAIN_NAME_LEN		16
+
+struct perf_record_schedstat_domain {
+	struct perf_event_header header;
+	__u64			 timestamp;
+	__u32			 cpu;
+	__u16			 version;
+	__u16			 domain;
+	union {
+		struct perf_record_schedstat_domain_v15 v15;
+	};
+};
+
 enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_USER_TYPE_START		= 64,
 	PERF_RECORD_HEADER_ATTR			= 64,
@@ -519,6 +556,8 @@ enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_FINISHED_INIT		= 82,
 	PERF_RECORD_COMPRESSED2			= 83,
 	PERF_RECORD_BPF_METADATA		= 84,
+	PERF_RECORD_SCHEDSTAT_CPU		= 85,
+	PERF_RECORD_SCHEDSTAT_DOMAIN		= 86,
 	PERF_RECORD_HEADER_MAX
 };
 
@@ -562,6 +601,8 @@ union perf_event {
 	struct perf_record_compressed		pack;
 	struct perf_record_compressed2		pack2;
 	struct perf_record_bpf_metadata		bpf_metadata;
+	struct perf_record_schedstat_cpu	schedstat_cpu;
+	struct perf_record_schedstat_domain	schedstat_domain;
 };
 
 #endif /* __LIBPERF_EVENT_H */
diff --git a/tools/lib/perf/include/perf/schedstat-v15.h b/tools/lib/perf/include/perf/schedstat-v15.h
new file mode 100644
index 000000000000..639458df05f8
--- /dev/null
+++ b/tools/lib/perf/include/perf/schedstat-v15.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef CPU_FIELD
+CPU_FIELD(__u32, yld_count, "sched_yield() count",
+	  "%11u", false, yld_count, v15);
+CPU_FIELD(__u32, array_exp, "Legacy counter can be ignored",
+	  "%11u", false, array_exp, v15);
+CPU_FIELD(__u32, sched_count, "schedule() called",
+	  "%11u", false, sched_count, v15);
+CPU_FIELD(__u32, sched_goidle, "schedule() left the processor idle",
+	  "%11u", true, sched_count, v15);
+CPU_FIELD(__u32, ttwu_count, "try_to_wake_up() was called",
+	  "%11u", false, ttwu_count, v15);
+CPU_FIELD(__u32, ttwu_local, "try_to_wake_up() was called to wake up the local cpu",
+	  "%11u", true, ttwu_count, v15);
+CPU_FIELD(__u64, rq_cpu_time, "total runtime by tasks on this processor (in jiffies)",
+	  "%11llu", false, rq_cpu_time, v15);
+CPU_FIELD(__u64, run_delay, "total waittime by tasks on this processor (in jiffies)",
+	  "%11llu", true, rq_cpu_time, v15);
+CPU_FIELD(__u64, pcount, "total timeslices run on this cpu",
+	  "%11llu", false, pcount, v15);
+#endif
+
+#ifdef DOMAIN_FIELD
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category idle> ");
+#endif
+DOMAIN_FIELD(__u32, idle_lb_count,
+	     "load_balance() count on cpu idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, idle_lb_balanced,
+	     "load_balance() found balanced on cpu idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, idle_lb_failed,
+	     "load_balance() move task failed on cpu idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, idle_lb_imbalance,
+	     "imbalance sum on cpu idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, idle_lb_gained,
+	     "pull_task() count on cpu idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, idle_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, idle_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, idle_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu idle", "%11u", true, v15);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(idle_lb_success_count, "load_balance() success count on cpu idle", "%11u",
+		  idle_lb_count, idle_lb_balanced, idle_lb_failed, v15);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(idle_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu idle)", "%11.2Lf",
+		  idle_lb_count, idle_lb_balanced, idle_lb_failed, idle_lb_gained, v15);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category busy> ");
+#endif
+DOMAIN_FIELD(__u32, busy_lb_count,
+	     "load_balance() count on cpu busy", "%11u", true, v15);
+DOMAIN_FIELD(__u32, busy_lb_balanced,
+	     "load_balance() found balanced on cpu busy", "%11u", true, v15);
+DOMAIN_FIELD(__u32, busy_lb_failed,
+	     "load_balance() move task failed on cpu busy", "%11u", true, v15);
+DOMAIN_FIELD(__u32, busy_lb_imbalance,
+	     "imbalance sum on cpu busy", "%11u", false, v15);
+DOMAIN_FIELD(__u32, busy_lb_gained,
+	     "pull_task() count on cpu busy", "%11u", false, v15);
+DOMAIN_FIELD(__u32, busy_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu busy", "%11u", false, v15);
+DOMAIN_FIELD(__u32, busy_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu busy", "%11u", true, v15);
+DOMAIN_FIELD(__u32, busy_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu busy", "%11u", true, v15);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(busy_lb_success_count, "load_balance() success count on cpu busy", "%11u",
+		  busy_lb_count, busy_lb_balanced, busy_lb_failed, v15);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(busy_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu busy)", "%11.2Lf",
+		  busy_lb_count, busy_lb_balanced, busy_lb_failed, busy_lb_gained, v15);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category newidle> ");
+#endif
+DOMAIN_FIELD(__u32, newidle_lb_count,
+	     "load_balance() count on cpu newly idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, newidle_lb_balanced,
+	     "load_balance() found balanced on cpu newly idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, newidle_lb_failed,
+	     "load_balance() move task failed on cpu newly idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, newidle_lb_imbalance,
+	     "imbalance sum on cpu newly idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, newidle_lb_gained,
+	     "pull_task() count on cpu newly idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, newidle_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu newly idle", "%11u", false, v15);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu newly idle", "%11u", true, v15);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu newly idle", "%11u", true, v15);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(newidle_lb_success_count,
+		  "load_balance() success count on cpu newly idle", "%11u",
+		  newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, v15);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(newidle_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu newly idle)", "%11.2Lf",
+		  newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, newidle_lb_gained, v15);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category active_load_balance()> ");
+#endif
+DOMAIN_FIELD(__u32, alb_count,
+	     "active_load_balance() count", "%11u", false, v15);
+DOMAIN_FIELD(__u32, alb_failed,
+	     "active_load_balance() move task failed", "%11u", false, v15);
+DOMAIN_FIELD(__u32, alb_pushed,
+	     "active_load_balance() successfully moved a task", "%11u", false, v15);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_exec()> ");
+#endif
+DOMAIN_FIELD(__u32, sbe_count,
+	     "sbe_count is not used", "%11u", false, v15);
+DOMAIN_FIELD(__u32, sbe_balanced,
+	     "sbe_balanced is not used", "%11u", false, v15);
+DOMAIN_FIELD(__u32, sbe_pushed,
+	     "sbe_pushed is not used", "%11u", false, v15);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_fork()> ");
+#endif
+DOMAIN_FIELD(__u32, sbf_count,
+	     "sbf_count is not used", "%11u", false, v15);
+DOMAIN_FIELD(__u32, sbf_balanced,
+	     "sbf_balanced is not used", "%11u", false, v15);
+DOMAIN_FIELD(__u32, sbf_pushed,
+	     "sbf_pushed is not used", "%11u", false, v15);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Wakeup Info> ");
+#endif
+DOMAIN_FIELD(__u32, ttwu_wake_remote,
+	     "try_to_wake_up() awoke a task that last ran on a diff cpu", "%11u", false, v15);
+DOMAIN_FIELD(__u32, ttwu_move_affine,
+	     "try_to_wake_up() moved task because cache-cold on own cpu", "%11u", false, v15);
+DOMAIN_FIELD(__u32, ttwu_move_balance,
+	     "try_to_wake_up() started passive balancing", "%11u", false, v15);
+#endif /* DOMAIN_FIELD */
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index c89ac85ec112..2c9456614cde 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -2657,6 +2657,8 @@ int cmd_inject(int argc, const char **argv)
 	inject.tool.compressed		= perf_event__repipe_op4_synth;
 	inject.tool.auxtrace		= perf_event__repipe_auxtrace;
 	inject.tool.bpf_metadata	= perf_event__repipe_op2_synth;
+	inject.tool.schedstat_cpu	= perf_event__repipe_op2_synth;
+	inject.tool.schedstat_domain	= perf_event__repipe_op2_synth;
 	inject.tool.dont_split_sample_group = true;
 	inject.tool.merge_deferred_callchains = false;
 	inject.session = __perf_session__new(&data, &inject.tool,
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index eca3b1c58c4b..ee3b4e42156e 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -28,6 +28,8 @@
 #include "util/debug.h"
 #include "util/event.h"
 #include "util/util.h"
+#include "util/synthetic-events.h"
+#include "util/target.h"
 
 #include <linux/kernel.h>
 #include <linux/log2.h>
@@ -55,6 +57,7 @@
 #define MAX_PRIO		140
 
 static const char *cpu_list;
+static struct perf_cpu_map *user_requested_cpus;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 
 struct sched_atom;
@@ -236,6 +239,9 @@ struct perf_sched {
 	volatile bool   thread_funcs_exit;
 	const char	*prio_str;
 	DECLARE_BITMAP(prio_bitmap, MAX_PRIO);
+
+	struct perf_session *session;
+	struct perf_data *data;
 };
 
 /* per thread run time data */
@@ -3734,6 +3740,195 @@ static void setup_sorting(struct perf_sched *sched, const struct option *options
 	sort_dimension__add("pid", &sched->cmp_pid);
 }
 
+static int process_synthesized_schedstat_event(const struct perf_tool *tool,
+					       union perf_event *event,
+					       struct perf_sample *sample __maybe_unused,
+					       struct machine *machine __maybe_unused)
+{
+	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
+
+	if (perf_data__write(sched->data, event, event->header.size) <= 0) {
+		pr_err("failed to write perf data, error: %m\n");
+		return -1;
+	}
+
+	sched->session->header.data_size += event->header.size;
+	return 0;
+}
+
+static void sighandler(int sig __maybe_unused)
+{
+}
+
+static int enable_sched_schedstats(int *reset)
+{
+	char path[PATH_MAX];
+	FILE *fp;
+	char ch;
+
+	snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint());
+	fp = fopen(path, "w+");
+	if (!fp) {
+		pr_err("Failed to open %s\n", path);
+		return -1;
+	}
+
+	ch = getc(fp);
+	if (ch == '0') {
+		*reset = 1;
+		rewind(fp);
+		putc('1', fp);
+		fclose(fp);
+	}
+	return 0;
+}
+
+static int disable_sched_schedstat(void)
+{
+	char path[PATH_MAX];
+	FILE *fp;
+
+	snprintf(path, PATH_MAX, "%s/sys/kernel/sched_schedstats", procfs__mountpoint());
+	fp = fopen(path, "w");
+	if (!fp) {
+		pr_err("Failed to open %s\n", path);
+		return -1;
+	}
+
+	putc('0', fp);
+	fclose(fp);
+	return 0;
+}
+
+/* perf.data or any other output file name used by stats subcommand (only). */
+const char *output_name;
+
+static int perf_sched__schedstat_record(struct perf_sched *sched,
+					int argc, const char **argv)
+{
+	struct perf_session *session;
+	struct target target = {};
+	struct evlist *evlist;
+	int reset = 0;
+	int err = 0;
+	int fd;
+	struct perf_data data = {
+		.path  = output_name,
+		.mode  = PERF_DATA_MODE_WRITE,
+	};
+
+	signal(SIGINT, sighandler);
+	signal(SIGCHLD, sighandler);
+	signal(SIGTERM, sighandler);
+
+	evlist = evlist__new();
+	if (!evlist)
+		return -ENOMEM;
+
+	session = perf_session__new(&data, &sched->tool);
+	if (IS_ERR(session)) {
+		pr_err("Perf session creation failed.\n");
+		evlist__delete(evlist);
+		return PTR_ERR(session);
+	}
+
+	session->evlist = evlist;
+
+	sched->session = session;
+	sched->data = &data;
+
+	fd = perf_data__fd(&data);
+
+	/*
+	 * Capture all important metadata about the system. Although they are
+	 * not used by `perf sched stats` tool directly, they provide useful
+	 * information about profiled environment.
+	 */
+	perf_header__set_feat(&session->header, HEADER_HOSTNAME);
+	perf_header__set_feat(&session->header, HEADER_OSRELEASE);
+	perf_header__set_feat(&session->header, HEADER_VERSION);
+	perf_header__set_feat(&session->header, HEADER_ARCH);
+	perf_header__set_feat(&session->header, HEADER_NRCPUS);
+	perf_header__set_feat(&session->header, HEADER_CPUDESC);
+	perf_header__set_feat(&session->header, HEADER_CPUID);
+	perf_header__set_feat(&session->header, HEADER_TOTAL_MEM);
+	perf_header__set_feat(&session->header, HEADER_CMDLINE);
+	perf_header__set_feat(&session->header, HEADER_CPU_TOPOLOGY);
+	perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY);
+	perf_header__set_feat(&session->header, HEADER_CACHE);
+	perf_header__set_feat(&session->header, HEADER_MEM_TOPOLOGY);
+	perf_header__set_feat(&session->header, HEADER_HYBRID_TOPOLOGY);
+	perf_header__set_feat(&session->header, HEADER_CPU_DOMAIN_INFO);
+
+	err = perf_session__write_header(session, evlist, fd, false);
+	if (err < 0)
+		goto out;
+
+	/*
+	 * `perf sched stats` does not support workload profiling (-p pid)
+	 * since /proc/schedstat file contains cpu specific data only. Hence, a
+	 * profile target is either set of cpus or systemwide, never a process.
+	 * Note that, although `-- <workload>` is supported, profile data are
+	 * still cpu/systemwide.
+	 */
+	if (cpu_list)
+		target.cpu_list = cpu_list;
+	else
+		target.system_wide = true;
+
+	if (argc) {
+		err = evlist__prepare_workload(evlist, &target, argv, false, NULL);
+		if (err)
+			goto out;
+	}
+
+	err = evlist__create_maps(evlist, &target);
+	if (err < 0)
+		goto out;
+
+	user_requested_cpus = evlist->core.user_requested_cpus;
+
+	err = perf_event__synthesize_schedstat(&(sched->tool),
+					       process_synthesized_schedstat_event,
+					       user_requested_cpus);
+	if (err < 0)
+		goto out;
+
+	err = enable_sched_schedstats(&reset);
+	if (err < 0)
+		goto out;
+
+	if (argc)
+		evlist__start_workload(evlist);
+
+	/* wait for signal */
+	pause();
+
+	if (reset) {
+		err = disable_sched_schedstat();
+		if (err < 0)
+			goto out;
+	}
+
+	err = perf_event__synthesize_schedstat(&(sched->tool),
+					       process_synthesized_schedstat_event,
+					       user_requested_cpus);
+	if (err < 0)
+		goto out;
+
+	err = perf_session__write_header(session, evlist, fd, true);
+
+out:
+	if (!err)
+		fprintf(stderr, "[ perf sched stats: Wrote samples to %s ]\n", data.path);
+	else
+		fprintf(stderr, "[ perf sched stats: Failed !! ]\n");
+
+	evlist__delete(evlist);
+	close(fd);
+	return err;
+}
+
 static bool schedstat_events_exposed(void)
 {
 	/*
@@ -3910,6 +4105,12 @@ int cmd_sched(int argc, const char **argv)
 	OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
 	OPT_PARENT(sched_options)
 	};
+	const struct option stats_options[] = {
+	OPT_STRING('o', "output", &output_name, "file",
+		   "`stats record` with output filename"),
+	OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
+	OPT_END()
+	};
 
 	const char * const latency_usage[] = {
 		"perf sched latency [<options>]",
@@ -3927,9 +4128,13 @@ int cmd_sched(int argc, const char **argv)
 		"perf sched timehist [<options>]",
 		NULL
 	};
+	const char *stats_usage[] = {
+		"perf sched stats {record} [<options>]",
+		NULL
+	};
 	const char *const sched_subcommands[] = { "record", "latency", "map",
 						  "replay", "script",
-						  "timehist", NULL };
+						  "timehist", "stats", NULL };
 	const char *sched_usage[] = {
 		NULL,
 		NULL
@@ -4027,6 +4232,21 @@ int cmd_sched(int argc, const char **argv)
 		ret = symbol__validate_sym_arguments();
 		if (!ret)
 			ret = perf_sched__timehist(&sched);
+	} else if (!strcmp(argv[0], "stats")) {
+		const char *const stats_subcommands[] = {"record", NULL};
+
+		argc = parse_options_subcommand(argc, argv, stats_options,
+						stats_subcommands,
+						stats_usage,
+						PARSE_OPT_STOP_AT_NON_OPTION);
+
+		if (argv[0] && !strcmp(argv[0], "record")) {
+			if (argc)
+				argc = parse_options(argc, argv, stats_options,
+						     stats_usage, 0);
+			return perf_sched__schedstat_record(&sched, argc, argv);
+		}
+		usage_with_options(stats_usage, stats_options);
 	} else {
 		usage_with_options(sched_usage, sched_options);
 	}
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 4c92cc1a952c..5a98c16e1092 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -83,6 +83,8 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_FINISHED_INIT]		= "FINISHED_INIT",
 	[PERF_RECORD_COMPRESSED2]		= "COMPRESSED2",
 	[PERF_RECORD_BPF_METADATA]		= "BPF_METADATA",
+	[PERF_RECORD_SCHEDSTAT_CPU]		= "SCHEDSTAT_CPU",
+	[PERF_RECORD_SCHEDSTAT_DOMAIN]		= "SCHEDSTAT_DOMAIN",
 };
 
 const char *perf_event__name(unsigned int id)
@@ -571,6 +573,44 @@ size_t perf_event__fprintf_text_poke(union perf_event *event, struct machine *ma
 	return ret;
 }
 
+size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp)
+{
+	struct perf_record_schedstat_cpu *cs = &event->schedstat_cpu;
+	size_t size = fprintf(fp, "\ncpu%u ", cs->cpu);
+	__u16 version = cs->version;
+
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)		\
+	size += fprintf(fp, "%" PRIu64 " ", (uint64_t)cs->_ver._name)
+
+	if (version == 15) {
+#include <perf/schedstat-v15.h>
+		return size;
+	}
+#undef CPU_FIELD
+
+	return fprintf(fp, "Unsupported /proc/schedstat version %d.\n",
+		       event->schedstat_cpu.version);
+}
+
+size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp)
+{
+	struct perf_record_schedstat_domain *ds = &event->schedstat_domain;
+	__u16 version = ds->version;
+	size_t size = fprintf(fp, "\ndomain%u ", ds->domain);
+
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		\
+	size += fprintf(fp, "%" PRIu64 " ", (uint64_t)ds->_ver._name)
+
+	if (version == 15) {
+#include <perf/schedstat-v15.h>
+		return size;
+	}
+#undef DOMAIN_FIELD
+
+	return fprintf(fp, "Unsupported /proc/schedstat version %d.\n",
+		       event->schedstat_domain.version);
+}
+
 size_t perf_event__fprintf(union perf_event *event, struct machine *machine, FILE *fp)
 {
 	size_t ret = fprintf(fp, "PERF_RECORD_%s",
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 64c63b59d617..2ea83fdf8a03 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -392,6 +392,8 @@ size_t perf_event__fprintf_ksymbol(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_bpf(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_bpf_metadata(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_text_poke(union perf_event *event, struct machine *machine,FILE *fp);
+size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf(union perf_event *event, struct machine *machine, FILE *fp);
 
 int kallsyms__get_function_start(const char *kallsyms_filename,
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index d7b28cb4e672..c0231bc000e7 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -698,6 +698,20 @@ static void perf_event__time_conv_swap(union perf_event *event,
 	}
 }
 
+static void
+perf_event__schedstat_cpu_swap(union perf_event *event __maybe_unused,
+			       bool sample_id_all __maybe_unused)
+{
+	/* FIXME */
+}
+
+static void
+perf_event__schedstat_domain_swap(union perf_event *event __maybe_unused,
+				  bool sample_id_all __maybe_unused)
+{
+	/* FIXME */
+}
+
 typedef void (*perf_event__swap_op)(union perf_event *event,
 				    bool sample_id_all);
 
@@ -737,6 +751,8 @@ static perf_event__swap_op perf_event__swap_ops[] = {
 	[PERF_RECORD_STAT_ROUND]	  = perf_event__stat_round_swap,
 	[PERF_RECORD_EVENT_UPDATE]	  = perf_event__event_update_swap,
 	[PERF_RECORD_TIME_CONV]		  = perf_event__time_conv_swap,
+	[PERF_RECORD_SCHEDSTAT_CPU]	  = perf_event__schedstat_cpu_swap,
+	[PERF_RECORD_SCHEDSTAT_DOMAIN]	  = perf_event__schedstat_domain_swap,
 	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
@@ -1667,6 +1683,12 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 	case PERF_RECORD_BPF_METADATA:
 		err = tool->bpf_metadata(tool, session, event);
 		break;
+	case PERF_RECORD_SCHEDSTAT_CPU:
+		err = tool->schedstat_cpu(tool, session, event);
+		break;
+	case PERF_RECORD_SCHEDSTAT_DOMAIN:
+		err = tool->schedstat_domain(tool, session, event);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 2ba9fa25e00a..5366ea921e70 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -2529,3 +2529,182 @@ int parse_synth_opt(char *synth)
 
 	return ret;
 }
+
+static union perf_event *__synthesize_schedstat_cpu(struct io *io, __u16 version,
+						    __u64 *cpu, __u64 timestamp)
+{
+	struct perf_record_schedstat_cpu *cs;
+	union perf_event *event;
+	size_t size;
+	char ch;
+
+	size = sizeof(*cs);
+	size = PERF_ALIGN(size, sizeof(u64));
+	event = zalloc(size);
+
+	if (!event)
+		return NULL;
+
+	cs = &event->schedstat_cpu;
+	cs->header.type = PERF_RECORD_SCHEDSTAT_CPU;
+	cs->header.size = size;
+	cs->timestamp = timestamp;
+
+	if (io__get_char(io) != 'p' || io__get_char(io) != 'u')
+		goto out_cpu;
+
+	if (io__get_dec(io, (__u64 *)cpu) != ' ')
+		goto out_cpu;
+
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)	\
+	do {								\
+		__u64 _tmp;						\
+		ch = io__get_dec(io, &_tmp);				\
+		if (ch != ' ' && ch != '\n')				\
+			goto out_cpu;					\
+		cs->_ver._name = _tmp;					\
+	} while (0)
+
+	if (version == 15) {
+#include <perf/schedstat-v15.h>
+	}
+#undef CPU_FIELD
+
+	cs->cpu = *cpu;
+	cs->version = version;
+
+	return event;
+out_cpu:
+	free(event);
+	return NULL;
+}
+
+static union perf_event *__synthesize_schedstat_domain(struct io *io, __u16 version,
+						       __u64 cpu, __u64 timestamp)
+{
+	struct perf_record_schedstat_domain *ds;
+	union perf_event *event = NULL;
+	__u64 d_num;
+	size_t size;
+	char ch;
+
+	if (io__get_char(io) != 'o' || io__get_char(io) != 'm' || io__get_char(io) != 'a' ||
+	    io__get_char(io) != 'i' || io__get_char(io) != 'n')
+		return NULL;
+
+	ch = io__get_dec(io, &d_num);
+
+	/* Skip cpumask as it can be extracted from perf header */
+	while (io__get_char(io) != ' ')
+		continue;
+
+	size = sizeof(*ds);
+	size = PERF_ALIGN(size, sizeof(u64));
+	event = zalloc(size);
+
+	ds = &event->schedstat_domain;
+	ds->header.type = PERF_RECORD_SCHEDSTAT_DOMAIN;
+	ds->header.size = size;
+	ds->version = version;
+	ds->timestamp = timestamp;
+	ds->domain = d_num;
+
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)	\
+	do {								\
+		__u64 _tmp;						\
+		ch = io__get_dec(io, &_tmp);				\
+		if (ch != ' ' && ch != '\n')				\
+			goto out_domain;				\
+		ds->_ver._name = _tmp;					\
+	} while (0)
+
+	if (version == 15) {
+#include <perf/schedstat-v15.h>
+	}
+#undef DOMAIN_FIELD
+
+	ds->cpu = cpu;
+	goto out;
+
+out_domain:
+	free(event);
+	event = NULL;
+out:
+	return event;
+}
+
+int perf_event__synthesize_schedstat(const struct perf_tool *tool,
+				     perf_event__handler_t process,
+				     struct perf_cpu_map *user_requested_cpus)
+{
+	char *line = NULL, path[PATH_MAX];
+	union perf_event *event = NULL;
+	size_t line_len = 0;
+	char bf[BUFSIZ];
+	__u64 timestamp;
+	__u64 cpu = -1;
+	__u16 version;
+	struct io io;
+	int ret = -1;
+	char ch;
+
+	snprintf(path, PATH_MAX, "%s/schedstat", procfs__mountpoint());
+	io.fd = open(path, O_RDONLY, 0);
+	if (io.fd < 0) {
+		pr_err("Failed to open %s. Possibly CONFIG_SCHEDSTAT is disabled.\n", path);
+		return -1;
+	}
+	io__init(&io, io.fd, bf, sizeof(bf));
+
+	if (io__getline(&io, &line, &line_len) < 0 || !line_len)
+		goto out;
+
+	if (!strcmp(line, "version 15\n")) {
+		version = 15;
+	} else {
+		pr_err("Unsupported %s version: %s", path, line + 8);
+		goto out_free_line;
+	}
+
+	if (io__getline(&io, &line, &line_len) < 0 || !line_len)
+		goto out_free_line;
+	timestamp = atol(line + 10);
+
+	/*
+	 * FIXME: Can be optimized a bit by not synthesizing domain samples
+	 * for filtered out cpus.
+	 */
+	for (ch = io__get_char(&io); !io.eof; ch = io__get_char(&io)) {
+		struct perf_cpu this_cpu;
+
+		if (ch == 'c') {
+			event = __synthesize_schedstat_cpu(&io, version,
+							   &cpu, timestamp);
+		} else if (ch == 'd') {
+			event = __synthesize_schedstat_domain(&io, version,
+							      cpu, timestamp);
+		}
+		if (!event)
+			goto out_free_line;
+
+		this_cpu.cpu = cpu;
+
+		if (user_requested_cpus && !perf_cpu_map__has(user_requested_cpus, this_cpu))
+			continue;
+
+		if (process(tool, event, NULL, NULL) < 0) {
+			free(event);
+			goto out_free_line;
+		}
+
+		free(event);
+	}
+
+	ret = 0;
+
+out_free_line:
+	free(line);
+out:
+	close(io.fd);
+	return ret;
+}
diff --git a/tools/perf/util/synthetic-events.h b/tools/perf/util/synthetic-events.h
index f8588b6cf11a..b0edad0c3100 100644
--- a/tools/perf/util/synthetic-events.h
+++ b/tools/perf/util/synthetic-events.h
@@ -128,4 +128,7 @@ int perf_event__synthesize_for_pipe(const struct perf_tool *tool,
 				    struct perf_data *data,
 				    perf_event__handler_t process);
 
+int perf_event__synthesize_schedstat(const struct perf_tool *tool,
+				     perf_event__handler_t process,
+				     struct perf_cpu_map *user_requested_cpu);
 #endif // __PERF_SYNTHETIC_EVENTS_H
diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
index 27ba5849c74a..013c7839e2cf 100644
--- a/tools/perf/util/tool.c
+++ b/tools/perf/util/tool.c
@@ -253,7 +253,25 @@ static int perf_event__process_bpf_metadata_stub(const struct perf_tool *tool __
 {
 	if (dump_trace)
 		perf_event__fprintf_bpf_metadata(event, stdout);
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+static int process_schedstat_cpu_stub(const struct perf_tool *tool __maybe_unused,
+				      struct perf_session *perf_session __maybe_unused,
+				      union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_schedstat_cpu(event, stdout);
+	dump_printf(": unhandled!\n");
+	return 0;
+}
 
+static int process_schedstat_domain_stub(const struct perf_tool *tool __maybe_unused,
+					 struct perf_session *perf_session __maybe_unused,
+					 union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_schedstat_domain(event, stdout);
 	dump_printf(": unhandled!\n");
 	return 0;
 }
@@ -317,6 +335,8 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events)
 #endif
 	tool->finished_init = process_event_op2_stub;
 	tool->bpf_metadata = perf_event__process_bpf_metadata_stub;
+	tool->schedstat_cpu = process_schedstat_cpu_stub;
+	tool->schedstat_domain = process_schedstat_domain_stub;
 }
 
 bool perf_tool__compressed_is_stub(const struct perf_tool *tool)
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index e96b69d25a5b..2d9a4b1ca9d0 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -81,7 +81,9 @@ struct perf_tool {
 			stat_round,
 			feature,
 			finished_init,
-			bpf_metadata;
+			bpf_metadata,
+			schedstat_cpu,
+			schedstat_domain;
 	event_op4	compressed;
 	event_op3	auxtrace;
 	bool		ordered_events;
-- 
cgit v1.2.3


From 55657d7ac8caa98c7c0ef241bf64e176db899b4d Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Mon, 19 Jan 2026 17:58:26 +0000
Subject: perf sched stats: Add schedstat v16 support

The /proc/schedstat file output is standardized with version number.

Add support to record and raw dump v16 version layout.

Version 16 of schedstats changed the order of definitions within 'enum
cpu_idle_type', which changed the order of [CPU_MAX_IDLE_TYPES] columns
in show_schedstat().

In particular the position of CPU_IDLE and __CPU_NOT_IDLE changed
places.

Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: James Clark <james.clark@linaro.org>
Acked-by: Ian Rogers <irogers@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: David Vernet <void@manifault.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Graham Woodward <graham.woodward@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Cc: Zhongqiu Han <quic_zhonhan@quicinc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/Makefile                     |   2 +-
 tools/lib/perf/include/perf/event.h         |  14 +++
 tools/lib/perf/include/perf/schedstat-v16.h | 146 ++++++++++++++++++++++++++++
 tools/perf/util/event.c                     |   6 ++
 tools/perf/util/synthetic-events.c          |   6 ++
 5 files changed, 173 insertions(+), 1 deletion(-)
 create mode 100644 tools/lib/perf/include/perf/schedstat-v16.h

(limited to 'tools')

diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile
index 9fa28e512ca8..965e066fd780 100644
--- a/tools/lib/perf/Makefile
+++ b/tools/lib/perf/Makefile
@@ -179,7 +179,7 @@ install_lib: libs
 		cp -fpR $(LIBPERF_ALL) $(DESTDIR)$(libdir_SQ)
 
 HDRS := bpf_perf.h core.h cpumap.h threadmap.h evlist.h evsel.h event.h mmap.h
-HDRS += schedstat-v15.h
+HDRS += schedstat-v15.h schedstat-v16.h
 INTERNAL_HDRS := cpumap.h evlist.h evsel.h lib.h mmap.h rc_check.h threadmap.h xyarray.h
 
 INSTALL_HDRS_PFX := $(DESTDIR)$(prefix)/include/perf
diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h
index ce04fed7cefc..bd4d507ea8ab 100644
--- a/tools/lib/perf/include/perf/event.h
+++ b/tools/lib/perf/include/perf/event.h
@@ -502,6 +502,12 @@ struct perf_record_schedstat_cpu_v15 {
 #undef CPU_FIELD
 };
 
+struct perf_record_schedstat_cpu_v16 {
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)		_type _name
+#include "schedstat-v16.h"
+#undef CPU_FIELD
+};
+
 struct perf_record_schedstat_cpu {
 	struct perf_event_header header;
 	__u64			 timestamp;
@@ -511,6 +517,7 @@ struct perf_record_schedstat_cpu {
 	char			 __pad[2];
 	union {
 		struct perf_record_schedstat_cpu_v15 v15;
+		struct perf_record_schedstat_cpu_v16 v16;
 	};
 };
 
@@ -520,6 +527,12 @@ struct perf_record_schedstat_domain_v15 {
 #undef DOMAIN_FIELD
 };
 
+struct perf_record_schedstat_domain_v16 {
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		_type _name
+#include "schedstat-v16.h"
+#undef DOMAIN_FIELD
+};
+
 #define DOMAIN_NAME_LEN		16
 
 struct perf_record_schedstat_domain {
@@ -530,6 +543,7 @@ struct perf_record_schedstat_domain {
 	__u16			 domain;
 	union {
 		struct perf_record_schedstat_domain_v15 v15;
+		struct perf_record_schedstat_domain_v16 v16;
 	};
 };
 
diff --git a/tools/lib/perf/include/perf/schedstat-v16.h b/tools/lib/perf/include/perf/schedstat-v16.h
new file mode 100644
index 000000000000..3462b79c29af
--- /dev/null
+++ b/tools/lib/perf/include/perf/schedstat-v16.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef CPU_FIELD
+CPU_FIELD(__u32, yld_count, "sched_yield() count",
+	  "%11u", false, yld_count, v16);
+CPU_FIELD(__u32, array_exp, "Legacy counter can be ignored",
+	  "%11u", false, array_exp, v16);
+CPU_FIELD(__u32, sched_count, "schedule() called",
+	  "%11u", false, sched_count, v16);
+CPU_FIELD(__u32, sched_goidle, "schedule() left the processor idle",
+	  "%11u", true, sched_count, v16);
+CPU_FIELD(__u32, ttwu_count, "try_to_wake_up() was called",
+	  "%11u", false, ttwu_count, v16);
+CPU_FIELD(__u32, ttwu_local, "try_to_wake_up() was called to wake up the local cpu",
+	  "%11u", true, ttwu_count, v16);
+CPU_FIELD(__u64, rq_cpu_time, "total runtime by tasks on this processor (in jiffies)",
+	  "%11llu", false, rq_cpu_time, v16);
+CPU_FIELD(__u64, run_delay, "total waittime by tasks on this processor (in jiffies)",
+	  "%11llu", true, rq_cpu_time, v16);
+CPU_FIELD(__u64, pcount, "total timeslices run on this cpu",
+	  "%11llu", false, pcount, v16);
+#endif /* CPU_FIELD */
+
+#ifdef DOMAIN_FIELD
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category busy> ");
+#endif
+DOMAIN_FIELD(__u32, busy_lb_count,
+	     "load_balance() count on cpu busy", "%11u", true, v16);
+DOMAIN_FIELD(__u32, busy_lb_balanced,
+	     "load_balance() found balanced on cpu busy", "%11u", true, v16);
+DOMAIN_FIELD(__u32, busy_lb_failed,
+	     "load_balance() move task failed on cpu busy", "%11u", true, v16);
+DOMAIN_FIELD(__u32, busy_lb_imbalance,
+	     "imbalance sum on cpu busy", "%11u", false, v16);
+DOMAIN_FIELD(__u32, busy_lb_gained,
+	     "pull_task() count on cpu busy", "%11u", false, v16);
+DOMAIN_FIELD(__u32, busy_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu busy", "%11u", false, v16);
+DOMAIN_FIELD(__u32, busy_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu busy", "%11u", true, v16);
+DOMAIN_FIELD(__u32, busy_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu busy", "%11u", true, v16);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(busy_lb_success_count, "load_balance() success count on cpu busy", "%11u",
+		  busy_lb_count, busy_lb_balanced, busy_lb_failed, v16);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(busy_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu busy)", "%11.2Lf",
+		  busy_lb_count, busy_lb_balanced, busy_lb_failed, busy_lb_gained, v16);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category idle> ");
+#endif
+DOMAIN_FIELD(__u32, idle_lb_count,
+	     "load_balance() count on cpu idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, idle_lb_balanced,
+	     "load_balance() found balanced on cpu idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, idle_lb_failed,
+	     "load_balance() move task failed on cpu idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, idle_lb_imbalance,
+	     "imbalance sum on cpu idle", "%11u", false, v16);
+DOMAIN_FIELD(__u32, idle_lb_gained,
+	     "pull_task() count on cpu idle", "%11u", false, v16);
+DOMAIN_FIELD(__u32, idle_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu idle", "%11u", false, v16);
+DOMAIN_FIELD(__u32, idle_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, idle_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu idle", "%11u", true, v16);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(idle_lb_success_count, "load_balance() success count on cpu idle", "%11u",
+		  idle_lb_count, idle_lb_balanced, idle_lb_failed, v16);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(idle_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu idle)", "%11.2Lf",
+		  idle_lb_count, idle_lb_balanced, idle_lb_failed, idle_lb_gained, v16);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category newidle> ");
+#endif
+DOMAIN_FIELD(__u32, newidle_lb_count,
+	     "load_balance() count on cpu newly idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, newidle_lb_balanced,
+	     "load_balance() found balanced on cpu newly idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, newidle_lb_failed,
+	     "load_balance() move task failed on cpu newly idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, newidle_lb_imbalance,
+	     "imbalance sum on cpu newly idle", "%11u", false, v16);
+DOMAIN_FIELD(__u32, newidle_lb_gained,
+	     "pull_task() count on cpu newly idle", "%11u", false, v16);
+DOMAIN_FIELD(__u32, newidle_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu newly idle", "%11u", false, v16);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu newly idle", "%11u", true, v16);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu newly idle", "%11u", true, v16);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(newidle_lb_success_count,
+		  "load_balance() success count on cpu newly idle", "%11u",
+		  newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, v16);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(newidle_lb_avg_count,
+		  "avg task pulled per successful lb attempt (cpu newly idle)", "%11.2Lf",
+		  newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, newidle_lb_gained, v16);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category active_load_balance()> ");
+#endif
+DOMAIN_FIELD(__u32, alb_count,
+	     "active_load_balance() count", "%11u", false, v16);
+DOMAIN_FIELD(__u32, alb_failed,
+	     "active_load_balance() move task failed", "%11u", false, v16);
+DOMAIN_FIELD(__u32, alb_pushed,
+	     "active_load_balance() successfully moved a task", "%11u", false, v16);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_exec()> ");
+#endif
+DOMAIN_FIELD(__u32, sbe_count,
+	     "sbe_count is not used", "%11u", false, v16);
+DOMAIN_FIELD(__u32, sbe_balanced,
+	     "sbe_balanced is not used", "%11u", false, v16);
+DOMAIN_FIELD(__u32, sbe_pushed,
+	     "sbe_pushed is not used", "%11u", false, v16);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_fork()> ");
+#endif
+DOMAIN_FIELD(__u32, sbf_count,
+	     "sbf_count is not used", "%11u", false, v16);
+DOMAIN_FIELD(__u32, sbf_balanced,
+	     "sbf_balanced is not used", "%11u", false, v16);
+DOMAIN_FIELD(__u32, sbf_pushed,
+	     "sbf_pushed is not used", "%11u", false, v16);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Wakeup Info> ");
+#endif
+DOMAIN_FIELD(__u32, ttwu_wake_remote,
+	     "try_to_wake_up() awoke a task that last ran on a diff cpu", "%11u", false, v16);
+DOMAIN_FIELD(__u32, ttwu_move_affine,
+	     "try_to_wake_up() moved task because cache-cold on own cpu", "%11u", false, v16);
+DOMAIN_FIELD(__u32, ttwu_move_balance,
+	     "try_to_wake_up() started passive balancing", "%11u", false, v16);
+#endif /* DOMAIN_FIELD */
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 5a98c16e1092..730021cec161 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -585,6 +585,9 @@ size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp)
 	if (version == 15) {
 #include <perf/schedstat-v15.h>
 		return size;
+	} else if (version == 16) {
+#include <perf/schedstat-v16.h>
+		return size;
 	}
 #undef CPU_FIELD
 
@@ -604,6 +607,9 @@ size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp)
 	if (version == 15) {
 #include <perf/schedstat-v15.h>
 		return size;
+	} else if (version == 16) {
+#include <perf/schedstat-v16.h>
+		return size;
 	}
 #undef DOMAIN_FIELD
 
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 5366ea921e70..4ce37357db05 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -2567,6 +2567,8 @@ static union perf_event *__synthesize_schedstat_cpu(struct io *io, __u16 version
 
 	if (version == 15) {
 #include <perf/schedstat-v15.h>
+	} else if (version == 16) {
+#include <perf/schedstat-v16.h>
 	}
 #undef CPU_FIELD
 
@@ -2620,6 +2622,8 @@ static union perf_event *__synthesize_schedstat_domain(struct io *io, __u16 vers
 
 	if (version == 15) {
 #include <perf/schedstat-v15.h>
+	} else if (version == 16) {
+#include <perf/schedstat-v16.h>
 	}
 #undef DOMAIN_FIELD
 
@@ -2661,6 +2665,8 @@ int perf_event__synthesize_schedstat(const struct perf_tool *tool,
 
 	if (!strcmp(line, "version 15\n")) {
 		version = 15;
+	} else if (!strcmp(line, "version 16\n")) {
+		version = 16;
 	} else {
 		pr_err("Unsupported %s version: %s", path, line + 8);
 		goto out_free_line;
-- 
cgit v1.2.3


From 805da27252a290984782abfdb313a78e7c157369 Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Mon, 19 Jan 2026 17:58:27 +0000
Subject: perf sched stats: Add schedstat v17 support

The /proc/schedstat file output is standardized with version number.

Add support to record and raw dump v17 version layout.

Version 17 of schedstats removed 'lb_imbalance' field as it has no
significance anymore and instead added more relevant fields namely
'lb_imbalance_load', 'lb_imbalance_util', 'lb_imbalance_task' and
'lb_imbalance_misfit'.

The domain field prints the name of the corresponding sched domain from
this version onwards.

Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: David Vernet <void@manifault.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Graham Woodward <graham.woodward@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Cc: Zhongqiu Han <quic_zhonhan@quicinc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/Makefile                     |   2 +-
 tools/lib/perf/include/perf/event.h         |  14 +++
 tools/lib/perf/include/perf/schedstat-v17.h | 164 ++++++++++++++++++++++++++++
 tools/perf/util/event.c                     |   6 +
 tools/perf/util/synthetic-events.c          |  11 ++
 5 files changed, 196 insertions(+), 1 deletion(-)
 create mode 100644 tools/lib/perf/include/perf/schedstat-v17.h

(limited to 'tools')

diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile
index 965e066fd780..27e6490f64dc 100644
--- a/tools/lib/perf/Makefile
+++ b/tools/lib/perf/Makefile
@@ -179,7 +179,7 @@ install_lib: libs
 		cp -fpR $(LIBPERF_ALL) $(DESTDIR)$(libdir_SQ)
 
 HDRS := bpf_perf.h core.h cpumap.h threadmap.h evlist.h evsel.h event.h mmap.h
-HDRS += schedstat-v15.h schedstat-v16.h
+HDRS += schedstat-v15.h schedstat-v16.h schedstat-v17.h
 INTERNAL_HDRS := cpumap.h evlist.h evsel.h lib.h mmap.h rc_check.h threadmap.h xyarray.h
 
 INSTALL_HDRS_PFX := $(DESTDIR)$(prefix)/include/perf
diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h
index bd4d507ea8ab..9043dc72b5d6 100644
--- a/tools/lib/perf/include/perf/event.h
+++ b/tools/lib/perf/include/perf/event.h
@@ -508,6 +508,12 @@ struct perf_record_schedstat_cpu_v16 {
 #undef CPU_FIELD
 };
 
+struct perf_record_schedstat_cpu_v17 {
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)		_type _name
+#include "schedstat-v17.h"
+#undef CPU_FIELD
+};
+
 struct perf_record_schedstat_cpu {
 	struct perf_event_header header;
 	__u64			 timestamp;
@@ -518,6 +524,7 @@ struct perf_record_schedstat_cpu {
 	union {
 		struct perf_record_schedstat_cpu_v15 v15;
 		struct perf_record_schedstat_cpu_v16 v16;
+		struct perf_record_schedstat_cpu_v17 v17;
 	};
 };
 
@@ -533,6 +540,12 @@ struct perf_record_schedstat_domain_v16 {
 #undef DOMAIN_FIELD
 };
 
+struct perf_record_schedstat_domain_v17 {
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		_type _name
+#include "schedstat-v17.h"
+#undef DOMAIN_FIELD
+};
+
 #define DOMAIN_NAME_LEN		16
 
 struct perf_record_schedstat_domain {
@@ -544,6 +557,7 @@ struct perf_record_schedstat_domain {
 	union {
 		struct perf_record_schedstat_domain_v15 v15;
 		struct perf_record_schedstat_domain_v16 v16;
+		struct perf_record_schedstat_domain_v17 v17;
 	};
 };
 
diff --git a/tools/lib/perf/include/perf/schedstat-v17.h b/tools/lib/perf/include/perf/schedstat-v17.h
new file mode 100644
index 000000000000..865dc7c1039c
--- /dev/null
+++ b/tools/lib/perf/include/perf/schedstat-v17.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef CPU_FIELD
+CPU_FIELD(__u32, yld_count, "sched_yield() count",
+	  "%11u", false, yld_count, v17);
+CPU_FIELD(__u32, array_exp, "Legacy counter can be ignored",
+	  "%11u", false, array_exp, v17);
+CPU_FIELD(__u32, sched_count, "schedule() called",
+	  "%11u", false, sched_count, v17);
+CPU_FIELD(__u32, sched_goidle, "schedule() left the processor idle",
+	  "%11u", true, sched_count, v17);
+CPU_FIELD(__u32, ttwu_count, "try_to_wake_up() was called",
+	  "%11u", false, ttwu_count, v17);
+CPU_FIELD(__u32, ttwu_local, "try_to_wake_up() was called to wake up the local cpu",
+	  "%11u", true, ttwu_count, v17);
+CPU_FIELD(__u64, rq_cpu_time, "total runtime by tasks on this processor (in jiffies)",
+	  "%11llu", false, rq_cpu_time, v17);
+CPU_FIELD(__u64, run_delay, "total waittime by tasks on this processor (in jiffies)",
+	  "%11llu", true, rq_cpu_time, v17);
+CPU_FIELD(__u64, pcount, "total timeslices run on this cpu",
+	  "%11llu", false, pcount, v17);
+#endif /* CPU_FIELD */
+
+#ifdef DOMAIN_FIELD
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category busy> ");
+#endif
+DOMAIN_FIELD(__u32, busy_lb_count,
+	     "load_balance() count on cpu busy", "%11u", true, v17);
+DOMAIN_FIELD(__u32, busy_lb_balanced,
+	     "load_balance() found balanced on cpu busy", "%11u", true, v17);
+DOMAIN_FIELD(__u32, busy_lb_failed,
+	     "load_balance() move task failed on cpu busy", "%11u", true, v17);
+DOMAIN_FIELD(__u32, busy_lb_imbalance_load,
+	     "imbalance in load on cpu busy", "%11u", false, v17);
+DOMAIN_FIELD(__u32, busy_lb_imbalance_util,
+	     "imbalance in utilization on cpu busy", "%11u", false, v17);
+DOMAIN_FIELD(__u32, busy_lb_imbalance_task,
+	     "imbalance in number of tasks on cpu busy", "%11u", false, v17);
+DOMAIN_FIELD(__u32, busy_lb_imbalance_misfit,
+	     "imbalance in misfit tasks on cpu busy", "%11u", false, v17);
+DOMAIN_FIELD(__u32, busy_lb_gained,
+	     "pull_task() count on cpu busy", "%11u", false, v17);
+DOMAIN_FIELD(__u32, busy_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu busy", "%11u", false, v17);
+DOMAIN_FIELD(__u32, busy_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu busy", "%11u", true, v17);
+DOMAIN_FIELD(__u32, busy_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu busy", "%11u", true, v17);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(busy_lb_success_count, "load_balance() success count on cpu busy", "%11u",
+		  busy_lb_count, busy_lb_balanced, busy_lb_failed, v17);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(busy_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu busy)", "%11.2Lf",
+		  busy_lb_count, busy_lb_balanced, busy_lb_failed, busy_lb_gained, v17);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category idle> ");
+#endif
+DOMAIN_FIELD(__u32, idle_lb_count,
+	     "load_balance() count on cpu idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, idle_lb_balanced,
+	     "load_balance() found balanced on cpu idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, idle_lb_failed,
+	     "load_balance() move task failed on cpu idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, idle_lb_imbalance_load,
+	     "imbalance in load on cpu idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, idle_lb_imbalance_util,
+	     "imbalance in utilization on cpu idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, idle_lb_imbalance_task,
+	     "imbalance in number of tasks on cpu idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, idle_lb_imbalance_misfit,
+	     "imbalance in misfit tasks on cpu idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, idle_lb_gained,
+	     "pull_task() count on cpu idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, idle_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, idle_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, idle_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu idle", "%11u", true, v17);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(idle_lb_success_count, "load_balance() success count on cpu idle", "%11u",
+		  idle_lb_count, idle_lb_balanced, idle_lb_failed, v17);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(idle_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu idle)", "%11.2Lf",
+		  idle_lb_count, idle_lb_balanced, idle_lb_failed, idle_lb_gained, v17);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category newidle> ");
+#endif
+DOMAIN_FIELD(__u32, newidle_lb_count,
+	     "load_balance() count on cpu newly idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, newidle_lb_balanced,
+	     "load_balance() found balanced on cpu newly idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, newidle_lb_failed,
+	     "load_balance() move task failed on cpu newly idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, newidle_lb_imbalance_load,
+	     "imbalance in load on cpu newly idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, newidle_lb_imbalance_util,
+	     "imbalance in utilization on cpu newly idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, newidle_lb_imbalance_task,
+	     "imbalance in number of tasks on cpu newly idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, newidle_lb_imbalance_misfit,
+	     "imbalance in misfit tasks on cpu newly idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, newidle_lb_gained,
+	     "pull_task() count on cpu newly idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, newidle_lb_hot_gained,
+	     "pull_task() when target task was cache-hot on cpu newly idle", "%11u", false, v17);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyq,
+	     "load_balance() failed to find busier queue on cpu newly idle", "%11u", true, v17);
+DOMAIN_FIELD(__u32, newidle_lb_nobusyg,
+	     "load_balance() failed to find busier group on cpu newly idle", "%11u", true, v17);
+#ifdef DERIVED_CNT_FIELD
+DERIVED_CNT_FIELD(newidle_lb_success_count,
+		  "load_balance() success count on cpu newly idle", "%11u",
+		  newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, v17);
+#endif
+#ifdef DERIVED_AVG_FIELD
+DERIVED_AVG_FIELD(newidle_lb_avg_pulled,
+		  "avg task pulled per successful lb attempt (cpu newly idle)", "%11.2Lf",
+		  newidle_lb_count, newidle_lb_balanced, newidle_lb_failed, newidle_lb_gained, v17);
+#endif
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category active_load_balance()> ");
+#endif
+DOMAIN_FIELD(__u32, alb_count,
+	     "active_load_balance() count", "%11u", false, v17);
+DOMAIN_FIELD(__u32, alb_failed,
+	     "active_load_balance() move task failed", "%11u", false, v17);
+DOMAIN_FIELD(__u32, alb_pushed,
+	     "active_load_balance() successfully moved a task", "%11u", false, v17);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_exec()> ");
+#endif
+DOMAIN_FIELD(__u32, sbe_count,
+	     "sbe_count is not used", "%11u", false, v17);
+DOMAIN_FIELD(__u32, sbe_balanced,
+	     "sbe_balanced is not used", "%11u", false, v17);
+DOMAIN_FIELD(__u32, sbe_pushed,
+	     "sbe_pushed is not used", "%11u", false, v17);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Category sched_balance_fork()> ");
+#endif
+DOMAIN_FIELD(__u32, sbf_count,
+	     "sbf_count is not used", "%11u", false, v17);
+DOMAIN_FIELD(__u32, sbf_balanced,
+	     "sbf_balanced is not used", "%11u", false, v17);
+DOMAIN_FIELD(__u32, sbf_pushed,
+	     "sbf_pushed is not used", "%11u", false, v17);
+#ifdef DOMAIN_CATEGORY
+DOMAIN_CATEGORY(" <Wakeup Info> ");
+#endif
+DOMAIN_FIELD(__u32, ttwu_wake_remote,
+	     "try_to_wake_up() awoke a task that last ran on a diff cpu", "%11u", false, v17);
+DOMAIN_FIELD(__u32, ttwu_move_affine,
+	     "try_to_wake_up() moved task because cache-cold on own cpu", "%11u", false, v17);
+DOMAIN_FIELD(__u32, ttwu_move_balance,
+	     "try_to_wake_up() started passive balancing", "%11u", false, v17);
+#endif /* DOMAIN_FIELD */
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 730021cec161..2dde1044b5a7 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -588,6 +588,9 @@ size_t perf_event__fprintf_schedstat_cpu(union perf_event *event, FILE *fp)
 	} else if (version == 16) {
 #include <perf/schedstat-v16.h>
 		return size;
+	} else if (version == 17) {
+#include <perf/schedstat-v17.h>
+		return size;
 	}
 #undef CPU_FIELD
 
@@ -610,6 +613,9 @@ size_t perf_event__fprintf_schedstat_domain(union perf_event *event, FILE *fp)
 	} else if (version == 16) {
 #include <perf/schedstat-v16.h>
 		return size;
+	} else if (version == 17) {
+#include <perf/schedstat-v17.h>
+		return size;
 	}
 #undef DOMAIN_FIELD
 
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 4ce37357db05..ef79433ebc3a 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -2569,6 +2569,8 @@ static union perf_event *__synthesize_schedstat_cpu(struct io *io, __u16 version
 #include <perf/schedstat-v15.h>
 	} else if (version == 16) {
 #include <perf/schedstat-v16.h>
+	} else if (version == 17) {
+#include <perf/schedstat-v17.h>
 	}
 #undef CPU_FIELD
 
@@ -2595,6 +2597,11 @@ static union perf_event *__synthesize_schedstat_domain(struct io *io, __u16 vers
 		return NULL;
 
 	ch = io__get_dec(io, &d_num);
+	if (version >= 17) {
+		/* Skip domain name as it can be extracted from perf header */
+		while (io__get_char(io) != ' ')
+			continue;
+	}
 
 	/* Skip cpumask as it can be extracted from perf header */
 	while (io__get_char(io) != ' ')
@@ -2624,6 +2631,8 @@ static union perf_event *__synthesize_schedstat_domain(struct io *io, __u16 vers
 #include <perf/schedstat-v15.h>
 	} else if (version == 16) {
 #include <perf/schedstat-v16.h>
+	} else if (version == 17) {
+#include <perf/schedstat-v17.h>
 	}
 #undef DOMAIN_FIELD
 
@@ -2667,6 +2676,8 @@ int perf_event__synthesize_schedstat(const struct perf_tool *tool,
 		version = 15;
 	} else if (!strcmp(line, "version 16\n")) {
 		version = 16;
+	} else if (!strcmp(line, "version 17\n")) {
+		version = 17;
 	} else {
 		pr_err("Unsupported %s version: %s", path, line + 8);
 		goto out_free_line;
-- 
cgit v1.2.3


From 5a357ae6ad63fd101a4f20d081f8893b51cc0790 Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Mon, 19 Jan 2026 17:58:28 +0000
Subject: perf sched stats: Add support for report subcommand

`perf sched stats record` captures two sets of samples. For workload
profile, first set right before workload starts and second set after
workload finishes. For the systemwide profile, first set at the
beginning of profile and second set on receiving SIGINT signal.

Add `perf sched stats report` subcommand that will read both the set
of samples, get the diff and render a final report. Final report prints
scheduler stat at cpu granularity as well as sched domain granularity.

Example usage:

  # ./perf sched stats record -- true
  [ perf sched stats: Wrote samples to perf.data ]

  # perf sched stats report
  Description
  ----------------------------------------------------------------------------------------------------
  DESC                          -> Description of the field
  COUNT                         -> Value of the field
  PCT_CHANGE                    -> Percent change with corresponding base value
  AVG_JIFFIES                   -> Avg time in jiffies between two consecutive occurrence of event
  ----------------------------------------------------------------------------------------------------

  Time elapsed (in jiffies)                                        :           1
  ----------------------------------------------------------------------------------------------------
  CPU: <ALL CPUS SUMMARY>
  ----------------------------------------------------------------------------------------------------
  DESC                                                                     COUNT   PCT_CHANGE
  ----------------------------------------------------------------------------------------------------
  yld_count                                                        :           0
  array_exp                                                        :           0
  sched_count                                                      :           0
  sched_goidle                                                     :           0  (     0.00% )
  ttwu_count                                                       :           0
  ttwu_local                                                       :           0  (     0.00% )
  rq_cpu_time                                                      :       33525
  run_delay                                                        :         436  (     1.30% )
  pcount                                                           :           0
  ----------------------------------------------------------------------------------------------------
  CPU: <ALL CPUS SUMMARY> | DOMAIN: SMT
  ----------------------------------------------------------------------------------------------------
  DESC                                                                     COUNT    AVG_JIFFIES
  ----------------------------------------- <Category busy> ------------------------------------------
  busy_lb_count                                                    :           0  $        0.00 $
  busy_lb_balanced                                                 :           0  $        0.00 $
  busy_lb_failed                                                   :           0  $        0.00 $
  busy_lb_imbalance_load                                           :           0
  busy_lb_imbalance_util                                           :           0
  busy_lb_imbalance_task                                           :           0
  busy_lb_imbalance_misfit                                         :           0
  busy_lb_gained                                                   :           0
  busy_lb_hot_gained                                               :           0
  busy_lb_nobusyq                                                  :           0  $        0.00 $
  busy_lb_nobusyg                                                  :           0  $        0.00 $
  *busy_lb_success_count                                           :           0
  *busy_lb_avg_pulled                                              :        0.00

  ... and so on. Output shows similar data for all the cpus in the
system.

Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: James Clark <james.clark@linaro.org>
Acked-by: Ian Rogers <irogers@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: David Vernet <void@manifault.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Graham Woodward <graham.woodward@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Cc: Zhongqiu Han <quic_zhonhan@quicinc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 509 ++++++++++++++++++++++++++++++++++++++++++++-
 tools/perf/util/util.c     |   6 +
 tools/perf/util/util.h     |   2 +
 3 files changed, 515 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index ee3b4e42156e..c6b054b9b12a 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -3929,6 +3929,503 @@ out:
 	return err;
 }
 
+struct schedstat_domain {
+	struct list_head domain_list;
+	struct perf_record_schedstat_domain *domain_data;
+};
+
+struct schedstat_cpu {
+	struct list_head cpu_list;
+	struct list_head domain_head;
+	struct perf_record_schedstat_cpu *cpu_data;
+};
+
+static struct list_head cpu_head = LIST_HEAD_INIT(cpu_head);
+static struct schedstat_cpu *cpu_second_pass;
+static struct schedstat_domain *domain_second_pass;
+static bool after_workload_flag;
+static bool verbose_field;
+
+static void store_schedtstat_cpu_diff(struct schedstat_cpu *after_workload)
+{
+	struct perf_record_schedstat_cpu *before = cpu_second_pass->cpu_data;
+	struct perf_record_schedstat_cpu *after = after_workload->cpu_data;
+	__u16 version = after_workload->cpu_data->version;
+
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)	\
+	(before->_ver._name = after->_ver._name - before->_ver._name)
+
+	if (version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+
+#undef CPU_FIELD
+}
+
+static void store_schedstat_domain_diff(struct schedstat_domain *after_workload)
+{
+	struct perf_record_schedstat_domain *before = domain_second_pass->domain_data;
+	struct perf_record_schedstat_domain *after = after_workload->domain_data;
+	__u16 version = after_workload->domain_data->version;
+
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)	\
+	(before->_ver._name = after->_ver._name - before->_ver._name)
+
+	if (version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+#undef DOMAIN_FIELD
+}
+
+static inline void print_cpu_stats(struct perf_record_schedstat_cpu *cs)
+{
+	printf("%-65s %12s %12s\n", "DESC", "COUNT", "PCT_CHANGE");
+	printf("%.*s\n", 100, graph_dotted_line);
+
+#define CALC_PCT(_x, _y)	((_y) ? ((double)(_x) / (_y)) * 100 : 0.0)
+
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)		\
+	do {									\
+		printf("%-65s: " _format, verbose_field ? _desc : #_name,	\
+		       cs->_ver._name);						\
+		if (_is_pct) {							\
+			printf("  ( %8.2lf%% )",				\
+			       CALC_PCT(cs->_ver._name, cs->_ver._pct_of));	\
+		}								\
+		printf("\n");							\
+	} while (0)
+
+	if (cs->version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (cs->version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (cs->version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+
+#undef CPU_FIELD
+#undef CALC_PCT
+}
+
+static inline void print_domain_stats(struct perf_record_schedstat_domain *ds,
+				      __u64 jiffies)
+{
+	printf("%-65s %12s %14s\n", "DESC", "COUNT", "AVG_JIFFIES");
+
+#define DOMAIN_CATEGORY(_desc)							\
+	do {									\
+		size_t _len = strlen(_desc);					\
+		size_t _pre_dash_cnt = (100 - _len) / 2;			\
+		size_t _post_dash_cnt = 100 - _len - _pre_dash_cnt;		\
+		print_separator2((int)_pre_dash_cnt, _desc, (int)_post_dash_cnt);\
+	} while (0)
+
+#define CALC_AVG(_x, _y)	((_y) ? (long double)(_x) / (_y) : 0.0)
+
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		\
+	do {									\
+		printf("%-65s: " _format, verbose_field ? _desc : #_name,	\
+		       ds->_ver._name);						\
+		if (_is_jiffies) {						\
+			printf("  $ %11.2Lf $",					\
+			       CALC_AVG(jiffies, ds->_ver._name));		\
+		}								\
+		printf("\n");							\
+	} while (0)
+
+#define DERIVED_CNT_FIELD(_name, _desc, _format, _x, _y, _z, _ver)		\
+	printf("*%-64s: " _format "\n", verbose_field ? _desc : #_name,		\
+	       (ds->_ver._x) - (ds->_ver._y) - (ds->_ver._z))
+
+#define DERIVED_AVG_FIELD(_name, _desc, _format, _x, _y, _z, _w, _ver)		\
+	printf("*%-64s: " _format "\n", verbose_field ? _desc : #_name,		\
+	       CALC_AVG(ds->_ver._w,						\
+			((ds->_ver._x) - (ds->_ver._y) - (ds->_ver._z))))
+
+	if (ds->version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (ds->version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (ds->version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+
+#undef DERIVED_AVG_FIELD
+#undef DERIVED_CNT_FIELD
+#undef DOMAIN_FIELD
+#undef CALC_AVG
+#undef DOMAIN_CATEGORY
+}
+
+static void summarize_schedstat_cpu(struct schedstat_cpu *summary_cpu,
+				    struct schedstat_cpu *cptr,
+				    int cnt, bool is_last)
+{
+	struct perf_record_schedstat_cpu *summary_cs = summary_cpu->cpu_data,
+					 *temp_cs = cptr->cpu_data;
+
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)		\
+	do {									\
+		summary_cs->_ver._name += temp_cs->_ver._name;			\
+		if (is_last)							\
+			summary_cs->_ver._name /= cnt;				\
+	} while (0)
+
+	if (cptr->cpu_data->version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (cptr->cpu_data->version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (cptr->cpu_data->version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+#undef CPU_FIELD
+}
+
+static void summarize_schedstat_domain(struct schedstat_domain *summary_domain,
+				       struct schedstat_domain *dptr,
+				       int cnt, bool is_last)
+{
+	struct perf_record_schedstat_domain *summary_ds = summary_domain->domain_data,
+					    *temp_ds = dptr->domain_data;
+
+#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		\
+	do {									\
+		summary_ds->_ver._name += temp_ds->_ver._name;			\
+		if (is_last)							\
+			summary_ds->_ver._name /= cnt;				\
+	} while (0)
+
+	if (dptr->domain_data->version == 15) {
+#include <perf/schedstat-v15.h>
+	} else if (dptr->domain_data->version == 16) {
+#include <perf/schedstat-v16.h>
+	} else if (dptr->domain_data->version == 17) {
+#include <perf/schedstat-v17.h>
+	}
+#undef DOMAIN_FIELD
+}
+
+/*
+ * get_all_cpu_stats() appends the summary to the head of the list.
+ */
+static int get_all_cpu_stats(struct list_head *head)
+{
+	struct schedstat_cpu *cptr = list_first_entry(head, struct schedstat_cpu, cpu_list);
+	struct schedstat_cpu *summary_head = NULL;
+	struct perf_record_schedstat_domain *ds;
+	struct perf_record_schedstat_cpu *cs;
+	struct schedstat_domain *dptr, *tdptr;
+	bool is_last = false;
+	int cnt = 1;
+	int ret = 0;
+
+	if (cptr) {
+		summary_head = zalloc(sizeof(*summary_head));
+		if (!summary_head)
+			return -ENOMEM;
+
+		summary_head->cpu_data = zalloc(sizeof(*cs));
+		memcpy(summary_head->cpu_data, cptr->cpu_data, sizeof(*cs));
+
+		INIT_LIST_HEAD(&summary_head->domain_head);
+
+		list_for_each_entry(dptr, &cptr->domain_head, domain_list) {
+			tdptr = zalloc(sizeof(*tdptr));
+			if (!tdptr)
+				return -ENOMEM;
+
+			tdptr->domain_data = zalloc(sizeof(*ds));
+			if (!tdptr->domain_data)
+				return -ENOMEM;
+
+			memcpy(tdptr->domain_data, dptr->domain_data, sizeof(*ds));
+			list_add_tail(&tdptr->domain_list, &summary_head->domain_head);
+		}
+	}
+
+	list_for_each_entry(cptr, head, cpu_list) {
+		if (list_is_first(&cptr->cpu_list, head))
+			continue;
+
+		if (list_is_last(&cptr->cpu_list, head))
+			is_last = true;
+
+		cnt++;
+		summarize_schedstat_cpu(summary_head, cptr, cnt, is_last);
+		tdptr = list_first_entry(&summary_head->domain_head, struct schedstat_domain,
+					 domain_list);
+
+		list_for_each_entry(dptr, &cptr->domain_head, domain_list) {
+			summarize_schedstat_domain(tdptr, dptr, cnt, is_last);
+			tdptr = list_next_entry(tdptr, domain_list);
+		}
+	}
+
+	list_add(&summary_head->cpu_list, head);
+	return ret;
+}
+
+static int show_schedstat_data(struct list_head *head, struct cpu_domain_map **cd_map)
+{
+	struct schedstat_cpu *cptr = list_first_entry(head, struct schedstat_cpu, cpu_list);
+	__u64 jiffies = cptr->cpu_data->timestamp;
+	struct perf_record_schedstat_domain *ds;
+	struct perf_record_schedstat_cpu *cs;
+	struct schedstat_domain *dptr;
+	bool is_summary = true;
+	int ret = 0;
+
+	printf("Description\n");
+	print_separator2(100, "", 0);
+	printf("%-30s-> %s\n", "DESC", "Description of the field");
+	printf("%-30s-> %s\n", "COUNT", "Value of the field");
+	printf("%-30s-> %s\n", "PCT_CHANGE", "Percent change with corresponding base value");
+	printf("%-30s-> %s\n", "AVG_JIFFIES",
+	       "Avg time in jiffies between two consecutive occurrence of event");
+
+	print_separator2(100, "", 0);
+	printf("\n");
+
+	printf("%-65s: %11llu\n", "Time elapsed (in jiffies)", jiffies);
+
+	ret = get_all_cpu_stats(head);
+
+	list_for_each_entry(cptr, head, cpu_list) {
+		cs = cptr->cpu_data;
+		print_separator2(100, "", 0);
+
+		if (is_summary)
+			printf("CPU: <ALL CPUS SUMMARY>\n");
+		else
+			printf("CPU: %d\n", cs->cpu);
+
+		print_separator2(100, "", 0);
+		print_cpu_stats(cs);
+		print_separator2(100, "", 0);
+
+		list_for_each_entry(dptr, &cptr->domain_head, domain_list) {
+			struct domain_info *dinfo;
+
+			ds = dptr->domain_data;
+			dinfo = cd_map[ds->cpu]->domains[ds->domain];
+			if (is_summary) {
+				if (dinfo->dname)
+					printf("CPU: <ALL CPUS SUMMARY> | DOMAIN: %s\n",
+					       dinfo->dname);
+				else
+					printf("CPU: <ALL CPUS SUMMARY> | DOMAIN: %d\n",
+					       dinfo->domain);
+			} else {
+				if (dinfo->dname)
+					printf("CPU: %d | DOMAIN: %s | DOMAIN_CPUS: ",
+					       cs->cpu, dinfo->dname);
+				else
+					printf("CPU: %d | DOMAIN: %d | DOMAIN_CPUS: ",
+					       cs->cpu, dinfo->domain);
+
+				printf("%s\n", dinfo->cpulist);
+			}
+			print_separator2(100, "", 0);
+			print_domain_stats(ds, jiffies);
+			print_separator2(100, "", 0);
+		}
+		is_summary = false;
+	}
+	return ret;
+}
+
+/*
+ * Creates a linked list of cpu_data and domain_data. Below represents the structure of the linked
+ * list where CPU0,CPU1,CPU2, ..., CPU(N-1) stores the cpu_data. Here N is the total number of cpus.
+ * Each of the CPU points to the list of domain_data. Here DOMAIN0, DOMAIN1, DOMAIN2, ... represents
+ * the domain_data. Here D0, D1, D2, ..., Dm are the number of domains in the respective cpus.
+ *
+ *	+----------+
+ *	| CPU_HEAD |
+ *	+----------+
+ *	      |
+ *	      v
+ *	+----------+    +---------+    +---------+    +---------+	    +--------------+
+ *	|   CPU0   | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D0-1) |
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *	      |
+ *	      v
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *	|   CPU1   | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D1-1) |
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *	      |
+ *	      v
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *	|   CPU2   | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(D2-1) |
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *	      |
+ *	      v
+ *	     ...
+ *	      |
+ *	      v
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *	| CPU(N-1) | -> | DOMAIN0 | -> | DOMAIN1 | -> | DOMAIN2 | -> ... -> | DOMAIN(Dm-1) |
+ *	+----------+    +---------+    +---------+    +---------+           +--------------+
+ *
+ * Each cpu as well as domain has 2 enties in the event list one before the workload starts and
+ * other after completion of the workload. The above linked list stores the diff of the cpu and
+ * domain statistics.
+ */
+static int perf_sched__process_schedstat(const struct perf_tool *tool __maybe_unused,
+					 struct perf_session *session __maybe_unused,
+					 union perf_event *event)
+{
+	struct perf_cpu this_cpu;
+	static __u32 initial_cpu;
+
+	switch (event->header.type) {
+	case PERF_RECORD_SCHEDSTAT_CPU:
+		this_cpu.cpu = event->schedstat_cpu.cpu;
+		break;
+	case PERF_RECORD_SCHEDSTAT_DOMAIN:
+		this_cpu.cpu = event->schedstat_domain.cpu;
+		break;
+	default:
+		return 0;
+	}
+
+	if (user_requested_cpus && !perf_cpu_map__has(user_requested_cpus, this_cpu))
+		return 0;
+
+	if (event->header.type == PERF_RECORD_SCHEDSTAT_CPU) {
+		struct schedstat_cpu *temp = zalloc(sizeof(*temp));
+
+		if (!temp)
+			return -ENOMEM;
+
+		temp->cpu_data = zalloc(sizeof(*temp->cpu_data));
+		if (!temp->cpu_data)
+			return -ENOMEM;
+
+		memcpy(temp->cpu_data, &event->schedstat_cpu, sizeof(*temp->cpu_data));
+
+		if (!list_empty(&cpu_head) && temp->cpu_data->cpu == initial_cpu)
+			after_workload_flag = true;
+
+		if (!after_workload_flag) {
+			if (list_empty(&cpu_head))
+				initial_cpu = temp->cpu_data->cpu;
+
+			list_add_tail(&temp->cpu_list, &cpu_head);
+			INIT_LIST_HEAD(&temp->domain_head);
+		} else {
+			if (temp->cpu_data->cpu == initial_cpu) {
+				cpu_second_pass = list_first_entry(&cpu_head, struct schedstat_cpu,
+								   cpu_list);
+				cpu_second_pass->cpu_data->timestamp =
+					temp->cpu_data->timestamp - cpu_second_pass->cpu_data->timestamp;
+			} else {
+				cpu_second_pass = list_next_entry(cpu_second_pass, cpu_list);
+			}
+			domain_second_pass = list_first_entry(&cpu_second_pass->domain_head,
+							      struct schedstat_domain, domain_list);
+			store_schedtstat_cpu_diff(temp);
+		}
+	} else if (event->header.type == PERF_RECORD_SCHEDSTAT_DOMAIN) {
+		struct schedstat_cpu *cpu_tail;
+		struct schedstat_domain *temp = zalloc(sizeof(*temp));
+
+		if (!temp)
+			return -ENOMEM;
+
+		temp->domain_data = zalloc(sizeof(*temp->domain_data));
+		if (!temp->domain_data)
+			return -ENOMEM;
+
+		memcpy(temp->domain_data, &event->schedstat_domain, sizeof(*temp->domain_data));
+
+		if (!after_workload_flag) {
+			cpu_tail = list_last_entry(&cpu_head, struct schedstat_cpu, cpu_list);
+			list_add_tail(&temp->domain_list, &cpu_tail->domain_head);
+		} else {
+			store_schedstat_domain_diff(temp);
+			domain_second_pass = list_next_entry(domain_second_pass, domain_list);
+		}
+	}
+
+	return 0;
+}
+
+static void free_schedstat(struct list_head *head)
+{
+	struct schedstat_domain *dptr, *n1;
+	struct schedstat_cpu *cptr, *n2;
+
+	list_for_each_entry_safe(cptr, n2, head, cpu_list) {
+		list_for_each_entry_safe(dptr, n1, &cptr->domain_head, domain_list) {
+			list_del_init(&dptr->domain_list);
+			free(dptr);
+		}
+		list_del_init(&cptr->cpu_list);
+		free(cptr);
+	}
+}
+
+static int perf_sched__schedstat_report(struct perf_sched *sched)
+{
+	struct cpu_domain_map **cd_map;
+	struct perf_session *session;
+	struct target target = {};
+	struct perf_data data = {
+		.path  = input_name,
+		.mode  = PERF_DATA_MODE_READ,
+	};
+	int err = 0;
+
+	sched->tool.schedstat_cpu = perf_sched__process_schedstat;
+	sched->tool.schedstat_domain = perf_sched__process_schedstat;
+
+	session = perf_session__new(&data, &sched->tool);
+	if (IS_ERR(session)) {
+		pr_err("Perf session creation failed.\n");
+		return PTR_ERR(session);
+	}
+
+	if (cpu_list)
+		target.cpu_list = cpu_list;
+	else
+		target.system_wide = true;
+
+	err = evlist__create_maps(session->evlist, &target);
+	if (err < 0)
+		goto out;
+
+	user_requested_cpus = session->evlist->core.user_requested_cpus;
+
+	err = perf_session__process_events(session);
+
+	if (!err) {
+		setup_pager();
+
+		if (list_empty(&cpu_head)) {
+			pr_err("Data is not available\n");
+			err = -1;
+			goto out;
+		}
+
+		cd_map = session->header.env.cpu_domain;
+		err = show_schedstat_data(&cpu_head, cd_map);
+	}
+
+out:
+	free_schedstat(&cpu_head);
+	perf_session__delete(session);
+	return err;
+}
+
 static bool schedstat_events_exposed(void)
 {
 	/*
@@ -4106,9 +4603,12 @@ int cmd_sched(int argc, const char **argv)
 	OPT_PARENT(sched_options)
 	};
 	const struct option stats_options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		   "`stats report` with input filename"),
 	OPT_STRING('o', "output", &output_name, "file",
 		   "`stats record` with output filename"),
 	OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
+	OPT_BOOLEAN('v', "verbose", &verbose_field, "Show explanation for fields in the report"),
 	OPT_END()
 	};
 
@@ -4129,7 +4629,7 @@ int cmd_sched(int argc, const char **argv)
 		NULL
 	};
 	const char *stats_usage[] = {
-		"perf sched stats {record} [<options>]",
+		"perf sched stats {record|report} [<options>]",
 		NULL
 	};
 	const char *const sched_subcommands[] = { "record", "latency", "map",
@@ -4233,7 +4733,7 @@ int cmd_sched(int argc, const char **argv)
 		if (!ret)
 			ret = perf_sched__timehist(&sched);
 	} else if (!strcmp(argv[0], "stats")) {
-		const char *const stats_subcommands[] = {"record", NULL};
+		const char *const stats_subcommands[] = {"record", "report", NULL};
 
 		argc = parse_options_subcommand(argc, argv, stats_options,
 						stats_subcommands,
@@ -4245,6 +4745,11 @@ int cmd_sched(int argc, const char **argv)
 				argc = parse_options(argc, argv, stats_options,
 						     stats_usage, 0);
 			return perf_sched__schedstat_record(&sched, argc, argv);
+		} else if (argv[0] && !strcmp(argv[0], "report")) {
+			if (argc)
+				argc = parse_options(argc, argv, stats_options,
+						     stats_usage, 0);
+			return perf_sched__schedstat_report(&sched);
 		}
 		usage_with_options(stats_usage, stats_options);
 	} else {
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index b87ff96a9f45..03a603fbcd7d 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -299,6 +299,12 @@ free_bm:
 	free(bm);
 }
 
+void print_separator2(int pre_dash_cnt, const char *s, int post_dash_cnt)
+{
+	printf("%.*s%s%.*s\n", pre_dash_cnt, graph_dotted_line, s, post_dash_cnt,
+	       graph_dotted_line);
+}
+
 int rm_rf_perf_data(const char *path)
 {
 	const char *pat[] = {
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 1572c8cf04e5..394dbfa944ac 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -51,6 +51,8 @@ int perf_tip(char **strp, const char *dirpath);
 
 void cpumask_to_cpulist(char *cpumask, char *cpulist);
 
+void print_separator2(int pre_dash_cnt, const char *s, int post_dash_cnt);
+
 #ifndef HAVE_SCHED_GETCPU_SUPPORT
 int sched_getcpu(void);
 #endif
-- 
cgit v1.2.3


From 00093b3133984ffe80697b5d2e7f204983660dd9 Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Mon, 19 Jan 2026 17:58:29 +0000
Subject: perf sched stats: Add support for live mode

The live mode works similar to simple `perf stat` command, by profiling
the target and printing results on the terminal as soon as the target
finishes.

Example usage:

  # perf sched stats -- true
  Description
  ----------------------------------------------------------------------------------------------------
  DESC                          -> Description of the field
  COUNT                         -> Value of the field
  PCT_CHANGE                    -> Percent change with corresponding base value
  AVG_JIFFIES                   -> Avg time in jiffies between two consecutive occurrence of event
  ----------------------------------------------------------------------------------------------------

  Time elapsed (in jiffies)                                        :           1
  ----------------------------------------------------------------------------------------------------
  CPU: <ALL CPUS SUMMARY>
  ----------------------------------------------------------------------------------------------------
  DESC                                                                     COUNT   PCT_CHANGE
  ----------------------------------------------------------------------------------------------------
  yld_count                                                        :           0
  array_exp                                                        :           0
  sched_count                                                      :           0
  sched_goidle                                                     :           0  (     0.00% )
  ttwu_count                                                       :           0
  ttwu_local                                                       :           0  (     0.00% )
  rq_cpu_time                                                      :       27875
  run_delay                                                        :           0  (     0.00% )
  pcount                                                           :           0
  ----------------------------------------------------------------------------------------------------
  CPU: <ALL CPUS SUMMARY> | DOMAIN: SMT
  ----------------------------------------------------------------------------------------------------
  DESC                                                                     COUNT    AVG_JIFFIES
  ----------------------------------------- <Category busy> ------------------------------------------
  busy_lb_count                                                    :           0  $        0.00 $
  busy_lb_balanced                                                 :           0  $        0.00 $
  busy_lb_failed                                                   :           0  $        0.00 $
  busy_lb_imbalance_load                                           :           0
  busy_lb_imbalance_util                                           :           0
  busy_lb_imbalance_task                                           :           0
  busy_lb_imbalance_misfit                                         :           0
  busy_lb_gained                                                   :           0
  busy_lb_hot_gained                                               :           0
  busy_lb_nobusyq                                                  :           0  $        0.00 $
  busy_lb_nobusyg                                                  :           0  $        0.00 $
  *busy_lb_success_count                                           :           0
  *busy_lb_avg_pulled                                              :        0.00

  ... and so on. Output will show similar data for all the cpus in the
system.

Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: James Clark <james.clark@linaro.org>
Acked-by: Ian Rogers <irogers@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: David Vernet <void@manifault.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Graham Woodward <graham.woodward@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Cc: Zhongqiu Han <quic_zhonhan@quicinc.com>
[ Avoid potentially using 'sv' uninitialized by calling free_cpu_domain_info() only when build_cpu_domain_map() is called ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 99 +++++++++++++++++++++++++++++++++++++++++++++-
 tools/perf/util/header.c   |  3 +-
 tools/perf/util/header.h   |  3 ++
 3 files changed, 102 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index c6b054b9b12a..ec9fa29196b2 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -4426,6 +4426,103 @@ out:
 	return err;
 }
 
+static int process_synthesized_event_live(const struct perf_tool *tool __maybe_unused,
+					  union perf_event *event,
+					  struct perf_sample *sample __maybe_unused,
+					  struct machine *machine __maybe_unused)
+{
+	return perf_sched__process_schedstat(tool, NULL, event);
+}
+
+static int perf_sched__schedstat_live(struct perf_sched *sched,
+				      int argc, const char **argv)
+{
+	struct cpu_domain_map **cd_map = NULL;
+	struct target target = {};
+	u32 __maybe_unused md;
+	struct evlist *evlist;
+	u32 nr = 0, sv;
+	int reset = 0;
+	int err = 0;
+
+	signal(SIGINT, sighandler);
+	signal(SIGCHLD, sighandler);
+	signal(SIGTERM, sighandler);
+
+	evlist = evlist__new();
+	if (!evlist)
+		return -ENOMEM;
+
+	/*
+	 * `perf sched schedstat` does not support workload profiling (-p pid)
+	 * since /proc/schedstat file contains cpu specific data only. Hence, a
+	 * profile target is either set of cpus or systemwide, never a process.
+	 * Note that, although `-- <workload>` is supported, profile data are
+	 * still cpu/systemwide.
+	 */
+	if (cpu_list)
+		target.cpu_list = cpu_list;
+	else
+		target.system_wide = true;
+
+	if (argc) {
+		err = evlist__prepare_workload(evlist, &target, argv, false, NULL);
+		if (err)
+			goto out;
+	}
+
+	err = evlist__create_maps(evlist, &target);
+	if (err < 0)
+		goto out;
+
+	user_requested_cpus = evlist->core.user_requested_cpus;
+
+	err = perf_event__synthesize_schedstat(&(sched->tool),
+					       process_synthesized_event_live,
+					       user_requested_cpus);
+	if (err < 0)
+		goto out;
+
+	err = enable_sched_schedstats(&reset);
+	if (err < 0)
+		goto out;
+
+	if (argc)
+		evlist__start_workload(evlist);
+
+	/* wait for signal */
+	pause();
+
+	if (reset) {
+		err = disable_sched_schedstat();
+		if (err < 0)
+			goto out;
+	}
+
+	err = perf_event__synthesize_schedstat(&(sched->tool),
+					       process_synthesized_event_live,
+					       user_requested_cpus);
+	if (err)
+		goto out;
+
+	setup_pager();
+
+	if (list_empty(&cpu_head)) {
+		pr_err("Data is not available\n");
+		err = -1;
+		goto out;
+	}
+
+	nr = cpu__max_present_cpu().cpu;
+	cd_map = build_cpu_domain_map(&sv, &md, nr);
+	show_schedstat_data(&cpu_head, cd_map);
+	free_cpu_domain_info(cd_map, sv, nr);
+out:
+	free_schedstat(&cpu_head);
+	evlist__delete(evlist);
+	return err;
+}
+
 static bool schedstat_events_exposed(void)
 {
 	/*
@@ -4751,7 +4848,7 @@ int cmd_sched(int argc, const char **argv)
 						     stats_usage, 0);
 			return perf_sched__schedstat_report(&sched);
 		}
-		usage_with_options(stats_usage, stats_options);
+		return perf_sched__schedstat_live(&sched, argc, argv);
 	} else {
 		usage_with_options(sched_usage, sched_options);
 	}
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 673d53bb2a2c..9a15dd4b7640 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1614,8 +1614,7 @@ static int write_pmu_caps(struct feat_fd *ff,
 	return 0;
 }
 
-static struct cpu_domain_map **build_cpu_domain_map(u32 *schedstat_version, u32 *max_sched_domains,
-						    u32 nr)
+struct cpu_domain_map **build_cpu_domain_map(u32 *schedstat_version, u32 *max_sched_domains, u32 nr)
 {
 	struct domain_info *domain_info;
 	struct cpu_domain_map **cd_map;
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index c62f3275a80f..36cc74e2d14d 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -211,4 +211,7 @@ char *get_cpuid_str(struct perf_cpu cpu);
 char *get_cpuid_allow_env_override(struct perf_cpu cpu);
 
 int strcmp_cpuid_str(const char *s1, const char *s2);
+
+struct cpu_domain_map **build_cpu_domain_map(u32 *schedstat_version, u32 *max_sched_domains,
+					     u32 nr);
 #endif /* __PERF_HEADER_H */
-- 
cgit v1.2.3


From 064790a3d4a89536d00a61d7a02de67ad319bdc5 Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Mon, 19 Jan 2026 17:58:30 +0000
Subject: perf sched stats: Add support for diff subcommand

`perf sched stats diff` subcommand will take two perf.data files as an
input and it will print the diff between the two perf.data files. The
default input to this subcommnd is perf.data.old and perf.data.

Example usage:

 # perf sched stats diff sample1.data sample2.data
 Description
 ----------------------------------------------------------------------------------------------------
 DESC                          -> Description of the field
 COUNT                         -> Value of the field
 PCT_CHANGE                    -> Percent change with corresponding base value
 AVG_JIFFIES                   -> Avg time in jiffies between two consecutive occurrence of event
 ----------------------------------------------------------------------------------------------------

 Time elapsed (in jiffies)                                        :           1,          1
 ----------------------------------------------------------------------------------------------------
 CPU: <ALL CPUS SUMMARY>
 ----------------------------------------------------------------------------------------------------
 DESC                                                                    COUNT1      COUNT2   PCT_CHANGE    PCT_CHANGE1 PCT_CHANGE2
 ----------------------------------------------------------------------------------------------------
 yld_count                                                        :           0,          0  |     0.00% |
 array_exp                                                        :           0,          0  |     0.00% |
 sched_count                                                      :           0,          0  |     0.00% |
 sched_goidle                                                     :           0,          0  |     0.00% |  (     0.00%,      0.00% )
 ttwu_count                                                       :           0,          0  |     0.00% |
 ttwu_local                                                       :           0,          0  |     0.00% |  (     0.00%,      0.00% )
 rq_cpu_time                                                      :       32565,      33525  |     2.95% |
 run_delay                                                        :           0,        436  |     0.00% |  (     0.00%,      1.30% )
 pcount                                                           :           0,          0  |     0.00% |
 ----------------------------------------------------------------------------------------------------
 CPU: <ALL CPUS SUMMARY> | DOMAIN: SMT
 ----------------------------------------------------------------------------------------------------
 DESC                                                                    COUNT1      COUNT2   PCT_CHANGE     AVG_JIFFIES1 AVG_JIFFIES2
 ----------------------------------------- <Category busy> ------------------------------------------
 busy_lb_count                                                    :           0,          0  |     0.00% |  $        0.00,        0.00 $
 busy_lb_balanced                                                 :           0,          0  |     0.00% |  $        0.00,        0.00 $
 busy_lb_failed                                                   :           0,          0  |     0.00% |  $        0.00,        0.00 $
 busy_lb_imbalance_load                                           :           0,          0  |     0.00% |
 busy_lb_imbalance_util                                           :           0,          0  |     0.00% |
 busy_lb_imbalance_task                                           :           0,          0  |     0.00% |
 busy_lb_imbalance_misfit                                         :           0,          0  |     0.00% |
 busy_lb_gained                                                   :           0,          0  |     0.00% |
 busy_lb_hot_gained                                               :           0,          0  |     0.00% |
 busy_lb_nobusyq                                                  :           0,          0  |     0.00% |  $        0.00,        0.00 $
 busy_lb_nobusyg                                                  :           0,          0  |     0.00% |  $        0.00,        0.00 $
 *busy_lb_success_count                                           :           0,          0  |     0.00% |
 *busy_lb_avg_pulled                                              :        0.00,       0.00  |     0.00% |

 ... and so on. Output contains the diff of aggregated data of all the
busy, idle and newidle categories for all the sched domains in the system.

Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: David Vernet <void@manifault.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Graham Woodward <graham.woodward@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Cc: Zhongqiu Han <quic_zhonhan@quicinc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 316 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 260 insertions(+), 56 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index ec9fa29196b2..b190e928117c 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -3985,29 +3985,46 @@ static void store_schedstat_domain_diff(struct schedstat_domain *after_workload)
 #undef DOMAIN_FIELD
 }
 
-static inline void print_cpu_stats(struct perf_record_schedstat_cpu *cs)
+#define PCT_CHNG(_x, _y)        ((_x) ? ((double)((double)(_y) - (_x)) / (_x)) * 100 : 0.0)
+static inline void print_cpu_stats(struct perf_record_schedstat_cpu *cs1,
+				   struct perf_record_schedstat_cpu *cs2)
 {
-	printf("%-65s %12s %12s\n", "DESC", "COUNT", "PCT_CHANGE");
-	printf("%.*s\n", 100, graph_dotted_line);
+	printf("%-65s ", "DESC");
+	if (!cs2)
+		printf("%12s %12s", "COUNT", "PCT_CHANGE");
+	else
+		printf("%12s %11s %12s %14s %10s", "COUNT1", "COUNT2", "PCT_CHANGE",
+		       "PCT_CHANGE1", "PCT_CHANGE2");
+
+	printf("\n");
+	print_separator2(100, "", 0);
 
 #define CALC_PCT(_x, _y)	((_y) ? ((double)(_x) / (_y)) * 100 : 0.0)
 
-#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)		\
-	do {									\
-		printf("%-65s: " _format, verbose_field ? _desc : #_name,	\
-		       cs->_ver._name);						\
-		if (_is_pct) {							\
-			printf("  ( %8.2lf%% )",				\
-			       CALC_PCT(cs->_ver._name, cs->_ver._pct_of));	\
-		}								\
-		printf("\n");							\
+#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of, _ver)			\
+	do {										\
+		printf("%-65s: " _format, verbose_field ? _desc : #_name,		\
+		       cs1->_ver._name);						\
+		if (!cs2) {								\
+			if (_is_pct)							\
+				printf("  ( %8.2lf%% )",				\
+				       CALC_PCT(cs1->_ver._name, cs1->_ver._pct_of));	\
+		} else {								\
+			printf("," _format "  | %8.2lf%% |", cs2->_ver._name,		\
+			       PCT_CHNG(cs1->_ver._name, cs2->_ver._name));		\
+			if (_is_pct)							\
+				printf("  ( %8.2lf%%,  %8.2lf%% )",			\
+				       CALC_PCT(cs1->_ver._name, cs1->_ver._pct_of),	\
+				       CALC_PCT(cs2->_ver._name, cs2->_ver._pct_of));	\
+		}									\
+		printf("\n");								\
 	} while (0)
 
-	if (cs->version == 15) {
+	if (cs1->version == 15) {
 #include <perf/schedstat-v15.h>
-	} else if (cs->version == 16) {
+	} else if (cs1->version == 16) {
 #include <perf/schedstat-v16.h>
-	} else if (cs->version == 17) {
+	} else if (cs1->version == 17) {
 #include <perf/schedstat-v17.h>
 	}
 
@@ -4015,10 +4032,17 @@ static inline void print_cpu_stats(struct perf_record_schedstat_cpu *cs)
 #undef CALC_PCT
 }
 
-static inline void print_domain_stats(struct perf_record_schedstat_domain *ds,
-				      __u64 jiffies)
+static inline void print_domain_stats(struct perf_record_schedstat_domain *ds1,
+				      struct perf_record_schedstat_domain *ds2,
+				      __u64 jiffies1, __u64 jiffies2)
 {
-	printf("%-65s %12s %14s\n", "DESC", "COUNT", "AVG_JIFFIES");
+	printf("%-65s ", "DESC");
+	if (!ds2)
+		printf("%12s %14s", "COUNT", "AVG_JIFFIES");
+	else
+		printf("%12s %11s %12s %16s %12s", "COUNT1", "COUNT2", "PCT_CHANGE",
+		       "AVG_JIFFIES1", "AVG_JIFFIES2");
+	printf("\n");
 
 #define DOMAIN_CATEGORY(_desc)							\
 	do {									\
@@ -4033,28 +4057,54 @@ static inline void print_domain_stats(struct perf_record_schedstat_domain *ds,
 #define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies, _ver)		\
 	do {									\
 		printf("%-65s: " _format, verbose_field ? _desc : #_name,	\
-		       ds->_ver._name);						\
-		if (_is_jiffies) {						\
-			printf("  $ %11.2Lf $",					\
-			       CALC_AVG(jiffies, ds->_ver._name));		\
+		       ds1->_ver._name);					\
+		if (!ds2) {							\
+			if (_is_jiffies)					\
+				printf("  $ %11.2Lf $",				\
+				       CALC_AVG(jiffies1, ds1->_ver._name));	\
+		} else {							\
+			printf("," _format "  | %8.2lf%% |", ds2->_ver._name,	\
+			       PCT_CHNG(ds1->_ver._name, ds2->_ver._name));	\
+			if (_is_jiffies)					\
+				printf("  $ %11.2Lf, %11.2Lf $",		\
+				       CALC_AVG(jiffies1, ds1->_ver._name),	\
+				       CALC_AVG(jiffies2, ds2->_ver._name));	\
 		}								\
 		printf("\n");							\
 	} while (0)
 
 #define DERIVED_CNT_FIELD(_name, _desc, _format, _x, _y, _z, _ver)		\
-	printf("*%-64s: " _format "\n", verbose_field ? _desc : #_name,		\
-	       (ds->_ver._x) - (ds->_ver._y) - (ds->_ver._z))
+	do {									\
+		__u32 t1 = ds1->_ver._x - ds1->_ver._y - ds1->_ver._z;		\
+		printf("*%-64s: " _format, verbose_field ? _desc : #_name, t1);	\
+		if (ds2) {							\
+			__u32 t2 = ds2->_ver._x - ds2->_ver._y - ds2->_ver._z;	\
+			printf("," _format "  | %8.2lf%% |", t2,		\
+			       PCT_CHNG(t1, t2));				\
+		}								\
+		printf("\n");							\
+	} while (0)
 
 #define DERIVED_AVG_FIELD(_name, _desc, _format, _x, _y, _z, _w, _ver)		\
-	printf("*%-64s: " _format "\n", verbose_field ? _desc : #_name,		\
-	       CALC_AVG(ds->_ver._w,						\
-			((ds->_ver._x) - (ds->_ver._y) - (ds->_ver._z))))
+	do {									\
+		__u32 t1 = ds1->_ver._x - ds1->_ver._y - ds1->_ver._z;		\
+		printf("*%-64s: " _format, verbose_field ? _desc : #_name,	\
+		       CALC_AVG(ds1->_ver._w, t1));				\
+		if (ds2) {							\
+			__u32 t2 = ds2->_ver._x - ds2->_ver._y - ds2->_ver._z;	\
+			printf("," _format "  | %8.2Lf%% |",			\
+			       CALC_AVG(ds2->_ver._w, t2),			\
+			       PCT_CHNG(CALC_AVG(ds1->_ver._w, t1),		\
+					CALC_AVG(ds2->_ver._w, t2)));		\
+		}								\
+		printf("\n");							\
+	} while (0)
 
-	if (ds->version == 15) {
+	if (ds1->version == 15) {
 #include <perf/schedstat-v15.h>
-	} else if (ds->version == 16) {
+	} else if (ds1->version == 16) {
 #include <perf/schedstat-v16.h>
-	} else if (ds->version == 17) {
+	} else if (ds1->version == 17) {
 #include <perf/schedstat-v17.h>
 	}
 
@@ -4064,6 +4114,7 @@ static inline void print_domain_stats(struct perf_record_schedstat_domain *ds,
 #undef CALC_AVG
 #undef DOMAIN_CATEGORY
 }
+#undef PCT_CHNG
 
 static void summarize_schedstat_cpu(struct schedstat_cpu *summary_cpu,
 				    struct schedstat_cpu *cptr,
@@ -4173,13 +4224,16 @@ static int get_all_cpu_stats(struct list_head *head)
 	return ret;
 }
 
-static int show_schedstat_data(struct list_head *head, struct cpu_domain_map **cd_map)
+static int show_schedstat_data(struct list_head *head1, struct cpu_domain_map **cd_map1,
+			       struct list_head *head2, struct cpu_domain_map **cd_map2,
+			       bool summary_only)
 {
-	struct schedstat_cpu *cptr = list_first_entry(head, struct schedstat_cpu, cpu_list);
-	__u64 jiffies = cptr->cpu_data->timestamp;
-	struct perf_record_schedstat_domain *ds;
-	struct perf_record_schedstat_cpu *cs;
-	struct schedstat_domain *dptr;
+	struct schedstat_cpu *cptr1 = list_first_entry(head1, struct schedstat_cpu, cpu_list);
+	struct perf_record_schedstat_domain *ds1 = NULL, *ds2 = NULL;
+	struct perf_record_schedstat_cpu *cs1 = NULL, *cs2 = NULL;
+	struct schedstat_domain *dptr1 = NULL, *dptr2 = NULL;
+	struct schedstat_cpu *cptr2 = NULL;
+	__u64 jiffies1 = 0, jiffies2 = 0;
 	bool is_summary = true;
 	int ret = 0;
 
@@ -4194,49 +4248,100 @@ static int show_schedstat_data(struct list_head *head, struct cpu_domain_map **c
 	print_separator2(100, "", 0);
 	printf("\n");
 
-	printf("%-65s: %11llu\n", "Time elapsed (in jiffies)", jiffies);
+	printf("%-65s: ", "Time elapsed (in jiffies)");
+	jiffies1 = cptr1->cpu_data->timestamp;
+	printf("%11llu", jiffies1);
+	if (head2) {
+		cptr2 = list_first_entry(head2, struct schedstat_cpu, cpu_list);
+		jiffies2 = cptr2->cpu_data->timestamp;
+		printf(",%11llu", jiffies2);
+	}
+	printf("\n");
+
+	ret = get_all_cpu_stats(head1);
+	if (cptr2) {
+		ret = get_all_cpu_stats(head2);
+		cptr2 = list_first_entry(head2, struct schedstat_cpu, cpu_list);
+	}
 
-	ret = get_all_cpu_stats(head);
+	list_for_each_entry(cptr1, head1, cpu_list) {
+		struct cpu_domain_map *cd_info1 = NULL, *cd_info2 = NULL;
+
+		cs1 = cptr1->cpu_data;
+		cd_info1 = cd_map1[cs1->cpu];
+		if (cptr2) {
+			cs2 = cptr2->cpu_data;
+			cd_info2 = cd_map2[cs2->cpu];
+			dptr2 = list_first_entry(&cptr2->domain_head, struct schedstat_domain,
+						 domain_list);
+		}
+
+		if (cs2 && cs1->cpu != cs2->cpu) {
+			pr_err("Failed because matching cpus not found for diff\n");
+			return -1;
+		}
+
+		if (cd_info2 && cd_info1->nr_domains != cd_info2->nr_domains) {
+			pr_err("Failed because nr_domains is not same for cpus\n");
+			return -1;
+		}
 
-	list_for_each_entry(cptr, head, cpu_list) {
-		cs = cptr->cpu_data;
 		print_separator2(100, "", 0);
 
 		if (is_summary)
 			printf("CPU: <ALL CPUS SUMMARY>\n");
 		else
-			printf("CPU: %d\n", cs->cpu);
+			printf("CPU: %d\n", cs1->cpu);
 
 		print_separator2(100, "", 0);
-		print_cpu_stats(cs);
+		print_cpu_stats(cs1, cs2);
 		print_separator2(100, "", 0);
 
-		list_for_each_entry(dptr, &cptr->domain_head, domain_list) {
-			struct domain_info *dinfo;
+		list_for_each_entry(dptr1, &cptr1->domain_head, domain_list) {
+			struct domain_info *dinfo1 = NULL, *dinfo2 = NULL;
+
+			ds1 = dptr1->domain_data;
+			dinfo1 = cd_info1->domains[ds1->domain];
+			if (dptr2) {
+				ds2 = dptr2->domain_data;
+				dinfo2 = cd_info2->domains[ds2->domain];
+			}
+
+			if (dinfo2 && dinfo1->domain != dinfo2->domain) {
+				pr_err("Failed because matching domain not found for diff\n");
+				return -1;
+			}
 
-			ds = dptr->domain_data;
-			dinfo = cd_map[ds->cpu]->domains[ds->domain];
 			if (is_summary) {
-				if (dinfo->dname)
+				if (dinfo1->dname)
 					printf("CPU: <ALL CPUS SUMMARY> | DOMAIN: %s\n",
-					       dinfo->dname);
+					       dinfo1->dname);
 				else
 					printf("CPU: <ALL CPUS SUMMARY> | DOMAIN: %d\n",
-					       dinfo->domain);
+					       dinfo1->domain);
 			} else {
-				if (dinfo->dname)
+				if (dinfo1->dname)
 					printf("CPU: %d | DOMAIN: %s | DOMAIN_CPUS: ",
-					       cs->cpu, dinfo->dname);
+					       cs1->cpu, dinfo1->dname);
 				else
 					printf("CPU: %d | DOMAIN: %d | DOMAIN_CPUS: ",
-					       cs->cpu, dinfo->domain);
+					       cs1->cpu, dinfo1->domain);
 
-				printf("%s\n", dinfo->cpulist);
+				printf("%s\n", dinfo1->cpulist);
 			}
 			print_separator2(100, "", 0);
-			print_domain_stats(ds, jiffies);
+			print_domain_stats(ds1, ds2, jiffies1, jiffies2);
 			print_separator2(100, "", 0);
+
+			if (dptr2)
+				dptr2 = list_next_entry(dptr2, domain_list);
 		}
+		if (summary_only)
+			break;
+
+		if (cptr2)
+			cptr2 = list_next_entry(cptr2, cpu_list);
+
 		is_summary = false;
 	}
 	return ret;
@@ -4417,7 +4522,7 @@ static int perf_sched__schedstat_report(struct perf_sched *sched)
 		}
 
 		cd_map = session->header.env.cpu_domain;
-		err = show_schedstat_data(&cpu_head, cd_map);
+		err = show_schedstat_data(&cpu_head, cd_map, NULL, NULL, false);
 	}
 
 out:
@@ -4426,6 +4531,100 @@ out:
 	return err;
 }
 
+static int perf_sched__schedstat_diff(struct perf_sched *sched,
+				      int argc, const char **argv)
+{
+	struct cpu_domain_map **cd_map0 = NULL, **cd_map1 = NULL;
+	struct list_head cpu_head_ses0, cpu_head_ses1;
+	struct perf_session *session[2];
+	struct perf_data data[2];
+	int ret = 0, err = 0;
+	static const char *defaults[] = {
+		"perf.data.old",
+		"perf.data",
+	};
+
+	if (argc) {
+		if (argc == 1)
+			defaults[1] = argv[0];
+		else if (argc == 2) {
+			defaults[0] = argv[0];
+			defaults[1] = argv[1];
+		} else {
+			pr_err("perf sched stats diff is not supported with more than 2 files.\n");
+			goto out_ret;
+		}
+	}
+
+	INIT_LIST_HEAD(&cpu_head_ses0);
+	INIT_LIST_HEAD(&cpu_head_ses1);
+
+	sched->tool.schedstat_cpu = perf_sched__process_schedstat;
+	sched->tool.schedstat_domain = perf_sched__process_schedstat;
+
+	data[0].path = defaults[0];
+	data[0].mode  = PERF_DATA_MODE_READ;
+	session[0] = perf_session__new(&data[0], &sched->tool);
+	if (IS_ERR(session[0])) {
+		ret = PTR_ERR(session[0]);
+		pr_err("Failed to open %s\n", data[0].path);
+		goto out_delete_ses0;
+	}
+
+	err = perf_session__process_events(session[0]);
+	if (err)
+		goto out_delete_ses0;
+
+	cd_map0 = session[0]->header.env.cpu_domain;
+	list_replace_init(&cpu_head, &cpu_head_ses0);
+	after_workload_flag = false;
+
+	data[1].path = defaults[1];
+	data[1].mode  = PERF_DATA_MODE_READ;
+	session[1] = perf_session__new(&data[1], &sched->tool);
+	if (IS_ERR(session[1])) {
+		ret = PTR_ERR(session[1]);
+		pr_err("Failed to open %s\n", data[1].path);
+		goto out_delete_ses1;
+	}
+
+	err = perf_session__process_events(session[1]);
+	if (err)
+		goto out_delete_ses1;
+
+	cd_map1 = session[1]->header.env.cpu_domain;
+	list_replace_init(&cpu_head, &cpu_head_ses1);
+	after_workload_flag = false;
+	setup_pager();
+
+	if (list_empty(&cpu_head_ses1)) {
+		pr_err("Data is not available\n");
+		ret = -1;
+		goto out_delete_ses1;
+	}
+
+	if (list_empty(&cpu_head_ses0)) {
+		pr_err("Data is not available\n");
+		ret = -1;
+		goto out_delete_ses0;
+	}
+
+	show_schedstat_data(&cpu_head_ses0, cd_map0, &cpu_head_ses1, cd_map1, true);
+
+out_delete_ses1:
+	free_schedstat(&cpu_head_ses1);
+	if (!IS_ERR(session[1]))
+		perf_session__delete(session[1]);
+
+out_delete_ses0:
+	free_schedstat(&cpu_head_ses0);
+	if (!IS_ERR(session[0]))
+		perf_session__delete(session[0]);
+
+out_ret:
+	return ret;
+}
+
 static int process_synthesized_event_live(const struct perf_tool *tool __maybe_unused,
 					  union perf_event *event,
 					  struct perf_sample *sample __maybe_unused,
@@ -4515,7 +4714,7 @@ static int perf_sched__schedstat_live(struct perf_sched *sched,
 
 	nr = cpu__max_present_cpu().cpu;
 	cd_map = build_cpu_domain_map(&sv, &md, nr);
-	show_schedstat_data(&cpu_head, cd_map);
+	show_schedstat_data(&cpu_head, cd_map, NULL, NULL, false);
 	free_cpu_domain_info(cd_map, sv, nr);
 out:
 	free_schedstat(&cpu_head);
@@ -4847,6 +5046,11 @@ int cmd_sched(int argc, const char **argv)
 				argc = parse_options(argc, argv, stats_options,
 						     stats_usage, 0);
 			return perf_sched__schedstat_report(&sched);
+		} else if (argv[0] && !strcmp(argv[0], "diff")) {
+			if (argc)
+				argc = parse_options(argc, argv, stats_options,
+						     stats_usage, 0);
+			return perf_sched__schedstat_diff(&sched, argc, argv);
 		}
 		return perf_sched__schedstat_live(&sched, argc, argv);
 	} else {
-- 
cgit v1.2.3


From c6b1f5707509c2718832a8f79e1d1510c85bcc75 Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Mon, 19 Jan 2026 17:58:31 +0000
Subject: perf sched stats: Add basic 'perf sched stats' test

Add basic test for 'perf sched stats {record|report|diff}' subcommand.

Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: David Vernet <void@manifault.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Graham Woodward <graham.woodward@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Cc: Zhongqiu Han <quic_zhonhan@quicinc.com>
Link: https://lore.kernel.org/r/20260119175833.340369-10-swapnil.sapkal@amd.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/perf_sched_stats.sh | 64 ++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100755 tools/perf/tests/shell/perf_sched_stats.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/perf_sched_stats.sh b/tools/perf/tests/shell/perf_sched_stats.sh
new file mode 100755
index 000000000000..2b1410b050d0
--- /dev/null
+++ b/tools/perf/tests/shell/perf_sched_stats.sh
@@ -0,0 +1,64 @@
+#!/bin/sh
+# perf sched stats tests
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+err=0
+test_perf_sched_stats_record() {
+  echo "Basic perf sched stats record test"
+  if ! perf sched stats record true 2>&1 | \
+    grep -E -q "[ perf sched stats: Wrote samples to perf.data ]"
+  then
+    echo "Basic perf sched stats record test [Failed]"
+    err=1
+    return
+  fi
+  echo "Basic perf sched stats record test [Success]"
+}
+
+test_perf_sched_stats_report() {
+  echo "Basic perf sched stats report test"
+  perf sched stats record true > /dev/null
+  if ! perf sched stats report 2>&1 | grep -E -q "Description"
+  then
+    echo "Basic perf sched stats report test [Failed]"
+    err=1
+    rm perf.data
+    return
+  fi
+  rm perf.data
+  echo "Basic perf sched stats report test [Success]"
+}
+
+test_perf_sched_stats_live() {
+  echo "Basic perf sched stats live mode test"
+  if ! perf sched stats true 2>&1 | grep -E -q "Description"
+  then
+    echo "Basic perf sched stats live mode test [Failed]"
+    err=1
+    return
+  fi
+  echo "Basic perf sched stats live mode test [Success]"
+}
+
+test_perf_sched_stats_diff() {
+  echo "Basic perf sched stats diff test"
+  perf sched stats record true > /dev/null
+  perf sched stats record true > /dev/null
+  if ! perf sched stats diff > /dev/null
+  then
+    echo "Basic perf sched stats diff test [Failed]"
+    err=1
+    rm perf.data.old perf.data
+    return
+  fi
+  rm perf.data.old perf.data
+  echo "Basic perf sched stats diff test [Success]"
+}
+
+test_perf_sched_stats_record
+test_perf_sched_stats_report
+test_perf_sched_stats_live
+test_perf_sched_stats_diff
+exit $err
-- 
cgit v1.2.3


From 800af362d68945e589f73cda429d04bfe4287feb Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Mon, 19 Jan 2026 17:58:32 +0000
Subject: perf sched stats: Add details in man page

Document 'perf sched stats' purpose, usage examples and guide on how to
interpret the report data in the perf-sched man page.

Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Ben Gainey <ben.gainey@arm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: David Vernet <void@manifault.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Graham Woodward <graham.woodward@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Santosh Shukla <santosh.shukla@amd.com>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Yang Jihong <yangjihong@bytedance.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Cc: Zhongqiu Han <quic_zhonhan@quicinc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-sched.txt | 261 +++++++++++++++++++++++++++++++-
 1 file changed, 260 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 6dbbddb6464d..5bfb7bb6c633 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -8,7 +8,7 @@ perf-sched - Tool to trace/measure scheduler properties (latencies)
 SYNOPSIS
 --------
 [verse]
-'perf sched' {record|latency|map|replay|script|timehist}
+'perf sched' {record|latency|map|replay|script|timehist|stats}
 
 DESCRIPTION
 -----------
@@ -80,8 +80,267 @@ There are several variants of 'perf sched':
     
    Times are in msec.usec.
 
+   'perf sched stats {record | report | diff} <command>' to capture, report the diff
+   in schedstat counters and show the difference between perf sched stats report
+   respectively. schedstat counters which are present in the linux kernel and are
+   exposed through the file ``/proc/schedstat``. These counters are enabled or disabled
+   via the sysctl governed by the file ``/proc/sys/kernel/sched_schedstats``. These
+   counters accounts for many scheduler events such as ``schedule()`` calls, load-balancing
+   events, ``try_to_wakeup()`` call among others. This is useful in understading the
+   scheduler behavior for the workload.
+
+   Note: The tool will not give correct results if there is topological reordering or
+         online/offline of cpus in between capturing snapshots of `/proc/schedstat`.
+
+    Example usage:
+        perf sched stats record -- sleep 1
+        perf sched stats report
+        perf sched stats diff
+
+   A detailed description of the schedstats can be found in the Kernel Documentation:
+   https://www.kernel.org/doc/html/latest/scheduler/sched-stats.html
+
+   The result can be interprested as follows:
+
+   The `perf sched stats report` starts with description of the columns present in
+   the report. These column names are given before cpu and domain stats to improve
+   the readability of the report.
+
+   ----------------------------------------------------------------------------------------------------
+   DESC                    -> Description of the field
+   COUNT                   -> Value of the field
+   PCT_CHANGE              -> Percent change with corresponding base value
+   AVG_JIFFIES             -> Avg time in jiffies between two consecutive occurrence of event
+   ----------------------------------------------------------------------------------------------------
+
+   Next is the total profiling time in terms of jiffies:
+
+   ----------------------------------------------------------------------------------------------------
+   Time elapsed (in jiffies)                                   :       24537
+   ----------------------------------------------------------------------------------------------------
+
+   Next is CPU scheduling statistics. These are simple diffs of /proc/schedstat CPU lines
+   along with description. The report also prints % relative to base stat.
+
+   In the example below, schedule() left the CPU0 idle 36.58% of the time. 0.45% of total
+   try_to_wake_up() was to wakeup local CPU. And, the total waittime by tasks on CPU0 is
+   48.70% of the total runtime by tasks on the same CPU.
+
+   ----------------------------------------------------------------------------------------------------
+   CPU 0
+   ----------------------------------------------------------------------------------------------------
+   DESC                                                                     COUNT   PCT_CHANGE
+   ----------------------------------------------------------------------------------------------------
+   yld_count                                                        :           0
+   array_exp                                                        :           0
+   sched_count                                                      :      402267
+   sched_goidle                                                     :      147161  (    36.58% )
+   ttwu_count                                                       :      236309
+   ttwu_local                                                       :        1062  (     0.45% )
+   rq_cpu_time                                                      :  7083791148
+   run_delay                                                        :  3449973971  (    48.70% )
+   pcount                                                           :      255035
+   ----------------------------------------------------------------------------------------------------
+
+   Next is load balancing statistics. For each of the sched domains
+   (eg: `SMT`, `MC`, `DIE`...), the scheduler computes statistics under
+   the following three categories:
+
+   1) Idle Load Balance: Load balancing performed on behalf of a long
+                         idling CPU by some other CPU.
+   2) Busy Load Balance: Load balancing performed when the CPU was busy.
+   3) New Idle Balance : Load balancing performed when a CPU just became
+                        idle.
+
+   Under each of these three categories, sched stats report provides
+   different load balancing statistics. Along with direct stats, the
+   report also contains derived metrics prefixed with *. Example:
+
+   ----------------------------------------------------------------------------------------------------
+   CPU 0, DOMAIN SMT CPUS 0,64
+   ----------------------------------------------------------------------------------------------------
+   DESC                                                                     COUNT    AVG_JIFFIES
+   ----------------------------------------- <Category busy> ------------------------------------------
+   busy_lb_count                                                    :         136  $       17.08 $
+   busy_lb_balanced                                                 :         131  $       17.73 $
+   busy_lb_failed                                                   :           0  $        0.00 $
+   busy_lb_imbalance_load                                           :          58
+   busy_lb_imbalance_util                                           :           0
+   busy_lb_imbalance_task                                           :           0
+   busy_lb_imbalance_misfit                                         :           0
+   busy_lb_gained                                                   :           7
+   busy_lb_hot_gained                                               :           0
+   busy_lb_nobusyq                                                  :           2  $     1161.50 $
+   busy_lb_nobusyg                                                  :         129  $       18.01 $
+   *busy_lb_success_count                                           :           5
+   *busy_lb_avg_pulled                                              :        1.40
+   ----------------------------------------- <Category idle> ------------------------------------------
+   idle_lb_count                                                    :         449  $        5.17 $
+   idle_lb_balanced                                                 :         382  $        6.08 $
+   idle_lb_failed                                                   :           3  $      774.33 $
+   idle_lb_imbalance_load                                           :           0
+   idle_lb_imbalance_util                                           :           0
+   idle_lb_imbalance_task                                           :          71
+   idle_lb_imbalance_misfit                                         :           0
+   idle_lb_gained                                                   :          67
+   idle_lb_hot_gained                                               :           0
+   idle_lb_nobusyq                                                  :           0  $        0.00 $
+   idle_lb_nobusyg                                                  :         382  $        6.08 $
+   *idle_lb_success_count                                           :          64
+   *idle_lb_avg_pulled                                              :        1.05
+   ---------------------------------------- <Category newidle> ----------------------------------------
+   newidle_lb_count                                                 :       30471  $        0.08 $
+   newidle_lb_balanced                                              :       28490  $        0.08 $
+   newidle_lb_failed                                                :         633  $        3.67 $
+   newidle_lb_imbalance_load                                        :           0
+   newidle_lb_imbalance_util                                        :           0
+   newidle_lb_imbalance_task                                        :        2040
+   newidle_lb_imbalance_misfit                                      :           0
+   newidle_lb_gained                                                :        1348
+   newidle_lb_hot_gained                                            :           0
+   newidle_lb_nobusyq                                               :           6  $      387.17 $
+   newidle_lb_nobusyg                                               :       26634  $        0.09 $
+   *newidle_lb_success_count                                        :        1348
+   *newidle_lb_avg_pulled                                           :        1.00
+   ----------------------------------------------------------------------------------------------------
+
+   Consider following line:
+
+   newidle_lb_balanced                                              :       28490  $        0.08 $
+
+   While profiling was active, the load-balancer found 28490 times the load
+   needs to be balanced on a newly idle CPU 0. Following value encapsulated
+   inside $ is average jiffies between two events (28490 / 24537 = 0.08).
+
+   Next are active_load_balance() stats. alb did not trigger while the
+   profiling was active, hence it's all 0s.
+
+   --------------------------------- <Category active_load_balance()> ---------------------------------
+   alb_count                                                        :           0
+   alb_failed                                                       :           0
+   alb_pushed                                                       :           0
+   ----------------------------------------------------------------------------------------------------
+
+   Next are sched_balance_exec() and sched_balance_fork() stats. They are
+   not used but we kept it in RFC just for legacy purpose. Unless opposed,
+   we plan to remove them in next revision.
+
+   Next are wakeup statistics. For every domain, the report also shows
+   task-wakeup statistics. Example:
+
+   ------------------------------------------ <Wakeup Info> -------------------------------------------
+   ttwu_wake_remote                                                 :        1590
+   ttwu_move_affine                                                 :          84
+   ttwu_move_balance                                                :           0
+   ----------------------------------------------------------------------------------------------------
+
+   Same set of stats are reported for each CPU and each domain level.
+
+   How to interpret the diff
+   ~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   The `perf sched stats diff` will also start with explaining the columns
+   present in the diff. Then it will show the diff in time in terms of
+   jiffies. The order of the values depends on the order of input data
+   files. It will take `perf.data.old` and `perf.data` respectively as the
+   defaults for comparison. Example:
+
+   ----------------------------------------------------------------------------------------------------
+   Time elapsed (in jiffies)                                        :        2009,       2001
+   ----------------------------------------------------------------------------------------------------
+
+   Below is the sample representing the difference in cpu and domain stats of
+   two runs. Here third column or the values enclosed in `|...|` shows the
+   percent change between the two. Second and fourth columns shows the
+   side-by-side representions of the corresponding fields from `perf sched
+   stats report`.
+
+   ----------------------------------------------------------------------------------------------------
+   CPU <ALL CPUS SUMMARY>
+   ----------------------------------------------------------------------------------------------------
+   DESC                                                                    COUNT1      COUNT2   PCT_CHANG>
+   ----------------------------------------------------------------------------------------------------
+   yld_count                                                        :           0,          0  |     0.00>
+   array_exp                                                        :           0,          0  |     0.00>
+   sched_count                                                      :      528533,     412573  |   -21.94>
+   sched_goidle                                                     :      193426,     146082  |   -24.48>
+   ttwu_count                                                       :      313134,     385975  |    23.26>
+   ttwu_local                                                       :        1126,       1282  |    13.85>
+   rq_cpu_time                                                      :  8257200244, 8301250047  |     0.53>
+   run_delay                                                        :  4728347053, 3997100703  |   -15.47>
+   pcount                                                           :      335031,     266396  |   -20.49>
+   ----------------------------------------------------------------------------------------------------
+
+   Below is the sample of domain stats diff:
+
+   ----------------------------------------------------------------------------------------------------
+   CPU <ALL CPUS SUMMARY>, DOMAIN SMT
+   ----------------------------------------------------------------------------------------------------
+   DESC                                                                    COUNT1      COUNT2   PCT_CHANG>
+   ----------------------------------------- <Category busy> ------------------------------------------
+   busy_lb_count                                                    :         122,         80  |   -34.43>
+   busy_lb_balanced                                                 :         115,         76  |   -33.91>
+   busy_lb_failed                                                   :           1,          3  |   200.00>
+   busy_lb_imbalance_load                                           :          35,         49  |    40.00>
+   busy_lb_imbalance_util                                           :           0,          0  |     0.00>
+   busy_lb_imbalance_task                                           :           0,          0  |     0.00>
+   busy_lb_imbalance_misfit                                         :           0,          0  |     0.00>
+   busy_lb_gained                                                   :           7,          2  |   -71.43>
+   busy_lb_hot_gained                                               :           0,          0  |     0.00>
+   busy_lb_nobusyq                                                  :           0,          0  |     0.00>
+   busy_lb_nobusyg                                                  :         115,         76  |   -33.91>
+   *busy_lb_success_count                                           :           6,          1  |   -83.33>
+   *busy_lb_avg_pulled                                              :        1.17,       2.00  |    71.43>
+   ----------------------------------------- <Category idle> ------------------------------------------
+   idle_lb_count                                                    :         568,        620  |     9.15>
+   idle_lb_balanced                                                 :         462,        449  |    -2.81>
+   idle_lb_failed                                                   :          11,         21  |    90.91>
+   idle_lb_imbalance_load                                           :           0,          0  |     0.00>
+   idle_lb_imbalance_util                                           :           0,          0  |     0.00>
+   idle_lb_imbalance_task                                           :         115,        189  |    64.35>
+   idle_lb_imbalance_misfit                                         :           0,          0  |     0.00>
+   idle_lb_gained                                                   :         103,        169  |    64.08>
+   idle_lb_hot_gained                                               :           0,          0  |     0.00>
+   idle_lb_nobusyq                                                  :           0,          0  |     0.00>
+   idle_lb_nobusyg                                                  :         462,        449  |    -2.81>
+   *idle_lb_success_count                                           :          95,        150  |    57.89>
+   *idle_lb_avg_pulled                                              :        1.08,       1.13  |     3.92>
+   ---------------------------------------- <Category newidle> ----------------------------------------
+   newidle_lb_count                                                 :       16961,       3155  |   -81.40>
+   newidle_lb_balanced                                              :       15646,       2556  |   -83.66>
+   newidle_lb_failed                                                :         397,        142  |   -64.23>
+   newidle_lb_imbalance_load                                        :           0,          0  |     0.00>
+   newidle_lb_imbalance_util                                        :           0,          0  |     0.00>
+   newidle_lb_imbalance_task                                        :        1376,        655  |   -52.40>
+   newidle_lb_imbalance_misfit                                      :           0,          0  |     0.00>
+   newidle_lb_gained                                                :         917,        457  |   -50.16>
+   newidle_lb_hot_gained                                            :           0,          0  |     0.00>
+   newidle_lb_nobusyq                                               :           3,          1  |   -66.67>
+   newidle_lb_nobusyg                                               :       14480,       2103  |   -85.48>
+   *newidle_lb_success_count                                        :         918,        457  |   -50.22>
+   *newidle_lb_avg_pulled                                           :        1.00,       1.00  |     0.11>
+   --------------------------------- <Category active_load_balance()> ---------------------------------
+   alb_count                                                        :           0,          1  |     0.00>
+   alb_failed                                                       :           0,          0  |     0.00>
+   alb_pushed                                                       :           0,          1  |     0.00>
+   --------------------------------- <Category sched_balance_exec()> ----------------------------------
+   sbe_count                                                        :           0,          0  |     0.00>
+   sbe_balanced                                                     :           0,          0  |     0.00>
+   sbe_pushed                                                       :           0,          0  |     0.00>
+   --------------------------------- <Category sched_balance_fork()> ----------------------------------
+   sbf_count                                                        :           0,          0  |     0.00>
+   sbf_balanced                                                     :           0,          0  |     0.00>
+   sbf_pushed                                                       :           0,          0  |     0.00>
+   ------------------------------------------ <Wakeup Info> -------------------------------------------
+   ttwu_wake_remote                                                 :        2031,       2914  |    43.48>
+   ttwu_move_affine                                                 :          73,        124  |    69.86>
+   ttwu_move_balance                                                :           0,          0  |     0.00>
+   ----------------------------------------------------------------------------------------------------
+
 OPTIONS
 -------
+Applicable to {record|latency|map|replay|script}
+
 -i::
 --input=<file>::
         Input file name. (default: perf.data unless stdin is a fifo)
-- 
cgit v1.2.3


From 7ff8b1d60881c5f97b5ae426e14d2822917d3b69 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 14 Jan 2026 12:20:28 -0600
Subject: cxl/pci: Remove CXL VH handling in CONFIG_PCIEAER_CXL conditional
 blocks from core/pci.c

Create new config CONFIG_CXL_RAS and put all CXL RAS items behind the
config. The config will depend on CPER and PCIE AER to build. Move the
related VH RAS code from core/pci.c to core/ras.c.

Restricted CXL host (RCH) RAS functions will be moved in a future patch.

Cc: Robert Richter <rrichter@amd.com>
Reviewed-by: Joshua Hahn <joshua.hahnjy@gmail.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Co-developed-by: Terry Bowman <terry.bowman@amd.com>
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://patch.msgid.link/20260114182055.46029-8-terry.bowman@amd.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/Kconfig       |   4 +
 drivers/cxl/core/Makefile |   2 +-
 drivers/cxl/core/core.h   |  31 ++++++++
 drivers/cxl/core/pci.c    | 189 +---------------------------------------------
 drivers/cxl/core/ras.c    | 176 ++++++++++++++++++++++++++++++++++++++++++
 drivers/cxl/cxl.h         |   8 --
 drivers/cxl/cxlpci.h      |  16 ++++
 tools/testing/cxl/Kbuild  |   2 +-
 8 files changed, 233 insertions(+), 195 deletions(-)

(limited to 'tools')

diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index 48b7314afdb8..217888992c88 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -233,4 +233,8 @@ config CXL_MCE
 	def_bool y
 	depends on X86_MCE && MEMORY_FAILURE
 
+config CXL_RAS
+	def_bool y
+	depends on ACPI_APEI_GHES && PCIEAER && CXL_PCI
+
 endif
diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
index 5ad8fef210b5..b2930cc54f8b 100644
--- a/drivers/cxl/core/Makefile
+++ b/drivers/cxl/core/Makefile
@@ -14,9 +14,9 @@ cxl_core-y += pci.o
 cxl_core-y += hdm.o
 cxl_core-y += pmu.o
 cxl_core-y += cdat.o
-cxl_core-y += ras.o
 cxl_core-$(CONFIG_TRACING) += trace.o
 cxl_core-$(CONFIG_CXL_REGION) += region.o
 cxl_core-$(CONFIG_CXL_MCE) += mce.o
 cxl_core-$(CONFIG_CXL_FEATURES) += features.o
 cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o
+cxl_core-$(CONFIG_CXL_RAS) += ras.o
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 1fb66132b777..bc818de87ccc 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -144,8 +144,39 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c);
 int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
 					struct access_coordinate *c);
 
+#ifdef CONFIG_CXL_RAS
 int cxl_ras_init(void);
 void cxl_ras_exit(void);
+bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base);
+void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base);
+#else
+static inline int cxl_ras_init(void)
+{
+	return 0;
+}
+
+static inline void cxl_ras_exit(void)
+{
+}
+
+static inline bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base)
+{
+	return false;
+}
+static inline void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { }
+#endif /* CONFIG_CXL_RAS */
+
+/* Restricted CXL Host specific RAS functions */
+#ifdef CONFIG_CXL_RAS
+void cxl_dport_map_rch_aer(struct cxl_dport *dport);
+void cxl_disable_rch_root_ints(struct cxl_dport *dport);
+void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds);
+#else
+static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { }
+static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { }
+static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
+#endif /* CONFIG_CXL_RAS */
+
 int cxl_gpf_port_setup(struct cxl_dport *dport);
 
 struct cxl_hdm;
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 51bb0f372e40..e132fff80979 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -632,81 +632,8 @@ err:
 }
 EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
 
-static void cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
-			       void __iomem *ras_base)
-{
-	void __iomem *addr;
-	u32 status;
-
-	if (!ras_base)
-		return;
-
-	addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
-	status = readl(addr);
-	if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
-		writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
-		trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
-	}
-}
-
-/* CXL spec rev3.0 8.2.4.16.1 */
-static void header_log_copy(void __iomem *ras_base, u32 *log)
-{
-	void __iomem *addr;
-	u32 *log_addr;
-	int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32);
-
-	addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET;
-	log_addr = log;
-
-	for (i = 0; i < log_u32_size; i++) {
-		*log_addr = readl(addr);
-		log_addr++;
-		addr += sizeof(u32);
-	}
-}
-
-/*
- * Log the state of the RAS status registers and prepare them to log the
- * next error status. Return 1 if reset needed.
- */
-static bool cxl_handle_ras(struct cxl_dev_state *cxlds,
-			   void __iomem *ras_base)
-{
-	u32 hl[CXL_HEADERLOG_SIZE_U32];
-	void __iomem *addr;
-	u32 status;
-	u32 fe;
-
-	if (!ras_base)
-		return false;
-
-	addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
-	status = readl(addr);
-	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
-		return false;
-
-	/* If multiple errors, log header points to first error from ctrl reg */
-	if (hweight32(status) > 1) {
-		void __iomem *rcc_addr =
-			ras_base + CXL_RAS_CAP_CONTROL_OFFSET;
-
-		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
-				   readl(rcc_addr)));
-	} else {
-		fe = status;
-	}
-
-	header_log_copy(ras_base, hl);
-	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl);
-	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
-
-	return true;
-}
-
-#ifdef CONFIG_PCIEAER_CXL
-
-static void cxl_dport_map_rch_aer(struct cxl_dport *dport)
+#ifdef CONFIG_CXL_RAS
+void cxl_dport_map_rch_aer(struct cxl_dport *dport)
 {
 	resource_size_t aer_phys;
 	struct device *host;
@@ -721,19 +648,7 @@ static void cxl_dport_map_rch_aer(struct cxl_dport *dport)
 	}
 }
 
-static void cxl_dport_map_ras(struct cxl_dport *dport)
-{
-	struct cxl_register_map *map = &dport->reg_map;
-	struct device *dev = dport->dport_dev;
-
-	if (!map->component_map.ras.valid)
-		dev_dbg(dev, "RAS registers not found\n");
-	else if (cxl_map_component_regs(map, &dport->regs.component,
-					BIT(CXL_CM_CAP_CAP_ID_RAS)))
-		dev_dbg(dev, "Failed to map RAS capability.\n");
-}
-
-static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
+void cxl_disable_rch_root_ints(struct cxl_dport *dport)
 {
 	void __iomem *aer_base = dport->regs.dport_aer;
 	u32 aer_cmd_mask, aer_cmd;
@@ -757,28 +672,6 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
 	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
 }
 
-/**
- * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport
- * @dport: the cxl_dport that needs to be initialized
- * @host: host device for devm operations
- */
-void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host)
-{
-	dport->reg_map.host = host;
-	cxl_dport_map_ras(dport);
-
-	if (dport->rch) {
-		struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev);
-
-		if (!host_bridge->native_aer)
-			return;
-
-		cxl_dport_map_rch_aer(dport);
-		cxl_disable_rch_root_ints(dport);
-	}
-}
-EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
-
 /*
  * Copy the AER capability registers using 32 bit read accesses.
  * This is necessary because RCRB AER capability is MMIO mapped. Clear the
@@ -827,7 +720,7 @@ static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs,
 	return false;
 }
 
-static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
+void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
 {
 	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
 	struct aer_capability_regs aer_regs;
@@ -852,82 +745,8 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
 	else
 		cxl_handle_ras(cxlds, dport->regs.ras);
 }
-
-#else
-static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
 #endif
 
-void cxl_cor_error_detected(struct pci_dev *pdev)
-{
-	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
-	struct device *dev = &cxlds->cxlmd->dev;
-
-	scoped_guard(device, dev) {
-		if (!dev->driver) {
-			dev_warn(&pdev->dev,
-				 "%s: memdev disabled, abort error handling\n",
-				 dev_name(dev));
-			return;
-		}
-
-		if (cxlds->rcd)
-			cxl_handle_rdport_errors(cxlds);
-
-		cxl_handle_cor_ras(cxlds, cxlds->regs.ras);
-	}
-}
-EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
-
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
-				    pci_channel_state_t state)
-{
-	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
-	struct cxl_memdev *cxlmd = cxlds->cxlmd;
-	struct device *dev = &cxlmd->dev;
-	bool ue;
-
-	scoped_guard(device, dev) {
-		if (!dev->driver) {
-			dev_warn(&pdev->dev,
-				 "%s: memdev disabled, abort error handling\n",
-				 dev_name(dev));
-			return PCI_ERS_RESULT_DISCONNECT;
-		}
-
-		if (cxlds->rcd)
-			cxl_handle_rdport_errors(cxlds);
-		/*
-		 * A frozen channel indicates an impending reset which is fatal to
-		 * CXL.mem operation, and will likely crash the system. On the off
-		 * chance the situation is recoverable dump the status of the RAS
-		 * capability registers and bounce the active state of the memdev.
-		 */
-		ue = cxl_handle_ras(cxlds, cxlds->regs.ras);
-	}
-
-
-	switch (state) {
-	case pci_channel_io_normal:
-		if (ue) {
-			device_release_driver(dev);
-			return PCI_ERS_RESULT_NEED_RESET;
-		}
-		return PCI_ERS_RESULT_CAN_RECOVER;
-	case pci_channel_io_frozen:
-		dev_warn(&pdev->dev,
-			 "%s: frozen state error detected, disable CXL.mem\n",
-			 dev_name(dev));
-		device_release_driver(dev);
-		return PCI_ERS_RESULT_NEED_RESET;
-	case pci_channel_io_perm_failure:
-		dev_warn(&pdev->dev,
-			 "failure state error detected, request disconnect\n");
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-	return PCI_ERS_RESULT_NEED_RESET;
-}
-EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
-
 static int cxl_flit_size(struct pci_dev *pdev)
 {
 	if (cxl_pci_flit_256(pdev))
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 2731ba3a0799..b933030b8e1e 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -5,6 +5,7 @@
 #include <linux/aer.h>
 #include <cxl/event.h>
 #include <cxlmem.h>
+#include <cxlpci.h>
 #include "trace.h"
 
 static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
@@ -124,3 +125,178 @@ void cxl_ras_exit(void)
 	cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
 	cancel_work_sync(&cxl_cper_prot_err_work);
 }
+
+static void cxl_dport_map_ras(struct cxl_dport *dport)
+{
+	struct cxl_register_map *map = &dport->reg_map;
+	struct device *dev = dport->dport_dev;
+
+	if (!map->component_map.ras.valid)
+		dev_dbg(dev, "RAS registers not found\n");
+	else if (cxl_map_component_regs(map, &dport->regs.component,
+					BIT(CXL_CM_CAP_CAP_ID_RAS)))
+		dev_dbg(dev, "Failed to map RAS capability.\n");
+}
+
+/**
+ * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport
+ * @dport: the cxl_dport that needs to be initialized
+ * @host: host device for devm operations
+ */
+void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host)
+{
+	dport->reg_map.host = host;
+	cxl_dport_map_ras(dport);
+
+	if (dport->rch) {
+		struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev);
+
+		if (!host_bridge->native_aer)
+			return;
+
+		cxl_dport_map_rch_aer(dport);
+		cxl_disable_rch_root_ints(dport);
+	}
+}
+EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
+
+void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base)
+{
+	void __iomem *addr;
+	u32 status;
+
+	if (!ras_base)
+		return;
+
+	addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
+	status = readl(addr);
+	if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
+		writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
+		trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+	}
+}
+
+/* CXL spec rev3.0 8.2.4.16.1 */
+static void header_log_copy(void __iomem *ras_base, u32 *log)
+{
+	void __iomem *addr;
+	u32 *log_addr;
+	int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32);
+
+	addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET;
+	log_addr = log;
+
+	for (i = 0; i < log_u32_size; i++) {
+		*log_addr = readl(addr);
+		log_addr++;
+		addr += sizeof(u32);
+	}
+}
+
+/*
+ * Log the state of the RAS status registers and prepare them to log the
+ * next error status. Return 1 if reset needed.
+ */
+bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base)
+{
+	u32 hl[CXL_HEADERLOG_SIZE_U32];
+	void __iomem *addr;
+	u32 status;
+	u32 fe;
+
+	if (!ras_base)
+		return false;
+
+	addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
+	status = readl(addr);
+	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
+		return false;
+
+	/* If multiple errors, log header points to first error from ctrl reg */
+	if (hweight32(status) > 1) {
+		void __iomem *rcc_addr =
+			ras_base + CXL_RAS_CAP_CONTROL_OFFSET;
+
+		fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+				   readl(rcc_addr)));
+	} else {
+		fe = status;
+	}
+
+	header_log_copy(ras_base, hl);
+	trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl);
+	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
+
+	return true;
+}
+
+void cxl_cor_error_detected(struct pci_dev *pdev)
+{
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct device *dev = &cxlds->cxlmd->dev;
+
+	scoped_guard(device, dev) {
+		if (!dev->driver) {
+			dev_warn(&pdev->dev,
+				 "%s: memdev disabled, abort error handling\n",
+				 dev_name(dev));
+			return;
+		}
+
+		if (cxlds->rcd)
+			cxl_handle_rdport_errors(cxlds);
+
+		cxl_handle_cor_ras(cxlds, cxlds->regs.ras);
+	}
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
+
+pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
+				    pci_channel_state_t state)
+{
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+	struct cxl_memdev *cxlmd = cxlds->cxlmd;
+	struct device *dev = &cxlmd->dev;
+	bool ue;
+
+	scoped_guard(device, dev) {
+		if (!dev->driver) {
+			dev_warn(&pdev->dev,
+				 "%s: memdev disabled, abort error handling\n",
+				 dev_name(dev));
+			return PCI_ERS_RESULT_DISCONNECT;
+		}
+
+		if (cxlds->rcd)
+			cxl_handle_rdport_errors(cxlds);
+		/*
+		 * A frozen channel indicates an impending reset which is fatal to
+		 * CXL.mem operation, and will likely crash the system. On the off
+		 * chance the situation is recoverable dump the status of the RAS
+		 * capability registers and bounce the active state of the memdev.
+		 */
+		ue = cxl_handle_ras(cxlds, cxlds->regs.ras);
+	}
+
+
+	switch (state) {
+	case pci_channel_io_normal:
+		if (ue) {
+			device_release_driver(dev);
+			return PCI_ERS_RESULT_NEED_RESET;
+		}
+		return PCI_ERS_RESULT_CAN_RECOVER;
+	case pci_channel_io_frozen:
+		dev_warn(&pdev->dev,
+			 "%s: frozen state error detected, disable CXL.mem\n",
+			 dev_name(dev));
+		device_release_driver(dev);
+		return PCI_ERS_RESULT_NEED_RESET;
+	case pci_channel_io_perm_failure:
+		dev_warn(&pdev->dev,
+			 "failure state error detected, request disconnect\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index ba17fa86d249..42a76a7a088f 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -803,14 +803,6 @@ struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port,
 					 struct device *dport_dev, int port_id,
 					 resource_size_t rcrb);
 
-#ifdef CONFIG_PCIEAER_CXL
-void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport);
-void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host);
-#else
-static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport,
-						struct device *host) { }
-#endif
-
 struct cxl_decoder *to_cxl_decoder(struct device *dev);
 struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev);
 struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev);
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index cdb7cf3dbcb4..6f9c78886fd9 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -76,7 +76,23 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev)
 
 struct cxl_dev_state;
 void read_cdat_data(struct cxl_port *port);
+
+#ifdef CONFIG_CXL_RAS
 void cxl_cor_error_detected(struct pci_dev *pdev);
 pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 				    pci_channel_state_t state);
+void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host);
+#else
+static inline void cxl_cor_error_detected(struct pci_dev *pdev) { }
+
+static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
+						  pci_channel_state_t state)
+{
+	return PCI_ERS_RESULT_NONE;
+}
+
+static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport,
+						struct device *host) { }
+#endif
+
 #endif /* __CXL_PCI_H__ */
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 0e151d0572d1..b7ea66382f3b 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -57,12 +57,12 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
 cxl_core-y += $(CXL_CORE_SRC)/hdm.o
 cxl_core-y += $(CXL_CORE_SRC)/pmu.o
 cxl_core-y += $(CXL_CORE_SRC)/cdat.o
-cxl_core-y += $(CXL_CORE_SRC)/ras.o
 cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
 cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
 cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o
 cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o
 cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o
+cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras.o
 cxl_core-y += config_check.o
 cxl_core-y += cxl_core_test.o
 cxl_core-y += cxl_core_exports.o
-- 
cgit v1.2.3


From 0ff60f2ec3e4043a442e805f80f8a2445113ec8f Mon Sep 17 00:00:00 2001
From: Terry Bowman <terry.bowman@amd.com>
Date: Wed, 14 Jan 2026 12:20:29 -0600
Subject: cxl/pci: Move CXL driver's RCH error handling into core/ras_rch.c

Restricted CXL Host (RCH) protocol error handling uses a procedure distinct
from the CXL Virtual Hierarchy (VH) handling. This is because of the
differences in the RCH and VH topologies. Improve the maintainability and
add ability to enable/disable RCH handling.

Move and combine the RCH handling code into a single block conditionally
compiled with the CONFIG_CXL_RCH_RAS kernel config.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Link: https://patch.msgid.link/20260114182055.46029-9-terry.bowman@amd.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/Makefile  |   1 +
 drivers/cxl/core/core.h    |  11 ++---
 drivers/cxl/core/pci.c     | 115 ------------------------------------------
 drivers/cxl/core/ras_rch.c | 121 +++++++++++++++++++++++++++++++++++++++++++++
 tools/testing/cxl/Kbuild   |   1 +
 5 files changed, 126 insertions(+), 123 deletions(-)
 create mode 100644 drivers/cxl/core/ras_rch.c

(limited to 'tools')

diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
index b2930cc54f8b..b37f38d502d8 100644
--- a/drivers/cxl/core/Makefile
+++ b/drivers/cxl/core/Makefile
@@ -20,3 +20,4 @@ cxl_core-$(CONFIG_CXL_MCE) += mce.o
 cxl_core-$(CONFIG_CXL_FEATURES) += features.o
 cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o
 cxl_core-$(CONFIG_CXL_RAS) += ras.o
+cxl_core-$(CONFIG_CXL_RAS) += ras_rch.o
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index bc818de87ccc..724361195057 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -149,6 +149,9 @@ int cxl_ras_init(void);
 void cxl_ras_exit(void);
 bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base);
 void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base);
+void cxl_dport_map_rch_aer(struct cxl_dport *dport);
+void cxl_disable_rch_root_ints(struct cxl_dport *dport);
+void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds);
 #else
 static inline int cxl_ras_init(void)
 {
@@ -164,14 +167,6 @@ static inline bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras
 	return false;
 }
 static inline void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { }
-#endif /* CONFIG_CXL_RAS */
-
-/* Restricted CXL Host specific RAS functions */
-#ifdef CONFIG_CXL_RAS
-void cxl_dport_map_rch_aer(struct cxl_dport *dport);
-void cxl_disable_rch_root_ints(struct cxl_dport *dport);
-void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds);
-#else
 static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { }
 static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { }
 static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index e132fff80979..b838c59d7a3c 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -632,121 +632,6 @@ err:
 }
 EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
 
-#ifdef CONFIG_CXL_RAS
-void cxl_dport_map_rch_aer(struct cxl_dport *dport)
-{
-	resource_size_t aer_phys;
-	struct device *host;
-	u16 aer_cap;
-
-	aer_cap = cxl_rcrb_to_aer(dport->dport_dev, dport->rcrb.base);
-	if (aer_cap) {
-		host = dport->reg_map.host;
-		aer_phys = aer_cap + dport->rcrb.base;
-		dport->regs.dport_aer = devm_cxl_iomap_block(host, aer_phys,
-						sizeof(struct aer_capability_regs));
-	}
-}
-
-void cxl_disable_rch_root_ints(struct cxl_dport *dport)
-{
-	void __iomem *aer_base = dport->regs.dport_aer;
-	u32 aer_cmd_mask, aer_cmd;
-
-	if (!aer_base)
-		return;
-
-	/*
-	 * Disable RCH root port command interrupts.
-	 * CXL 3.0 12.2.1.1 - RCH Downstream Port-detected Errors
-	 *
-	 * This sequence may not be necessary. CXL spec states disabling
-	 * the root cmd register's interrupts is required. But, PCI spec
-	 * shows these are disabled by default on reset.
-	 */
-	aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN |
-			PCI_ERR_ROOT_CMD_NONFATAL_EN |
-			PCI_ERR_ROOT_CMD_FATAL_EN);
-	aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND);
-	aer_cmd &= ~aer_cmd_mask;
-	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
-}
-
-/*
- * Copy the AER capability registers using 32 bit read accesses.
- * This is necessary because RCRB AER capability is MMIO mapped. Clear the
- * status after copying.
- *
- * @aer_base: base address of AER capability block in RCRB
- * @aer_regs: destination for copying AER capability
- */
-static bool cxl_rch_get_aer_info(void __iomem *aer_base,
-				 struct aer_capability_regs *aer_regs)
-{
-	int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32);
-	u32 *aer_regs_buf = (u32 *)aer_regs;
-	int n;
-
-	if (!aer_base)
-		return false;
-
-	/* Use readl() to guarantee 32-bit accesses */
-	for (n = 0; n < read_cnt; n++)
-		aer_regs_buf[n] = readl(aer_base + n * sizeof(u32));
-
-	writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS);
-	writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS);
-
-	return true;
-}
-
-/* Get AER severity. Return false if there is no error. */
-static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs,
-				     int *severity)
-{
-	if (aer_regs->uncor_status & ~aer_regs->uncor_mask) {
-		if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV)
-			*severity = AER_FATAL;
-		else
-			*severity = AER_NONFATAL;
-		return true;
-	}
-
-	if (aer_regs->cor_status & ~aer_regs->cor_mask) {
-		*severity = AER_CORRECTABLE;
-		return true;
-	}
-
-	return false;
-}
-
-void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
-{
-	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
-	struct aer_capability_regs aer_regs;
-	struct cxl_dport *dport;
-	int severity;
-
-	struct cxl_port *port __free(put_cxl_port) =
-		cxl_pci_find_port(pdev, &dport);
-	if (!port)
-		return;
-
-	if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs))
-		return;
-
-	if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
-		return;
-
-	pci_print_aer(pdev, severity, &aer_regs);
-
-	if (severity == AER_CORRECTABLE)
-		cxl_handle_cor_ras(cxlds, dport->regs.ras);
-	else
-		cxl_handle_ras(cxlds, dport->regs.ras);
-}
-#endif
-
 static int cxl_flit_size(struct pci_dev *pdev)
 {
 	if (cxl_pci_flit_256(pdev))
diff --git a/drivers/cxl/core/ras_rch.c b/drivers/cxl/core/ras_rch.c
new file mode 100644
index 000000000000..ed58afd18ecc
--- /dev/null
+++ b/drivers/cxl/core/ras_rch.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2025 AMD Corporation. All rights reserved. */
+
+#include <linux/types.h>
+#include <linux/aer.h>
+#include "cxl.h"
+#include "core.h"
+#include "cxlmem.h"
+
+void cxl_dport_map_rch_aer(struct cxl_dport *dport)
+{
+	resource_size_t aer_phys;
+	struct device *host;
+	u16 aer_cap;
+
+	aer_cap = cxl_rcrb_to_aer(dport->dport_dev, dport->rcrb.base);
+	if (aer_cap) {
+		host = dport->reg_map.host;
+		aer_phys = aer_cap + dport->rcrb.base;
+		dport->regs.dport_aer =
+			devm_cxl_iomap_block(host, aer_phys,
+					     sizeof(struct aer_capability_regs));
+	}
+}
+
+void cxl_disable_rch_root_ints(struct cxl_dport *dport)
+{
+	void __iomem *aer_base = dport->regs.dport_aer;
+	u32 aer_cmd_mask, aer_cmd;
+
+	if (!aer_base)
+		return;
+
+	/*
+	 * Disable RCH root port command interrupts.
+	 * CXL 3.0 12.2.1.1 - RCH Downstream Port-detected Errors
+	 *
+	 * This sequence may not be necessary. CXL spec states disabling
+	 * the root cmd register's interrupts is required. But, PCI spec
+	 * shows these are disabled by default on reset.
+	 */
+	aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN |
+			PCI_ERR_ROOT_CMD_NONFATAL_EN |
+			PCI_ERR_ROOT_CMD_FATAL_EN);
+	aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND);
+	aer_cmd &= ~aer_cmd_mask;
+	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
+}
+
+/*
+ * Copy the AER capability registers using 32 bit read accesses.
+ * This is necessary because RCRB AER capability is MMIO mapped. Clear the
+ * status after copying.
+ *
+ * @aer_base: base address of AER capability block in RCRB
+ * @aer_regs: destination for copying AER capability
+ */
+static bool cxl_rch_get_aer_info(void __iomem *aer_base,
+				 struct aer_capability_regs *aer_regs)
+{
+	int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32);
+	u32 *aer_regs_buf = (u32 *)aer_regs;
+	int n;
+
+	if (!aer_base)
+		return false;
+
+	/* Use readl() to guarantee 32-bit accesses */
+	for (n = 0; n < read_cnt; n++)
+		aer_regs_buf[n] = readl(aer_base + n * sizeof(u32));
+
+	writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS);
+	writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS);
+
+	return true;
+}
+
+/* Get AER severity. Return false if there is no error. */
+static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs,
+				     int *severity)
+{
+	if (aer_regs->uncor_status & ~aer_regs->uncor_mask) {
+		if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV)
+			*severity = AER_FATAL;
+		else
+			*severity = AER_NONFATAL;
+		return true;
+	}
+
+	if (aer_regs->cor_status & ~aer_regs->cor_mask) {
+		*severity = AER_CORRECTABLE;
+		return true;
+	}
+
+	return false;
+}
+
+void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
+{
+	struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+	struct aer_capability_regs aer_regs;
+	struct cxl_dport *dport;
+	int severity;
+
+	struct cxl_port *port __free(put_cxl_port) =
+		cxl_pci_find_port(pdev, &dport);
+	if (!port)
+		return;
+
+	if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs))
+		return;
+
+	if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
+		return;
+
+	pci_print_aer(pdev, severity, &aer_regs);
+	if (severity == AER_CORRECTABLE)
+		cxl_handle_cor_ras(cxlds, dport->regs.ras);
+	else
+		cxl_handle_ras(cxlds, dport->regs.ras);
+}
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index b7ea66382f3b..6eceefefb0e0 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -63,6 +63,7 @@ cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o
 cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o
 cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o
 cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras.o
+cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras_rch.o
 cxl_core-y += config_check.o
 cxl_core-y += cxl_core_test.o
 cxl_core-y += cxl_core_exports.o
-- 
cgit v1.2.3


From a081b5789255d27b76cd2cbab85676b2a31dbde1 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 16 Jan 2026 10:34:02 +0100
Subject: kallsyms: Get rid of kallsyms relative base

When the kallsyms relative base was introduced, per-CPU variable
references on x86_64 SMP were implemented as offsets into the respective
per-CPU region, rather than offsets relative to the location of the
variable's template in the kernel image, which is how other
architectures implement it.

This required kallsyms to reason about the difference between the two,
and the sign of the value in the kallsyms_offsets[] array was used to
distinguish them. This meant that negative offsets were not permitted
for ordinary variables, and so it was crucial that the relative base was
chosen such that all offsets were positive numbers.

This is no longer needed: instead, the offsets can simply be encoded as
values in the range -/+ 2 GiB, which is precisely what PC32 relocations
provide on most architectures. So it is possible to simplify the logic,
and just use _text as the anchor directly, and let the linker calculate
the final value based on the location of the entry itself.

Some architectures (nios2, extensa) do not support place-relative
relocations at all, but these are all 32-bit and non-relocatable, and so
there is no need for place-relative relocations in the first place, and
the actual symbol values can just be stored directly.

This makes all entries in the kallsyms_offsets[] array visible as
place-relative references in the ELF metadata, which will be important
when implementing ELF-based fg-kaslr.

Reviewed-by: Kees Cook <kees@kernel.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://patch.msgid.link/20260116093359.2442297-6-ardb+git@google.com
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
---
 kernel/kallsyms.c                   |  6 ++--
 kernel/kallsyms_internal.h          |  1 -
 kernel/vmcore_info.c                |  1 -
 scripts/kallsyms.c                  | 64 ++++++++++---------------------------
 scripts/link-vmlinux.sh             |  4 +++
 tools/perf/tests/vmlinux-kallsyms.c |  1 -
 6 files changed, 25 insertions(+), 52 deletions(-)

(limited to 'tools')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 049e296f586c..6125724aadb1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -151,8 +151,10 @@ static unsigned int get_symbol_offset(unsigned long pos)
 
 unsigned long kallsyms_sym_address(int idx)
 {
-	/* values are unsigned offsets */
-	return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
+	/* non-relocatable 32-bit kernels just embed the value directly */
+	if (!IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_RELOCATABLE))
+		return (u32)kallsyms_offsets[idx];
+	return (unsigned long)offset_to_ptr(kallsyms_offsets + idx);
 }
 
 static unsigned int get_symbol_seq(int index)
diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h
index 9633782f8250..81a867dbe57d 100644
--- a/kernel/kallsyms_internal.h
+++ b/kernel/kallsyms_internal.h
@@ -8,7 +8,6 @@ extern const int kallsyms_offsets[];
 extern const u8 kallsyms_names[];
 
 extern const unsigned int kallsyms_num_syms;
-extern const unsigned long kallsyms_relative_base;
 
 extern const char kallsyms_token_table[];
 extern const u16 kallsyms_token_index[];
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index fe9bf8db1922..f114719f6cb5 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -238,7 +238,6 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL(kallsyms_token_table);
 	VMCOREINFO_SYMBOL(kallsyms_token_index);
 	VMCOREINFO_SYMBOL(kallsyms_offsets);
-	VMCOREINFO_SYMBOL(kallsyms_relative_base);
 #endif /* CONFIG_KALLSYMS */
 
 	arch_crash_save_vmcoreinfo();
diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index 4b0234e4b12f..37d5c095ad22 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -46,7 +46,6 @@ struct addr_range {
 };
 
 static unsigned long long _text;
-static unsigned long long relative_base;
 static struct addr_range text_ranges[] = {
 	{ "_stext",     "_etext"     },
 	{ "_sinittext", "_einittext" },
@@ -57,6 +56,7 @@ static struct addr_range text_ranges[] = {
 static struct sym_entry **table;
 static unsigned int table_size, table_cnt;
 static int all_symbols;
+static int pc_relative;
 
 static int token_profit[0x10000];
 
@@ -280,7 +280,7 @@ static void read_map(const char *in)
 static void output_label(const char *label)
 {
 	printf(".globl %s\n", label);
-	printf("\tALGN\n");
+	printf("\t.balign 4\n");
 	printf("%s:\n", label);
 }
 
@@ -343,15 +343,6 @@ static void write_src(void)
 	unsigned int *markers, markers_cnt;
 	char buf[KSYM_NAME_LEN];
 
-	printf("#include <asm/bitsperlong.h>\n");
-	printf("#if BITS_PER_LONG == 64\n");
-	printf("#define PTR .quad\n");
-	printf("#define ALGN .balign 8\n");
-	printf("#else\n");
-	printf("#define PTR .long\n");
-	printf("#define ALGN .balign 4\n");
-	printf("#endif\n");
-
 	printf("\t.section .rodata, \"a\"\n");
 
 	output_label("kallsyms_num_syms");
@@ -434,34 +425,24 @@ static void write_src(void)
 	output_label("kallsyms_offsets");
 
 	for (i = 0; i < table_cnt; i++) {
-		/*
-		 * Use the offset relative to the lowest value
-		 * encountered of all relative symbols, and emit
-		 * non-relocatable fixed offsets that will be fixed
-		 * up at runtime.
-		 */
-
-		long long offset;
-
-		offset = table[i]->addr - relative_base;
-		if (offset < 0 || offset > UINT_MAX) {
-			fprintf(stderr, "kallsyms failure: "
-				"relative symbol value %#llx out of range\n",
-				table[i]->addr);
-			exit(EXIT_FAILURE);
+		if (pc_relative) {
+			long long offset = table[i]->addr - _text;
+
+			if (offset < INT_MIN || offset > INT_MAX) {
+				fprintf(stderr, "kallsyms failure: "
+					"relative symbol value %#llx out of range\n",
+					table[i]->addr);
+				exit(EXIT_FAILURE);
+			}
+			printf("\t.long\t_text - . + (%d)\t/* %s */\n",
+			       (int)offset, table[i]->sym);
+		} else {
+			printf("\t.long\t%#x\t/* %s */\n",
+			       (unsigned int)table[i]->addr, table[i]->sym);
 		}
-		printf("\t.long\t%#x\t/* %s */\n", (int)offset, table[i]->sym);
 	}
 	printf("\n");
 
-	output_label("kallsyms_relative_base");
-	/* Provide proper symbols relocatability by their '_text' relativeness. */
-	if (_text <= relative_base)
-		printf("\tPTR\t_text + %#llx\n", relative_base - _text);
-	else
-		printf("\tPTR\t_text - %#llx\n", _text - relative_base);
-	printf("\n");
-
 	sort_symbols_by_name();
 	output_label("kallsyms_seqs_of_names");
 	for (i = 0; i < table_cnt; i++)
@@ -701,22 +682,12 @@ static void sort_symbols(void)
 	qsort(table, table_cnt, sizeof(table[0]), compare_symbols);
 }
 
-/* find the minimum non-absolute symbol address */
-static void record_relative_base(void)
-{
-	/*
-	 * The table is sorted by address.
-	 * Take the first symbol value.
-	 */
-	if (table_cnt)
-		relative_base = table[0]->addr;
-}
-
 int main(int argc, char **argv)
 {
 	while (1) {
 		static const struct option long_options[] = {
 			{"all-symbols",     no_argument, &all_symbols,     1},
+			{"pc-relative",     no_argument, &pc_relative,     1},
 			{},
 		};
 
@@ -734,7 +705,6 @@ int main(int argc, char **argv)
 	read_map(argv[optind]);
 	shrink_table();
 	sort_symbols();
-	record_relative_base();
 	optimize_token_table();
 	write_src();
 
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 4ab44c73da4d..73531cb63efc 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -143,6 +143,10 @@ kallsyms()
 		kallsymopt="${kallsymopt} --all-symbols"
 	fi
 
+	if is_enabled CONFIG_64BIT || is_enabled CONFIG_RELOCATABLE; then
+		kallsymopt="${kallsymopt} --pc-relative"
+	fi
+
 	info KSYMS "${2}.S"
 	scripts/kallsyms ${kallsymopt} "${1}" > "${2}.S"
 
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index 74cdbd2ce9d0..524d46478364 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -27,7 +27,6 @@ static bool is_ignored_symbol(const char *name, char type)
 		 * stable symbol list.
 		 */
 		"kallsyms_offsets",
-		"kallsyms_relative_base",
 		"kallsyms_num_syms",
 		"kallsyms_names",
 		"kallsyms_markers",
-- 
cgit v1.2.3


From d07d7c3dd9446b47507d15fdfbb835638a2f6f50 Mon Sep 17 00:00:00 2001
From: Danielle Ratson <danieller@nvidia.com>
Date: Wed, 21 Jan 2026 13:46:44 +0200
Subject: selftests: net: Add kernel selftest for RFC 4884

RFC 4884 extended certain ICMP messages with a length attribute that
encodes the length of the "original datagram" field. This is needed so
that new information could be appended to these messages without
applications thinking that it is part of the "original datagram" field.

In version 5.9, the kernel was extended with two new socket options
(SOL_IP/IP_RECVERR_4884 and SOL_IPV6/IPV6_RECVERR_RFC4884) that allow
user space to retrieve this length which is basically the offset to the
ICMP Extension Structure at the end of the ICMP message. This is
required by user space applications that need to parse the information
contained in the ICMP Extension Structure. For example, the RFC 5837
extension for tracepath.

Add a selftest that verifies correct handling of the RFC 4884 length
field for both IPv4 and IPv6, with and without extension structures,
and validates that malformed extensions are correctly reported as invalid.

For each address family, the test creates:
  - a raw socket used to send locally crafted ICMP error packets to the
    loopback address, and
  - a datagram socket used to receive the encapsulated original datagram
    and associated error metadata from the kernel error queue.

ICMP packets are constructed entirely in user space rather than relying
on kernel-generated errors. This allows the test to exercise invalid
scenarios (such as corrupted checksums and incorrect length fields) and
verify that the SO_EE_RFC4884_FLAG_INVALID flag is set as expected.

Output Example:

$ ./icmp_rfc4884
Starting 18 tests from 18 test cases.
  RUN           rfc4884.ipv4_ext_small_payload.rfc4884 ...
            OK  rfc4884.ipv4_ext_small_payload.rfc4884
ok 1 rfc4884.ipv4_ext_small_payload.rfc4884
  RUN           rfc4884.ipv4_ext.rfc4884 ...
            OK  rfc4884.ipv4_ext.rfc4884
ok 2 rfc4884.ipv4_ext.rfc4884
  RUN           rfc4884.ipv4_ext_large_payload.rfc4884 ...
            OK  rfc4884.ipv4_ext_large_payload.rfc4884
ok 3 rfc4884.ipv4_ext_large_payload.rfc4884
  RUN           rfc4884.ipv4_no_ext_small_payload.rfc4884 ...
            OK  rfc4884.ipv4_no_ext_small_payload.rfc4884
ok 4 rfc4884.ipv4_no_ext_small_payload.rfc4884
  RUN           rfc4884.ipv4_no_ext_min_payload.rfc4884 ...
            OK  rfc4884.ipv4_no_ext_min_payload.rfc4884
ok 5 rfc4884.ipv4_no_ext_min_payload.rfc4884
  RUN           rfc4884.ipv4_no_ext_large_payload.rfc4884 ...
            OK  rfc4884.ipv4_no_ext_large_payload.rfc4884
ok 6 rfc4884.ipv4_no_ext_large_payload.rfc4884
  RUN           rfc4884.ipv4_invalid_ext_checksum.rfc4884 ...
            OK  rfc4884.ipv4_invalid_ext_checksum.rfc4884
ok 7 rfc4884.ipv4_invalid_ext_checksum.rfc4884
  RUN           rfc4884.ipv4_invalid_ext_length_small.rfc4884 ...
            OK  rfc4884.ipv4_invalid_ext_length_small.rfc4884
ok 8 rfc4884.ipv4_invalid_ext_length_small.rfc4884
  RUN           rfc4884.ipv4_invalid_ext_length_large.rfc4884 ...
            OK  rfc4884.ipv4_invalid_ext_length_large.rfc4884
ok 9 rfc4884.ipv4_invalid_ext_length_large.rfc4884
  RUN           rfc4884.ipv6_ext_small_payload.rfc4884 ...
            OK  rfc4884.ipv6_ext_small_payload.rfc4884
ok 10 rfc4884.ipv6_ext_small_payload.rfc4884
  RUN           rfc4884.ipv6_ext.rfc4884 ...
            OK  rfc4884.ipv6_ext.rfc4884
ok 11 rfc4884.ipv6_ext.rfc4884
  RUN           rfc4884.ipv6_ext_large_payload.rfc4884 ...
            OK  rfc4884.ipv6_ext_large_payload.rfc4884
ok 12 rfc4884.ipv6_ext_large_payload.rfc4884
  RUN           rfc4884.ipv6_no_ext_small_payload.rfc4884 ...
            OK  rfc4884.ipv6_no_ext_small_payload.rfc4884
ok 13 rfc4884.ipv6_no_ext_small_payload.rfc4884
  RUN           rfc4884.ipv6_no_ext_min_payload.rfc4884 ...
            OK  rfc4884.ipv6_no_ext_min_payload.rfc4884
ok 14 rfc4884.ipv6_no_ext_min_payload.rfc4884
  RUN           rfc4884.ipv6_no_ext_large_payload.rfc4884 ...
            OK  rfc4884.ipv6_no_ext_large_payload.rfc4884
ok 15 rfc4884.ipv6_no_ext_large_payload.rfc4884
  RUN           rfc4884.ipv6_invalid_ext_checksum.rfc4884 ...
            OK  rfc4884.ipv6_invalid_ext_checksum.rfc4884
ok 16 rfc4884.ipv6_invalid_ext_checksum.rfc4884
  RUN           rfc4884.ipv6_invalid_ext_length_small.rfc4884 ...
            OK  rfc4884.ipv6_invalid_ext_length_small.rfc4884
ok 17 rfc4884.ipv6_invalid_ext_length_small.rfc4884
  RUN           rfc4884.ipv6_invalid_ext_length_large.rfc4884 ...
            OK  rfc4884.ipv6_invalid_ext_length_large.rfc4884
ok 18 rfc4884.ipv6_invalid_ext_length_large.rfc4884
 PASSED: 18 / 18 tests passed.
 Totals: pass:18 fail:0 xfail:0 xpass:0 skip:0 error:0

Signed-off-by: Danielle Ratson <danieller@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20260121114644.2863640-1-danieller@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/.gitignore     |   1 +
 tools/testing/selftests/net/Makefile       |   1 +
 tools/testing/selftests/net/icmp_rfc4884.c | 679 +++++++++++++++++++++++++++++
 3 files changed, 681 insertions(+)
 create mode 100644 tools/testing/selftests/net/icmp_rfc4884.c

(limited to 'tools')

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 6930fe926c58..97ad4d551d44 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -7,6 +7,7 @@ cmsg_sender
 epoll_busy_poll
 fin_ack_lat
 hwtstamp_config
+icmp_rfc4884
 io_uring_zerocopy_tx
 ioam6_parser
 ip_defrag
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index b66ba04f19d9..fe7937dc5f45 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -166,6 +166,7 @@ TEST_GEN_PROGS := \
 	bind_timewait \
 	bind_wildcard \
 	epoll_busy_poll \
+	icmp_rfc4884 \
 	ipv6_fragmentation \
 	proc_net_pktgen \
 	reuseaddr_conflict \
diff --git a/tools/testing/selftests/net/icmp_rfc4884.c b/tools/testing/selftests/net/icmp_rfc4884.c
new file mode 100644
index 000000000000..cd826b913557
--- /dev/null
+++ b/tools/testing/selftests/net/icmp_rfc4884.c
@@ -0,0 +1,679 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <arpa/inet.h>
+#include <error.h>
+#include <linux/errqueue.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <netinet/in.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+
+#include "../kselftest_harness.h"
+
+static const unsigned short src_port = 44444;
+static const unsigned short dst_port = 55555;
+static const int min_orig_dgram_len = 128;
+static const int min_payload_len_v4 =
+	min_orig_dgram_len - sizeof(struct iphdr) - sizeof(struct udphdr);
+static const int min_payload_len_v6 =
+	min_orig_dgram_len - sizeof(struct ipv6hdr) - sizeof(struct udphdr);
+static const uint8_t orig_payload_byte =  0xAA;
+
+struct sockaddr_inet {
+	union {
+		struct sockaddr_in6 v6;
+		struct sockaddr_in v4;
+		struct sockaddr sa;
+	};
+	socklen_t len;
+};
+
+struct ip_case_info {
+	int	domain;
+	int	level;
+	int	opt1;
+	int	opt2;
+	int	proto;
+	int	(*build_func)(uint8_t *buf, ssize_t buflen, bool with_ext,
+			      int payload_len, bool bad_csum, bool bad_len,
+			      bool smaller_len);
+	int	min_payload;
+};
+
+static int bringup_loopback(void)
+{
+	struct ifreq ifr = {
+		.ifr_name = "lo"
+	};
+	int fd;
+
+	fd = socket(AF_INET, SOCK_DGRAM, 0);
+	if (fd < 0)
+		return -1;
+
+	if (ioctl(fd, SIOCGIFFLAGS, &ifr) < 0)
+		goto err;
+
+	ifr.ifr_flags = ifr.ifr_flags | IFF_UP;
+
+	if (ioctl(fd, SIOCSIFFLAGS, &ifr) < 0)
+		goto err;
+
+	close(fd);
+	return 0;
+
+err:
+	close(fd);
+	return -1;
+}
+
+static uint16_t csum(const void *buf, size_t len)
+{
+	const uint8_t *data = buf;
+	uint32_t sum = 0;
+
+	while (len > 1) {
+		sum += (data[0] << 8) | data[1];
+		data += 2;
+		len -= 2;
+	}
+
+	if (len == 1)
+		sum += data[0] << 8;
+
+	while (sum >> 16)
+		sum = (sum & 0xFFFF) + (sum >> 16);
+
+	return ~sum & 0xFFFF;
+}
+
+static int poll_err(int fd)
+{
+	struct pollfd pfd;
+
+	memset(&pfd, 0, sizeof(pfd));
+	pfd.fd = fd;
+
+	if (poll(&pfd, 1, 5000) != 1 || pfd.revents != POLLERR)
+		return -1;
+
+	return 0;
+}
+
+static void set_addr(struct sockaddr_inet *addr, int domain,
+		     unsigned short port)
+{
+	memset(addr, 0, sizeof(*addr));
+
+	switch (domain) {
+	case AF_INET:
+		addr->v4.sin_family = AF_INET;
+		addr->v4.sin_port = htons(port);
+		addr->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+		addr->len = sizeof(addr->v4);
+		break;
+	case AF_INET6:
+		addr->v6.sin6_family = AF_INET6;
+		addr->v6.sin6_port = htons(port);
+		addr->v6.sin6_addr = in6addr_loopback;
+		addr->len = sizeof(addr->v6);
+		break;
+	}
+}
+
+static int bind_and_setsockopt(int fd, const struct ip_case_info *info)
+{
+	struct sockaddr_inet addr;
+	int opt = 1;
+
+	set_addr(&addr, info->domain, src_port);
+
+	if (setsockopt(fd, info->level, info->opt1, &opt, sizeof(opt)) < 0)
+		return -1;
+
+	if (setsockopt(fd, info->level, info->opt2, &opt, sizeof(opt)) < 0)
+		return -1;
+
+	return bind(fd, &addr.sa, addr.len);
+}
+
+static int build_rfc4884_ext(uint8_t *buf, size_t buflen, bool bad_csum,
+			     bool bad_len, bool smaller_len)
+{
+	struct icmp_extobj_hdr *objh;
+	struct icmp_ext_hdr *exthdr;
+	size_t obj_len, ext_len;
+	uint16_t sum;
+
+	/* Use an object payload of 4 bytes */
+	obj_len = sizeof(*objh) + sizeof(uint32_t);
+	ext_len = sizeof(*exthdr) + obj_len;
+
+	if (ext_len > buflen)
+		return -EINVAL;
+
+	exthdr = (struct icmp_ext_hdr *)buf;
+	objh = (struct icmp_extobj_hdr *)(buf + sizeof(*exthdr));
+
+	exthdr->version = 2;
+	/* When encoding a bad object length, either encode a length too small
+	 * to fit the object header or too big to fit in the packet.
+	 */
+	if (bad_len)
+		obj_len = smaller_len ? sizeof(*objh) - 1 : obj_len * 2;
+	objh->length = htons(obj_len);
+
+	sum = csum(buf, ext_len);
+	exthdr->checksum = htons(bad_csum ? sum - 1 : sum);
+
+	return ext_len;
+}
+
+static int build_orig_dgram_v4(uint8_t *buf, ssize_t buflen, int payload_len)
+{
+	struct udphdr *udph;
+	struct iphdr *iph;
+	size_t len = 0;
+
+	len = sizeof(*iph) + sizeof(*udph) + payload_len;
+	if (len > buflen)
+		return -EINVAL;
+
+	iph = (struct iphdr *)buf;
+	udph = (struct udphdr *)(buf + sizeof(*iph));
+
+	iph->version = 4;
+	iph->ihl = 5;
+	iph->protocol = IPPROTO_UDP;
+	iph->saddr = htonl(INADDR_LOOPBACK);
+	iph->daddr = htonl(INADDR_LOOPBACK);
+	iph->tot_len = htons(len);
+	iph->check = htons(csum(iph, sizeof(*iph)));
+
+	udph->source = htons(src_port);
+	udph->dest = htons(dst_port);
+	udph->len = htons(sizeof(*udph) + payload_len);
+
+	memset(buf + sizeof(*iph) + sizeof(*udph), orig_payload_byte,
+	       payload_len);
+
+	return len;
+}
+
+static int build_orig_dgram_v6(uint8_t *buf, ssize_t buflen, int payload_len)
+{
+	struct udphdr *udph;
+	struct ipv6hdr *iph;
+	size_t len = 0;
+
+	len = sizeof(*iph) + sizeof(*udph) + payload_len;
+	if (len > buflen)
+		return -EINVAL;
+
+	iph = (struct ipv6hdr *)buf;
+	udph = (struct udphdr *)(buf + sizeof(*iph));
+
+	iph->version = 6;
+	iph->payload_len = htons(sizeof(*udph) + payload_len);
+	iph->nexthdr = IPPROTO_UDP;
+	iph->saddr = in6addr_loopback;
+	iph->daddr = in6addr_loopback;
+
+	udph->source = htons(src_port);
+	udph->dest = htons(dst_port);
+	udph->len = htons(sizeof(*udph) + payload_len);
+
+	memset(buf + sizeof(*iph) + sizeof(*udph), orig_payload_byte,
+	       payload_len);
+
+	return len;
+}
+
+static int build_icmpv4_pkt(uint8_t *buf, ssize_t buflen, bool with_ext,
+			    int payload_len, bool bad_csum, bool bad_len,
+			    bool smaller_len)
+{
+	struct icmphdr *icmph;
+	int len, ret;
+
+	len = sizeof(*icmph);
+	memset(buf, 0, buflen);
+
+	icmph = (struct icmphdr *)buf;
+	icmph->type = ICMP_DEST_UNREACH;
+	icmph->code = ICMP_PORT_UNREACH;
+	icmph->checksum = 0;
+
+	ret = build_orig_dgram_v4(buf + len, buflen - len, payload_len);
+	if (ret < 0)
+		return ret;
+
+	len += ret;
+
+	icmph->un.reserved[1] = (len - sizeof(*icmph)) / sizeof(uint32_t);
+
+	if (with_ext) {
+		ret = build_rfc4884_ext(buf + len, buflen - len,
+					bad_csum, bad_len, smaller_len);
+		if (ret < 0)
+			return ret;
+
+		len += ret;
+	}
+
+	icmph->checksum = htons(csum(icmph, len));
+	return len;
+}
+
+static int build_icmpv6_pkt(uint8_t *buf, ssize_t buflen, bool with_ext,
+			    int payload_len, bool bad_csum, bool bad_len,
+			    bool smaller_len)
+{
+	struct icmp6hdr *icmph;
+	int len, ret;
+
+	len = sizeof(*icmph);
+	memset(buf, 0, buflen);
+
+	icmph = (struct icmp6hdr *)buf;
+	icmph->icmp6_type = ICMPV6_DEST_UNREACH;
+	icmph->icmp6_code = ICMPV6_PORT_UNREACH;
+	icmph->icmp6_cksum = 0;
+
+	ret = build_orig_dgram_v6(buf + len, buflen - len, payload_len);
+	if (ret < 0)
+		return ret;
+
+	len += ret;
+
+	icmph->icmp6_datagram_len = (len - sizeof(*icmph)) / sizeof(uint64_t);
+
+	if (with_ext) {
+		ret = build_rfc4884_ext(buf + len, buflen - len,
+					bad_csum, bad_len, smaller_len);
+		if (ret < 0)
+			return ret;
+
+		len += ret;
+	}
+
+	icmph->icmp6_cksum = htons(csum(icmph, len));
+	return len;
+}
+
+FIXTURE(rfc4884) {};
+
+FIXTURE_SETUP(rfc4884)
+{
+	int ret;
+
+	ret = unshare(CLONE_NEWNET);
+	ASSERT_EQ(ret, 0) {
+		TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno));
+	}
+
+	ret = bringup_loopback();
+	ASSERT_EQ(ret, 0) TH_LOG("Failed to bring up loopback interface");
+}
+
+FIXTURE_TEARDOWN(rfc4884)
+{
+}
+
+const struct ip_case_info ipv4_info = {
+	.domain		= AF_INET,
+	.level		= SOL_IP,
+	.opt1		= IP_RECVERR,
+	.opt2		= IP_RECVERR_RFC4884,
+	.proto		= IPPROTO_ICMP,
+	.build_func	= build_icmpv4_pkt,
+	.min_payload	= min_payload_len_v4,
+};
+
+const struct ip_case_info ipv6_info = {
+	.domain		= AF_INET6,
+	.level		= SOL_IPV6,
+	.opt1		= IPV6_RECVERR,
+	.opt2		= IPV6_RECVERR_RFC4884,
+	.proto		= IPPROTO_ICMPV6,
+	.build_func	= build_icmpv6_pkt,
+	.min_payload	= min_payload_len_v6,
+};
+
+FIXTURE_VARIANT(rfc4884) {
+	/* IPv4/v6 related information */
+	struct ip_case_info	info;
+	/* Whether to append an ICMP extension or not */
+	bool			with_ext;
+	/* UDP payload length */
+	int			payload_len;
+	/* Whether to generate a bad checksum in the ICMP extension structure */
+	bool			bad_csum;
+	/* Whether to generate a bad length in the ICMP object header */
+	bool			bad_len;
+	/* Whether it is too small to fit the object header or too big to fit
+	 * in the packet
+	 */
+	bool			smaller_len;
+};
+
+/* Tests that a valid ICMPv4 error message with extension and the original
+ * datagram is smaller than 128 bytes, generates an error with zero offset,
+ * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv4_ext_small_payload) {
+	.info		= ipv4_info,
+	.with_ext	= true,
+	.payload_len	= 64,
+	.bad_csum	= false,
+	.bad_len	= false,
+};
+
+/* Tests that a valid ICMPv4 error message with extension and 128 bytes original
+ * datagram, generates an error with the expected offset, and does not raise the
+ * SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv4_ext) {
+	.info		= ipv4_info,
+	.with_ext	= true,
+	.payload_len	= min_payload_len_v4,
+	.bad_csum	= false,
+	.bad_len	= false,
+};
+
+/* Tests that a valid ICMPv4 error message with extension and the original
+ * datagram is larger than 128 bytes, generates an error with the expected
+ * offset, and does not raise the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv4_ext_large_payload) {
+	.info		= ipv4_info,
+	.with_ext	= true,
+	.payload_len	= 256,
+	.bad_csum	= false,
+	.bad_len	= false,
+};
+
+/* Tests that a valid ICMPv4 error message without extension and the original
+ * datagram is smaller than 128 bytes, generates an error with zero offset,
+ * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv4_no_ext_small_payload) {
+	.info		= ipv4_info,
+	.with_ext	= false,
+	.payload_len	= 64,
+	.bad_csum	= false,
+	.bad_len	= false,
+};
+
+/* Tests that a valid ICMPv4 error message without extension and 128 bytes
+ * original datagram, generates an error with zero offset, and does not raise
+ * the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv4_no_ext_min_payload) {
+	.info		= ipv4_info,
+	.with_ext	= false,
+	.payload_len	= min_payload_len_v4,
+	.bad_csum	= false,
+	.bad_len	= false,
+};
+
+/* Tests that a valid ICMPv4 error message without extension and the original
+ * datagram is larger than 128 bytes, generates an error with zero offset,
+ * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv4_no_ext_large_payload) {
+	.info		= ipv4_info,
+	.with_ext	= false,
+	.payload_len	= 256,
+	.bad_csum	= false,
+	.bad_len	= false,
+};
+
+/* Tests that an ICMPv4 error message with extension and an invalid checksum,
+ * generates an error with the expected offset, and raises the
+ * SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv4_invalid_ext_checksum) {
+	.info		= ipv4_info,
+	.with_ext	= true,
+	.payload_len	= min_payload_len_v4,
+	.bad_csum	= true,
+	.bad_len	= false,
+};
+
+/* Tests that an ICMPv4 error message with extension and an object length
+ * smaller than the object header, generates an error with the expected offset,
+ * and raises the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv4_invalid_ext_length_small) {
+	.info		= ipv4_info,
+	.with_ext	= true,
+	.payload_len	= min_payload_len_v4,
+	.bad_csum	= false,
+	.bad_len	= true,
+	.smaller_len	= true,
+};
+
+/* Tests that an ICMPv4 error message with extension and an object length that
+ * is too big to fit in the packet, generates an error with the expected offset,
+ * and raises the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv4_invalid_ext_length_large) {
+	.info		= ipv4_info,
+	.with_ext	= true,
+	.payload_len	= min_payload_len_v4,
+	.bad_csum	= false,
+	.bad_len	= true,
+	.smaller_len	= false,
+};
+
+/* Tests that a valid ICMPv6 error message with extension and the original
+ * datagram is smaller than 128 bytes, generates an error with zero offset,
+ * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv6_ext_small_payload) {
+	.info		= ipv6_info,
+	.with_ext	= true,
+	.payload_len	= 64,
+	.bad_csum	= false,
+	.bad_len	= false,
+};
+
+/* Tests that a valid ICMPv6 error message with extension and 128 bytes original
+ * datagram, generates an error with the expected offset, and does not raise the
+ * SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv6_ext) {
+	.info		= ipv6_info,
+	.with_ext	= true,
+	.payload_len	= min_payload_len_v6,
+	.bad_csum	= false,
+	.bad_len	= false,
+};
+
+/* Tests that a valid ICMPv6 error message with extension and the original
+ * datagram is larger than 128 bytes, generates an error with the expected
+ * offset, and does not raise the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv6_ext_large_payload) {
+	.info		= ipv6_info,
+	.with_ext	= true,
+	.payload_len	= 256,
+	.bad_csum	= false,
+	.bad_len	= false,
+};
+/* Tests that a valid ICMPv6 error message without extension and the original
+ * datagram is smaller than 128 bytes, generates an error with zero offset,
+ * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv6_no_ext_small_payload) {
+	.info		= ipv6_info,
+	.with_ext	= false,
+	.payload_len	= 64,
+	.bad_csum	= false,
+	.bad_len	= false,
+};
+
+/* Tests that a valid ICMPv6 error message without extension and 128 bytes
+ * original datagram, generates an error with zero offset, and does not
+ * raise the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv6_no_ext_min_payload) {
+	.info		= ipv6_info,
+	.with_ext	= false,
+	.payload_len	= min_payload_len_v6,
+	.bad_csum	= false,
+	.bad_len	= false,
+};
+
+/* Tests that a valid ICMPv6 error message without extension and the original
+ * datagram is larger than 128 bytes, generates an error with zero offset,
+ * and does not raise the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv6_no_ext_large_payload) {
+	.info		= ipv6_info,
+	.with_ext	= false,
+	.payload_len	= 256,
+	.bad_csum	= false,
+	.bad_len	= false,
+};
+
+/* Tests that an ICMPv6 error message with extension and an invalid checksum,
+ * generates an error with the expected offset, and raises the
+ * SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv6_invalid_ext_checksum) {
+	.info		= ipv6_info,
+	.with_ext	= true,
+	.payload_len	= min_payload_len_v6,
+	.bad_csum	= true,
+	.bad_len	= false,
+};
+
+/* Tests that an ICMPv6 error message with extension and an object length
+ * smaller than the object header, generates an error with the expected offset,
+ * and raises the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv6_invalid_ext_length_small) {
+	.info		= ipv6_info,
+	.with_ext	= true,
+	.payload_len	= min_payload_len_v6,
+	.bad_csum	= false,
+	.bad_len	= true,
+	.smaller_len	= true,
+};
+
+/* Tests that an ICMPv6 error message with extension and an object length that
+ * is too big to fit in the packet, generates an error with the expected offset,
+ * and raises the SO_EE_RFC4884_FLAG_INVALID flag.
+ */
+FIXTURE_VARIANT_ADD(rfc4884, ipv6_invalid_ext_length_large) {
+	.info		= ipv6_info,
+	.with_ext	= true,
+	.payload_len	= min_payload_len_v6,
+	.bad_csum	= false,
+	.bad_len	= true,
+	.smaller_len	= false,
+};
+
+static void
+check_rfc4884_offset(struct __test_metadata *_metadata, int sock,
+		     const FIXTURE_VARIANT(rfc4884) *v)
+{
+	char rxbuf[1024];
+	char ctrl[1024];
+	struct iovec iov = {
+		.iov_base = rxbuf,
+		.iov_len = sizeof(rxbuf)
+	};
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = ctrl,
+		.msg_controllen = sizeof(ctrl),
+	};
+	struct cmsghdr *cmsg;
+	int recv;
+
+	ASSERT_EQ(poll_err(sock), 0);
+
+	recv = recvmsg(sock, &msg, MSG_ERRQUEUE);
+	ASSERT_GE(recv, 0) TH_LOG("recvmsg(MSG_ERRQUEUE) failed");
+
+	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+		bool is_invalid, expected_invalid;
+		struct sock_extended_err *ee;
+		int expected_off;
+		uint16_t off;
+
+		if (cmsg->cmsg_level != v->info.level ||
+		    cmsg->cmsg_type != v->info.opt1) {
+			TH_LOG("Unrelated cmsgs were encountered in recvmsg()");
+			continue;
+		}
+
+		ee = (struct sock_extended_err *)CMSG_DATA(cmsg);
+		off = ee->ee_rfc4884.len;
+		is_invalid = ee->ee_rfc4884.flags & SO_EE_RFC4884_FLAG_INVALID;
+
+		expected_invalid = v->bad_csum || v->bad_len;
+		ASSERT_EQ(is_invalid, expected_invalid) {
+			TH_LOG("Expected invalidity flag to be %d, but got %d",
+			       expected_invalid, is_invalid);
+		}
+
+		expected_off =
+			(v->with_ext && v->payload_len >= v->info.min_payload) ?
+			v->payload_len : 0;
+		ASSERT_EQ(off, expected_off) {
+			TH_LOG("Expected RFC4884 offset %u, got %u",
+			       expected_off, off);
+		}
+		break;
+	}
+}
+
+TEST_F(rfc4884, rfc4884)
+{
+	const typeof(variant) v = variant;
+	struct sockaddr_inet addr;
+	uint8_t pkt[1024];
+	int dgram, raw;
+	int len, sent;
+	int err;
+
+	dgram = socket(v->info.domain, SOCK_DGRAM, 0);
+	ASSERT_GE(dgram, 0) TH_LOG("Opening datagram socket failed");
+
+	err = bind_and_setsockopt(dgram, &v->info);
+	ASSERT_EQ(err, 0) TH_LOG("Bind failed");
+
+	raw = socket(v->info.domain, SOCK_RAW, v->info.proto);
+	ASSERT_GE(raw, 0) TH_LOG("Opening raw socket failed");
+
+	len = v->info.build_func(pkt, sizeof(pkt), v->with_ext, v->payload_len,
+				 v->bad_csum, v->bad_len, v->smaller_len);
+	ASSERT_GT(len, 0) TH_LOG("Building packet failed");
+
+	set_addr(&addr, v->info.domain, 0);
+	sent = sendto(raw, pkt, len, 0, &addr.sa, addr.len);
+	ASSERT_EQ(len, sent) TH_LOG("Sending packet failed");
+
+	check_rfc4884_offset(_metadata, dgram, v);
+
+	close(dgram);
+	close(raw);
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From caf84294ff98bb7455722285f30f46c193ffccdd Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:48 +0800
Subject: selftests: ublk: fix user_data truncation for tgt_data >= 256

The build_user_data() function packs multiple fields into a __u64
value using bit shifts. Without explicit __u64 casts before shifting,
the shift operations are performed on 32-bit unsigned integers before
being promoted to 64-bit, causing data loss.

Specifically, when tgt_data >= 256, the expression (tgt_data << 24)
shifts on a 32-bit value, truncating the upper 8 bits before promotion
to __u64. Since tgt_data can be up to 16 bits (assertion allows up to
65535), values >= 256 would have their high byte lost.

Add explicit __u64 casts to both op and tgt_data before shifting to
ensure the shift operations happen in 64-bit space, preserving all
bits of the input values.

user_data_to_tgt_data() is only used by stripe.c, in which the max
supported member disks are 4, so won't trigger this issue.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/kublk.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index cb757fd9bf9d..69fd5794f300 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -262,7 +262,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op,
 	_Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
 	assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7));
 
-	return tag | (op << 16) | (tgt_data << 24) |
+	return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) |
 		(__u64)q_id << 56 | (__u64)is_target_io << 63;
 }
 
-- 
cgit v1.2.3


From 584709ad5ce359f8b5773eb6af40070412652c51 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:49 +0800
Subject: selftests: ublk: replace assert() with ublk_assert()

Replace assert() with ublk_assert() since it is often triggered in daemon,
and we may get nothing shown in terminal.

Add ublk_assert(), so we can log something to syslog when assert() is
triggered.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/common.c      |  2 +-
 tools/testing/selftests/ublk/file_backed.c |  2 +-
 tools/testing/selftests/ublk/kublk.c       |  2 +-
 tools/testing/selftests/ublk/kublk.h       |  2 +-
 tools/testing/selftests/ublk/stripe.c      | 10 +++++-----
 tools/testing/selftests/ublk/utils.h       | 10 ++++++++++
 6 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c
index d9873d4d50d0..530f9877c9dd 100644
--- a/tools/testing/selftests/ublk/common.c
+++ b/tools/testing/selftests/ublk/common.c
@@ -16,7 +16,7 @@ int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct)
 {
 	int fd, i;
 
-	assert(dev->nr_fds == 1);
+	ublk_assert(dev->nr_fds == 1);
 
 	for (i = 0; i < dev->tgt.nr_backing_files; i++) {
 		char *file = dev->tgt.backing_file[i];
diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index c3ce5ff72422..889047bd8fa3 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -10,7 +10,7 @@ static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int
 		return zc ? IORING_OP_READ_FIXED : IORING_OP_READ;
 	else if (ublk_op == UBLK_IO_OP_WRITE)
 		return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE;
-	assert(0);
+	ublk_assert(0);
 }
 
 static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q,
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 3472ce7426ba..e98999bea9b1 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -825,7 +825,7 @@ static void ublk_handle_uring_cmd(struct ublk_thread *t,
 	}
 
 	if (cqe->res == UBLK_IO_RES_OK) {
-		assert(tag < q->q_depth);
+		ublk_assert(tag < q->q_depth);
 
 		if (ublk_queue_use_user_copy(q))
 			ublk_user_copy(io, UBLK_IO_OP_WRITE);
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 69fd5794f300..48634d29c084 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -260,7 +260,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op,
 {
 	/* we only have 7 bits to encode q_id */
 	_Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
-	assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7));
+	ublk_assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7));
 
 	return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) |
 		(__u64)q_id << 56 | (__u64)is_target_io << 63;
diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c
index 2be1c36438e7..b967447fe591 100644
--- a/tools/testing/selftests/ublk/stripe.c
+++ b/tools/testing/selftests/ublk/stripe.c
@@ -96,12 +96,12 @@ static void calculate_stripe_array(const struct stripe_conf *conf,
 			this->seq = seq;
 			s->nr += 1;
 		} else {
-			assert(seq == this->seq);
-			assert(this->start + this->nr_sects == stripe_off);
+			ublk_assert(seq == this->seq);
+			ublk_assert(this->start + this->nr_sects == stripe_off);
 			this->nr_sects += nr_sects;
 		}
 
-		assert(this->nr_vec < this->cap);
+		ublk_assert(this->nr_vec < this->cap);
 		this->vec[this->nr_vec].iov_base = (void *)(base + done);
 		this->vec[this->nr_vec++].iov_len = nr_sects << 9;
 
@@ -120,7 +120,7 @@ static inline enum io_uring_op stripe_to_uring_op(
 		return zc ? IORING_OP_READV_FIXED : IORING_OP_READV;
 	else if (ublk_op == UBLK_IO_OP_WRITE)
 		return zc ? IORING_OP_WRITEV_FIXED : IORING_OP_WRITEV;
-	assert(0);
+	ublk_assert(0);
 }
 
 static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
@@ -322,7 +322,7 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
 	if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE)
 		return -EINVAL;
 
-	assert(dev->nr_fds == dev->tgt.nr_backing_files + 1);
+	ublk_assert(dev->nr_fds == dev->tgt.nr_backing_files + 1);
 
 	for (i = 0; i < dev->tgt.nr_backing_files; i++)
 		dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1);
diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h
index a852e0b7153e..17eefed73690 100644
--- a/tools/testing/selftests/ublk/utils.h
+++ b/tools/testing/selftests/ublk/utils.h
@@ -43,6 +43,7 @@ static inline void ublk_err(const char *fmt, ...)
 
 	va_start(ap, fmt);
 	vfprintf(stderr, fmt, ap);
+	va_end(ap);
 }
 
 static inline void ublk_log(const char *fmt, ...)
@@ -52,6 +53,7 @@ static inline void ublk_log(const char *fmt, ...)
 
 		va_start(ap, fmt);
 		vfprintf(stdout, fmt, ap);
+		va_end(ap);
 	}
 }
 
@@ -62,7 +64,15 @@ static inline void ublk_dbg(int level, const char *fmt, ...)
 
 		va_start(ap, fmt);
 		vfprintf(stdout, fmt, ap);
+		va_end(ap);
 	}
 }
 
+#define ublk_assert(x)  do { \
+	if (!(x)) {     \
+		ublk_err("%s %d: assert!\n", __func__, __LINE__); \
+		assert(x);      \
+	}       \
+} while (0)
+
 #endif
-- 
cgit v1.2.3


From f1d621b5a04ea41ee90f177db084d00db57e6839 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:50 +0800
Subject: selftests: ublk: add ublk_io_buf_idx() for returning io buffer index

Since UBLK_F_PER_IO_DAEMON is added, io buffer index may depend on current
thread because the common way is to use per-pthread io_ring_ctx for issuing
ublk uring_cmd.

Add one helper for returning io buffer index, so we can hide the buffer
index implementation details for target code.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/file_backed.c |  9 +++++----
 tools/testing/selftests/ublk/kublk.c       |  9 +++++----
 tools/testing/selftests/ublk/kublk.h       | 10 +++++++++-
 tools/testing/selftests/ublk/null.c        | 18 ++++++++++--------
 tools/testing/selftests/ublk/stripe.c      |  7 ++++---
 5 files changed, 33 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index 889047bd8fa3..228af2580ac6 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -39,6 +39,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 	__u32 len = iod->nr_sectors << 9;
 	struct io_uring_sqe *sqe[3];
 	void *addr = io->buf_addr;
+	unsigned short buf_index = ublk_io_buf_idx(t, q, tag);
 
 	if (iod->op_flags & UBLK_IO_F_INTEGRITY) {
 		ublk_io_alloc_sqes(t, sqe, 1);
@@ -62,7 +63,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 				len,
 				offset);
 		if (auto_zc)
-			sqe[0]->buf_index = tag;
+			sqe[0]->buf_index = buf_index;
 		io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
 		/* bit63 marks us as tgt io */
 		sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
@@ -71,7 +72,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 
 	ublk_io_alloc_sqes(t, sqe, 3);
 
-	io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index);
+	io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_index);
 	sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
 	sqe[0]->user_data = build_user_data(tag,
 			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
@@ -79,11 +80,11 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 	io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0,
 			len,
 			offset);
-	sqe[1]->buf_index = tag;
+	sqe[1]->buf_index = buf_index;
 	sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK;
 	sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
 
-	io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, io->buf_index);
+	io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_index);
 	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
 
 	return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 2;
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index e98999bea9b1..9b6f1cd04dc4 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -605,16 +605,17 @@ static void ublk_dev_unprep(struct ublk_dev *dev)
 	close(dev->fds[0]);
 }
 
-static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
+static void ublk_set_auto_buf_reg(const struct ublk_thread *t,
+				  const struct ublk_queue *q,
 				  struct io_uring_sqe *sqe,
 				  unsigned short tag)
 {
 	struct ublk_auto_buf_reg buf = {};
 
 	if (q->tgt_ops->buf_index)
-		buf.index = q->tgt_ops->buf_index(q, tag);
+		buf.index = q->tgt_ops->buf_index(t, q, tag);
 	else
-		buf.index = q->ios[tag].buf_index;
+		buf.index = ublk_io_buf_idx(t, q, tag);
 
 	if (ublk_queue_auto_zc_fallback(q))
 		buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
@@ -730,7 +731,7 @@ int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io)
 		cmd->addr	= 0;
 
 	if (ublk_queue_use_auto_zc(q))
-		ublk_set_auto_buf_reg(q, sqe[0], io->tag);
+		ublk_set_auto_buf_reg(t, q, sqe[0], io->tag);
 
 	user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
 	io_uring_sqe_set_data64(sqe[0], user_data);
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 48634d29c084..311a75da9b21 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -150,7 +150,8 @@ struct ublk_tgt_ops {
 	void (*usage)(const struct ublk_tgt_ops *ops);
 
 	/* return buffer index for UBLK_F_AUTO_BUF_REG */
-	unsigned short (*buf_index)(const struct ublk_queue *, int tag);
+	unsigned short (*buf_index)(const struct ublk_thread *t,
+			const struct ublk_queue *, int tag);
 };
 
 struct ublk_tgt {
@@ -393,6 +394,13 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op)
 	addr[1] = 0;
 }
 
+static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t,
+					     const struct ublk_queue *q,
+					     unsigned tag)
+{
+	return q->ios[tag].buf_index;
+}
+
 static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag)
 {
 	return &q->ios[tag];
diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c
index 3aa162f08476..7656888f4149 100644
--- a/tools/testing/selftests/ublk/null.c
+++ b/tools/testing/selftests/ublk/null.c
@@ -44,12 +44,12 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
 }
 
 static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod,
-		struct io_uring_sqe *sqe, int q_id)
+		struct io_uring_sqe *sqe, int q_id, unsigned buf_idx)
 {
 	unsigned ublk_op = ublksrv_get_op(iod);
 
 	io_uring_prep_nop(sqe);
-	sqe->buf_index = tag;
+	sqe->buf_index = buf_idx;
 	sqe->flags |= IOSQE_FIXED_FILE;
 	sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT;
 	sqe->len = iod->nr_sectors << 9; 	/* injected result */
@@ -61,18 +61,19 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q,
 {
 	const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
 	struct io_uring_sqe *sqe[3];
+	unsigned short buf_idx = ublk_io_buf_idx(t, q, tag);
 
 	ublk_io_alloc_sqes(t, sqe, 3);
 
-	io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
+	io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx);
 	sqe[0]->user_data = build_user_data(tag,
 			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
 	sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
 
-	__setup_nop_io(tag, iod, sqe[1], q->q_id);
+	__setup_nop_io(tag, iod, sqe[1], q->q_id, buf_idx);
 	sqe[1]->flags |= IOSQE_IO_HARDLINK;
 
-	io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
+	io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_idx);
 	sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
 
 	// buf register is marked as IOSQE_CQE_SKIP_SUCCESS
@@ -86,7 +87,7 @@ static int null_queue_auto_zc_io(struct ublk_thread *t, struct ublk_queue *q,
 	struct io_uring_sqe *sqe[1];
 
 	ublk_io_alloc_sqes(t, sqe, 1);
-	__setup_nop_io(tag, iod, sqe[0], q->q_id);
+	__setup_nop_io(tag, iod, sqe[0], q->q_id, ublk_io_buf_idx(t, q, tag));
 	return 1;
 }
 
@@ -137,11 +138,12 @@ static int ublk_null_queue_io(struct ublk_thread *t, struct ublk_queue *q,
  * return invalid buffer index for triggering auto buffer register failure,
  * then UBLK_IO_RES_NEED_REG_BUF handling is covered
  */
-static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag)
+static unsigned short ublk_null_buf_index(const struct ublk_thread *t,
+		const struct ublk_queue *q, int tag)
 {
 	if (ublk_queue_auto_zc_fallback(q))
 		return (unsigned short)-1;
-	return q->ios[tag].buf_index;
+	return ublk_io_buf_idx(t, q, tag);
 }
 
 const struct ublk_tgt_ops null_tgt_ops = {
diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c
index b967447fe591..dca819f5366e 100644
--- a/tools/testing/selftests/ublk/stripe.c
+++ b/tools/testing/selftests/ublk/stripe.c
@@ -135,6 +135,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 	struct ublk_io *io = ublk_get_io(q, tag);
 	int i, extra = zc ? 2 : 0;
 	void *base = io->buf_addr;
+	unsigned short buf_idx = ublk_io_buf_idx(t, q, tag);
 
 	io->private_data = s;
 	calculate_stripe_array(conf, iod, s, base);
@@ -142,7 +143,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 	ublk_io_alloc_sqes(t, sqe, s->nr + extra);
 
 	if (zc) {
-		io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index);
+		io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx);
 		sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
 		sqe[0]->user_data = build_user_data(tag,
 			ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
@@ -158,7 +159,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 				t->start << 9);
 		io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE);
 		if (auto_zc || zc) {
-			sqe[i]->buf_index = tag;
+			sqe[i]->buf_index = buf_idx;
 			if (zc)
 				sqe[i]->flags |= IOSQE_IO_HARDLINK;
 		}
@@ -168,7 +169,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
 	if (zc) {
 		struct io_uring_sqe *unreg = sqe[s->nr + 1];
 
-		io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, io->buf_index);
+		io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, buf_idx);
 		unreg->user_data = build_user_data(
 			tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1);
 	}
-- 
cgit v1.2.3


From dccbfa9d416424fbcbc83a46e84c604bad1db9d0 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:51 +0800
Subject: selftests: ublk: add batch buffer management infrastructure

Add the foundational infrastructure for UBLK_F_BATCH_IO buffer
management including:

- Allocator utility functions for small sized per-thread allocation
- Batch buffer allocation and deallocation functions
- Buffer index management for commit buffers
- Thread state management for batch I/O mode
- Buffer size calculation based on device features

This prepares the groundwork for handling batch I/O commands by
establishing the buffer management layer needed for UBLK_U_IO_PREP_IO_CMDS
and UBLK_U_IO_COMMIT_IO_CMDS operations.

The allocator uses CPU sets for efficient per-thread buffer tracking,
and commit buffers are pre-allocated with 2 buffers per thread to handle
overlapping command operations.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/batch.c | 152 +++++++++++++++++++++++++++++++++++
 tools/testing/selftests/ublk/kublk.c |  26 +++++-
 tools/testing/selftests/ublk/kublk.h |  53 ++++++++++++
 tools/testing/selftests/ublk/utils.h |  54 +++++++++++++
 4 files changed, 282 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/ublk/batch.c

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c
new file mode 100644
index 000000000000..609e6073c9c0
--- /dev/null
+++ b/tools/testing/selftests/ublk/batch.c
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: UBLK_F_BATCH_IO buffer management
+ */
+
+#include "kublk.h"
+
+static inline void *ublk_get_commit_buf(struct ublk_thread *t,
+					unsigned short buf_idx)
+{
+	unsigned idx;
+
+	if (buf_idx < t->commit_buf_start ||
+			buf_idx >= t->commit_buf_start + t->nr_commit_buf)
+		return NULL;
+	idx = buf_idx - t->commit_buf_start;
+	return t->commit_buf + idx * t->commit_buf_size;
+}
+
+/*
+ * Allocate one buffer for UBLK_U_IO_PREP_IO_CMDS or UBLK_U_IO_COMMIT_IO_CMDS
+ *
+ * Buffer index is returned.
+ */
+static inline unsigned short ublk_alloc_commit_buf(struct ublk_thread *t)
+{
+	int idx = allocator_get(&t->commit_buf_alloc);
+
+	if (idx >= 0)
+		return  idx + t->commit_buf_start;
+	return UBLKS_T_COMMIT_BUF_INV_IDX;
+}
+
+/*
+ * Free one commit buffer which is used by UBLK_U_IO_PREP_IO_CMDS or
+ * UBLK_U_IO_COMMIT_IO_CMDS
+ */
+static inline void ublk_free_commit_buf(struct ublk_thread *t,
+					 unsigned short i)
+{
+	unsigned short idx = i - t->commit_buf_start;
+
+	ublk_assert(idx < t->nr_commit_buf);
+	ublk_assert(allocator_get_val(&t->commit_buf_alloc, idx) != 0);
+
+	allocator_put(&t->commit_buf_alloc, idx);
+}
+
+static unsigned char ublk_commit_elem_buf_size(struct ublk_dev *dev)
+{
+	if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY |
+				UBLK_F_AUTO_BUF_REG))
+		return 8;
+
+	/* one extra 8bytes for carrying buffer address */
+	return 16;
+}
+
+static unsigned ublk_commit_buf_size(struct ublk_thread *t)
+{
+	struct ublk_dev *dev = t->dev;
+	unsigned elem_size = ublk_commit_elem_buf_size(dev);
+	unsigned int total = elem_size * dev->dev_info.queue_depth;
+	unsigned int page_sz = getpagesize();
+
+	return round_up(total, page_sz);
+}
+
+static void free_batch_commit_buf(struct ublk_thread *t)
+{
+	if (t->commit_buf) {
+		unsigned buf_size = ublk_commit_buf_size(t);
+		unsigned int total = buf_size * t->nr_commit_buf;
+
+		munlock(t->commit_buf, total);
+		free(t->commit_buf);
+	}
+	allocator_deinit(&t->commit_buf_alloc);
+}
+
+static int alloc_batch_commit_buf(struct ublk_thread *t)
+{
+	unsigned buf_size = ublk_commit_buf_size(t);
+	unsigned int total = buf_size * t->nr_commit_buf;
+	unsigned int page_sz = getpagesize();
+	void *buf = NULL;
+	int ret;
+
+	allocator_init(&t->commit_buf_alloc, t->nr_commit_buf);
+
+	t->commit_buf = NULL;
+	ret = posix_memalign(&buf, page_sz, total);
+	if (ret || !buf)
+		goto fail;
+
+	t->commit_buf = buf;
+
+	/* lock commit buffer pages for fast access */
+	if (mlock(t->commit_buf, total))
+		ublk_err("%s: can't lock commit buffer %s\n", __func__,
+			strerror(errno));
+
+	return 0;
+
+fail:
+	free_batch_commit_buf(t);
+	return ret;
+}
+
+void ublk_batch_prepare(struct ublk_thread *t)
+{
+	/*
+	 * We only handle single device in this thread context.
+	 *
+	 * All queues have same feature flags, so use queue 0's for
+	 * calculate uring_cmd flags.
+	 *
+	 * This way looks not elegant, but it works so far.
+	 */
+	struct ublk_queue *q = &t->dev->q[0];
+
+	t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev);
+	t->commit_buf_size = ublk_commit_buf_size(t);
+	t->commit_buf_start = t->nr_bufs;
+	t->nr_commit_buf = 2;
+	t->nr_bufs += t->nr_commit_buf;
+
+	t->cmd_flags = 0;
+	if (ublk_queue_use_auto_zc(q)) {
+		if (ublk_queue_auto_zc_fallback(q))
+			t->cmd_flags |= UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK;
+	} else if (!ublk_queue_no_buf(q))
+		t->cmd_flags |= UBLK_BATCH_F_HAS_BUF_ADDR;
+
+	t->state |= UBLKS_T_BATCH_IO;
+
+	ublk_log("%s: thread %d commit(nr_bufs %u, buf_size %u, start %u)\n",
+			__func__, t->idx,
+			t->nr_commit_buf, t->commit_buf_size,
+			t->nr_bufs);
+}
+
+int ublk_batch_alloc_buf(struct ublk_thread *t)
+{
+	ublk_assert(t->nr_commit_buf < 16);
+	return alloc_batch_commit_buf(t);
+}
+
+void ublk_batch_free_buf(struct ublk_thread *t)
+{
+	free_batch_commit_buf(t);
+}
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 9b6f1cd04dc4..3864f42e6c29 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -435,6 +435,8 @@ static void ublk_thread_deinit(struct ublk_thread *t)
 {
 	io_uring_unregister_buffers(&t->ring);
 
+	ublk_batch_free_buf(t);
+
 	io_uring_unregister_ring_fd(&t->ring);
 
 	if (t->ring.ring_fd > 0) {
@@ -531,15 +533,33 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag
 		unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
 		unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
 		max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
-		ret = io_uring_register_buffers_sparse(
-			&t->ring, max_nr_ios_per_thread);
+
+		t->nr_bufs = max_nr_ios_per_thread;
+	} else {
+		t->nr_bufs = 0;
+	}
+
+	if (ublk_dev_batch_io(dev))
+		 ublk_batch_prepare(t);
+
+	if (t->nr_bufs) {
+		ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs);
 		if (ret) {
-			ublk_err("ublk dev %d thread %d register spare buffers failed %d",
+			ublk_err("ublk dev %d thread %d register spare buffers failed %d\n",
 					dev->dev_info.dev_id, t->idx, ret);
 			goto fail;
 		}
 	}
 
+	if (ublk_dev_batch_io(dev)) {
+		ret = ublk_batch_alloc_buf(t);
+		if (ret) {
+			ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n",
+				dev->dev_info.dev_id, t->idx, ret);
+			goto fail;
+		}
+	}
+
 	io_uring_register_ring_fd(&t->ring);
 
 	if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) {
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 311a75da9b21..424c333596ac 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -182,15 +182,40 @@ struct ublk_queue {
 	struct ublk_io ios[UBLK_QUEUE_DEPTH];
 };
 
+/* align with `ublk_elem_header` */
+struct ublk_batch_elem {
+	__u16 tag;
+	__u16 buf_index;
+	__s32 result;
+	__u64 buf_addr;
+};
+
 struct ublk_thread {
 	struct ublk_dev *dev;
 	unsigned idx;
 
 #define UBLKS_T_STOPPING	(1U << 0)
 #define UBLKS_T_IDLE	(1U << 1)
+#define UBLKS_T_BATCH_IO	(1U << 31) 	/* readonly */
 	unsigned state;
 	unsigned int cmd_inflight;
 	unsigned int io_inflight;
+
+	unsigned short nr_bufs;
+
+       /* followings are for BATCH_IO */
+	unsigned short commit_buf_start;
+	unsigned char  commit_buf_elem_size;
+       /*
+        * We just support single device, so pre-calculate commit/prep flags
+        */
+	unsigned short cmd_flags;
+	unsigned int   nr_commit_buf;
+	unsigned int   commit_buf_size;
+	void *commit_buf;
+#define UBLKS_T_COMMIT_BUF_INV_IDX  ((unsigned short)-1)
+	struct allocator commit_buf_alloc;
+
 	struct io_uring ring;
 };
 
@@ -211,6 +236,27 @@ struct ublk_dev {
 
 extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io);
 
+static inline int __ublk_use_batch_io(__u64 flags)
+{
+	return flags & UBLK_F_BATCH_IO;
+}
+
+static inline int ublk_queue_batch_io(const struct ublk_queue *q)
+{
+	return __ublk_use_batch_io(q->flags);
+}
+
+static inline int ublk_dev_batch_io(const struct ublk_dev *dev)
+{
+	return __ublk_use_batch_io(dev->dev_info.flags);
+}
+
+/* only work for handle single device in this pthread context */
+static inline int ublk_thread_batch_io(const struct ublk_thread *t)
+{
+	return t->state & UBLKS_T_BATCH_IO;
+}
+
 static inline void ublk_set_integrity_params(const struct dev_ctx *ctx,
 					     struct ublk_params *params)
 {
@@ -465,6 +511,13 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q)
 	return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q);
 }
 
+/* Initialize batch I/O state and calculate buffer parameters */
+void ublk_batch_prepare(struct ublk_thread *t);
+/* Allocate and register commit buffers for batch operations */
+int ublk_batch_alloc_buf(struct ublk_thread *t);
+/* Free commit buffers and cleanup batch allocator */
+void ublk_batch_free_buf(struct ublk_thread *t);
+
 extern const struct ublk_tgt_ops null_tgt_ops;
 extern const struct ublk_tgt_ops loop_tgt_ops;
 extern const struct ublk_tgt_ops stripe_tgt_ops;
diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h
index 17eefed73690..aab522f26167 100644
--- a/tools/testing/selftests/ublk/utils.h
+++ b/tools/testing/selftests/ublk/utils.h
@@ -21,6 +21,60 @@
 #define round_up(val, rnd) \
 	(((val) + ((rnd) - 1)) & ~((rnd) - 1))
 
+/* small sized & per-thread allocator */
+struct allocator {
+	unsigned int size;
+	cpu_set_t *set;
+};
+
+static inline int allocator_init(struct allocator *a, unsigned size)
+{
+	a->set = CPU_ALLOC(size);
+	a->size = size;
+
+	if (a->set)
+		return 0;
+	return -ENOMEM;
+}
+
+static inline void allocator_deinit(struct allocator *a)
+{
+	CPU_FREE(a->set);
+	a->set = NULL;
+	a->size = 0;
+}
+
+static inline int allocator_get(struct allocator *a)
+{
+	int i;
+
+	for (i = 0; i < a->size; i += 1) {
+		size_t set_size = CPU_ALLOC_SIZE(a->size);
+
+		if (!CPU_ISSET_S(i, set_size, a->set)) {
+			CPU_SET_S(i, set_size, a->set);
+			return i;
+		}
+	}
+
+	return -1;
+}
+
+static inline void allocator_put(struct allocator *a, int i)
+{
+	size_t set_size = CPU_ALLOC_SIZE(a->size);
+
+	if (i >= 0 && i < a->size)
+		CPU_CLR_S(i, set_size, a->set);
+}
+
+static inline int allocator_get_val(struct allocator *a, int i)
+{
+	size_t set_size = CPU_ALLOC_SIZE(a->size);
+
+	return CPU_ISSET_S(i, set_size, a->set);
+}
+
 static inline unsigned int ilog2(unsigned int x)
 {
 	if (x == 0)
-- 
cgit v1.2.3


From d468930a019df71951a80fde20f6348136a2175d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:52 +0800
Subject: selftests: ublk: handle UBLK_U_IO_PREP_IO_CMDS

Implement support for UBLK_U_IO_PREP_IO_CMDS in the batch I/O framework:

- Add batch command initialization and setup functions
- Implement prep command queueing with proper buffer management
- Add command completion handling for prep and commit commands
- Integrate batch I/O setup into thread initialization
- Update CQE handling to support batch commands

The implementation uses the previously established buffer management
infrastructure to queue UBLK_U_IO_PREP_IO_CMDS commands. Commands are
prepared in the first thread context and use commit buffers for
efficient command batching.

Key changes:
- ublk_batch_queue_prep_io_cmds() prepares I/O command batches
- ublk_batch_compl_cmd() handles batch command completions
- Modified thread setup to use batch operations when enabled
- Enhanced buffer index calculation for batch mode

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/batch.c | 114 +++++++++++++++++++++++++++++++++++
 tools/testing/selftests/ublk/kublk.c |  50 +++++++++++----
 tools/testing/selftests/ublk/kublk.h |  22 +++++++
 3 files changed, 174 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c
index 609e6073c9c0..079cae77add1 100644
--- a/tools/testing/selftests/ublk/batch.c
+++ b/tools/testing/selftests/ublk/batch.c
@@ -150,3 +150,117 @@ void ublk_batch_free_buf(struct ublk_thread *t)
 {
 	free_batch_commit_buf(t);
 }
+
+static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id,
+				struct io_uring_sqe *sqe, unsigned op,
+				unsigned short elem_bytes,
+				unsigned short nr_elem,
+				unsigned short buf_idx)
+{
+	struct ublk_batch_io *cmd;
+	__u64 user_data;
+
+	cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe);
+
+	ublk_set_sqe_cmd_op(sqe, op);
+
+	sqe->fd	= 0;	/* dev->fds[0] */
+	sqe->opcode	= IORING_OP_URING_CMD;
+	sqe->flags	= IOSQE_FIXED_FILE;
+
+	cmd->q_id	= q_id;
+	cmd->flags	= 0;
+	cmd->reserved 	= 0;
+	cmd->elem_bytes = elem_bytes;
+	cmd->nr_elem	= nr_elem;
+
+	user_data = build_user_data(buf_idx, _IOC_NR(op), 0, q_id, 0);
+	io_uring_sqe_set_data64(sqe, user_data);
+
+	t->cmd_inflight += 1;
+
+	ublk_dbg(UBLK_DBG_IO_CMD, "%s: thread %u qid %d cmd_op %x data %lx "
+			"nr_elem %u elem_bytes %u buf_size %u buf_idx %d "
+			"cmd_inflight %u\n",
+			__func__, t->idx, q_id, op, user_data,
+			cmd->nr_elem, cmd->elem_bytes,
+			nr_elem * elem_bytes, buf_idx, t->cmd_inflight);
+}
+
+static void ublk_setup_commit_sqe(struct ublk_thread *t,
+				  struct io_uring_sqe *sqe,
+				  unsigned short buf_idx)
+{
+	struct ublk_batch_io *cmd;
+
+	cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe);
+
+	/* Use plain user buffer instead of fixed buffer */
+	cmd->flags |= t->cmd_flags;
+}
+
+int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q)
+{
+	unsigned short nr_elem = q->q_depth;
+	unsigned short buf_idx = ublk_alloc_commit_buf(t);
+	struct io_uring_sqe *sqe;
+	void *buf;
+	int i;
+
+	ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX);
+
+	ublk_io_alloc_sqes(t, &sqe, 1);
+
+	ublk_assert(nr_elem == q->q_depth);
+	buf = ublk_get_commit_buf(t, buf_idx);
+	for (i = 0; i < nr_elem; i++) {
+		struct ublk_batch_elem *elem = (struct ublk_batch_elem *)(
+				buf + i * t->commit_buf_elem_size);
+		struct ublk_io *io = &q->ios[i];
+
+		elem->tag = i;
+		elem->result = 0;
+
+		if (ublk_queue_use_auto_zc(q))
+			elem->buf_index = ublk_batch_io_buf_idx(t, q, i);
+		else if (!ublk_queue_no_buf(q))
+			elem->buf_addr = (__u64)io->buf_addr;
+	}
+
+	sqe->addr = (__u64)buf;
+	sqe->len = t->commit_buf_elem_size * nr_elem;
+
+	ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_PREP_IO_CMDS,
+			t->commit_buf_elem_size, nr_elem, buf_idx);
+	ublk_setup_commit_sqe(t, sqe, buf_idx);
+	return 0;
+}
+
+static void ublk_batch_compl_commit_cmd(struct ublk_thread *t,
+					const struct io_uring_cqe *cqe,
+					unsigned op)
+{
+	unsigned short buf_idx = user_data_to_tag(cqe->user_data);
+
+	if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS))
+		ublk_assert(cqe->res == 0);
+	else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS))
+		;//assert(cqe->res == t->commit_buf_size);
+	else
+		ublk_assert(0);
+
+	ublk_free_commit_buf(t, buf_idx);
+}
+
+void ublk_batch_compl_cmd(struct ublk_thread *t,
+			  const struct io_uring_cqe *cqe)
+{
+	unsigned op = user_data_to_op(cqe->user_data);
+
+	if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) ||
+			op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) {
+		t->cmd_inflight--;
+		ublk_batch_compl_commit_cmd(t, cqe, op);
+		return;
+	}
+}
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 3864f42e6c29..dba912a44eb3 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -840,6 +840,8 @@ static void ublk_handle_uring_cmd(struct ublk_thread *t,
 	unsigned tag = user_data_to_tag(cqe->user_data);
 	struct ublk_io *io = &q->ios[tag];
 
+	t->cmd_inflight--;
+
 	if (!fetch) {
 		t->state |= UBLKS_T_STOPPING;
 		io->flags &= ~UBLKS_IO_NEED_FETCH_RQ;
@@ -874,28 +876,30 @@ static void ublk_handle_cqe(struct ublk_thread *t,
 {
 	struct ublk_dev *dev = t->dev;
 	unsigned q_id = user_data_to_q_id(cqe->user_data);
-	struct ublk_queue *q = &dev->q[q_id];
 	unsigned cmd_op = user_data_to_op(cqe->user_data);
 
 	if (cqe->res < 0 && cqe->res != -ENODEV)
-		ublk_err("%s: res %d userdata %llx queue state %x\n", __func__,
-				cqe->res, cqe->user_data, q->flags);
+		ublk_err("%s: res %d userdata %llx thread state %x\n", __func__,
+				cqe->res, cqe->user_data, t->state);
 
-	ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n",
-			__func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data),
-			cmd_op, is_target_io(cqe->user_data),
+	ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x "
+			"data %lx target %d/%d) stopping %d\n",
+			__func__, cqe->res, t->idx, q_id,
+			user_data_to_tag(cqe->user_data),
+			cmd_op, cqe->user_data, is_target_io(cqe->user_data),
 			user_data_to_tgt_data(cqe->user_data),
 			(t->state & UBLKS_T_STOPPING));
 
 	/* Don't retrieve io in case of target io */
 	if (is_target_io(cqe->user_data)) {
-		ublksrv_handle_tgt_cqe(t, q, cqe);
+		ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe);
 		return;
 	}
 
-	t->cmd_inflight--;
-
-	ublk_handle_uring_cmd(t, q, cqe);
+	if (ublk_thread_batch_io(t))
+		ublk_batch_compl_cmd(t, cqe);
+	else
+		ublk_handle_uring_cmd(t, &dev->q[q_id], cqe);
 }
 
 static int ublk_reap_events_uring(struct ublk_thread *t)
@@ -952,6 +956,22 @@ static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info)
 				info->dev->dev_info.dev_id, info->idx);
 }
 
+static void ublk_batch_setup_queues(struct ublk_thread *t)
+{
+	int i;
+
+	/* setup all queues in the 1st thread */
+	for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) {
+		struct ublk_queue *q = &t->dev->q[i];
+		int ret;
+
+		ret = ublk_batch_queue_prep_io_cmds(t, q);
+		ublk_assert(ret == 0);
+		ret = ublk_process_io(t);
+		ublk_assert(ret >= 0);
+	}
+}
+
 static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info)
 {
 	struct ublk_thread t = {
@@ -972,8 +992,14 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf
 	ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
 			gettid(), dev_id, t.idx);
 
-	/* submit all io commands to ublk driver */
-	ublk_submit_fetch_commands(&t);
+	if (!ublk_thread_batch_io(&t)) {
+		/* submit all io commands to ublk driver */
+		ublk_submit_fetch_commands(&t);
+	} else if (!t.idx) {
+		/* prepare all io commands in the 1st thread context */
+		ublk_batch_setup_queues(&t);
+	}
+
 	do {
 		if (ublk_process_io(&t) < 0)
 			break;
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 424c333596ac..08320d44c7c2 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -440,10 +440,16 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op)
 	addr[1] = 0;
 }
 
+static inline unsigned short ublk_batch_io_buf_idx(
+		const struct ublk_thread *t, const struct ublk_queue *q,
+		unsigned tag);
+
 static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t,
 					     const struct ublk_queue *q,
 					     unsigned tag)
 {
+	if (ublk_queue_batch_io(q))
+		return ublk_batch_io_buf_idx(t, q, tag);
 	return q->ios[tag].buf_index;
 }
 
@@ -511,6 +517,22 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q)
 	return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q);
 }
 
+/*
+ * Each IO's buffer index has to be calculated by this helper for
+ * UBLKS_T_BATCH_IO
+ */
+static inline unsigned short ublk_batch_io_buf_idx(
+		const struct ublk_thread *t, const struct ublk_queue *q,
+		unsigned tag)
+{
+	return tag;
+}
+
+/* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */
+int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q);
+/* Handle completion of batch I/O commands (prep/commit) */
+void ublk_batch_compl_cmd(struct ublk_thread *t,
+			  const struct io_uring_cqe *cqe);
 /* Initialize batch I/O state and calculate buffer parameters */
 void ublk_batch_prepare(struct ublk_thread *t);
 /* Allocate and register commit buffers for batch operations */
-- 
cgit v1.2.3


From dee7024ffecba291891503e425373d9f2a1d01b6 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:53 +0800
Subject: selftests: ublk: handle UBLK_U_IO_COMMIT_IO_CMDS

Implement UBLK_U_IO_COMMIT_IO_CMDS to enable efficient batched
completion of I/O operations in the batch I/O framework.

This completes the batch I/O infrastructure by adding the commit
phase that notifies the kernel about completed I/O operations:

Key features:
- Batch multiple I/O completions into single UBLK_U_IO_COMMIT_IO_CMDS
- Dynamic commit buffer allocation and management per thread
- Automatic commit buffer preparation before processing events
- Commit buffer submission after processing completed I/Os
- Integration with existing completion workflows

Implementation details:
- ublk_batch_prep_commit() allocates and initializes commit buffers
- ublk_batch_complete_io() adds completed I/Os to current batch
- ublk_batch_commit_io_cmds() submits batched completions to kernel
- Modified ublk_process_io() to handle batch commit lifecycle
- Enhanced ublk_complete_io() to route to batch or legacy completion

The commit buffer stores completion information (tag, result, buffer
details) for multiple I/Os, then submits them all at once, significantly
reducing syscall overhead compared to individual I/O completions.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/batch.c | 74 ++++++++++++++++++++++++++++++++++--
 tools/testing/selftests/ublk/kublk.c |  8 +++-
 tools/testing/selftests/ublk/kublk.h | 69 +++++++++++++++++++++------------
 3 files changed, 122 insertions(+), 29 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c
index 079cae77add1..9c4db7335d44 100644
--- a/tools/testing/selftests/ublk/batch.c
+++ b/tools/testing/selftests/ublk/batch.c
@@ -174,7 +174,7 @@ static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id,
 	cmd->elem_bytes = elem_bytes;
 	cmd->nr_elem	= nr_elem;
 
-	user_data = build_user_data(buf_idx, _IOC_NR(op), 0, q_id, 0);
+	user_data = build_user_data(buf_idx, _IOC_NR(op), nr_elem, q_id, 0);
 	io_uring_sqe_set_data64(sqe, user_data);
 
 	t->cmd_inflight += 1;
@@ -244,9 +244,11 @@ static void ublk_batch_compl_commit_cmd(struct ublk_thread *t,
 
 	if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS))
 		ublk_assert(cqe->res == 0);
-	else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS))
-		;//assert(cqe->res == t->commit_buf_size);
-	else
+	else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) {
+		int nr_elem = user_data_to_tgt_data(cqe->user_data);
+
+		ublk_assert(cqe->res == t->commit_buf_elem_size * nr_elem);
+	} else
 		ublk_assert(0);
 
 	ublk_free_commit_buf(t, buf_idx);
@@ -264,3 +266,67 @@ void ublk_batch_compl_cmd(struct ublk_thread *t,
 		return;
 	}
 }
+
+void ublk_batch_commit_io_cmds(struct ublk_thread *t)
+{
+	struct io_uring_sqe *sqe;
+	unsigned short buf_idx;
+	unsigned short nr_elem = t->commit.done;
+
+	/* nothing to commit */
+	if (!nr_elem) {
+		ublk_free_commit_buf(t, t->commit.buf_idx);
+		return;
+	}
+
+	ublk_io_alloc_sqes(t, &sqe, 1);
+	buf_idx = t->commit.buf_idx;
+	sqe->addr = (__u64)t->commit.elem;
+	sqe->len = nr_elem * t->commit_buf_elem_size;
+
+	/* commit isn't per-queue command */
+	ublk_init_batch_cmd(t, t->commit.q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS,
+			t->commit_buf_elem_size, nr_elem, buf_idx);
+	ublk_setup_commit_sqe(t, sqe, buf_idx);
+}
+
+static void ublk_batch_init_commit(struct ublk_thread *t,
+				   unsigned short buf_idx)
+{
+	/* so far only support 1:1 queue/thread mapping */
+	t->commit.q_id = t->idx;
+	t->commit.buf_idx = buf_idx;
+	t->commit.elem = ublk_get_commit_buf(t, buf_idx);
+	t->commit.done = 0;
+	t->commit.count = t->commit_buf_size /
+		t->commit_buf_elem_size;
+}
+
+void ublk_batch_prep_commit(struct ublk_thread *t)
+{
+	unsigned short buf_idx = ublk_alloc_commit_buf(t);
+
+	ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX);
+	ublk_batch_init_commit(t, buf_idx);
+}
+
+void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q,
+			    unsigned tag, int res)
+{
+	struct batch_commit_buf *cb = &t->commit;
+	struct ublk_batch_elem *elem = (struct ublk_batch_elem *)(cb->elem +
+			cb->done * t->commit_buf_elem_size);
+	struct ublk_io *io = &q->ios[tag];
+
+	ublk_assert(q->q_id == t->commit.q_id);
+
+	elem->tag = tag;
+	elem->buf_index = ublk_batch_io_buf_idx(t, q, tag);
+	elem->result = res;
+
+	if (!ublk_queue_no_buf(q))
+		elem->buf_addr	= (__u64) (uintptr_t) io->buf_addr;
+
+	cb->done += 1;
+	ublk_assert(cb->done <= cb->count);
+}
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index dba912a44eb3..bf217d30c15f 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -931,7 +931,13 @@ static int ublk_process_io(struct ublk_thread *t)
 		return -ENODEV;
 
 	ret = io_uring_submit_and_wait(&t->ring, 1);
-	reapped = ublk_reap_events_uring(t);
+	if (ublk_thread_batch_io(t)) {
+		ublk_batch_prep_commit(t);
+		reapped = ublk_reap_events_uring(t);
+		ublk_batch_commit_io_cmds(t);
+	} else {
+		reapped = ublk_reap_events_uring(t);
+	}
 
 	ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
 			ret, reapped, (t->state & UBLKS_T_STOPPING),
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 08320d44c7c2..5b05f6d7d808 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -190,6 +190,14 @@ struct ublk_batch_elem {
 	__u64 buf_addr;
 };
 
+struct batch_commit_buf {
+	unsigned short q_id;
+	unsigned short buf_idx;
+	void *elem;
+	unsigned short done;
+	unsigned short count;
+};
+
 struct ublk_thread {
 	struct ublk_dev *dev;
 	unsigned idx;
@@ -215,6 +223,7 @@ struct ublk_thread {
 	void *commit_buf;
 #define UBLKS_T_COMMIT_BUF_INV_IDX  ((unsigned short)-1)
 	struct allocator commit_buf_alloc;
+	struct batch_commit_buf commit;
 
 	struct io_uring ring;
 };
@@ -458,30 +467,6 @@ static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag)
 	return &q->ios[tag];
 }
 
-static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q,
-				   unsigned tag, int res)
-{
-	struct ublk_io *io = &q->ios[tag];
-
-	ublk_mark_io_done(io, res);
-
-	return ublk_queue_io_cmd(t, io);
-}
-
-static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q,
-				      unsigned tag, int queued)
-{
-	if (queued < 0)
-		ublk_complete_io(t, q, tag, queued);
-	else {
-		struct ublk_io *io = ublk_get_io(q, tag);
-
-		t->io_inflight += queued;
-		io->tgt_ios = queued;
-		io->result = 0;
-	}
-}
-
 static inline int ublk_completed_tgt_io(struct ublk_thread *t,
 					struct ublk_queue *q, unsigned tag)
 {
@@ -540,6 +525,42 @@ int ublk_batch_alloc_buf(struct ublk_thread *t);
 /* Free commit buffers and cleanup batch allocator */
 void ublk_batch_free_buf(struct ublk_thread *t);
 
+/* Prepare a new commit buffer for batching completed I/O operations */
+void ublk_batch_prep_commit(struct ublk_thread *t);
+/* Submit UBLK_U_IO_COMMIT_IO_CMDS with batched completed I/O operations */
+void ublk_batch_commit_io_cmds(struct ublk_thread *t);
+/* Add a completed I/O operation to the current batch commit buffer */
+void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q,
+			    unsigned tag, int res);
+
+static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q,
+				   unsigned tag, int res)
+{
+	if (ublk_queue_batch_io(q)) {
+		ublk_batch_complete_io(t, q, tag, res);
+		return 0;
+	} else {
+		struct ublk_io *io = &q->ios[tag];
+
+		ublk_mark_io_done(io, res);
+		return ublk_queue_io_cmd(t, io);
+	}
+}
+
+static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q,
+				      unsigned tag, int queued)
+{
+	if (queued < 0)
+		ublk_complete_io(t, q, tag, queued);
+	else {
+		struct ublk_io *io = ublk_get_io(q, tag);
+
+		t->io_inflight += queued;
+		io->tgt_ios = queued;
+		io->result = 0;
+	}
+}
+
 extern const struct ublk_tgt_ops null_tgt_ops;
 extern const struct ublk_tgt_ops loop_tgt_ops;
 extern const struct ublk_tgt_ops stripe_tgt_ops;
-- 
cgit v1.2.3


From cb5a6b308700c65c29baccbb6b9b07f306633ad5 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:54 +0800
Subject: selftests: ublk: handle UBLK_U_IO_FETCH_IO_CMDS

Add support for UBLK_U_IO_FETCH_IO_CMDS to enable efficient batch
fetching of I/O commands using multishot io_uring operations.

Key improvements:
- Implement multishot UBLK_U_IO_FETCH_IO_CMDS for continuous command fetching
- Add fetch buffer management with page-aligned, mlocked buffers
- Process fetched I/O command tags from kernel-provided buffers
- Integrate fetch operations with existing batch I/O infrastructure
- Significantly reduce uring_cmd issuing overhead through batching

The implementation uses two fetch buffers per thread with automatic
requeuing to maintain continuous I/O command flow. Each fetch operation
retrieves multiple command tags in a single syscall, dramatically
improving performance compared to individual command fetching.

Technical details:
- Fetch buffers are page-aligned and mlocked for optimal performance
- Uses IORING_URING_CMD_MULTISHOT for continuous operation
- Automatic buffer management and requeuing on completion
- Enhanced CQE handling for fetch command completions

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/batch.c | 136 ++++++++++++++++++++++++++++++++++-
 tools/testing/selftests/ublk/kublk.c |  14 +++-
 tools/testing/selftests/ublk/kublk.h |  13 ++++
 3 files changed, 159 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c
index 9c4db7335d44..5f9587210b12 100644
--- a/tools/testing/selftests/ublk/batch.c
+++ b/tools/testing/selftests/ublk/batch.c
@@ -140,15 +140,63 @@ void ublk_batch_prepare(struct ublk_thread *t)
 			t->nr_bufs);
 }
 
+static void free_batch_fetch_buf(struct ublk_thread *t)
+{
+	int i;
+
+	for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) {
+		io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i);
+		munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size);
+		free(t->fetch[i].fetch_buf);
+	}
+}
+
+static int alloc_batch_fetch_buf(struct ublk_thread *t)
+{
+	/* page aligned fetch buffer, and it is mlocked for speedup delivery */
+	unsigned pg_sz = getpagesize();
+	unsigned buf_size = round_up(t->dev->dev_info.queue_depth * 2, pg_sz);
+	int ret;
+	int i = 0;
+
+	for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) {
+		t->fetch[i].fetch_buf_size = buf_size;
+
+		if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz,
+					t->fetch[i].fetch_buf_size))
+			return -ENOMEM;
+
+		/* lock fetch buffer page for fast fetching */
+		if (mlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size))
+			ublk_err("%s: can't lock fetch buffer %s\n", __func__,
+				strerror(errno));
+		t->fetch[i].br = io_uring_setup_buf_ring(&t->ring, 1,
+			i, IOU_PBUF_RING_INC, &ret);
+		if (!t->fetch[i].br) {
+			ublk_err("Buffer ring register failed %d\n", ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
 int ublk_batch_alloc_buf(struct ublk_thread *t)
 {
+	int ret;
+
 	ublk_assert(t->nr_commit_buf < 16);
-	return alloc_batch_commit_buf(t);
+
+	ret = alloc_batch_commit_buf(t);
+	if (ret)
+		return ret;
+	return alloc_batch_fetch_buf(t);
 }
 
 void ublk_batch_free_buf(struct ublk_thread *t)
 {
 	free_batch_commit_buf(t);
+	free_batch_fetch_buf(t);
 }
 
 static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id,
@@ -199,6 +247,76 @@ static void ublk_setup_commit_sqe(struct ublk_thread *t,
 	cmd->flags |= t->cmd_flags;
 }
 
+static void ublk_batch_queue_fetch(struct ublk_thread *t,
+				   struct ublk_queue *q,
+				   unsigned short buf_idx)
+{
+	unsigned short nr_elem = t->fetch[buf_idx].fetch_buf_size / 2;
+	struct io_uring_sqe *sqe;
+
+	io_uring_buf_ring_add(t->fetch[buf_idx].br, t->fetch[buf_idx].fetch_buf,
+			t->fetch[buf_idx].fetch_buf_size,
+			0, 0, 0);
+	io_uring_buf_ring_advance(t->fetch[buf_idx].br, 1);
+
+	ublk_io_alloc_sqes(t, &sqe, 1);
+
+	ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_FETCH_IO_CMDS, 2, nr_elem,
+			buf_idx);
+
+	sqe->rw_flags= IORING_URING_CMD_MULTISHOT;
+	sqe->buf_group = buf_idx;
+	sqe->flags |= IOSQE_BUFFER_SELECT;
+
+	t->fetch[buf_idx].fetch_buf_off = 0;
+}
+
+void ublk_batch_start_fetch(struct ublk_thread *t,
+			    struct ublk_queue *q)
+{
+	int i;
+
+	for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++)
+		ublk_batch_queue_fetch(t, q, i);
+}
+
+static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t,
+				   struct ublk_queue *q,
+				   const struct io_uring_cqe *cqe)
+{
+	unsigned short buf_idx = user_data_to_tag(cqe->user_data);
+	unsigned start = t->fetch[buf_idx].fetch_buf_off;
+	unsigned end = start + cqe->res;
+	void *buf = t->fetch[buf_idx].fetch_buf;
+	int i;
+
+	if (cqe->res < 0)
+		return buf_idx;
+
+       if ((end - start) / 2 > q->q_depth) {
+               ublk_err("%s: fetch duplicated ios offset %u count %u\n", __func__, start, cqe->res);
+
+               for (i = start; i < end; i += 2) {
+                       unsigned short tag = *(unsigned short *)(buf + i);
+
+                       ublk_err("%u ", tag);
+               }
+               ublk_err("\n");
+       }
+
+	for (i = start; i < end; i += 2) {
+		unsigned short tag = *(unsigned short *)(buf + i);
+
+		if (tag >= q->q_depth)
+			ublk_err("%s: bad tag %u\n", __func__, tag);
+
+		if (q->tgt_ops->queue_io)
+			q->tgt_ops->queue_io(t, q, tag);
+	}
+	t->fetch[buf_idx].fetch_buf_off = end;
+	return buf_idx;
+}
+
 int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q)
 {
 	unsigned short nr_elem = q->q_depth;
@@ -258,6 +376,9 @@ void ublk_batch_compl_cmd(struct ublk_thread *t,
 			  const struct io_uring_cqe *cqe)
 {
 	unsigned op = user_data_to_op(cqe->user_data);
+	struct ublk_queue *q;
+	unsigned buf_idx;
+	unsigned q_id;
 
 	if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) ||
 			op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) {
@@ -265,6 +386,19 @@ void ublk_batch_compl_cmd(struct ublk_thread *t,
 		ublk_batch_compl_commit_cmd(t, cqe, op);
 		return;
 	}
+
+	/* FETCH command is per queue */
+	q_id = user_data_to_q_id(cqe->user_data);
+	q = &t->dev->q[q_id];
+	buf_idx = ublk_compl_batch_fetch(t, q, cqe);
+
+	if (cqe->res < 0 && cqe->res != -ENOBUFS) {
+		t->cmd_inflight--;
+		t->state |= UBLKS_T_STOPPING;
+	} else if (!(cqe->flags & IORING_CQE_F_MORE) || cqe->res == -ENOBUFS) {
+		t->cmd_inflight--;
+		ublk_batch_queue_fetch(t, q, buf_idx);
+	}
 }
 
 void ublk_batch_commit_io_cmds(struct ublk_thread *t)
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index bf217d30c15f..c77205bac7a9 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -519,6 +519,10 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag
 	int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
 	int ret;
 
+	/* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */
+	if (ublk_dev_batch_io(dev))
+		cq_depth += dev->dev_info.queue_depth;
+
 	ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
 			IORING_SETUP_COOP_TASKRUN |
 			IORING_SETUP_SINGLE_ISSUER |
@@ -878,7 +882,7 @@ static void ublk_handle_cqe(struct ublk_thread *t,
 	unsigned q_id = user_data_to_q_id(cqe->user_data);
 	unsigned cmd_op = user_data_to_op(cqe->user_data);
 
-	if (cqe->res < 0 && cqe->res != -ENODEV)
+	if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS)
 		ublk_err("%s: res %d userdata %llx thread state %x\n", __func__,
 				cqe->res, cqe->user_data, t->state);
 
@@ -1001,9 +1005,13 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf
 	if (!ublk_thread_batch_io(&t)) {
 		/* submit all io commands to ublk driver */
 		ublk_submit_fetch_commands(&t);
-	} else if (!t.idx) {
+	} else {
+		struct ublk_queue *q = &t.dev->q[t.idx];
+
 		/* prepare all io commands in the 1st thread context */
-		ublk_batch_setup_queues(&t);
+		if (!t.idx)
+			ublk_batch_setup_queues(&t);
+		ublk_batch_start_fetch(&t, q);
 	}
 
 	do {
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 5b05f6d7d808..950e99c02e8b 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -198,6 +198,13 @@ struct batch_commit_buf {
 	unsigned short count;
 };
 
+struct batch_fetch_buf {
+	struct io_uring_buf_ring *br;
+	void *fetch_buf;
+	unsigned int fetch_buf_size;
+	unsigned int fetch_buf_off;
+};
+
 struct ublk_thread {
 	struct ublk_dev *dev;
 	unsigned idx;
@@ -224,6 +231,9 @@ struct ublk_thread {
 #define UBLKS_T_COMMIT_BUF_INV_IDX  ((unsigned short)-1)
 	struct allocator commit_buf_alloc;
 	struct batch_commit_buf commit;
+	/* FETCH_IO_CMDS buffer */
+#define UBLKS_T_NR_FETCH_BUF 	2
+	struct batch_fetch_buf fetch[UBLKS_T_NR_FETCH_BUF];
 
 	struct io_uring ring;
 };
@@ -515,6 +525,9 @@ static inline unsigned short ublk_batch_io_buf_idx(
 
 /* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */
 int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q);
+/* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */
+void ublk_batch_start_fetch(struct ublk_thread *t,
+			    struct ublk_queue *q);
 /* Handle completion of batch I/O commands (prep/commit) */
 void ublk_batch_compl_cmd(struct ublk_thread *t,
 			  const struct io_uring_cqe *cqe);
-- 
cgit v1.2.3


From 4968fb7cc60676040258c8867f22931c8735126f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:55 +0800
Subject: selftests: ublk: increase timeout to 150 seconds

More tests need to be covered in existing generic tests, and default
45sec isn't enough, and timeout is often triggered, increase timeout
by adding setting file.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile | 2 ++
 tools/testing/selftests/ublk/settings | 1 +
 2 files changed, 3 insertions(+)
 create mode 100644 tools/testing/selftests/ublk/settings

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 3a2498089b15..f2da8b403537 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -52,6 +52,8 @@ TEST_PROGS += test_stress_05.sh
 TEST_PROGS += test_stress_06.sh
 TEST_PROGS += test_stress_07.sh
 
+TEST_FILES := settings
+
 TEST_GEN_PROGS_EXTENDED = kublk metadata_size
 STANDALONE_UTILS := metadata_size.c
 
diff --git a/tools/testing/selftests/ublk/settings b/tools/testing/selftests/ublk/settings
new file mode 100644
index 000000000000..682a40f1c8e6
--- /dev/null
+++ b/tools/testing/selftests/ublk/settings
@@ -0,0 +1 @@
+timeout=150
-- 
cgit v1.2.3


From 20aeab0b08a175d9ceb4ad327f55ba5c29a79888 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:56 +0800
Subject: selftests: ublk: add --batch/-b for enabling F_BATCH_IO

Add --batch/-b for enabling F_BATCH_IO.

Add batch_01 for covering its basic function.

Add stress_08 and stress_09 for covering stress test.

Add recovery test for F_BATCH_IO in generic_04 and generic_05.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile           |  4 +++
 tools/testing/selftests/ublk/kublk.c            | 15 +++++++--
 tools/testing/selftests/ublk/test_batch_01.sh   | 32 ++++++++++++++++++
 tools/testing/selftests/ublk/test_generic_04.sh |  5 +++
 tools/testing/selftests/ublk/test_generic_05.sh |  5 +++
 tools/testing/selftests/ublk/test_stress_08.sh  | 45 +++++++++++++++++++++++++
 tools/testing/selftests/ublk/test_stress_09.sh  | 44 ++++++++++++++++++++++++
 7 files changed, 148 insertions(+), 2 deletions(-)
 create mode 100755 tools/testing/selftests/ublk/test_batch_01.sh
 create mode 100755 tools/testing/selftests/ublk/test_stress_08.sh
 create mode 100755 tools/testing/selftests/ublk/test_stress_09.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index f2da8b403537..520e18e224f2 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -25,6 +25,8 @@ TEST_PROGS += test_generic_14.sh
 TEST_PROGS += test_generic_15.sh
 TEST_PROGS += test_generic_16.sh
 
+TEST_PROGS += test_batch_01.sh
+
 TEST_PROGS += test_null_01.sh
 TEST_PROGS += test_null_02.sh
 TEST_PROGS += test_null_03.sh
@@ -51,6 +53,8 @@ TEST_PROGS += test_stress_04.sh
 TEST_PROGS += test_stress_05.sh
 TEST_PROGS += test_stress_06.sh
 TEST_PROGS += test_stress_07.sh
+TEST_PROGS += test_stress_08.sh
+TEST_PROGS += test_stress_09.sh
 
 TEST_FILES := settings
 
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index c77205bac7a9..5d84000872a0 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -1593,7 +1593,8 @@ static int cmd_dev_get_features(void)
 		FEAT_NAME(UBLK_F_PER_IO_DAEMON),
 		FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON),
 		FEAT_NAME(UBLK_F_INTEGRITY),
-		FEAT_NAME(UBLK_F_SAFE_STOP_DEV)
+		FEAT_NAME(UBLK_F_SAFE_STOP_DEV),
+		FEAT_NAME(UBLK_F_BATCH_IO),
 	};
 	struct ublk_dev *dev;
 	__u64 features = 0;
@@ -1691,6 +1692,7 @@ static void __cmd_create_help(char *exe, bool recovery)
 	printf("\t[--nthreads threads] [--per_io_tasks]\n");
 	printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] "
 		 "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n");
+	printf("\t[--batch|-b]\n");
 	printf("\t[target options] [backfile1] [backfile2] ...\n");
 	printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
 	printf("\tdefault: nthreads=nr_queues");
@@ -1763,6 +1765,7 @@ int main(int argc, char *argv[])
 		{ "csum_type",		1,	NULL,  0 },
 		{ "tag_size",		1,	NULL,  0 },
 		{ "safe",		0,	NULL,  0 },
+		{ "batch",              0,      NULL, 'b'},
 		{ 0, 0, 0, 0 }
 	};
 	const struct ublk_tgt_ops *ops = NULL;
@@ -1785,12 +1788,15 @@ int main(int argc, char *argv[])
 
 	opterr = 0;
 	optind = 2;
-	while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazu",
+	while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub",
 				  longopts, &option_idx)) != -1) {
 		switch (opt) {
 		case 'a':
 			ctx.all = 1;
 			break;
+		case 'b':
+			ctx.flags |= UBLK_F_BATCH_IO;
+			break;
 		case 'n':
 			ctx.dev_id = strtol(optarg, NULL, 10);
 			break;
@@ -1895,6 +1901,11 @@ int main(int argc, char *argv[])
 		}
 	}
 
+	if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) {
+		ublk_err("per_io_task and F_BATCH_IO conflict\n");
+		return -EINVAL;
+	}
+
 	/* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */
 	if (ctx.auto_zc_fallback &&
 	    !((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
diff --git a/tools/testing/selftests/ublk/test_batch_01.sh b/tools/testing/selftests/ublk/test_batch_01.sh
new file mode 100755
index 000000000000..9fa9fff5c62f
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_batch_01.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="batch_01"
+ERR_CODE=0
+
+if ! _have_feature "BATCH_IO"; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "generic" "test basic function of UBLK_F_BATCH_IO"
+
+_create_backfile 0 256M
+_create_backfile 1 256M
+
+dev_id=$(_add_ublk_dev -t loop -q 2 -b "${UBLK_BACKFILES[0]}")
+_check_add_dev $TID $?
+
+if ! _mkfs_mount_test /dev/ublkb"${dev_id}"; then
+	_cleanup_test "generic"
+	_show_result $TID 255
+fi
+
+dev_id=$(_add_ublk_dev -t stripe -b --auto_zc "${UBLK_BACKFILES[0]}" "${UBLK_BACKFILES[1]}")
+_check_add_dev $TID $?
+_mkfs_mount_test /dev/ublkb"${dev_id}"
+ERR_CODE=$?
+
+_cleanup_test "generic"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_generic_04.sh
index baf5b156193d..be2292822bbe 100755
--- a/tools/testing/selftests/ublk/test_generic_04.sh
+++ b/tools/testing/selftests/ublk/test_generic_04.sh
@@ -26,6 +26,11 @@ _create_backfile 0 256M
 _create_backfile 1 128M
 _create_backfile 2 128M
 
+ublk_run_recover_test -t null -q 2 -r 1 -b &
+ublk_run_recover_test -t loop -q 2 -r 1 -b "${UBLK_BACKFILES[0]}" &
+ublk_run_recover_test -t stripe -q 2 -r 1 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
 ublk_run_recover_test -t null -q 2 -r 1 &
 ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" &
 ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh
index 7b5083afc02a..9b7f71c16d82 100755
--- a/tools/testing/selftests/ublk/test_generic_05.sh
+++ b/tools/testing/selftests/ublk/test_generic_05.sh
@@ -30,6 +30,11 @@ _create_backfile 0 256M
 _create_backfile 1 128M
 _create_backfile 2 128M
 
+ublk_run_recover_test -t null -q 2 -r 1 -z -b &
+ublk_run_recover_test -t loop -q 2 -r 1 -z -b "${UBLK_BACKFILES[0]}" &
+ublk_run_recover_test -t stripe -q 2 -r 1 -z -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
 ublk_run_recover_test -t null -q 2 -r 1 -z &
 ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" &
 ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh
new file mode 100755
index 000000000000..190db0b4f2ad
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_stress_08.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+TID="stress_06"
+ERR_CODE=0
+
+ublk_io_and_remove()
+{
+	run_io_and_remove "$@"
+	ERR_CODE=$?
+	if [ ${ERR_CODE} -ne 0 ]; then
+		echo "$TID failure: $*"
+		_show_result $TID $ERR_CODE
+	fi
+}
+
+if ! _have_program fio; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+if ! _have_feature "ZERO_COPY"; then
+	exit "$UBLK_SKIP_CODE"
+fi
+if ! _have_feature "AUTO_BUF_REG"; then
+	exit "$UBLK_SKIP_CODE"
+fi
+if ! _have_feature "BATCH_IO"; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "stress" "run IO and remove device(zero copy)"
+
+_create_backfile 0 256M
+_create_backfile 1 128M
+_create_backfile 2 128M
+
+ublk_io_and_remove 8G -t null -q 4 -b &
+ublk_io_and_remove 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" &
+ublk_io_and_remove 256M -t stripe -q 4 --auto_zc -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b &
+wait
+
+_cleanup_test "stress"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh
new file mode 100755
index 000000000000..1b6bdb31da03
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_stress_09.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+TID="stress_07"
+ERR_CODE=0
+
+ublk_io_and_kill_daemon()
+{
+	run_io_and_kill_daemon "$@"
+	ERR_CODE=$?
+	if [ ${ERR_CODE} -ne 0 ]; then
+		echo "$TID failure: $*"
+		_show_result $TID $ERR_CODE
+	fi
+}
+
+if ! _have_program fio; then
+	exit "$UBLK_SKIP_CODE"
+fi
+if ! _have_feature "ZERO_COPY"; then
+	exit "$UBLK_SKIP_CODE"
+fi
+if ! _have_feature "AUTO_BUF_REG"; then
+	exit "$UBLK_SKIP_CODE"
+fi
+if ! _have_feature "BATCH_IO"; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "stress" "run IO and kill ublk server(zero copy)"
+
+_create_backfile 0 256M
+_create_backfile 1 128M
+_create_backfile 2 128M
+
+ublk_io_and_kill_daemon 8G -t null -q 4 -z -b &
+ublk_io_and_kill_daemon 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" &
+ublk_io_and_kill_daemon 256M -t stripe -q 4 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b &
+wait
+
+_cleanup_test "stress"
+_show_result $TID $ERR_CODE
-- 
cgit v1.2.3


From e8cd481cc665d5db8e918e84740db22bc213059e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 16 Jan 2026 22:18:57 +0800
Subject: selftests: ublk: support arbitrary threads/queues combination

Enable flexible thread-to-queue mapping in batch I/O mode to support
arbitrary combinations of threads and queues, improving resource
utilization and scalability.

Key improvements:
- Support N:M thread-to-queue mapping (previously limited to 1:1)
- Dynamic buffer allocation based on actual queue assignment per thread
- Thread-safe queue preparation with spinlock protection
- Intelligent buffer index calculation for multi-queue scenarios
- Enhanced validation for thread/queue combination constraints

Implementation details:
- Add q_thread_map matrix to track queue-to-thread assignments
- Dynamic allocation of commit and fetch buffers per thread
- Round-robin queue assignment algorithm for load balancing
- Per-queue spinlock to prevent race conditions during prep
- Updated buffer index calculation using queue position within thread

This enables efficient configurations like:
- Any other N:M combinations for optimal resource matching

Testing:
- Added test_batch_02.sh: 4 threads vs 1 queue
- Added test_batch_03.sh: 1 thread vs 4 queues
- Validates correctness across different mapping scenarios

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile         |   2 +
 tools/testing/selftests/ublk/batch.c          | 199 ++++++++++++++++++++++----
 tools/testing/selftests/ublk/kublk.c          |  49 +++++--
 tools/testing/selftests/ublk/kublk.h          |  40 ++++--
 tools/testing/selftests/ublk/test_batch_02.sh |  30 ++++
 tools/testing/selftests/ublk/test_batch_03.sh |  30 ++++
 6 files changed, 302 insertions(+), 48 deletions(-)
 create mode 100755 tools/testing/selftests/ublk/test_batch_02.sh
 create mode 100755 tools/testing/selftests/ublk/test_batch_03.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 520e18e224f2..e39a6f871fcc 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -26,6 +26,8 @@ TEST_PROGS += test_generic_15.sh
 TEST_PROGS += test_generic_16.sh
 
 TEST_PROGS += test_batch_01.sh
+TEST_PROGS += test_batch_02.sh
+TEST_PROGS += test_batch_03.sh
 
 TEST_PROGS += test_null_01.sh
 TEST_PROGS += test_null_02.sh
diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c
index 5f9587210b12..a54025b00917 100644
--- a/tools/testing/selftests/ublk/batch.c
+++ b/tools/testing/selftests/ublk/batch.c
@@ -76,6 +76,7 @@ static void free_batch_commit_buf(struct ublk_thread *t)
 		free(t->commit_buf);
 	}
 	allocator_deinit(&t->commit_buf_alloc);
+	free(t->commit);
 }
 
 static int alloc_batch_commit_buf(struct ublk_thread *t)
@@ -84,7 +85,13 @@ static int alloc_batch_commit_buf(struct ublk_thread *t)
 	unsigned int total = buf_size * t->nr_commit_buf;
 	unsigned int page_sz = getpagesize();
 	void *buf = NULL;
-	int ret;
+	int i, ret, j = 0;
+
+	t->commit = calloc(t->nr_queues, sizeof(*t->commit));
+	for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) {
+		if (t->q_map[i])
+			t->commit[j++].q_id = i;
+	}
 
 	allocator_init(&t->commit_buf_alloc, t->nr_commit_buf);
 
@@ -107,6 +114,17 @@ fail:
 	return ret;
 }
 
+static unsigned int ublk_thread_nr_queues(const struct ublk_thread *t)
+{
+	int i;
+	int ret = 0;
+
+	for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++)
+		ret += !!t->q_map[i];
+
+	return ret;
+}
+
 void ublk_batch_prepare(struct ublk_thread *t)
 {
 	/*
@@ -119,10 +137,13 @@ void ublk_batch_prepare(struct ublk_thread *t)
 	 */
 	struct ublk_queue *q = &t->dev->q[0];
 
+	/* cache nr_queues because we don't support dynamic load-balance yet */
+	t->nr_queues = ublk_thread_nr_queues(t);
+
 	t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev);
 	t->commit_buf_size = ublk_commit_buf_size(t);
 	t->commit_buf_start = t->nr_bufs;
-	t->nr_commit_buf = 2;
+	t->nr_commit_buf = 2 * t->nr_queues;
 	t->nr_bufs += t->nr_commit_buf;
 
 	t->cmd_flags = 0;
@@ -144,11 +165,12 @@ static void free_batch_fetch_buf(struct ublk_thread *t)
 {
 	int i;
 
-	for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) {
+	for (i = 0; i < t->nr_fetch_bufs; i++) {
 		io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i);
 		munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size);
 		free(t->fetch[i].fetch_buf);
 	}
+	free(t->fetch);
 }
 
 static int alloc_batch_fetch_buf(struct ublk_thread *t)
@@ -159,7 +181,12 @@ static int alloc_batch_fetch_buf(struct ublk_thread *t)
 	int ret;
 	int i = 0;
 
-	for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) {
+	/* double fetch buffer for each queue */
+	t->nr_fetch_bufs = t->nr_queues * 2;
+	t->fetch = calloc(t->nr_fetch_bufs, sizeof(*t->fetch));
+
+	/* allocate one buffer for each queue */
+	for (i = 0; i < t->nr_fetch_bufs; i++) {
 		t->fetch[i].fetch_buf_size = buf_size;
 
 		if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz,
@@ -185,7 +212,7 @@ int ublk_batch_alloc_buf(struct ublk_thread *t)
 {
 	int ret;
 
-	ublk_assert(t->nr_commit_buf < 16);
+	ublk_assert(t->nr_commit_buf < 2 * UBLK_MAX_QUEUES);
 
 	ret = alloc_batch_commit_buf(t);
 	if (ret)
@@ -271,13 +298,20 @@ static void ublk_batch_queue_fetch(struct ublk_thread *t,
 	t->fetch[buf_idx].fetch_buf_off = 0;
 }
 
-void ublk_batch_start_fetch(struct ublk_thread *t,
-			    struct ublk_queue *q)
+void ublk_batch_start_fetch(struct ublk_thread *t)
 {
 	int i;
+	int j = 0;
+
+	for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) {
+		if (t->q_map[i]) {
+			struct ublk_queue *q = &t->dev->q[i];
 
-	for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++)
-		ublk_batch_queue_fetch(t, q, i);
+			/* submit two fetch commands for each queue */
+			ublk_batch_queue_fetch(t, q, j++);
+			ublk_batch_queue_fetch(t, q, j++);
+		}
+	}
 }
 
 static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t,
@@ -317,7 +351,7 @@ static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t,
 	return buf_idx;
 }
 
-int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q)
+static int __ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q)
 {
 	unsigned short nr_elem = q->q_depth;
 	unsigned short buf_idx = ublk_alloc_commit_buf(t);
@@ -354,6 +388,22 @@ int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q)
 	return 0;
 }
 
+int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q)
+{
+	int ret = 0;
+
+	pthread_spin_lock(&q->lock);
+	if (q->flags & UBLKS_Q_PREPARED)
+		goto unlock;
+	ret = __ublk_batch_queue_prep_io_cmds(t, q);
+	if (!ret)
+		q->flags |= UBLKS_Q_PREPARED;
+unlock:
+	pthread_spin_unlock(&q->lock);
+
+	return ret;
+}
+
 static void ublk_batch_compl_commit_cmd(struct ublk_thread *t,
 					const struct io_uring_cqe *cqe,
 					unsigned op)
@@ -401,59 +451,89 @@ void ublk_batch_compl_cmd(struct ublk_thread *t,
 	}
 }
 
-void ublk_batch_commit_io_cmds(struct ublk_thread *t)
+static void __ublk_batch_commit_io_cmds(struct ublk_thread *t,
+					struct batch_commit_buf *cb)
 {
 	struct io_uring_sqe *sqe;
 	unsigned short buf_idx;
-	unsigned short nr_elem = t->commit.done;
+	unsigned short nr_elem = cb->done;
 
 	/* nothing to commit */
 	if (!nr_elem) {
-		ublk_free_commit_buf(t, t->commit.buf_idx);
+		ublk_free_commit_buf(t, cb->buf_idx);
 		return;
 	}
 
 	ublk_io_alloc_sqes(t, &sqe, 1);
-	buf_idx = t->commit.buf_idx;
-	sqe->addr = (__u64)t->commit.elem;
+	buf_idx = cb->buf_idx;
+	sqe->addr = (__u64)cb->elem;
 	sqe->len = nr_elem * t->commit_buf_elem_size;
 
 	/* commit isn't per-queue command */
-	ublk_init_batch_cmd(t, t->commit.q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS,
+	ublk_init_batch_cmd(t, cb->q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS,
 			t->commit_buf_elem_size, nr_elem, buf_idx);
 	ublk_setup_commit_sqe(t, sqe, buf_idx);
 }
 
-static void ublk_batch_init_commit(struct ublk_thread *t,
-				   unsigned short buf_idx)
+void ublk_batch_commit_io_cmds(struct ublk_thread *t)
+{
+	int i;
+
+	for (i = 0; i < t->nr_queues; i++) {
+		struct batch_commit_buf *cb = &t->commit[i];
+
+		if (cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX)
+			__ublk_batch_commit_io_cmds(t, cb);
+	}
+
+}
+
+static void __ublk_batch_init_commit(struct ublk_thread *t,
+				     struct batch_commit_buf *cb,
+				     unsigned short buf_idx)
 {
 	/* so far only support 1:1 queue/thread mapping */
-	t->commit.q_id = t->idx;
-	t->commit.buf_idx = buf_idx;
-	t->commit.elem = ublk_get_commit_buf(t, buf_idx);
-	t->commit.done = 0;
-	t->commit.count = t->commit_buf_size /
+	cb->buf_idx = buf_idx;
+	cb->elem = ublk_get_commit_buf(t, buf_idx);
+	cb->done = 0;
+	cb->count = t->commit_buf_size /
 		t->commit_buf_elem_size;
 }
 
-void ublk_batch_prep_commit(struct ublk_thread *t)
+/* COMMIT_IO_CMDS is per-queue command, so use its own commit buffer */
+static void ublk_batch_init_commit(struct ublk_thread *t,
+				   struct batch_commit_buf *cb)
 {
 	unsigned short buf_idx = ublk_alloc_commit_buf(t);
 
 	ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX);
-	ublk_batch_init_commit(t, buf_idx);
+	ublk_assert(!ublk_batch_commit_prepared(cb));
+
+	__ublk_batch_init_commit(t, cb, buf_idx);
+}
+
+void ublk_batch_prep_commit(struct ublk_thread *t)
+{
+	int i;
+
+	for (i = 0; i < t->nr_queues; i++)
+		t->commit[i].buf_idx = UBLKS_T_COMMIT_BUF_INV_IDX;
 }
 
 void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q,
 			    unsigned tag, int res)
 {
-	struct batch_commit_buf *cb = &t->commit;
-	struct ublk_batch_elem *elem = (struct ublk_batch_elem *)(cb->elem +
-			cb->done * t->commit_buf_elem_size);
+	unsigned q_t_idx = ublk_queue_idx_in_thread(t, q);
+	struct batch_commit_buf *cb = &t->commit[q_t_idx];
+	struct ublk_batch_elem *elem;
 	struct ublk_io *io = &q->ios[tag];
 
-	ublk_assert(q->q_id == t->commit.q_id);
+	if (!ublk_batch_commit_prepared(cb))
+		ublk_batch_init_commit(t, cb);
+
+	ublk_assert(q->q_id == cb->q_id);
 
+	elem = (struct ublk_batch_elem *)(cb->elem + cb->done * t->commit_buf_elem_size);
 	elem->tag = tag;
 	elem->buf_index = ublk_batch_io_buf_idx(t, q, tag);
 	elem->result = res;
@@ -464,3 +544,64 @@ void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q,
 	cb->done += 1;
 	ublk_assert(cb->done <= cb->count);
 }
+
+void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES],
+			   int nthreads, int queues)
+{
+	int i, j;
+
+	/*
+	 * Setup round-robin queue-to-thread mapping for arbitrary N:M combinations.
+	 *
+	 * This algorithm distributes queues across threads (and threads across queues)
+	 * in a balanced round-robin fashion to ensure even load distribution.
+	 *
+	 * Examples:
+	 * - 2 threads, 4 queues: T0=[Q0,Q2], T1=[Q1,Q3]
+	 * - 4 threads, 2 queues: T0=[Q0], T1=[Q1], T2=[Q0], T3=[Q1]
+	 * - 3 threads, 3 queues: T0=[Q0], T1=[Q1], T2=[Q2] (1:1 mapping)
+	 *
+	 * Phase 1: Mark which queues each thread handles (boolean mapping)
+	 */
+	for (i = 0, j = 0; i < queues || j < nthreads; i++, j++) {
+		q_thread_map[j % nthreads][i % queues] = 1;
+	}
+
+	/*
+	 * Phase 2: Convert boolean mapping to sequential indices within each thread.
+	 *
+	 * Transform from: q_thread_map[thread][queue] = 1 (handles queue)
+	 * To:             q_thread_map[thread][queue] = N (queue index within thread)
+	 *
+	 * This allows each thread to know the local index of each queue it handles,
+	 * which is essential for buffer allocation and management. For example:
+	 * - Thread 0 handling queues [0,2] becomes: q_thread_map[0][0]=1, q_thread_map[0][2]=2
+	 * - Thread 1 handling queues [1,3] becomes: q_thread_map[1][1]=1, q_thread_map[1][3]=2
+	 */
+	for (j = 0; j < nthreads; j++) {
+		unsigned char seq = 1;
+
+		for (i = 0; i < queues; i++) {
+			if (q_thread_map[j][i])
+				q_thread_map[j][i] = seq++;
+		}
+	}
+
+#if 0
+	for (j = 0; j < nthreads; j++) {
+		printf("thread %0d: ", j);
+		for (i = 0; i < queues; i++) {
+			if (q_thread_map[j][i])
+				printf("%03u ", i);
+		}
+		printf("\n");
+	}
+	printf("\n");
+	for (j = 0; j < nthreads; j++) {
+		for (i = 0; i < queues; i++) {
+			printf("%03u ", q_thread_map[j][i]);
+		}
+		printf("\n");
+	}
+#endif
+}
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 5d84000872a0..2da37557e1a9 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -455,6 +455,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags,
 	int cmd_buf_size, io_buf_size, integrity_size;
 	unsigned long off;
 
+	pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE);
 	q->tgt_ops = dev->tgt.ops;
 	q->flags = 0;
 	q->q_depth = depth;
@@ -521,7 +522,7 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag
 
 	/* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */
 	if (ublk_dev_batch_io(dev))
-		cq_depth += dev->dev_info.queue_depth;
+		cq_depth += dev->dev_info.queue_depth * 2;
 
 	ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
 			IORING_SETUP_COOP_TASKRUN |
@@ -957,6 +958,7 @@ struct ublk_thread_info {
 	sem_t 			*ready;
 	cpu_set_t 		*affinity;
 	unsigned long long	extra_flags;
+	unsigned char		(*q_thread_map)[UBLK_MAX_QUEUES];
 };
 
 static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info)
@@ -970,14 +972,18 @@ static void ublk_batch_setup_queues(struct ublk_thread *t)
 {
 	int i;
 
-	/* setup all queues in the 1st thread */
 	for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) {
 		struct ublk_queue *q = &t->dev->q[i];
 		int ret;
 
+		/*
+		 * Only prepare io commands in the mapped thread context,
+		 * otherwise io command buffer index may not work as expected
+		 */
+		if (t->q_map[i] == 0)
+			continue;
+
 		ret = ublk_batch_queue_prep_io_cmds(t, q);
-		ublk_assert(ret == 0);
-		ret = ublk_process_io(t);
 		ublk_assert(ret >= 0);
 	}
 }
@@ -991,6 +997,10 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf
 	int dev_id = info->dev->dev_info.dev_id;
 	int ret;
 
+	/* Copy per-thread queue mapping into thread-local variable */
+	if (info->q_thread_map)
+		memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map));
+
 	ret = ublk_thread_init(&t, info->extra_flags);
 	if (ret) {
 		ublk_err("ublk dev %d thread %u init failed\n",
@@ -1006,12 +1016,8 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf
 		/* submit all io commands to ublk driver */
 		ublk_submit_fetch_commands(&t);
 	} else {
-		struct ublk_queue *q = &t.dev->q[t.idx];
-
-		/* prepare all io commands in the 1st thread context */
-		if (!t.idx)
-			ublk_batch_setup_queues(&t);
-		ublk_batch_start_fetch(&t, q);
+		ublk_batch_setup_queues(&t);
+		ublk_batch_start_fetch(&t);
 	}
 
 	do {
@@ -1085,6 +1091,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 	struct ublk_thread_info *tinfo;
 	unsigned long long extra_flags = 0;
 	cpu_set_t *affinity_buf;
+	unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL;
 	void *thread_ret;
 	sem_t ready;
 	int ret, i;
@@ -1104,6 +1111,16 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 	if (ret)
 		return ret;
 
+	if (ublk_dev_batch_io(dev)) {
+		q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map));
+		if (!q_thread_map) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+		ublk_batch_setup_map(q_thread_map, dev->nthreads,
+				     dinfo->nr_hw_queues);
+	}
+
 	if (ctx->auto_zc_fallback)
 		extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK;
 	if (ctx->no_ublk_fixed_fd)
@@ -1127,6 +1144,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 		tinfo[i].idx = i;
 		tinfo[i].ready = &ready;
 		tinfo[i].extra_flags = extra_flags;
+		tinfo[i].q_thread_map = q_thread_map;
 
 		/*
 		 * If threads are not tied 1:1 to queues, setting thread
@@ -1146,6 +1164,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
 	for (i = 0; i < dev->nthreads; i++)
 		sem_wait(&ready);
 	free(affinity_buf);
+	free(q_thread_map);
 
 	/* everything is fine now, start us */
 	if (ctx->recovery)
@@ -1314,7 +1333,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
 		goto fail;
 	}
 
-	if (nthreads != nr_queues && !ctx->per_io_tasks) {
+	if (nthreads != nr_queues && (!ctx->per_io_tasks &&
+				!(ctx->flags & UBLK_F_BATCH_IO))) {
 		ublk_err("%s: threads %u must be same as queues %u if "
 			"not using per_io_tasks\n",
 			__func__, nthreads, nr_queues);
@@ -1940,6 +1960,13 @@ int main(int argc, char *argv[])
 		return -EINVAL;
 	}
 
+	if ((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
+			(ctx.flags & UBLK_F_BATCH_IO) &&
+			(ctx.nthreads > ctx.nr_hw_queues)) {
+		ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n");
+		return -EINVAL;
+	}
+
 	i = optind;
 	while (i < argc && ctx.nr_files < MAX_BACK_FILES) {
 		ctx.files[ctx.nr_files++] = argv[i++];
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 950e99c02e8b..ca97deb5e208 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -173,13 +173,17 @@ struct ublk_queue {
 	const struct ublk_tgt_ops *tgt_ops;
 	struct ublksrv_io_desc *io_cmd_buf;
 
-/* borrow one bit of ublk uapi flags, which may never be used */
+/* borrow three bit of ublk uapi flags, which may never be used */
 #define UBLKS_Q_AUTO_BUF_REG_FALLBACK	(1ULL << 63)
 #define UBLKS_Q_NO_UBLK_FIXED_FD	(1ULL << 62)
+#define UBLKS_Q_PREPARED	(1ULL << 61)
 	__u64 flags;
 	int ublk_fd;	/* cached ublk char device fd */
 	__u8 metadata_size;
 	struct ublk_io ios[UBLK_QUEUE_DEPTH];
+
+	/* used for prep io commands */
+	pthread_spinlock_t lock;
 };
 
 /* align with `ublk_elem_header` */
@@ -206,8 +210,12 @@ struct batch_fetch_buf {
 };
 
 struct ublk_thread {
+	/* Thread-local copy of queue-to-thread mapping for this thread */
+	unsigned char q_map[UBLK_MAX_QUEUES];
+
 	struct ublk_dev *dev;
-	unsigned idx;
+	unsigned short idx;
+	unsigned short nr_queues;
 
 #define UBLKS_T_STOPPING	(1U << 0)
 #define UBLKS_T_IDLE	(1U << 1)
@@ -230,10 +238,10 @@ struct ublk_thread {
 	void *commit_buf;
 #define UBLKS_T_COMMIT_BUF_INV_IDX  ((unsigned short)-1)
 	struct allocator commit_buf_alloc;
-	struct batch_commit_buf commit;
+	struct batch_commit_buf *commit;
 	/* FETCH_IO_CMDS buffer */
-#define UBLKS_T_NR_FETCH_BUF 	2
-	struct batch_fetch_buf fetch[UBLKS_T_NR_FETCH_BUF];
+	unsigned short nr_fetch_bufs;
+	struct batch_fetch_buf *fetch;
 
 	struct io_uring ring;
 };
@@ -512,6 +520,21 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q)
 	return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q);
 }
 
+static inline int ublk_batch_commit_prepared(struct batch_commit_buf *cb)
+{
+	return cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX;
+}
+
+static inline unsigned ublk_queue_idx_in_thread(const struct ublk_thread *t,
+						const struct ublk_queue *q)
+{
+	unsigned char idx;
+
+	idx = t->q_map[q->q_id];
+	ublk_assert(idx != 0);
+	return idx - 1;
+}
+
 /*
  * Each IO's buffer index has to be calculated by this helper for
  * UBLKS_T_BATCH_IO
@@ -520,14 +543,13 @@ static inline unsigned short ublk_batch_io_buf_idx(
 		const struct ublk_thread *t, const struct ublk_queue *q,
 		unsigned tag)
 {
-	return tag;
+	return ublk_queue_idx_in_thread(t, q) * q->q_depth + tag;
 }
 
 /* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */
 int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q);
 /* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */
-void ublk_batch_start_fetch(struct ublk_thread *t,
-			    struct ublk_queue *q);
+void ublk_batch_start_fetch(struct ublk_thread *t);
 /* Handle completion of batch I/O commands (prep/commit) */
 void ublk_batch_compl_cmd(struct ublk_thread *t,
 			  const struct io_uring_cqe *cqe);
@@ -545,6 +567,8 @@ void ublk_batch_commit_io_cmds(struct ublk_thread *t);
 /* Add a completed I/O operation to the current batch commit buffer */
 void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q,
 			    unsigned tag, int res);
+void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES],
+			   int nthreads, int queues);
 
 static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q,
 				   unsigned tag, int res)
diff --git a/tools/testing/selftests/ublk/test_batch_02.sh b/tools/testing/selftests/ublk/test_batch_02.sh
new file mode 100755
index 000000000000..b477f91359e1
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_batch_02.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="batch_02"
+ERR_CODE=0
+
+if ! _have_feature "BATCH_IO"; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+if ! _have_program fio; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "generic" "test UBLK_F_BATCH_IO with 4_threads vs. 1_queues"
+
+_create_backfile 0 512M
+
+dev_id=$(_add_ublk_dev -t loop -q 1 --nthreads 4 -b "${UBLK_BACKFILES[0]}")
+_check_add_dev $TID $?
+
+# run fio over the ublk disk
+fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \
+	--iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1
+ERR_CODE=$?
+
+_cleanup_test "generic"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_batch_03.sh b/tools/testing/selftests/ublk/test_batch_03.sh
new file mode 100755
index 000000000000..13a2b3d3a1b9
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_batch_03.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="batch_03"
+ERR_CODE=0
+
+if ! _have_feature "BATCH_IO"; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+if ! _have_program fio; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "generic" "test UBLK_F_BATCH_IO with 1_threads vs. 4_queues"
+
+_create_backfile 0 512M
+
+dev_id=$(_add_ublk_dev -t loop -q 4 --nthreads 1 -b "${UBLK_BACKFILES[0]}")
+_check_add_dev $TID $?
+
+# run fio over the ublk disk
+fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \
+	--iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1
+ERR_CODE=$?
+
+_cleanup_test "generic"
+_show_result $TID $ERR_CODE
-- 
cgit v1.2.3


From e4d3fc6a22f53e5bbe51e28b43cb32bc130d9f87 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 23 Jan 2026 17:15:44 +0800
Subject: selftests: ublk: fix test name

Fix the two added test name.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/test_stress_08.sh | 2 +-
 tools/testing/selftests/ublk/test_stress_09.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh
index 190db0b4f2ad..9abb50ee3d00 100755
--- a/tools/testing/selftests/ublk/test_stress_08.sh
+++ b/tools/testing/selftests/ublk/test_stress_08.sh
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-TID="stress_06"
+TID="stress_08"
 ERR_CODE=0
 
 ublk_io_and_remove()
diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh
index 1b6bdb31da03..87b92b0a2410 100755
--- a/tools/testing/selftests/ublk/test_stress_09.sh
+++ b/tools/testing/selftests/ublk/test_stress_09.sh
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-TID="stress_07"
+TID="stress_09"
 ERR_CODE=0
 
 ublk_io_and_kill_daemon()
-- 
cgit v1.2.3


From 65c4b312f1f13f4b45e18387f4a8bb19c1ea3ff3 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Wed, 21 Jan 2026 09:51:54 -0700
Subject: tools: usb: usbip: remove dead-link from README

Remove dead-link to Debug Tips document on usbip project wiki
http://usbip.wiki.sourceforge.net/how-to-debug-usbip

Tried and failed find the file in archives. It would be nice to
locate the file and add this to usbip tool.

Reported-by: Ignacio Hernandez-Ros <ignacio@hernandez-ros.com>
Closes: https://lore.kernel.org/all/0101019bdf6ca137-60344502-51d2-4767-a34b-6a7cf1bfdf4a-000000@us-west-2.amazonses.com/
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://patch.msgid.link/20260121165155.13550-1-skhan@linuxfoundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/usb/usbip/README | 2 --
 1 file changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/usb/usbip/README b/tools/usb/usbip/README
index 2fc021c0eae1..11971538f03e 100644
--- a/tools/usb/usbip/README
+++ b/tools/usb/usbip/README
@@ -241,8 +241,6 @@ Detach the imported device:
 
 
 [Checklist]
-    - See 'Debug Tips' on the project wiki.
-	- http://usbip.wiki.sourceforge.net/how-to-debug-usbip
     - usbip-host.ko must be bound to the target device.
 	- See /sys/kernel/debug/usb/devices and find "Driver=..." lines of the device.
     - Target USB gadget must be bound to vudc
-- 
cgit v1.2.3


From e396a74222654486d6ab45dca5d0c54c408b8b91 Mon Sep 17 00:00:00 2001
From: Zhiquan Li <zhiquan_li@163.com>
Date: Thu, 22 Jan 2026 13:35:50 +0800
Subject: KVM: selftests: Add -U_FORTIFY_SOURCE to avoid some unpredictable
 test failures

Some distributions (such as Ubuntu) configure GCC so that
_FORTIFY_SOURCE is automatically enabled at -O1 or above.  This results
in some fortified version of definitions of standard library functions
are included.  While linker resolves the symbols, the fortified versions
might override the definitions in lib/string_override.c and reference to
those PLT entries in GLIBC.  This is not a problem for the code in host,
but it is a disaster for the guest code.  E.g., if build and run
x86/nested_emulation_test on Ubuntu 24.04 will encounter a L1 #PF due to
memset() reference to __memset_chk@plt.

The option -fno-builtin-memset is not helpful here, because those
fortified versions are not built-in but some definitions which are
included by header, they are for different intentions.

In order to eliminate the unpredictable behaviors may vary depending on
the linker and platform, add the "-U_FORTIFY_SOURCE" into CFLAGS to
prevent from introducing the fortified definitions.

Signed-off-by: Zhiquan Li <zhiquan_li@163.com>
Link: https://patch.msgid.link/20260122053551.548229-1-zhiquan_li@163.com
Fixes: 6b6f71484bf4 ("KVM: selftests: Implement memcmp(), memcpy(), and memset() for guest use")
Cc: stable@vger.kernel.org
[sean: tag for stable]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index ba5c2b643efa..d45bf4ccb3bf 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -251,6 +251,7 @@ LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include
 LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
 CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
 	-Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \
+	-U_FORTIFY_SOURCE \
 	-fno-builtin-memcmp -fno-builtin-memcpy \
 	-fno-builtin-memset -fno-builtin-strnlen \
 	-fno-stack-protector -fno-PIE -fno-strict-aliasing \
-- 
cgit v1.2.3


From 5094f7d5ff2318edfe6f2a9632b31f0ddefd6ee4 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 18 Jan 2026 00:26:21 +0100
Subject: tools/docs: sphinx-build-wrapper: generate rust docs only once
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the rust docs are generated for each entry in SPHINXDIRS.
This is unnecessary as they will be the same for each one.

Move the generation, so it is executed only once.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Tested-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <20260118-docs-spurious-rust-v1-1-998e14b9ed9e@weissschuh.net>
---
 tools/docs/sphinx-build-wrapper | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper
index 9f1ae1485f84..2f65ddc85955 100755
--- a/tools/docs/sphinx-build-wrapper
+++ b/tools/docs/sphinx-build-wrapper
@@ -352,23 +352,6 @@ class SphinxBuilder:
             except (OSError, IOError) as e:
                 print(f"Warning: Failed to copy CSS: {e}", file=sys.stderr)
 
-        if self.rustdoc:
-            print("Building rust docs")
-            if "MAKE" in self.env:
-                cmd = [self.env["MAKE"]]
-            else:
-                cmd = ["make", "LLVM=1"]
-
-            cmd += [ "rustdoc"]
-            if self.verbose:
-                print(" ".join(cmd))
-
-            try:
-                subprocess.run(cmd, check=True)
-            except subprocess.CalledProcessError as e:
-                print(f"Ignored errors when building rustdoc: {e}. Is RUST enabled?",
-                      file=sys.stderr)
-
     def build_pdf_file(self, latex_cmd, from_dir, path):
         """Builds a single pdf file using latex_cmd"""
         try:
@@ -785,6 +768,23 @@ class SphinxBuilder:
         elif target == "infodocs":
             self.handle_info(output_dirs)
 
+        if self.rustdoc and target in ["htmldocs", "epubdocs"]:
+            print("Building rust docs")
+            if "MAKE" in self.env:
+                cmd = [self.env["MAKE"]]
+            else:
+                cmd = ["make", "LLVM=1"]
+
+            cmd += [ "rustdoc"]
+            if self.verbose:
+                print(" ".join(cmd))
+
+            try:
+                subprocess.run(cmd, check=True)
+            except subprocess.CalledProcessError as e:
+                print(f"Ignored errors when building rustdoc: {e}. Is RUST enabled?",
+                      file=sys.stderr)
+
 def jobs_type(value):
     """
     Handle valid values for -j. Accepts Sphinx "-jauto", plus a number
-- 
cgit v1.2.3


From 2d652135a16b413e38e6d7fed5244690d853756b Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 18 Jan 2026 00:26:22 +0100
Subject: tools/docs: sphinx-build-wrapper: make 'rustdoc' a local variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All users of this variable are now in the same method.

Demote the instance variable to a local one.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Tested-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <20260118-docs-spurious-rust-v1-2-998e14b9ed9e@weissschuh.net>
---
 tools/docs/sphinx-build-wrapper | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper
index 2f65ddc85955..76dfd5cbf178 100755
--- a/tools/docs/sphinx-build-wrapper
+++ b/tools/docs/sphinx-build-wrapper
@@ -123,12 +123,10 @@ class SphinxBuilder:
         """
         Checks if Rust is enabled
         """
-        self.rustdoc = False
-
         config = os.path.join(self.srctree, ".config")
 
         if not os.path.isfile(config):
-            return
+            return False
 
         re_rust = re.compile(r"CONFIG_RUST=(m|y)")
 
@@ -136,11 +134,13 @@ class SphinxBuilder:
             with open(config, "r", encoding="utf-8") as fp:
                 for line in fp:
                     if re_rust.match(line):
-                        self.rustdoc = True
-                        return
+                        return True
 
         except OSError as e:
             print(f"Failed to open {config}", file=sys.stderr)
+            return False
+
+        return False
 
     def get_sphinx_extra_opts(self, n_jobs):
         """
@@ -259,8 +259,6 @@ class SphinxBuilder:
 
         self.get_sphinx_extra_opts(n_jobs)
 
-        self.check_rust()
-
         #
         # If venv command line argument is specified, run Sphinx from venv
         #
@@ -680,7 +678,8 @@ class SphinxBuilder:
 
             args.extend(["-D", f"latex_elements.papersize={paper}paper"])
 
-        if self.rustdoc:
+        rustdoc = self.check_rust()
+        if rustdoc:
             args.extend(["-t", "rustdoc"])
 
         if not sphinxdirs:
@@ -768,7 +767,7 @@ class SphinxBuilder:
         elif target == "infodocs":
             self.handle_info(output_dirs)
 
-        if self.rustdoc and target in ["htmldocs", "epubdocs"]:
+        if rustdoc and target in ["htmldocs", "epubdocs"]:
             print("Building rust docs")
             if "MAKE" in self.env:
                 cmd = [self.env["MAKE"]]
-- 
cgit v1.2.3


From 6f9a96cc96ea405e0f80fede761dc415e33364c7 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 18 Jan 2026 00:26:23 +0100
Subject: tools/docs: sphinx-build-wrapper: compute sphinxdirs_list earlier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

An upcoming patch will require sphinxdirs_list to be available before
the call to check_rust().

Move it up in the function.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Tested-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <20260118-docs-spurious-rust-v1-3-998e14b9ed9e@weissschuh.net>
---
 tools/docs/sphinx-build-wrapper | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper
index 76dfd5cbf178..04eb300dab4a 100755
--- a/tools/docs/sphinx-build-wrapper
+++ b/tools/docs/sphinx-build-wrapper
@@ -670,6 +670,19 @@ class SphinxBuilder:
         if kerneldoc.startswith(self.srctree):
             kerneldoc = os.path.relpath(kerneldoc, self.srctree)
 
+        if not sphinxdirs:
+            sphinxdirs = os.environ.get("SPHINXDIRS", ".")
+
+        #
+        # sphinxdirs can be a list or a whitespace-separated string
+        #
+        sphinxdirs_list = []
+        for sphinxdir in sphinxdirs:
+            if isinstance(sphinxdir, list):
+                sphinxdirs_list += sphinxdir
+            else:
+                sphinxdirs_list += sphinxdir.split()
+
         args = [ "-b", builder, "-c", docs_dir ]
 
         if builder == "latex":
@@ -682,9 +695,6 @@ class SphinxBuilder:
         if rustdoc:
             args.extend(["-t", "rustdoc"])
 
-        if not sphinxdirs:
-            sphinxdirs = os.environ.get("SPHINXDIRS", ".")
-
         #
         # The sphinx-build tool has a bug: internally, it tries to set
         # locale with locale.setlocale(locale.LC_ALL, ''). This causes a
@@ -695,16 +705,6 @@ class SphinxBuilder:
         except locale.Error:
             self.env["LC_ALL"] = "C"
 
-        #
-        # sphinxdirs can be a list or a whitespace-separated string
-        #
-        sphinxdirs_list = []
-        for sphinxdir in sphinxdirs:
-            if isinstance(sphinxdir, list):
-                sphinxdirs_list += sphinxdir
-            else:
-                sphinxdirs_list += sphinxdir.split()
-
         #
         # Step 1:  Build each directory in separate.
         #
-- 
cgit v1.2.3


From ffb569d59c253399efb2345ddfefe7929cd7e2a8 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 18 Jan 2026 00:26:24 +0100
Subject: tools/docs: sphinx-build-wrapper: only generate rust docs when
 requested
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the user explicitly specifies SPHINXDIRS to build a specific
subdirectory it is unexpected that the rust docs are also generated.
Especially as their generation may dominate the execution time.

Only generate the rust docs when they are part of the SPHINXDIRS
requested by the user. 'rust/rustdocs' is not considered, as it is
not a valid SPHINXDIRS anyways.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Tested-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <20260118-docs-spurious-rust-v1-4-998e14b9ed9e@weissschuh.net>
---
 tools/docs/sphinx-build-wrapper | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper
index 04eb300dab4a..78ff7ac202ef 100755
--- a/tools/docs/sphinx-build-wrapper
+++ b/tools/docs/sphinx-build-wrapper
@@ -119,12 +119,15 @@ class SphinxBuilder:
 
         return path
 
-    def check_rust(self):
+    def check_rust(self, sphinxdirs):
         """
         Checks if Rust is enabled
         """
         config = os.path.join(self.srctree, ".config")
 
+        if not {'.', 'rust'}.intersection(sphinxdirs):
+            return False
+
         if not os.path.isfile(config):
             return False
 
@@ -691,7 +694,7 @@ class SphinxBuilder:
 
             args.extend(["-D", f"latex_elements.papersize={paper}paper"])
 
-        rustdoc = self.check_rust()
+        rustdoc = self.check_rust(sphinxdirs_list)
         if rustdoc:
             args.extend(["-t", "rustdoc"])
 
-- 
cgit v1.2.3


From 4d7f6319faf209a9472f7bc5df98706b723b9464 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:09 +0100
Subject: docs: kdoc: latex_fonts: Improve docstrings and comments

In preparation to document kernel-doc module, improve its
documentation.

Among the changes, it had to place the xml template inside
a code block, as otherwise doc build would break.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <6e0eb2e245eae9b4f39cf231dee32df00b9e8b7b.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/latex_fonts.py | 95 +++++++++++++++++++++---------------
 1 file changed, 56 insertions(+), 39 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/latex_fonts.py b/tools/lib/python/kdoc/latex_fonts.py
index 29317f8006ea..1d04cbda169f 100755
--- a/tools/lib/python/kdoc/latex_fonts.py
+++ b/tools/lib/python/kdoc/latex_fonts.py
@@ -5,12 +5,13 @@
 # Ported to Python by (c) Mauro Carvalho Chehab, 2025
 
 """
-Detect problematic Noto CJK variable fonts.
+Detect problematic Noto CJK variable fonts
+==========================================
 
-For "make pdfdocs", reports of build errors of translations.pdf started
-arriving early 2024 [1, 2].  It turned out that Fedora and openSUSE
-tumbleweed have started deploying variable-font [3] format of "Noto CJK"
-fonts [4, 5].  For PDF, a LaTeX package named xeCJK is used for CJK
+For ``make pdfdocs``, reports of build errors of translations.pdf started
+arriving early 2024 [1]_ [2]_.  It turned out that Fedora and openSUSE
+tumbleweed have started deploying variable-font [3]_ format of "Noto CJK"
+fonts [4]_ [5]_.  For PDF, a LaTeX package named xeCJK is used for CJK
 (Chinese, Japanese, Korean) pages.  xeCJK requires XeLaTeX/XeTeX, which
 does not (and likely never will) understand variable fonts for historical
 reasons.
@@ -25,68 +26,77 @@ This script is invoked from the error path of "make pdfdocs" and emits
 suggestions if variable-font files of "Noto CJK" fonts are in the list of
 fonts accessible from XeTeX.
 
-References:
-[1]: https://lore.kernel.org/r/8734tqsrt7.fsf@meer.lwn.net/
-[2]: https://lore.kernel.org/r/1708585803.600323099@f111.i.mail.ru/
-[3]: https://en.wikipedia.org/wiki/Variable_font
-[4]: https://fedoraproject.org/wiki/Changes/Noto_CJK_Variable_Fonts
-[5]: https://build.opensuse.org/request/show/1157217
+.. [1] https://lore.kernel.org/r/8734tqsrt7.fsf@meer.lwn.net/
+.. [2] https://lore.kernel.org/r/1708585803.600323099@f111.i.mail.ru/
+.. [3] https://en.wikipedia.org/wiki/Variable_font
+.. [4] https://fedoraproject.org/wiki/Changes/Noto_CJK_Variable_Fonts
+.. [5] https://build.opensuse.org/request/show/1157217
 
-#===========================================================================
 Workarounds for building translations.pdf
-#===========================================================================
+-----------------------------------------
 
 * Denylist "variable font" Noto CJK fonts.
+
   - Create $HOME/deny-vf/fontconfig/fonts.conf from template below, with
     tweaks if necessary.  Remove leading "".
+
   - Path of fontconfig/fonts.conf can be overridden by setting an env
     variable FONTS_CONF_DENY_VF.
 
-    * Template:
------------------------------------------------------------------
-<?xml version="1.0"?>
-<!DOCTYPE fontconfig SYSTEM "urn:fontconfig:fonts.dtd">
-<fontconfig>
-<!--
-  Ignore variable-font glob (not to break xetex)
--->
-    <selectfont>
-        <rejectfont>
-            <!--
-                for Fedora
-            -->
-            <glob>/usr/share/fonts/google-noto-*-cjk-vf-fonts</glob>
-            <!--
-                for openSUSE tumbleweed
-            -->
-            <glob>/usr/share/fonts/truetype/Noto*CJK*-VF.otf</glob>
-        </rejectfont>
-    </selectfont>
-</fontconfig>
------------------------------------------------------------------
+    * Template::
+
+        <?xml version="1.0"?>
+        <!DOCTYPE fontconfig SYSTEM "urn:fontconfig:fonts.dtd">
+        <fontconfig>
+        <!--
+        Ignore variable-font glob (not to break xetex)
+        -->
+            <selectfont>
+                <rejectfont>
+                    <!--
+                        for Fedora
+                    -->
+                    <glob>/usr/share/fonts/google-noto-*-cjk-vf-fonts</glob>
+                    <!--
+                        for openSUSE tumbleweed
+                    -->
+                    <glob>/usr/share/fonts/truetype/Noto*CJK*-VF.otf</glob>
+                </rejectfont>
+            </selectfont>
+        </fontconfig>
 
     The denylisting is activated for "make pdfdocs".
 
 * For skipping CJK pages in PDF
+
   - Uninstall texlive-xecjk.
     Denylisting is not needed in this case.
 
 * For printing CJK pages in PDF
+
   - Need non-variable "Noto CJK" fonts.
+
     * Fedora
+
       - google-noto-sans-cjk-fonts
       - google-noto-serif-cjk-fonts
+
     * openSUSE tumbleweed
+
       - Non-variable "Noto CJK" fonts are not available as distro packages
         as of April, 2024.  Fetch a set of font files from upstream Noto
         CJK Font released at:
+
           https://github.com/notofonts/noto-cjk/tree/main/Sans#super-otc
+
         and at:
+
           https://github.com/notofonts/noto-cjk/tree/main/Serif#super-otc
-        , then uncompress and deploy them.
+
+        then uncompress and deploy them.
       - Remember to update fontconfig cache by running fc-cache.
 
-!!! Caution !!!
+.. caution::
     Uninstalling "variable font" packages can be dangerous.
     They might be depended upon by other packages important for your work.
     Denylisting should be less invasive, as it is effective only while
@@ -115,10 +125,15 @@ class LatexFontChecker:
         self.re_cjk = re.compile(r"([^:]+):\s*Noto\s+(Sans|Sans Mono|Serif) CJK")
 
     def description(self):
+        """
+        Returns module description.
+        """
         return __doc__
 
     def get_noto_cjk_vf_fonts(self):
-        """Get Noto CJK fonts"""
+        """
+        Get Noto CJK fonts.
+        """
 
         cjk_fonts = set()
         cmd = ["fc-list", ":", "file", "family", "variable"]
@@ -143,7 +158,9 @@ class LatexFontChecker:
         return sorted(cjk_fonts)
 
     def check(self):
-        """Check for problems with CJK fonts"""
+        """
+        Check for problems with CJK fonts.
+        """
 
         fonts = textwrap.indent("\n".join(self.get_noto_cjk_vf_fonts()), "    ")
         if not fonts:
-- 
cgit v1.2.3


From 8d08c7c6ffc14abe584738843c8577c691ffcf22 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:10 +0100
Subject: docs: kdoc_files: Improve docstrings and comments

In preparation to document kernel-doc module, improve its
documentation.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <75d58878ad6f83f24f1c0ce9e04301a000ecbaa3.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/kdoc_files.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/kdoc_files.py b/tools/lib/python/kdoc/kdoc_files.py
index bfe02baf1606..022487ea2cc6 100644
--- a/tools/lib/python/kdoc/kdoc_files.py
+++ b/tools/lib/python/kdoc/kdoc_files.py
@@ -5,7 +5,8 @@
 # pylint: disable=R0903,R0913,R0914,R0917
 
 """
-Parse lernel-doc tags on multiple kernel source files.
+Classes for navigating through the files that kernel-doc needs to handle
+to generate documentation.
 """
 
 import argparse
@@ -43,7 +44,7 @@ class GlobSourceFiles:
         self.srctree = srctree
 
     def _parse_dir(self, dirname):
-        """Internal function to parse files recursively"""
+        """Internal function to parse files recursively."""
 
         with os.scandir(dirname) as obj:
             for entry in obj:
@@ -65,7 +66,7 @@ class GlobSourceFiles:
     def parse_files(self, file_list, file_not_found_cb):
         """
         Define an iterator to parse all source files from file_list,
-        handling directories if any
+        handling directories if any.
         """
 
         if not file_list:
@@ -91,18 +92,18 @@ class KernelFiles():
 
     There are two type of parsers defined here:
         - self.parse_file(): parses both kernel-doc markups and
-          EXPORT_SYMBOL* macros;
-        - self.process_export_file(): parses only EXPORT_SYMBOL* macros.
+          ``EXPORT_SYMBOL*`` macros;
+        - self.process_export_file(): parses only ``EXPORT_SYMBOL*`` macros.
     """
 
     def warning(self, msg):
-        """Ancillary routine to output a warning and increment error count"""
+        """Ancillary routine to output a warning and increment error count."""
 
         self.config.log.warning(msg)
         self.errors += 1
 
     def error(self, msg):
-        """Ancillary routine to output an error and increment error count"""
+        """Ancillary routine to output an error and increment error count."""
 
         self.config.log.error(msg)
         self.errors += 1
@@ -128,7 +129,7 @@ class KernelFiles():
 
     def process_export_file(self, fname):
         """
-        Parses EXPORT_SYMBOL* macros from a single Kernel source file.
+        Parses ``EXPORT_SYMBOL*`` macros from a single Kernel source file.
         """
 
         # Prevent parsing the same file twice if results are cached
@@ -157,7 +158,7 @@ class KernelFiles():
                  wcontents_before_sections=False,
                  logger=None):
         """
-        Initialize startup variables and parse all files
+        Initialize startup variables and parse all files.
         """
 
         if not verbose:
@@ -213,7 +214,7 @@ class KernelFiles():
 
     def parse(self, file_list, export_file=None):
         """
-        Parse all files
+        Parse all files.
         """
 
         glob = GlobSourceFiles(srctree=self.config.src_tree)
@@ -242,7 +243,7 @@ class KernelFiles():
             filenames=None, export_file=None):
         """
         Interacts over the kernel-doc results and output messages,
-        returning kernel-doc markups on each interaction
+        returning kernel-doc markups on each interaction.
         """
 
         self.out_style.set_config(self.config)
-- 
cgit v1.2.3


From f40bba94a4db923a0ef0355b3055403fc9975729 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:11 +0100
Subject: docs: kdoc_item: Improve docstrings and comments

In preparation to document kernel-doc module, improve its
documentation.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <65a7c6bb318e7a8cbf5c115903d507568099151a.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/kdoc_item.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/kdoc_item.py b/tools/lib/python/kdoc/kdoc_item.py
index 19805301cb2c..2b8a93f79716 100644
--- a/tools/lib/python/kdoc/kdoc_item.py
+++ b/tools/lib/python/kdoc/kdoc_item.py
@@ -4,7 +4,16 @@
 # then pass into the output modules.
 #
 
+"""
+Data class to store a kernel-doc Item.
+"""
+
 class KdocItem:
+    """
+    A class that will, eventually, encapsulate all of the parsed data that we
+    then pass into the output modules.
+    """
+
     def __init__(self, name, fname, type, start_line, **other_stuff):
         self.name = name
         self.fname = fname
@@ -24,6 +33,9 @@ class KdocItem:
         self.other_stuff = other_stuff
 
     def get(self, key, default = None):
+        """
+        Get a value from optional keys.
+        """
         return self.other_stuff.get(key, default)
 
     def __getitem__(self, key):
@@ -33,10 +45,16 @@ class KdocItem:
     # Tracking of section and parameter information.
     #
     def set_sections(self, sections, start_lines):
+        """
+        Set sections and start lines.
+        """
         self.sections = sections
         self.section_start_lines = start_lines
 
     def set_params(self, names, descs, types, starts):
+        """
+        Set parameter list: names, descriptions, types and start lines.
+        """
         self.parameterlist = names
         self.parameterdescs = descs
         self.parametertypes = types
-- 
cgit v1.2.3


From 50206750e08e3087af74aa984d850df978b2554a Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:12 +0100
Subject: docs: kdoc_parser: Improve docstrings and comments

In preparation to document kernel-doc module, improve its
documentation.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <b6aabe25b45e9751885becd544a4db82dbe11ff2.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/kdoc_parser.py | 171 +++++++++++++++++++----------------
 1 file changed, 93 insertions(+), 78 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py
index a9a37519145d..bd88a2cd60c3 100644
--- a/tools/lib/python/kdoc/kdoc_parser.py
+++ b/tools/lib/python/kdoc/kdoc_parser.py
@@ -5,11 +5,8 @@
 # pylint: disable=C0301,C0302,R0904,R0912,R0913,R0914,R0915,R0917,R1702
 
 """
-kdoc_parser
-===========
-
-Read a C language source or header FILE and extract embedded
-documentation comments
+Classes and functions related to reading a C language source or header FILE
+and extract embedded documentation comments from it.
 """
 
 import sys
@@ -195,25 +192,28 @@ function_xforms  = [
 ]
 
 #
-# Apply a set of transforms to a block of text.
+# Ancillary functions
 #
+
 def apply_transforms(xforms, text):
+    """
+    Apply a set of transforms to a block of text.
+    """
     for search, subst in xforms:
         text = search.sub(subst, text)
     return text
 
-#
-# A little helper to get rid of excess white space
-#
 multi_space = KernRe(r'\s\s+')
 def trim_whitespace(s):
+    """
+    A little helper to get rid of excess white space.
+    """
     return multi_space.sub(' ', s.strip())
 
-#
-# Remove struct/enum members that have been marked "private".
-#
 def trim_private_members(text):
-    #
+    """
+    Remove ``struct``/``enum`` members that have been marked "private".
+    """
     # First look for a "public:" block that ends a private region, then
     # handle the "private until the end" case.
     #
@@ -226,20 +226,21 @@ def trim_private_members(text):
 
 class state:
     """
-    State machine enums
+    States used by the parser's state machine.
     """
 
     # Parser states
-    NORMAL        = 0        # normal code
-    NAME          = 1        # looking for function name
-    DECLARATION   = 2        # We have seen a declaration which might not be done
-    BODY          = 3        # the body of the comment
-    SPECIAL_SECTION = 4      # doc section ending with a blank line
-    PROTO         = 5        # scanning prototype
-    DOCBLOCK      = 6        # documentation block
-    INLINE_NAME   = 7        # gathering doc outside main block
-    INLINE_TEXT   = 8	     # reading the body of inline docs
-
+    NORMAL        = 0        #: Normal code.
+    NAME          = 1        #: Looking for function name.
+    DECLARATION   = 2        #: We have seen a declaration which might not be done.
+    BODY          = 3        #: The body of the comment.
+    SPECIAL_SECTION = 4      #: Doc section ending with a blank line.
+    PROTO         = 5        #: Scanning prototype.
+    DOCBLOCK      = 6        #: Documentation block.
+    INLINE_NAME   = 7        #: Gathering doc outside main block.
+    INLINE_TEXT   = 8	     #: Reading the body of inline docs.
+
+    #: Names for each parser state.
     name = [
         "NORMAL",
         "NAME",
@@ -253,9 +254,12 @@ class state:
     ]
 
 
-SECTION_DEFAULT = "Description"  # default section
+SECTION_DEFAULT = "Description"  #: Default section.
 
 class KernelEntry:
+    """
+    Encapsulates a Kernel documentation entry.
+    """
 
     def __init__(self, config, fname, ln):
         self.config = config
@@ -288,9 +292,11 @@ class KernelEntry:
     # Management of section contents
     #
     def add_text(self, text):
+        """Add a new text to the entry contents list."""
         self._contents.append(text)
 
     def contents(self):
+        """Returns a string with all content texts that were added."""
         return '\n'.join(self._contents) + '\n'
 
     # TODO: rename to emit_message after removal of kernel-doc.pl
@@ -309,10 +315,10 @@ class KernelEntry:
         self.warnings.append(log_msg)
         return
 
-    #
-    # Begin a new section.
-    #
     def begin_section(self, line_no, title = SECTION_DEFAULT, dump = False):
+        """
+        Begin a new section.
+        """
         if dump:
             self.dump_section(start_new = True)
         self.section = title
@@ -366,11 +372,13 @@ class KernelDoc:
     documentation comments.
     """
 
-    # Section names
-
+    #: Name of context section.
     section_context = "Context"
+
+    #: Name of return section.
     section_return = "Return"
 
+    #: String to write when a parameter is not described.
     undescribed = "-- undescribed --"
 
     def __init__(self, config, fname):
@@ -416,7 +424,7 @@ class KernelDoc:
 
     def dump_section(self, start_new=True):
         """
-        Dumps section contents to arrays/hashes intended for that purpose.
+        Dump section contents to arrays/hashes intended for that purpose.
         """
 
         if self.entry:
@@ -425,9 +433,9 @@ class KernelDoc:
     # TODO: rename it to store_declaration after removal of kernel-doc.pl
     def output_declaration(self, dtype, name, **args):
         """
-        Stores the entry into an entry array.
+        Store the entry into an entry array.
 
-        The actual output and output filters will be handled elsewhere
+        The actual output and output filters will be handled elsewhere.
         """
 
         item = KdocItem(name, self.fname, dtype,
@@ -663,10 +671,12 @@ class KernelDoc:
             self.emit_msg(ln,
                           f"No description found for return value of '{declaration_name}'")
 
-    #
-    # Split apart a structure prototype; returns (struct|union, name, members) or None
-    #
     def split_struct_proto(self, proto):
+        """
+        Split apart a structure prototype; returns (struct|union, name,
+        members) or ``None``.
+        """
+
         type_pattern = r'(struct|union)'
         qualifiers = [
             "__attribute__",
@@ -685,21 +695,26 @@ class KernelDoc:
             if r.search(proto):
                 return (r.group(1), r.group(3), r.group(2))
         return None
-    #
-    # Rewrite the members of a structure or union for easier formatting later on.
-    # Among other things, this function will turn a member like:
-    #
-    #  struct { inner_members; } foo;
-    #
-    # into:
-    #
-    #  struct foo; inner_members;
-    #
+
     def rewrite_struct_members(self, members):
+        """
+        Process ``struct``/``union`` members from the most deeply nested
+        outward.
+
+        Rewrite the members of a ``struct`` or ``union`` for easier formatting
+        later on. Among other things, this function will turn a member like::
+
+          struct { inner_members; } foo;
+
+        into::
+
+          struct foo; inner_members;
+        """
+
         #
-        # Process struct/union members from the most deeply nested outward.  The
-        # trick is in the ^{ below - it prevents a match of an outer struct/union
-        # until the inner one has been munged (removing the "{" in the process).
+        # The trick is in the ``^{`` below - it prevents a match of an outer
+        # ``struct``/``union`` until the inner one has been munged
+        # (removing the ``{`` in the process).
         #
         struct_members = KernRe(r'(struct|union)'   # 0: declaration type
                                 r'([^\{\};]+)' 	    # 1: possible name
@@ -777,11 +792,12 @@ class KernelDoc:
             tuples = struct_members.findall(members)
         return members
 
-    #
-    # Format the struct declaration into a standard form for inclusion in the
-    # resulting docs.
-    #
     def format_struct_decl(self, declaration):
+        """
+        Format the ``struct`` declaration into a standard form for inclusion
+        in the resulting docs.
+        """
+
         #
         # Insert newlines, get rid of extra spaces.
         #
@@ -815,7 +831,7 @@ class KernelDoc:
 
     def dump_struct(self, ln, proto):
         """
-        Store an entry for a struct or union
+        Store an entry for a ``struct`` or ``union``
         """
         #
         # Do the basic parse to get the pieces of the declaration.
@@ -857,7 +873,7 @@ class KernelDoc:
 
     def dump_enum(self, ln, proto):
         """
-        Stores an enum inside self.entries array.
+        Store an ``enum`` inside self.entries array.
         """
         #
         # Strip preprocessor directives.  Note that this depends on the
@@ -1004,7 +1020,7 @@ class KernelDoc:
 
     def dump_declaration(self, ln, prototype):
         """
-        Stores a data declaration inside self.entries array.
+        Store a data declaration inside self.entries array.
         """
 
         if self.entry.decl_type == "enum":
@@ -1021,7 +1037,7 @@ class KernelDoc:
 
     def dump_function(self, ln, prototype):
         """
-        Stores a function or function macro inside self.entries array.
+        Store a function or function macro inside self.entries array.
         """
 
         found = func_macro = False
@@ -1122,7 +1138,7 @@ class KernelDoc:
 
     def dump_typedef(self, ln, proto):
         """
-        Stores a typedef inside self.entries array.
+        Store a ``typedef`` inside self.entries array.
         """
         #
         # We start by looking for function typedefs.
@@ -1176,7 +1192,7 @@ class KernelDoc:
     @staticmethod
     def process_export(function_set, line):
         """
-        process EXPORT_SYMBOL* tags
+        process ``EXPORT_SYMBOL*`` tags
 
         This method doesn't use any variable from the class, so declare it
         with a staticmethod decorator.
@@ -1207,7 +1223,7 @@ class KernelDoc:
 
     def process_normal(self, ln, line):
         """
-        STATE_NORMAL: looking for the /** to begin everything.
+        STATE_NORMAL: looking for the ``/**`` to begin everything.
         """
 
         if not doc_start.match(line):
@@ -1297,10 +1313,10 @@ class KernelDoc:
         else:
             self.emit_msg(ln, f"Cannot find identifier on line:\n{line}")
 
-    #
-    # Helper function to determine if a new section is being started.
-    #
     def is_new_section(self, ln, line):
+        """
+        Helper function to determine if a new section is being started.
+        """
         if doc_sect.search(line):
             self.state = state.BODY
             #
@@ -1332,10 +1348,10 @@ class KernelDoc:
             return True
         return False
 
-    #
-    # Helper function to detect (and effect) the end of a kerneldoc comment.
-    #
     def is_comment_end(self, ln, line):
+        """
+        Helper function to detect (and effect) the end of a kerneldoc comment.
+        """
         if doc_end.search(line):
             self.dump_section()
 
@@ -1354,7 +1370,7 @@ class KernelDoc:
 
     def process_decl(self, ln, line):
         """
-        STATE_DECLARATION: We've seen the beginning of a declaration
+        STATE_DECLARATION: We've seen the beginning of a declaration.
         """
         if self.is_new_section(ln, line) or self.is_comment_end(ln, line):
             return
@@ -1383,7 +1399,7 @@ class KernelDoc:
 
     def process_special(self, ln, line):
         """
-        STATE_SPECIAL_SECTION: a section ending with a blank line
+        STATE_SPECIAL_SECTION: a section ending with a blank line.
         """
         #
         # If we have hit a blank line (only the " * " marker), then this
@@ -1473,7 +1489,7 @@ class KernelDoc:
 
     def syscall_munge(self, ln, proto):         # pylint: disable=W0613
         """
-        Handle syscall definitions
+        Handle syscall definitions.
         """
 
         is_void = False
@@ -1512,7 +1528,7 @@ class KernelDoc:
 
     def tracepoint_munge(self, ln, proto):
         """
-        Handle tracepoint definitions
+        Handle tracepoint definitions.
         """
 
         tracepointname = None
@@ -1548,7 +1564,7 @@ class KernelDoc:
         return proto
 
     def process_proto_function(self, ln, line):
-        """Ancillary routine to process a function prototype"""
+        """Ancillary routine to process a function prototype."""
 
         # strip C99-style comments to end of line
         line = KernRe(r"//.*$", re.S).sub('', line)
@@ -1593,7 +1609,9 @@ class KernelDoc:
             self.reset_state(ln)
 
     def process_proto_type(self, ln, line):
-        """Ancillary routine to process a type"""
+        """
+        Ancillary routine to process a type.
+        """
 
         # Strip C99-style comments and surrounding whitespace
         line = KernRe(r"//.*$", re.S).sub('', line).strip()
@@ -1647,7 +1665,7 @@ class KernelDoc:
             self.process_proto_type(ln, line)
 
     def process_docblock(self, ln, line):
-        """STATE_DOCBLOCK: within a DOC: block."""
+        """STATE_DOCBLOCK: within a ``DOC:`` block."""
 
         if doc_end.search(line):
             self.dump_section()
@@ -1659,7 +1677,7 @@ class KernelDoc:
 
     def parse_export(self):
         """
-        Parses EXPORT_SYMBOL* macros from a single Kernel source file.
+        Parses ``EXPORT_SYMBOL*`` macros from a single Kernel source file.
         """
 
         export_table = set()
@@ -1676,10 +1694,7 @@ class KernelDoc:
 
         return export_table
 
-    #
-    # The state/action table telling us which function to invoke in
-    # each state.
-    #
+    #: The state/action table telling us which function to invoke in each state.
     state_actions = {
         state.NORMAL:			process_normal,
         state.NAME:			process_name,
-- 
cgit v1.2.3


From 245f1ab2c9bce18a9467b4ef892570dd83b049d2 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:13 +0100
Subject: docs: kdoc_output: Improve docstrings and comments

In preparation to document kernel-doc module, improve its
documentation.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <ac03bf776f0929bbe822cd8269f2a31e275b8d6b.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/kdoc_output.py | 60 +++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/kdoc_output.py b/tools/lib/python/kdoc/kdoc_output.py
index d2bf94275d65..4210b91dde5f 100644
--- a/tools/lib/python/kdoc/kdoc_output.py
+++ b/tools/lib/python/kdoc/kdoc_output.py
@@ -5,14 +5,16 @@
 # pylint: disable=C0301,R0902,R0911,R0912,R0913,R0914,R0915,R0917
 
 """
-Implement output filters to print kernel-doc documentation.
+Classes to implement output filters to print kernel-doc documentation.
 
-The implementation uses a virtual base class (OutputFormat) which
+The implementation uses a virtual base class ``OutputFormat``. It
 contains dispatches to virtual methods, and some code to filter
 out output messages.
 
 The actual implementation is done on one separate class per each type
-of output. Currently, there are output classes for ReST and man/troff.
+of output, e.g. ``RestFormat`` and ``ManFormat`` classes.
+
+Currently, there are output classes for ReST and man/troff.
 """
 
 import os
@@ -54,16 +56,19 @@ class OutputFormat:
     """
 
     # output mode.
-    OUTPUT_ALL          = 0 # output all symbols and doc sections
-    OUTPUT_INCLUDE      = 1 # output only specified symbols
-    OUTPUT_EXPORTED     = 2 # output exported symbols
-    OUTPUT_INTERNAL     = 3 # output non-exported symbols
+    OUTPUT_ALL          = 0 #: Output all symbols and doc sections.
+    OUTPUT_INCLUDE      = 1 #: Output only specified symbols.
+    OUTPUT_EXPORTED     = 2 #: Output exported symbols.
+    OUTPUT_INTERNAL     = 3 #: Output non-exported symbols.
 
-    # Virtual member to be overridden at the inherited classes
+    #: Highlights to be used in ReST format.
     highlights = []
 
+    #: Blank line character.
+    blankline = ""
+
     def __init__(self):
-        """Declare internal vars and set mode to OUTPUT_ALL"""
+        """Declare internal vars and set mode to ``OUTPUT_ALL``."""
 
         self.out_mode = self.OUTPUT_ALL
         self.enable_lineno = None
@@ -128,7 +133,7 @@ class OutputFormat:
             self.config.warning(log_msg)
 
     def check_doc(self, name, args):
-        """Check if DOC should be output"""
+        """Check if DOC should be output."""
 
         if self.no_doc_sections:
             return False
@@ -177,7 +182,7 @@ class OutputFormat:
 
     def msg(self, fname, name, args):
         """
-        Handles a single entry from kernel-doc parser
+        Handles a single entry from kernel-doc parser.
         """
 
         self.data = ""
@@ -220,30 +225,31 @@ class OutputFormat:
     # Virtual methods to be overridden by inherited classes
     # At the base class, those do nothing.
     def set_symbols(self, symbols):
-        """Get a list of all symbols from kernel_doc"""
+        """Get a list of all symbols from kernel_doc."""
 
     def out_doc(self, fname, name, args):
-        """Outputs a DOC block"""
+        """Outputs a DOC block."""
 
     def out_function(self, fname, name, args):
-        """Outputs a function"""
+        """Outputs a function."""
 
     def out_enum(self, fname, name, args):
-        """Outputs an enum"""
+        """Outputs an enum."""
 
     def out_var(self, fname, name, args):
-        """Outputs a variable"""
+        """Outputs a variable."""
 
     def out_typedef(self, fname, name, args):
-        """Outputs a typedef"""
+        """Outputs a typedef."""
 
     def out_struct(self, fname, name, args):
-        """Outputs a struct"""
+        """Outputs a struct."""
 
 
 class RestFormat(OutputFormat):
-    """Consts and functions used by ReST output"""
+    """Consts and functions used by ReST output."""
 
+    #: Highlights to be used in ReST format
     highlights = [
         (type_constant, r"``\1``"),
         (type_constant2, r"``\1``"),
@@ -263,9 +269,13 @@ class RestFormat(OutputFormat):
         (type_fallback, r":c:type:`\1`"),
         (type_param_ref, r"**\1\2**")
     ]
+
     blankline = "\n"
 
+    #: Sphinx literal block regex.
     sphinx_literal = KernRe(r'^[^.].*::$', cache=False)
+
+    #: Sphinx code block regex.
     sphinx_cblock = KernRe(r'^\.\.\ +code-block::', cache=False)
 
     def __init__(self):
@@ -280,7 +290,7 @@ class RestFormat(OutputFormat):
         self.lineprefix = ""
 
     def print_lineno(self, ln):
-        """Outputs a line number"""
+        """Outputs a line number."""
 
         if self.enable_lineno and ln is not None:
             ln += 1
@@ -289,7 +299,7 @@ class RestFormat(OutputFormat):
     def output_highlight(self, args):
         """
         Outputs a C symbol that may require being converted to ReST using
-        the self.highlights variable
+        the self.highlights variable.
         """
 
         input_text = args
@@ -570,7 +580,7 @@ class RestFormat(OutputFormat):
 
 
 class ManFormat(OutputFormat):
-    """Consts and functions used by man pages output"""
+    """Consts and functions used by man pages output."""
 
     highlights = (
         (type_constant, r"\1"),
@@ -587,6 +597,7 @@ class ManFormat(OutputFormat):
     )
     blankline = ""
 
+    #: Allowed timestamp formats.
     date_formats = [
         "%a %b %d %H:%M:%S %Z %Y",
         "%a %b %d %H:%M:%S %Y",
@@ -653,7 +664,7 @@ class ManFormat(OutputFormat):
         self.symbols = symbols
 
     def out_tail(self, fname, name, args):
-        """Adds a tail for all man pages"""
+        """Adds a tail for all man pages."""
 
         # SEE ALSO section
         self.data += f'.SH "SEE ALSO"' + "\n.PP\n"
@@ -689,7 +700,7 @@ class ManFormat(OutputFormat):
     def output_highlight(self, block):
         """
         Outputs a C symbol that may require being highlighted with
-        self.highlights variable using troff syntax
+        self.highlights variable using troff syntax.
         """
 
         contents = self.highlight_block(block)
@@ -720,7 +731,6 @@ class ManFormat(OutputFormat):
             self.output_highlight(text)
 
     def out_function(self, fname, name, args):
-        """output function in man"""
 
         out_name = self.arg_name(args, name)
 
-- 
cgit v1.2.3


From b0b88915c83c2888e60a27c4914d10486f34fe3a Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:14 +0100
Subject: docs: kdoc_re: Improve docstrings and comments

In preparation to document kernel-doc module, improve its
documentation.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <14a12a43144d52345bfd405d0401d246f0885acf.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/kdoc_re.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py
index 2dfa1bf83d64..2816bd9f90f8 100644
--- a/tools/lib/python/kdoc/kdoc_re.py
+++ b/tools/lib/python/kdoc/kdoc_re.py
@@ -51,6 +51,9 @@ class KernRe:
         """
         return self.regex.pattern
 
+    def __repr__(self):
+        return f're.compile("{self.regex.pattern}")'
+
     def __add__(self, other):
         """
         Allows adding two regular expressions into one.
@@ -61,7 +64,7 @@ class KernRe:
 
     def match(self, string):
         """
-        Handles a re.match storing its results
+        Handles a re.match storing its results.
         """
 
         self.last_match = self.regex.match(string)
@@ -69,7 +72,7 @@ class KernRe:
 
     def search(self, string):
         """
-        Handles a re.search storing its results
+        Handles a re.search storing its results.
         """
 
         self.last_match = self.regex.search(string)
@@ -77,28 +80,28 @@ class KernRe:
 
     def findall(self, string):
         """
-        Alias to re.findall
+        Alias to re.findall.
         """
 
         return self.regex.findall(string)
 
     def split(self, string):
         """
-        Alias to re.split
+        Alias to re.split.
         """
 
         return self.regex.split(string)
 
     def sub(self, sub, string, count=0):
         """
-        Alias to re.sub
+        Alias to re.sub.
         """
 
         return self.regex.sub(sub, string, count=count)
 
     def group(self, num):
         """
-        Returns the group results of the last match
+        Returns the group results of the last match.
         """
 
         return self.last_match.group(num)
@@ -110,7 +113,7 @@ class NestedMatch:
     even harder on Python with its normal re module, as there are several
     advanced regular expressions that are missing.
 
-    This is the case of this pattern:
+    This is the case of this pattern::
 
             '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
 
@@ -121,6 +124,7 @@ class NestedMatch:
     replace nested expressions.
 
     The original approach was suggested by:
+
         https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
 
     Although I re-implemented it to make it more generic and match 3 types
-- 
cgit v1.2.3


From e68c84b9f3ba138878581a9f36a02c67d2ae20d4 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:15 +0100
Subject: docs: kdoc: parse_data_structs: Improve docstrings and comments

In preparation to document kernel-doc module, improve its
documentation.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <76ead85b4c13a8038180a792e270c3691d26cd25.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/parse_data_structs.py | 62 ++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/parse_data_structs.py b/tools/lib/python/kdoc/parse_data_structs.py
index 25361996cd20..9941cd19032e 100755
--- a/tools/lib/python/kdoc/parse_data_structs.py
+++ b/tools/lib/python/kdoc/parse_data_structs.py
@@ -9,12 +9,12 @@ Parse a source file or header, creating ReStructured Text cross references.
 It accepts an optional file to change the default symbol reference or to
 suppress symbols from the output.
 
-It is capable of identifying defines, functions, structs, typedefs,
-enums and enum symbols and create cross-references for all of them.
+It is capable of identifying ``define``, function, ``struct``, ``typedef``,
+``enum`` and ``enum`` symbols and create cross-references for all of them.
 It is also capable of distinguish #define used for specifying a Linux
 ioctl.
 
-The optional rules file contains a set of rules like:
+The optional rules file contains a set of rules like::
 
     ignore ioctl VIDIOC_ENUM_FMT
     replace ioctl VIDIOC_DQBUF vidioc_qbuf
@@ -34,8 +34,8 @@ class ParseDataStructs:
     It is meant to allow having a more comprehensive documentation, where
     uAPI headers will create cross-reference links to the code.
 
-    It is capable of identifying defines, functions, structs, typedefs,
-    enums and enum symbols and create cross-references for all of them.
+    It is capable of identifying ``define``, function, ``struct``, ``typedef``,
+    ``enum`` and ``enum`` symbols and create cross-references for all of them.
     It is also capable of distinguish #define used for specifying a Linux
     ioctl.
 
@@ -43,13 +43,13 @@ class ParseDataStructs:
     allows parsing an exception file. Such file contains a set of rules
     using the syntax below:
 
-    1. Ignore rules:
+    1. Ignore rules::
 
         ignore <type> <symbol>`
 
     Removes the symbol from reference generation.
 
-    2. Replace rules:
+    2. Replace rules::
 
         replace <type> <old_symbol> <new_reference>
 
@@ -58,22 +58,22 @@ class ParseDataStructs:
         - A simple symbol name;
         - A full Sphinx reference.
 
-    3. Namespace rules
+    3. Namespace rules::
 
         namespace <namespace>
 
        Sets C namespace to be used during cross-reference generation. Can
        be overridden by replace rules.
 
-    On ignore and replace rules, <type> can be:
-        - ioctl: for defines that end with _IO*, e.g. ioctl definitions
-        - define: for other defines
-        - symbol: for symbols defined within enums;
-        - typedef: for typedefs;
-        - enum: for the name of a non-anonymous enum;
-        - struct: for structs.
+    On ignore and replace rules, ``<type>`` can be:
+        - ``ioctl``: for defines that end with ``_IO*``, e.g. ioctl definitions
+        - ``define``: for other defines
+        - ``symbol``: for symbols defined within enums;
+        - ``typedef``: for typedefs;
+        - ``enum``: for the name of a non-anonymous enum;
+        - ``struct``: for structs.
 
-    Examples:
+    Examples::
 
         ignore define __LINUX_MEDIA_H
         ignore ioctl VIDIOC_ENUM_FMT
@@ -83,13 +83,15 @@ class ParseDataStructs:
         namespace MC
     """
 
-    # Parser regexes with multiple ways to capture enums and structs
+    #: Parser regex with multiple ways to capture enums.
     RE_ENUMS = [
         re.compile(r"^\s*enum\s+([\w_]+)\s*\{"),
         re.compile(r"^\s*enum\s+([\w_]+)\s*$"),
         re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*\{"),
         re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*$"),
     ]
+
+    #: Parser regex with multiple ways to capture structs.
     RE_STRUCTS = [
         re.compile(r"^\s*struct\s+([_\w][\w\d_]+)\s*\{"),
         re.compile(r"^\s*struct\s+([_\w][\w\d_]+)$"),
@@ -97,11 +99,13 @@ class ParseDataStructs:
         re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)$"),
     ]
 
-    # FIXME: the original code was written a long time before Sphinx C
+    # NOTE: the original code was written a long time before Sphinx C
     # domain to have multiple namespaces. To avoid to much turn at the
     # existing hyperlinks, the code kept using "c:type" instead of the
     # right types. To change that, we need to change the types not only
     # here, but also at the uAPI media documentation.
+
+    #: Dictionary containing C type identifiers to be transformed.
     DEF_SYMBOL_TYPES = {
         "ioctl": {
             "prefix": "\\ ",
@@ -158,6 +162,10 @@ class ParseDataStructs:
             self.symbols[symbol_type] = {}
 
     def read_exceptions(self, fname: str):
+        """
+        Read an optional exceptions file, used to override defaults.
+        """
+
         if not fname:
             return
 
@@ -242,9 +250,9 @@ class ParseDataStructs:
     def store_type(self, ln, symbol_type: str, symbol: str,
                    ref_name: str = None, replace_underscores: bool = True):
         """
-        Stores a new symbol at self.symbols under symbol_type.
+        Store a new symbol at self.symbols under symbol_type.
 
-        By default, underscores are replaced by "-"
+        By default, underscores are replaced by ``-``.
         """
         defs = self.DEF_SYMBOL_TYPES[symbol_type]
 
@@ -276,12 +284,16 @@ class ParseDataStructs:
         self.symbols[symbol_type][symbol] = (f"{prefix}{ref_link}{suffix}", ln)
 
     def store_line(self, line):
-        """Stores a line at self.data, properly indented"""
+        """
+        Store a line at self.data, properly indented.
+        """
         line = "    " + line.expandtabs()
         self.data += line.rstrip(" ")
 
     def parse_file(self, file_in: str, exceptions: str = None):
-        """Reads a C source file and get identifiers"""
+        """
+        Read a C source file and get identifiers.
+        """
         self.data = ""
         is_enum = False
         is_comment = False
@@ -433,7 +445,7 @@ class ParseDataStructs:
 
     def gen_toc(self):
         """
-        Create a list of symbols to be part of a TOC contents table
+        Create a list of symbols to be part of a TOC contents table.
         """
         text = []
 
@@ -464,6 +476,10 @@ class ParseDataStructs:
         return "\n".join(text)
 
     def write_output(self, file_in: str, file_out: str, toc: bool):
+        """
+        Write a ReST output file.
+        """
+
         title = os.path.basename(file_in)
 
         if toc:
-- 
cgit v1.2.3


From 7ef684c9fdb336b1e102c014da2424c0240196c2 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:16 +0100
Subject: docs: kdoc: enrich_formatter: Improve docstrings and comments

In preparation to document kernel-doc module, improve its
documentation.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <55ec8b896fe00529d326859cd094230fb5a2cd30.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/enrich_formatter.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/enrich_formatter.py b/tools/lib/python/kdoc/enrich_formatter.py
index bb171567a4ca..d1be4e5e1962 100644
--- a/tools/lib/python/kdoc/enrich_formatter.py
+++ b/tools/lib/python/kdoc/enrich_formatter.py
@@ -26,12 +26,16 @@ class EnrichFormatter(argparse.HelpFormatter):
     and how they're used at the __doc__ description.
     """
     def __init__(self, *args, **kwargs):
-        """Initialize class and check if is TTY"""
+        """
+        Initialize class and check if is TTY.
+        """
         super().__init__(*args, **kwargs)
         self._tty = sys.stdout.isatty()
 
     def enrich_text(self, text):
-        """Handle ReST markups (currently, only ``foo``)"""
+        r"""
+        Handle ReST markups (currently, only \`\`text\`\` markups).
+        """
         if self._tty and text:
             # Replace ``text`` with ANSI SGR (bold)
             return re.sub(r'\`\`(.+?)\`\`',
@@ -39,12 +43,16 @@ class EnrichFormatter(argparse.HelpFormatter):
         return text
 
     def _fill_text(self, text, width, indent):
-        """Enrich descriptions with markups on it"""
+        """
+        Enrich descriptions with markups on it.
+        """
         enriched = self.enrich_text(text)
         return "\n".join(indent + line for line in enriched.splitlines())
 
     def _format_usage(self, usage, actions, groups, prefix):
-        """Enrich positional arguments at usage: line"""
+        """
+        Enrich positional arguments at usage: line.
+        """
 
         prog = self._prog
         parts = []
@@ -63,7 +71,9 @@ class EnrichFormatter(argparse.HelpFormatter):
         return usage_text
 
     def _format_action_invocation(self, action):
-        """Enrich argument names"""
+        """
+        Enrich argument names.
+        """
         if not action.option_strings:
             return self.enrich_text(f"``{action.dest.upper()}``")
 
-- 
cgit v1.2.3


From 33220c1fc10b2e53f0a79cdf6447fd7bf405a860 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:17 +0100
Subject: docs: kdoc: python_version: Improve docstrings and comments

In preparation to document kernel-doc module, improve its
documentation.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <2153afaeb496e1bb8d3cc318fff26c3f99d99486.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/python_version.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/python_version.py b/tools/lib/python/kdoc/python_version.py
index e83088013db2..4ddb7ead5f56 100644
--- a/tools/lib/python/kdoc/python_version.py
+++ b/tools/lib/python/kdoc/python_version.py
@@ -33,21 +33,31 @@ class PythonVersion:
     """
 
     def __init__(self, version):
-        """Ïnitialize self.version tuple from a version string"""
+        """
+        Ïnitialize self.version tuple from a version string.
+        """
         self.version = self.parse_version(version)
 
     @staticmethod
     def parse_version(version):
-        """Convert a major.minor.patch version into a tuple"""
+        """
+        Convert a major.minor.patch version into a tuple.
+        """
         return tuple(int(x) for x in version.split("."))
 
     @staticmethod
     def ver_str(version):
-        """Returns a version tuple as major.minor.patch"""
+        """
+        Returns a version tuple as major.minor.patch.
+        """
         return ".".join([str(x) for x in version])
 
     @staticmethod
     def cmd_print(cmd, max_len=80):
+        """
+        Outputs a command line, repecting maximum width.
+        """
+
         cmd_line = []
 
         for w in cmd:
@@ -66,7 +76,9 @@ class PythonVersion:
         return "\n  ".join(cmd_line)
 
     def __str__(self):
-        """Returns a version tuple as major.minor.patch from self.version"""
+        """
+        Return a version tuple as major.minor.patch from self.version.
+        """
         return self.ver_str(self.version)
 
     @staticmethod
-- 
cgit v1.2.3


From 66c3bf974d48f8e5c5f94148e1171b62bd80e26d Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:20 +0100
Subject: docs: python: abi_parser: do some improvements at documentation

Add documentation for two consts and ensure that all sentenses
will end with a dot.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <c5756d7fd70697890130b41b2856c59144d01844.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/abi/abi_parser.py | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/abi/abi_parser.py b/tools/lib/python/abi/abi_parser.py
index 9b8db70067ef..d7bb20ef3acc 100644
--- a/tools/lib/python/abi/abi_parser.py
+++ b/tools/lib/python/abi/abi_parser.py
@@ -21,14 +21,17 @@ from abi.helpers import AbiDebug, ABI_DIR
 
 
 class AbiParser:
-    """Main class to parse ABI files"""
+    """Main class to parse ABI files."""
 
+    #: Valid tags at Documentation/ABI.
     TAGS = r"(what|where|date|kernelversion|contact|description|users)"
+
+    #: ABI elements that will auto-generate cross-references.
     XREF = r"(?:^|\s|\()(\/(?:sys|config|proc|dev|kvd)\/[^,.:;\)\s]+)(?:[,.:;\)\s]|\Z)"
 
     def __init__(self, directory, logger=None,
                  enable_lineno=False, show_warnings=True, debug=0):
-        """Stores arguments for the class and initialize class vars"""
+        """Stores arguments for the class and initialize class vars."""
 
         self.directory = directory
         self.enable_lineno = enable_lineno
@@ -65,7 +68,7 @@ class AbiParser:
         self.re_xref_node = re.compile(self.XREF)
 
     def warn(self, fdata, msg, extra=None):
-        """Displays a parse error if warning is enabled"""
+        """Displays a parse error if warning is enabled."""
 
         if not self.show_warnings:
             return
@@ -77,7 +80,7 @@ class AbiParser:
         self.log.warning(msg)
 
     def add_symbol(self, what, fname, ln=None, xref=None):
-        """Create a reference table describing where each 'what' is located"""
+        """Create a reference table describing where each 'what' is located."""
 
         if what not in self.what_symbols:
             self.what_symbols[what] = {"file": {}}
@@ -92,7 +95,7 @@ class AbiParser:
             self.what_symbols[what]["xref"] = xref
 
     def _parse_line(self, fdata, line):
-        """Parse a single line of an ABI file"""
+        """Parse a single line of an ABI file."""
 
         new_what = False
         new_tag = False
@@ -264,7 +267,7 @@ class AbiParser:
             self.warn(fdata, "Unexpected content", line)
 
     def parse_readme(self, nametag, fname):
-        """Parse ABI README file"""
+        """Parse ABI README file."""
 
         nametag["what"] = ["Introduction"]
         nametag["path"] = "README"
@@ -282,7 +285,7 @@ class AbiParser:
                 nametag["description"] += line
 
     def parse_file(self, fname, path, basename):
-        """Parse a single file"""
+        """Parse a single file."""
 
         ref = f"abi_file_{path}_{basename}"
         ref = self.re_unprintable.sub("_", ref).strip("_")
@@ -348,7 +351,7 @@ class AbiParser:
                     self.add_symbol(what=w, fname=fname, xref=fdata.key)
 
     def _parse_abi(self, root=None):
-        """Internal function to parse documentation ABI recursively"""
+        """Internal function to parse documentation ABI recursively."""
 
         if not root:
             root = self.directory
@@ -377,7 +380,7 @@ class AbiParser:
                 self.parse_file(name, path, basename)
 
     def parse_abi(self, root=None):
-        """Parse documentation ABI"""
+        """Parse documentation ABI."""
 
         self._parse_abi(root)
 
@@ -385,7 +388,7 @@ class AbiParser:
             self.log.debug(pformat(self.data))
 
     def desc_txt(self, desc):
-        """Print description as found inside ABI files"""
+        """Print description as found inside ABI files."""
 
         desc = desc.strip(" \t\n")
 
@@ -393,7 +396,7 @@ class AbiParser:
 
     def xref(self, fname):
         """
-        Converts a Documentation/ABI + basename into a ReST cross-reference
+        Converts a Documentation/ABI + basename into a ReST cross-reference.
         """
 
         xref = self.file_refs.get(fname)
@@ -403,7 +406,7 @@ class AbiParser:
             return xref
 
     def desc_rst(self, desc):
-        """Enrich ReST output by creating cross-references"""
+        """Enrich ReST output by creating cross-references."""
 
         # Remove title markups from the description
         # Having titles inside ABI files will only work if extra
@@ -459,7 +462,7 @@ class AbiParser:
 
     def doc(self, output_in_txt=False, show_symbols=True, show_file=True,
             filter_path=None):
-        """Print ABI at stdout"""
+        """Print ABI at stdout."""
 
         part = None
         for key, v in sorted(self.data.items(),
@@ -549,7 +552,7 @@ class AbiParser:
             yield (msg, file_ref[0][0], ln)
 
     def check_issues(self):
-        """Warn about duplicated ABI entries"""
+        """Warn about duplicated ABI entries."""
 
         for what, v in self.what_symbols.items():
             files = v.get("file")
@@ -575,7 +578,7 @@ class AbiParser:
             self.log.warning("%s is defined %d times: %s", what, len(f), "; ".join(f))
 
     def search_symbols(self, expr):
-        """ Searches for ABI symbols """
+        """ Searches for ABI symbols."""
 
         regex = re.compile(expr, re.I)
 
-- 
cgit v1.2.3


From ff91637dece7f4e108f7a2e76bd7e1054d24f600 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:21 +0100
Subject: docs: python: abi_regex: do some improvements at documentation

Add documentation for two consts and ensure that all sentenses
will end with a dot.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <5419ad89a5042c1571198c2f055866674808579b.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/abi/abi_regex.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/abi/abi_regex.py b/tools/lib/python/abi/abi_regex.py
index d5553206de3c..d0c5e3ede6b5 100644
--- a/tools/lib/python/abi/abi_regex.py
+++ b/tools/lib/python/abi/abi_regex.py
@@ -16,10 +16,22 @@ from abi.abi_parser import AbiParser
 from abi.helpers import AbiDebug
 
 class AbiRegex(AbiParser):
-    """Extends AbiParser to search ABI nodes with regular expressions"""
+    """
+    Extends AbiParser to search ABI nodes with regular expressions.
 
-    # Escape only ASCII visible characters
+    There some optimizations here to allow a quick symbol search:
+    instead of trying to place all symbols altogether an doing linear
+    search which is very time consuming, create a tree with one depth,
+    grouping similar symbols altogether.
+
+    Yet, sometimes a full search will be needed, so we have a special branch
+    on such group tree where other symbols are placed.
+    """
+
+    #: Escape only ASCII visible characters.
     escape_symbols = r"([\x21-\x29\x2b-\x2d\x3a-\x40\x5c\x60\x7b-\x7e])"
+
+    #: Special group for other nodes.
     leave_others = "others"
 
     # Tuples with regular expressions to be compiled and replacement data
@@ -88,13 +100,15 @@ class AbiRegex(AbiParser):
         # Recover plus characters
         (re.compile(r"\xf7"), "+"),
     ]
+
+    #: Regex to check if the symbol name has a number on it.
     re_has_num = re.compile(r"\\d")
 
-    # Symbol name after escape_chars that are considered a devnode basename
+    #: Symbol name after escape_chars that are considered a devnode basename.
     re_symbol_name =  re.compile(r"(\w|\\[\.\-\:])+$")
 
-    # List of popular group names to be skipped to minimize regex group size
-    # Use AbiDebug.SUBGROUP_SIZE to detect those
+    #: List of popular group names to be skipped to minimize regex group size
+    #: Use AbiDebug.SUBGROUP_SIZE to detect those.
     skip_names = set(["devices", "hwmon"])
 
     def regex_append(self, what, new):
@@ -148,7 +162,7 @@ class AbiRegex(AbiParser):
     def get_regexes(self, what):
         """
         Given an ABI devnode, return a list of all regular expressions that
-        may match it, based on the sub-groups created by regex_append()
+        may match it, based on the sub-groups created by regex_append().
         """
 
         re_list = []
-- 
cgit v1.2.3


From a50c62d375a824046a7baa9cb03e5a7e8bf7c6c4 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:22 +0100
Subject: docs: kabi: system_symbols: end docstring phrases with a dot

Some docstring classes are not ending with a dot. Fix to make it
more uniform.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <efd0e150d8e12d8ea2665f54a96b1997f32897b7.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/abi/system_symbols.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/abi/system_symbols.py b/tools/lib/python/abi/system_symbols.py
index 4a2554da217b..7bbefd274ea2 100644
--- a/tools/lib/python/abi/system_symbols.py
+++ b/tools/lib/python/abi/system_symbols.py
@@ -18,11 +18,11 @@ from random import shuffle
 from abi.helpers import AbiDebug
 
 class SystemSymbols:
-    """Stores arguments for the class and initialize class vars"""
+    """Stores arguments for the class and initialize class vars."""
 
     def graph_add_file(self, path, link=None):
         """
-        add a file path to the sysfs graph stored at self.root
+        add a file path to the sysfs graph stored at self.root.
         """
 
         if path in self.files:
@@ -43,7 +43,7 @@ class SystemSymbols:
         self.files.add(path)
 
     def print_graph(self, root_prefix="", root=None, level=0):
-        """Prints a reference tree graph using UTF-8 characters"""
+        """Prints a reference tree graph using UTF-8 characters."""
 
         if not root:
             root = self.root
@@ -173,7 +173,7 @@ class SystemSymbols:
         self._walk(sysfs)
 
     def check_file(self, refs, found):
-        """Check missing ABI symbols for a given sysfs file"""
+        """Check missing ABI symbols for a given sysfs file."""
 
         res_list = []
 
@@ -214,7 +214,7 @@ class SystemSymbols:
         return res_list
 
     def _ref_interactor(self, root):
-        """Recursive function to interact over the sysfs tree"""
+        """Recursive function to interact over the sysfs tree."""
 
         for k, v in root.items():
             if isinstance(v, dict):
@@ -232,7 +232,7 @@ class SystemSymbols:
 
 
     def get_fileref(self, all_refs, chunk_size):
-        """Interactor to group refs into chunks"""
+        """Interactor to group refs into chunks."""
 
         n = 0
         refs = []
@@ -250,7 +250,7 @@ class SystemSymbols:
 
     def check_undefined_symbols(self, max_workers=None, chunk_size=50,
                                 found=None, dry_run=None):
-        """Seach ABI for sysfs symbols missing documentation"""
+        """Seach ABI for sysfs symbols missing documentation."""
 
         self.abi.parse_abi()
 
-- 
cgit v1.2.3


From 5c9ece0b02b219e8502f66b8d9636d511280126d Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:23 +0100
Subject: docs: kabi: helpers: add helper for debug bits 7 and 8

The kabi logic supports 8 debug bits, but only 6 are currently
documented. Document the remaining ones.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <60e99b9060396eac8621954d6b8a73af45df90fb.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/abi/helpers.py | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/python/abi/helpers.py b/tools/lib/python/abi/helpers.py
index 639b23e4ca33..b8c8dfb1272e 100644
--- a/tools/lib/python/abi/helpers.py
+++ b/tools/lib/python/abi/helpers.py
@@ -35,4 +35,6 @@ DEBUG_HELP = """
 16 - enable debug for what to regex conversion
 32 - enable debug for symbol regex subgroups
 64 - enable debug for sysfs graph tree variable
+128 - enable debug of search groups
+256 - enable displaying refrence tree graphs for undefined symbols.
 """
-- 
cgit v1.2.3


From b713adadf8be2d75dd6cfb626aec143d7461b100 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:24 +0100
Subject: docs: kabi: helpers: add documentation for each "enum" value

Ensure that kABI module documentation will describe each
debug bit.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <3b118b157e52d757bf82fd74f03b0f4bd9e8b8f1.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/abi/helpers.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/abi/helpers.py b/tools/lib/python/abi/helpers.py
index b8c8dfb1272e..2a378d780d3c 100644
--- a/tools/lib/python/abi/helpers.py
+++ b/tools/lib/python/abi/helpers.py
@@ -13,28 +13,28 @@ ABI_DIR = "Documentation/ABI/"
 class AbiDebug:
     """Debug levels"""
 
-    WHAT_PARSING = 1
-    WHAT_OPEN = 2
-    DUMP_ABI_STRUCTS = 4
-    UNDEFINED = 8
-    REGEX = 16
-    SUBGROUP_MAP = 32
-    SUBGROUP_DICT = 64
-    SUBGROUP_SIZE = 128
-    GRAPH = 256
-
+    WHAT_PARSING = 1        #: Enable debug parsing logic.
+    WHAT_OPEN = 2           #: Enable debug messages on file open.
+    DUMP_ABI_STRUCTS = 4    #: Enable debug for ABI parse data.
+    UNDEFINED = 8           #: Enable extra undefined symbol data.
+    REGEX = 16              #: Enable debug for what to regex conversion.
+    SUBGROUP_MAP = 32       #: Enable debug for symbol regex subgroups
+    SUBGROUP_DICT = 64      #: Enable debug for sysfs graph tree variable.
+    SUBGROUP_SIZE = 128     #: Enable debug of search groups.
+    GRAPH = 256             #: Display ref tree graph for undefined symbols.
 
+#: Helper messages for each debug variable
 DEBUG_HELP = """
-1  - enable debug parsing logic
-2  - enable debug messages on file open
-4  - enable debug for ABI parse data
-8  - enable extra debug information to identify troubles
-     with ABI symbols found at the local machine that
-     weren't found on ABI documentation (used only for
-     undefined subcommand)
-16 - enable debug for what to regex conversion
-32 - enable debug for symbol regex subgroups
-64 - enable debug for sysfs graph tree variable
+1   - enable debug parsing logic
+2   - enable debug messages on file open
+4   - enable debug for ABI parse data
+8   - enable extra debug information to identify troubles
+      with ABI symbols found at the local machine that
+      weren't found on ABI documentation (used only for
+      undefined subcommand)
+16  - enable debug for what to regex conversion
+32  - enable debug for symbol regex subgroups
+64  - enable debug for sysfs graph tree variable
 128 - enable debug of search groups
 256 - enable displaying refrence tree graphs for undefined symbols.
 """
-- 
cgit v1.2.3


From 8b85f614f3b68a8a58762c8f8defbcebf6f0282a Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:26 +0100
Subject: docs: jobserver: do some documentation improvements

Make Sphinx handle better jobserver class documentation

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <18a9c1406bdead680e3ee5768c97ae8b2138e8ea.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/jobserver.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/jobserver.py b/tools/lib/python/jobserver.py
index 616411087725..8da1973e5c87 100755
--- a/tools/lib/python/jobserver.py
+++ b/tools/lib/python/jobserver.py
@@ -11,20 +11,23 @@ Interacts with the POSIX jobserver during the Kernel build time.
 A "normal" jobserver task, like the one initiated by a make subrocess would do:
 
     - open read/write file descriptors to communicate with the job server;
-    - ask for one slot by calling:
+    - ask for one slot by calling::
+
         claim = os.read(reader, 1)
-    - when the job finshes, call:
+
+    - when the job finshes, call::
+
         os.write(writer, b"+")  # os.write(writer, claim)
 
 Here, the goal is different: This script aims to get the remaining number
 of slots available, using all of them to run a command which handle tasks in
 parallel. To to that, it has a loop that ends only after there are no
 slots left. It then increments the number by one, in order to allow a
-call equivalent to make -j$((claim+1)), e.g. having a parent make creating
+call equivalent to ``make -j$((claim+1))``, e.g. having a parent make creating
 $claim child to do the actual work.
 
 The end goal here is to keep the total number of build tasks under the
-limit established by the initial make -j$n_proc call.
+limit established by the initial ``make -j$n_proc`` call.
 
 See:
     https://www.gnu.org/software/make/manual/html_node/POSIX-Jobserver.html#POSIX-Jobserver
@@ -40,13 +43,14 @@ class JobserverExec:
     Claim all slots from make using POSIX Jobserver.
 
     The main methods here are:
+
     - open(): reserves all slots;
     - close(): method returns all used slots back to make;
-    - run(): executes a command setting PARALLELISM=<available slots jobs + 1>
+    - run(): executes a command setting PARALLELISM=<available slots jobs + 1>.
     """
 
     def __init__(self):
-        """Initialize internal vars"""
+        """Initialize internal vars."""
         self.claim = 0
         self.jobs = b""
         self.reader = None
@@ -54,7 +58,7 @@ class JobserverExec:
         self.is_open = False
 
     def open(self):
-        """Reserve all available slots to be claimed later on"""
+        """Reserve all available slots to be claimed later on."""
 
         if self.is_open:
             return
@@ -118,7 +122,7 @@ class JobserverExec:
         self.is_open = True
 
     def close(self):
-        """Return all reserved slots to Jobserver"""
+        """Return all reserved slots to Jobserver."""
 
         if not self.is_open:
             return
-- 
cgit v1.2.3


From ef6aa110d8888a14dfb2e843794097263c45a06b Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 19 Jan 2026 17:23:28 +0100
Subject: docs: parse_features: make documentation more consistent

Do some changes to:
- add missing documentation strings to vars;
- add a missing docstring;
- ensure that phases will end with a period.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <3722f10361638561a5ced18cf4f409930c88270b.1768838938.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/feat/parse_features.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/feat/parse_features.py b/tools/lib/python/feat/parse_features.py
index b88c04d3e2fe..41a51d9d6f62 100755
--- a/tools/lib/python/feat/parse_features.py
+++ b/tools/lib/python/feat/parse_features.py
@@ -21,14 +21,25 @@ class ParseFeature:
     from it.
     """
 
+    #: feature header string.
     h_name = "Feature"
+
+    #: Kernel config header string.
     h_kconfig = "Kconfig"
+
+    #: description header string.
     h_description = "Description"
+
+    #: subsystem header string.
     h_subsys = "Subsystem"
+
+    #: status header string.
     h_status = "Status"
+
+    #: architecture header string.
     h_arch = "Architecture"
 
-    # Sort order for status. Others will be mapped at the end.
+    #: Sort order for status. Others will be mapped at the end.
     status_map = {
         "ok":   0,
         "TODO": 1,
@@ -40,7 +51,7 @@ class ParseFeature:
 
     def __init__(self, prefix, debug=0, enable_fname=False):
         """
-        Sets internal variables
+        Sets internal variables.
         """
 
         self.prefix = prefix
@@ -63,11 +74,13 @@ class ParseFeature:
         self.msg = ""
 
     def emit(self, msg="", end="\n"):
+        """Helper function to append a new message for feature output."""
+
         self.msg += msg + end
 
     def parse_error(self, fname, ln, msg, data=None):
         """
-        Displays an error message, printing file name and line
+        Displays an error message, printing file name and line.
         """
 
         if ln:
@@ -82,7 +95,7 @@ class ParseFeature:
             print("", file=sys.stderr)
 
     def parse_feat_file(self, fname):
-        """Parses a single arch-support.txt feature file"""
+        """Parses a single arch-support.txt feature file."""
 
         if os.path.isdir(fname):
             return
@@ -204,7 +217,7 @@ class ParseFeature:
         self.max_size_arch_with_header = self.max_size_arch + len(self.h_arch)
 
     def parse(self):
-        """Parses all arch-support.txt feature files inside self.prefix"""
+        """Parses all arch-support.txt feature files inside self.prefix."""
 
         path = os.path.expanduser(self.prefix)
 
@@ -281,7 +294,7 @@ class ParseFeature:
 
     def output_feature(self, feat):
         """
-        Output a feature on all architectures
+        Output a feature on all architectures.
         """
 
         title = f"Feature {feat}"
@@ -331,7 +344,7 @@ class ParseFeature:
 
     def matrix_lines(self, desc_size, max_size_status, header):
         """
-        Helper function to split element tables at the output matrix
+        Helper function to split element tables at the output matrix.
         """
 
         if header:
-- 
cgit v1.2.3


From 40146bf7555e9c3480b1225dfe7a5306e1b58b13 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 21 Jan 2026 17:11:36 +0100
Subject: selftests: net: tests for add double tunneling GRO/GSO

Create a simple, netns-based topology with double, nested UDP tunnels and
perform TSO transfers on top.

Explicitly enable GSO and/or GRO and check the skb layout consistency with
different configuration allowing (or not) GSO frames to be delivered on
the other end.

The trickest part is account in a robust way the aggregated/unaggregated
packets with double encapsulation: use a classic bpf filter for it.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/61f2c98ba0f73057c2d6f6cb62eb807abd90bf6b.1769011015.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile            |   1 +
 tools/testing/selftests/net/config              |   1 +
 tools/testing/selftests/net/double_udp_encap.sh | 393 ++++++++++++++++++++++++
 3 files changed, 395 insertions(+)
 create mode 100755 tools/testing/selftests/net/double_udp_encap.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index ce9699092f50..33f56fcbde09 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -22,6 +22,7 @@ TEST_PROGS := \
 	cmsg_so_mark.sh \
 	cmsg_so_priority.sh \
 	cmsg_time.sh \
+	double_udp_encap.sh \
 	drop_monitor_tests.sh \
 	fcnal-ipv4.sh \
 	fcnal-ipv6.sh \
diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index b84362b9b508..cd49b7dfe216 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -77,6 +77,7 @@ CONFIG_NET_DROP_MONITOR=m
 CONFIG_NETFILTER=y
 CONFIG_NETFILTER_ADVANCED=y
 CONFIG_NETFILTER_XTABLES_LEGACY=y
+CONFIG_NETFILTER_XT_MATCH_BPF=m
 CONFIG_NETFILTER_XT_MATCH_LENGTH=m
 CONFIG_NETFILTER_XT_MATCH_POLICY=m
 CONFIG_NETFILTER_XT_NAT=m
diff --git a/tools/testing/selftests/net/double_udp_encap.sh b/tools/testing/selftests/net/double_udp_encap.sh
new file mode 100755
index 000000000000..9aaf97cdf141
--- /dev/null
+++ b/tools/testing/selftests/net/double_udp_encap.sh
@@ -0,0 +1,393 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+# shellcheck disable=SC2155 # prefer RO variable over return value from cmd
+readonly CLI="$(dirname "$(readlink -f "$0")")/../../../net/ynl/pyynl/cli.py"
+
+readonly SRC=1
+readonly DST=2
+
+readonly NET_V4=192.168.1.
+readonly NET_V6=2001:db8::
+readonly OL1_NET_V4=172.16.1.
+readonly OL1_NET_V6=2001:db8:1::
+readonly OL2_NET_V4=172.16.2.
+readonly OL2_NET_V6=2001:db8:2::
+
+trap cleanup_all_ns EXIT
+
+# shellcheck disable=SC2329 # can't figure out usage trough a variable
+is_ipv6() {
+	if [[ $1 =~ .*:.* ]]; then
+		return 0
+	fi
+	return 1
+}
+
+# shellcheck disable=SC2329 # can't figure out usage trough a variable
+create_gnv_endpoint() {
+	local -r netns=$1
+	local -r bm_rem_addr=$2
+	local -r gnv_dev=$3
+	local -r gnv_id=$4
+	local opts=$5
+	local gnv_json
+	local rem
+
+	if is_ipv6 "$bm_rem_addr"; then
+		rem=remote6
+	else
+		rem=remote
+	fi
+
+	# add ynl opt separator, if needed
+	[ -n "$opts" ] && opts=", $opts"
+
+	gnv_json="{ \"id\": $gnv_id, \"$rem\": \"$bm_rem_addr\"$opts }"
+	ip netns exec "$netns" "$CLI" --family rt-link --create --excl \
+	   --do newlink  --json "{\"ifname\": \"$gnv_dev\",
+	       \"linkinfo\": {\"kind\":\"geneve\",
+	       \"data\": $gnv_json } }" > /dev/null
+	ip -n "$netns" link set dev "$gnv_dev" up
+}
+
+# shellcheck disable=SC2329 # can't figure out usage trough a variable
+create_vxlan_endpoint() {
+	local -r netns=$1
+	local -r bm_rem_addr=$2
+	local -r vxlan_dev=$3
+	local -r vxlan_id=$4
+	local -r opts_str=$5
+	local oldifs
+	local -a opts
+	local opt
+
+	# convert the arguments from yaml format
+	oldifs=$IFS
+	IFS=','
+	for opt in $opts_str; do
+		local pattern='"port":'
+
+		[ -n "$opt" ] || continue
+
+		opts+=("${opt/$pattern*/dstport}" "${opt/$pattern/}")
+	done
+	IFS=$oldifs
+	[ ${#opts[@]} -gt 0 ] || opts+=("dstport" "4789")
+
+	ip -n "$netns" link add "$vxlan_dev" type vxlan id "$vxlan_id" \
+	   remote "$bm_rem_addr" "${opts[@]}"
+	ip -n "$netns" link set dev "$vxlan_dev" up
+}
+
+create_ns() {
+	local nested_opt='"port":6082'
+	local create_endpoint
+	local options="$1"
+	local feature
+	local dev
+	local id
+	local ns
+
+	RET=0
+
+	#  +-------------+    +-------------+
+	#  | NS_SRC      |    | NS_NST_DST  |
+	#  |             |    |             |
+	#  |   gnv_nst1  |    |  gnv_nst2   |
+	#  |   +         |    |         +   |
+	#  |   |         |    |         |   |
+	#  |   +         |    |         +   |
+	#  |  gnv1       |    |        gnv2 |
+	#  |   +         |    |         +   |
+	#  |   |         |    |         |   |
+	#  |   + veth1 +--------+ veth2 +   |
+	#  |             |    |             |
+	#  +-------------+    +-------------+
+
+	setup_ns NS_SRC NS_DST
+
+	# concatenate caller provided options and default one
+	[ -n "$2" ] && nested_opt="$nested_opt,$2"
+
+	ip link add name "veth$SRC" netns "$NS_SRC" type veth \
+	   peer name "veth$DST" netns "$NS_DST"
+	case "$ENCAP" in
+	vxlan)
+		create_endpoint=create_vxlan_endpoint
+		dev=vx
+		;;
+	geneve)
+		create_endpoint=create_gnv_endpoint
+		dev=gnv
+		;;
+	esac
+
+	id=1
+	for ns in "${NS_LIST[@]}"; do
+		ip -n "$ns" link set dev "veth$id" up
+
+		# ensure the sender can do large write just after 3whs
+		ip netns exec "$ns" \
+		   sysctl -qw net.ipv4.tcp_wmem="4096 4194304 4194304"
+
+		# note that 3 - $SRC == $DST and 3 - $DST == $SRC
+		if [ $FAMILY = "4" ]; then
+			ip -n "$ns" addr add dev "veth$id" "$NET_V4$id/24"
+			$create_endpoint "$ns" "$NET_V4$((3 - id))" \
+			   "$dev$id" 4 "$options"
+			ip -n "$ns" addr add dev "$dev$id" "$OL1_NET_V4$id/24"
+
+			# nested tunnel devices
+			# pmtu can't be propagated to upper layer devices;
+			# need manual adjust
+			$create_endpoint "$ns" "$OL1_NET_V4$((3 - id))" \
+			   "$dev"_nst"$id" 40 "$nested_opt"
+			ip -n "$ns" addr add dev "$dev"_nst"$id" \
+			   "$OL2_NET_V4$id/24"
+			ip -n "$ns" link set dev "$dev"_nst"$id" mtu 1392
+		else
+			ip -n "$ns" addr add dev "veth$id" "$NET_V6$id/64" \
+			   nodad
+			$create_endpoint "$ns" "$NET_V6$((3 - id))" \
+			   "$dev"6"$id" 6 "$options"
+			ip -n "$ns" addr add dev "$dev"6"$id" \
+			   "$OL1_NET_V6$id/64" nodad
+
+			$create_endpoint "$ns" "$OL1_NET_V6$((3 - id))" \
+			   "$dev"6_nst"$id" 60 "$nested_opt"
+			ip -n "$ns" addr add dev "$dev"6_nst"$id" \
+			   "$OL2_NET_V6$id/64" nodad
+			ip -n "$ns" link set dev "$dev"6_nst"$id" mtu 1352
+		fi
+		id=$((id+1))
+	done
+
+	# enable GRO heuristic on the veth peer and ensure UDP L4 over tunnel is
+	# actually segmented
+	for feature in tso tx-udp_tnl-segmentation; do
+		ip netns exec "$NS_SRC" ethtool -K "veth$SRC" \
+		   "$feature" off 2>/dev/null
+	done
+}
+
+create_ns_gso() {
+	local dev
+
+	create_ns "$@"
+	if [ "$ENCAP" = "geneve" ]; then
+		dev=gnv
+	else
+		dev=vx
+	fi
+	[ "$FAMILY" = "6" ] && dev="$dev"6
+	ip netns exec "$NS_SRC" ethtool -K "$dev$SRC" \
+	   tx-gso-partial on \
+	   tx-udp_tnl-segmentation on \
+	   tx-udp_tnl-csum-segmentation on
+}
+
+create_ns_gso_gro() {
+	create_ns_gso "$@"
+	ip netns exec "$NS_DST" ethtool -K "veth$DST" gro on
+	ip netns exec "$NS_SRC" ethtool -K "veth$SRC" tx off >/dev/null 2>&1
+}
+
+run_test() {
+	local -r dst=$NET$DST
+	local -r msg=$1
+	local -r total_size=$2
+	local -r encappkts=$3
+	local inner_proto_offset=0
+	local inner_maclen=14
+	local rx_family="-4"
+	local ipt=iptables
+	local bpf_filter
+	local -a rx_args
+	local wire_pkts
+	local rcvpkts
+	local encl=8
+	local dport
+	local pkts
+	local snd
+
+	if [ $FAMILY = "6" ]; then
+		ipt=ip6tables
+	else
+		# rx program does not support '-6' and implies ipv6 usage by
+		# default
+		rx_args=("$rx_family")
+	fi
+
+	# The received can only check fixed size packet
+	pkts=$((total_size / GSO_SIZE))
+	if [ -n "$4" ]; then
+		wire_pkts=$4
+	elif [ $((total_size % GSO_SIZE)) -eq 0 ]; then
+		wire_pkts=1
+		rx_args+=("-l" "$GSO_SIZE")
+	else
+		wire_pkts=2
+		pkts=$((pkts + 1))
+	fi
+
+	if [ "$ENCAP" = "geneve" ]; then
+		dport=6081
+	else
+		dport=4789
+	fi
+
+	# Either:
+	# - IPv4, nested tunnel carries UDP over IPv4, with dport 6082,
+	#   innermost is TCP over IPv4 on port 8000
+	# - IPv6, nested tunnel carries UDP over IPv6, with dport 6082,
+	#   innermost is TCP over IPv6 on port 8000
+	# The nested tunnel port is 6082 and the nested encap len is 8
+	# regardless of the encap type (no geneve opts).
+	# In inherit protocol mode there is no nested mac hdr and the nested
+	# l3 protocol type field belongs to the geneve hdr.
+	[ "$USE_HINT" = true ] && encl=16
+	[ "$INHERIT" = true ] && inner_maclen=0
+	[ "$INHERIT" = true ] && inner_proto_offset=-4
+	local inner=$((inner_maclen+encl))
+	local proto=$((inner_maclen+encl+inner_proto_offset))
+	bpf_filter=$(nfbpf_compile "(ip &&
+		ip[$((40+encl))] == 0x08 && ip[$((41+encl))] == 0x00 &&
+		ip[$((51+encl))] == 0x11 &&
+		ip[$((64+encl))] == 0x17 && ip[$((65+encl))] == 0xc2 &&
+		ip[$((76+proto))] == 0x08 && ip[$((77+proto))] == 0x00 &&
+		ip[$((87+inner))] == 0x6 &&
+		ip[$((100+inner))] == 0x1f && ip[$((101+inner))] == 0x40) ||
+		(ip6 &&
+		ip6[$((60+encl))] == 0x86 && ip6[$((61+encl))] == 0xdd &&
+		ip6[$((68+encl))] == 0x11 &&
+		ip6[$((104+encl))] == 0x17 && ip6[$((105+encl))] == 0xc2 &&
+		ip6[$((116+proto))] == 0x86 && ip6[$((117+proto))] == 0xdd &&
+		ip6[$((124+inner))] == 0x6 &&
+		ip6[$((160+inner))] == 0x1f && ip6[$((161+inner))] == 0x40)")
+
+	# ignore shorts packet, to avoid arp/mld induced noise
+	ip netns exec "$NS_SRC" "$ipt" -A OUTPUT -p udp --dport "$dport" \
+	   -m length --length 600:65535 -m bpf --bytecode "$bpf_filter"
+	ip netns exec "$NS_DST" "$ipt" -A INPUT -p udp --dport "$dport" \
+	   -m length --length 600:65535 -m bpf --bytecode "$bpf_filter"
+	ip netns exec "$NS_DST" ./udpgso_bench_rx -C 2000 -t -R 100 \
+	   -n "$pkts" "${rx_args[@]}" &
+	local pid=$!
+	wait_local_port_listen "$NS_DST" 8000 tcp
+	ip netns exec "$NS_SRC" ./udpgso_bench_tx -"$FAMILY" -t -M 1 \
+	   -s "$total_size" -D "$dst"
+	local ret=$?
+	check_err "$ret" "client failure exit code $ret"
+	wait "$pid"
+	ret=$?
+	check_err "$ret" "sever failure exit code $ret"
+
+	snd=$(ip netns exec "$NS_SRC" "$ipt"-save -c |
+	    grep "dport $dport" | sed -e 's/\[//' -e 's/:.*//')
+
+	[ "$snd" = "$wire_pkts" ]
+	# shellcheck disable=SC2319 # known false positive
+	check_err $? "send $snd packets on the lowest link, expected $wire_pkts"
+
+	rcvpkts=$(ip netns exec "$NS_DST" "$ipt"-save -c | \
+	   grep "dport $dport" | sed -e 's/\[//' -e 's/:.*//')
+
+	[ "$rcvpkts" = "$encappkts" ]
+	check_err $? "received $rcvpkts $ENCAP packets, expected $encappkts"
+	log_test "$msg"
+}
+
+run_tests() {
+	for FAMILY in 4 6; do
+		NET=$OL2_NET_V4
+		GSO_SIZE=1340 # 1392 - 20 - 32
+
+		if [ $FAMILY = 6 ]; then
+			NET=$OL2_NET_V6
+			GSO_SIZE=1280 # 1352 - 40 - 32
+		fi
+
+		echo "IPv$FAMILY"
+
+		unset USE_HINT
+		unset INHERIT
+
+		# "geneve" must be last encap in list, so that later
+		# test cases will run on it
+		for ENCAP in "vxlan" "geneve"; do
+			create_ns
+			run_test "No GSO - $ENCAP" $((GSO_SIZE * 4)) 4 4
+			cleanup_all_ns
+
+			create_ns_gso
+			run_test "GSO without GRO - $ENCAP" $((GSO_SIZE * 4)) \
+			   4 1
+			cleanup_all_ns
+
+			# IPv4 only test
+			[ $FAMILY = "4" ] || continue
+			create_ns_gso
+			ip netns exec "$NS_SRC" \
+			   sysctl -qw net.ipv4.ip_no_pmtu_disc=1
+			run_test "GSO disable due to no fixedid - $ENCAP" \
+			   $((GSO_SIZE * 4)) 4 4
+			cleanup_all_ns
+		done
+
+		# GRO tests imply/require geneve encap, the only one providing
+		# GRO hints
+		create_ns_gso_gro
+		run_test "double tunnel GRO, no hints" $((GSO_SIZE * 4)) 4
+		cleanup_all_ns
+
+		# hint option is expected for all the following tests in the RX
+		# path
+		USE_HINT=true
+		create_ns_gso_gro \
+		   '"gro-hint":1,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' \
+		   '"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1'
+		run_test "double tunnel GRO" $((GSO_SIZE * 4)) 1
+		cleanup_all_ns
+
+		create_ns_gso_gro '"gro-hint":1,"udp-csum":1' '"udp-csum":1'
+		run_test "double tunnel GRO - csum complete" $((GSO_SIZE * 4))\
+		   1
+		cleanup_all_ns
+
+		create_ns_gso_gro '"gro-hint":1' \
+		   '"udp-csum":0,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1'
+		run_test "double tunnel GRO - no nested csum" \
+		   $((GSO_SIZE * 4)) 1
+		cleanup_all_ns
+
+		create_ns_gso_gro \
+		   '"gro-hint":1,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' \
+		   '"udp-csum":1'
+		run_test "double tunnel GRO - nested csum, outer 0-csum, skip"\
+		   $((GSO_SIZE * 4)) 4
+		cleanup_all_ns
+
+		INHERIT=true
+		create_ns_gso_gro '"gro-hint":1,"udp-csum":1' \
+		   '"udp-csum":1,"inner-proto-inherit":1'
+		run_test "double tunnel GRO - nested inherit proto" \
+		   $((GSO_SIZE * 4)) 1
+		cleanup_all_ns
+		unset INHERIT
+
+		create_ns_gso_gro '"gro-hint":1'
+		run_test "double tunnel GRO - short last pkt" \
+		   $((GSO_SIZE * 4 + GSO_SIZE / 2)) 2
+		cleanup_all_ns
+	done
+}
+
+require_command nfbpf_compile
+require_command jq
+
+# tcp retransmisions will break the accounting
+xfail_on_slow run_tests
+exit "$EXIT_STATUS"
-- 
cgit v1.2.3


From e073c118db0217e1dab14eb0088e7c6a8bf9ef63 Mon Sep 17 00:00:00 2001
From: Xu Du <xudu@redhat.com>
Date: Wed, 21 Jan 2026 18:04:55 +0800
Subject: selftest: tun: Format tun.c existing code

In preparation for adding new tests for GSO over UDP tunnels,
apply consistently the kernel style to the existing code.

Signed-off-by: Xu Du <xudu@redhat.com>
Link: https://patch.msgid.link/d797de1e5a3d215dd78cb46775772ef682bab60e.1768979440.git.xudu@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tun.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c
index 0efc67b0357a..128b0a5327d4 100644
--- a/tools/testing/selftests/net/tun.c
+++ b/tools/testing/selftests/net/tun.c
@@ -25,7 +25,7 @@ static int tun_attach(int fd, char *dev)
 	strcpy(ifr.ifr_name, dev);
 	ifr.ifr_flags = IFF_ATTACH_QUEUE;
 
-	return ioctl(fd, TUNSETQUEUE, (void *) &ifr);
+	return ioctl(fd, TUNSETQUEUE, (void *)&ifr);
 }
 
 static int tun_detach(int fd, char *dev)
@@ -36,7 +36,7 @@ static int tun_detach(int fd, char *dev)
 	strcpy(ifr.ifr_name, dev);
 	ifr.ifr_flags = IFF_DETACH_QUEUE;
 
-	return ioctl(fd, TUNSETQUEUE, (void *) &ifr);
+	return ioctl(fd, TUNSETQUEUE, (void *)&ifr);
 }
 
 static int tun_alloc(char *dev)
@@ -54,7 +54,7 @@ static int tun_alloc(char *dev)
 	strcpy(ifr.ifr_name, dev);
 	ifr.ifr_flags = IFF_TAP | IFF_NAPI | IFF_MULTI_QUEUE;
 
-	err = ioctl(fd, TUNSETIFF, (void *) &ifr);
+	err = ioctl(fd, TUNSETIFF, (void *)&ifr);
 	if (err < 0) {
 		fprintf(stderr, "can't TUNSETIFF: %s\n", strerror(errno));
 		close(fd);
@@ -67,9 +67,9 @@ static int tun_alloc(char *dev)
 static int tun_delete(char *dev)
 {
 	struct {
-		struct nlmsghdr  nh;
+		struct nlmsghdr nh;
 		struct ifinfomsg ifm;
-		unsigned char    data[64];
+		unsigned char data[64];
 	} req;
 	struct rtattr *rta;
 	int ret, rtnl;
@@ -127,31 +127,36 @@ FIXTURE_TEARDOWN(tun)
 		close(self->fd2);
 }
 
-TEST_F(tun, delete_detach_close) {
+TEST_F(tun, delete_detach_close)
+{
 	EXPECT_EQ(tun_delete(self->ifname), 0);
 	EXPECT_EQ(tun_detach(self->fd, self->ifname), -1);
 	EXPECT_EQ(errno, 22);
 }
 
-TEST_F(tun, detach_delete_close) {
+TEST_F(tun, detach_delete_close)
+{
 	EXPECT_EQ(tun_detach(self->fd, self->ifname), 0);
 	EXPECT_EQ(tun_delete(self->ifname), 0);
 }
 
-TEST_F(tun, detach_close_delete) {
+TEST_F(tun, detach_close_delete)
+{
 	EXPECT_EQ(tun_detach(self->fd, self->ifname), 0);
 	close(self->fd);
 	self->fd = -1;
 	EXPECT_EQ(tun_delete(self->ifname), 0);
 }
 
-TEST_F(tun, reattach_delete_close) {
+TEST_F(tun, reattach_delete_close)
+{
 	EXPECT_EQ(tun_detach(self->fd, self->ifname), 0);
 	EXPECT_EQ(tun_attach(self->fd, self->ifname), 0);
 	EXPECT_EQ(tun_delete(self->ifname), 0);
 }
 
-TEST_F(tun, reattach_close_delete) {
+TEST_F(tun, reattach_close_delete)
+{
 	EXPECT_EQ(tun_detach(self->fd, self->ifname), 0);
 	EXPECT_EQ(tun_attach(self->fd, self->ifname), 0);
 	close(self->fd);
-- 
cgit v1.2.3


From a942fcd72e9736a0c49b60160f29dc5bb01d6f89 Mon Sep 17 00:00:00 2001
From: Xu Du <xudu@redhat.com>
Date: Wed, 21 Jan 2026 18:04:56 +0800
Subject: selftest: tun: Introduce tuntap_helpers.h header for TUN/TAP testing

Introduce rtnetlink manipulation and packet construction helpers that
will simplify the later creation of more related test cases. This avoids
duplicating logic across different test cases.

This new header will contain:
 - YNL-based netlink management utilities.
 - Helpers for ip link, ip address, ip neighbor and ip route operations.
 - Packet construction and manipulation helpers.

Signed-off-by: Xu Du <xudu@redhat.com>
Link: https://patch.msgid.link/91f905715c69c75f7bf72d43388921fde6c34989.1768979440.git.xudu@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tuntap_helpers.h | 390 +++++++++++++++++++++++++++
 1 file changed, 390 insertions(+)
 create mode 100644 tools/testing/selftests/net/tuntap_helpers.h

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tuntap_helpers.h b/tools/testing/selftests/net/tuntap_helpers.h
new file mode 100644
index 000000000000..d6c0437136ec
--- /dev/null
+++ b/tools/testing/selftests/net/tuntap_helpers.h
@@ -0,0 +1,390 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _TUNTAP_HELPERS_H
+#define _TUNTAP_HELPERS_H
+
+#include <errno.h>
+#include <linux/if_packet.h>
+#include <linux/ipv6.h>
+#include <linux/virtio_net.h>
+#include <netinet/in.h>
+#include <netinet/if_ether.h>
+#include <netinet/udp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ynl.h>
+
+#include "rt-route-user.h"
+#include "rt-addr-user.h"
+#include "rt-neigh-user.h"
+#include "rt-link-user.h"
+
+#define GENEVE_HLEN 8
+#define PKT_DATA 0xCB
+#define TUNTAP_DEFAULT_TTL 8
+#define TUNTAP_DEFAULT_IPID 1337
+
+unsigned int if_nametoindex(const char *ifname);
+
+static inline int ip_addr_len(int family)
+{
+	return (family == AF_INET) ? sizeof(struct in_addr) :
+				     sizeof(struct in6_addr);
+}
+
+static inline void fill_ifaddr_msg(struct ifaddrmsg *ifam, int family,
+				   int prefix, int flags, const char *dev)
+{
+	ifam->ifa_family = family;
+	ifam->ifa_prefixlen = prefix;
+	ifam->ifa_index = if_nametoindex(dev);
+	ifam->ifa_flags = flags;
+	ifam->ifa_scope = RT_SCOPE_UNIVERSE;
+}
+
+static inline int ip_addr_add(const char *dev, int family, void *addr,
+			      uint8_t prefix)
+{
+	int nl_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+	int ifa_flags = IFA_F_PERMANENT | IFA_F_NODAD;
+	int ret = -1, ipalen = ip_addr_len(family);
+	struct rt_addr_newaddr_req *req;
+	struct ynl_sock *ys;
+
+	ys = ynl_sock_create(&ynl_rt_addr_family, NULL);
+	if (!ys)
+		return -1;
+
+	req = rt_addr_newaddr_req_alloc();
+	if (!req)
+		goto err_req_alloc;
+
+	fill_ifaddr_msg(&req->_hdr, family, prefix, ifa_flags, dev);
+	rt_addr_newaddr_req_set_nlflags(req, nl_flags);
+	rt_addr_newaddr_req_set_local(req, addr, ipalen);
+
+	ret = rt_addr_newaddr(ys, req);
+	rt_addr_newaddr_req_free(req);
+err_req_alloc:
+	ynl_sock_destroy(ys);
+	return ret;
+}
+
+static inline void fill_neigh_req_header(struct ndmsg *ndm, int family,
+					 int state, const char *dev)
+{
+	ndm->ndm_family = family;
+	ndm->ndm_ifindex = if_nametoindex(dev);
+	ndm->ndm_state = state;
+	ndm->ndm_flags = 0;
+	ndm->ndm_type = RTN_UNICAST;
+}
+
+static inline int ip_neigh_add(const char *dev, int family, void *addr,
+			       unsigned char *lladdr)
+{
+	int nl_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+	int ret = -1, ipalen = ip_addr_len(family);
+	struct rt_neigh_newneigh_req *req;
+	struct ynl_sock *ys;
+
+	ys = ynl_sock_create(&ynl_rt_neigh_family, NULL);
+	if (!ys)
+		return -1;
+
+	req = rt_neigh_newneigh_req_alloc();
+	if (!req)
+		goto err_req_alloc;
+
+	fill_neigh_req_header(&req->_hdr, family, NUD_PERMANENT, dev);
+	rt_neigh_newneigh_req_set_nlflags(req, nl_flags);
+	rt_neigh_newneigh_req_set_dst(req, addr, ipalen);
+	rt_neigh_newneigh_req_set_lladdr(req, lladdr, ETH_ALEN);
+	rt_neigh_newneigh_req_set_ifindex(req, if_nametoindex(dev));
+
+	ret = rt_neigh_newneigh(ys, req);
+	rt_neigh_newneigh_req_free(req);
+err_req_alloc:
+	ynl_sock_destroy(ys);
+	return ret;
+}
+
+static inline void fill_route_req_header(struct rtmsg *rtm, int family,
+					 int table)
+{
+	rtm->rtm_family = family;
+	rtm->rtm_table = table;
+}
+
+static inline int
+ip_route_get(const char *dev, int family, int table, void *dst,
+	     void (*parse_rsp)(struct rt_route_getroute_rsp *rsp, void *out),
+	     void *out)
+{
+	int ret = -1, ipalen = ip_addr_len(family);
+	struct rt_route_getroute_req *req;
+	struct rt_route_getroute_rsp *rsp;
+	struct ynl_sock *ys;
+
+	ys = ynl_sock_create(&ynl_rt_route_family, NULL);
+	if (!ys)
+		return -1;
+
+	req = rt_route_getroute_req_alloc();
+	if (!req)
+		goto err_req_alloc;
+
+	fill_route_req_header(&req->_hdr, family, table);
+	rt_route_getroute_req_set_nlflags(req, NLM_F_REQUEST);
+	rt_route_getroute_req_set_dst(req, dst, ipalen);
+	rt_route_getroute_req_set_oif(req, if_nametoindex(dev));
+
+	rsp = rt_route_getroute(ys, req);
+	if (!rsp)
+		goto err_rsp_get;
+
+	ret = 0;
+	if (parse_rsp)
+		parse_rsp(rsp, out);
+
+	rt_route_getroute_rsp_free(rsp);
+err_rsp_get:
+	rt_route_getroute_req_free(req);
+err_req_alloc:
+	ynl_sock_destroy(ys);
+	return ret;
+}
+
+static inline int
+ip_link_add(const char *dev, char *link_type,
+	    int (*fill_link_attr)(struct rt_link_newlink_req *req, void *data),
+	    void *data)
+{
+	int nl_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+	struct rt_link_newlink_req *req;
+	struct ynl_sock *ys;
+	int ret = -1;
+
+	ys = ynl_sock_create(&ynl_rt_link_family, NULL);
+	if (!ys)
+		return -1;
+
+	req = rt_link_newlink_req_alloc();
+	if (!req)
+		goto err_req_alloc;
+
+	req->_hdr.ifi_flags = IFF_UP;
+	rt_link_newlink_req_set_nlflags(req, nl_flags);
+	rt_link_newlink_req_set_ifname(req, dev);
+	rt_link_newlink_req_set_linkinfo_kind(req, link_type);
+
+	if (fill_link_attr && fill_link_attr(req, data) < 0)
+		goto err_attr_fill;
+
+	ret = rt_link_newlink(ys, req);
+err_attr_fill:
+	rt_link_newlink_req_free(req);
+err_req_alloc:
+	ynl_sock_destroy(ys);
+	return ret;
+}
+
+static inline int ip_link_del(const char *dev)
+{
+	struct rt_link_dellink_req *req;
+	struct ynl_sock *ys;
+	int ret = -1;
+
+	ys = ynl_sock_create(&ynl_rt_link_family, NULL);
+	if (!ys)
+		return -1;
+
+	req = rt_link_dellink_req_alloc();
+	if (!req)
+		goto err_req_alloc;
+
+	rt_link_dellink_req_set_nlflags(req, NLM_F_REQUEST);
+	rt_link_dellink_req_set_ifname(req, dev);
+
+	ret = rt_link_dellink(ys, req);
+	rt_link_dellink_req_free(req);
+err_req_alloc:
+	ynl_sock_destroy(ys);
+	return ret;
+}
+
+static inline size_t build_eth(uint8_t *buf, uint16_t proto, unsigned char *src,
+			       unsigned char *dest)
+{
+	struct ethhdr *eth = (struct ethhdr *)buf;
+
+	eth->h_proto = htons(proto);
+	memcpy(eth->h_source, src, ETH_ALEN);
+	memcpy(eth->h_dest, dest, ETH_ALEN);
+
+	return ETH_HLEN;
+}
+
+static inline uint32_t add_csum(const uint8_t *buf, int len)
+{
+	uint16_t *sbuf = (uint16_t *)buf;
+	uint32_t sum = 0;
+
+	while (len > 1) {
+		sum += *sbuf++;
+		len -= 2;
+	}
+
+	if (len)
+		sum += *(uint8_t *)sbuf;
+
+	return sum;
+}
+
+static inline uint16_t finish_ip_csum(uint32_t sum)
+{
+	while (sum >> 16)
+		sum = (sum & 0xffff) + (sum >> 16);
+	return ~((uint16_t)sum);
+}
+
+static inline uint16_t build_ip_csum(const uint8_t *buf, int len, uint32_t sum)
+{
+	sum += add_csum(buf, len);
+	return finish_ip_csum(sum);
+}
+
+static inline int build_ipv4_header(uint8_t *buf, uint8_t proto,
+				    int payload_len, struct in_addr *src,
+				    struct in_addr *dst)
+{
+	struct iphdr *iph = (struct iphdr *)buf;
+
+	iph->ihl = 5;
+	iph->version = 4;
+	iph->ttl = TUNTAP_DEFAULT_TTL;
+	iph->tot_len = htons(sizeof(*iph) + payload_len);
+	iph->id = htons(TUNTAP_DEFAULT_IPID);
+	iph->protocol = proto;
+	iph->saddr = src->s_addr;
+	iph->daddr = dst->s_addr;
+	iph->check = build_ip_csum(buf, iph->ihl << 2, 0);
+
+	return iph->ihl << 2;
+}
+
+static inline void ipv6_set_dsfield(struct ipv6hdr *ip6h, uint8_t dsfield)
+{
+	uint16_t val, *ptr = (uint16_t *)ip6h;
+
+	val = ntohs(*ptr);
+	val &= 0xF00F;
+	val |= ((uint16_t)dsfield) << 4;
+	*ptr = htons(val);
+}
+
+static inline int build_ipv6_header(uint8_t *buf, uint8_t proto,
+				    uint8_t dsfield, int payload_len,
+				    struct in6_addr *src, struct in6_addr *dst)
+{
+	struct ipv6hdr *ip6h = (struct ipv6hdr *)buf;
+
+	ip6h->version = 6;
+	ip6h->payload_len = htons(payload_len);
+	ip6h->nexthdr = proto;
+	ip6h->hop_limit = TUNTAP_DEFAULT_TTL;
+	ipv6_set_dsfield(ip6h, dsfield);
+	memcpy(&ip6h->saddr, src, sizeof(ip6h->saddr));
+	memcpy(&ip6h->daddr, dst, sizeof(ip6h->daddr));
+
+	return sizeof(struct ipv6hdr);
+}
+
+static inline int build_geneve_header(uint8_t *buf, uint32_t vni)
+{
+	uint16_t protocol = htons(ETH_P_TEB);
+	uint32_t geneve_vni = htonl((vni << 8) & 0xffffff00);
+
+	memcpy(buf + 2, &protocol, 2);
+	memcpy(buf + 4, &geneve_vni, 4);
+	return GENEVE_HLEN;
+}
+
+static inline int build_udp_header(uint8_t *buf, uint16_t sport, uint16_t dport,
+				   int payload_len)
+{
+	struct udphdr *udph = (struct udphdr *)buf;
+
+	udph->source = htons(sport);
+	udph->dest = htons(dport);
+	udph->len = htons(sizeof(*udph) + payload_len);
+	return sizeof(*udph);
+}
+
+static inline void build_udp_packet_csum(uint8_t *buf, int family,
+					 bool csum_off)
+{
+	struct udphdr *udph = (struct udphdr *)buf;
+	size_t ipalen = ip_addr_len(family);
+	uint32_t sum;
+
+	/* No extension IPv4 and IPv6 headers addresses are the last fields */
+	sum = add_csum(buf - 2 * ipalen, 2 * ipalen);
+	sum += htons(IPPROTO_UDP) + udph->len;
+
+	if (!csum_off)
+		sum += add_csum(buf, udph->len);
+
+	udph->check = finish_ip_csum(sum);
+}
+
+static inline int build_udp_packet(uint8_t *buf, uint16_t sport, uint16_t dport,
+				   int payload_len, int family, bool csum_off)
+{
+	struct udphdr *udph = (struct udphdr *)buf;
+
+	build_udp_header(buf, sport, dport, payload_len);
+	memset(buf + sizeof(*udph), PKT_DATA, payload_len);
+	build_udp_packet_csum(buf, family, csum_off);
+
+	return sizeof(*udph) + payload_len;
+}
+
+static inline int build_virtio_net_hdr_v1_hash_tunnel(uint8_t *buf, bool is_tap,
+						      int hdr_len, int gso_size,
+						      int outer_family,
+						      int inner_family)
+{
+	struct virtio_net_hdr_v1_hash_tunnel *vh_tunnel = (void *)buf;
+	struct virtio_net_hdr_v1 *vh = &vh_tunnel->hash_hdr.hdr;
+	int outer_iphlen, inner_iphlen, eth_hlen, gso_type;
+
+	eth_hlen = is_tap ? ETH_HLEN : 0;
+	outer_iphlen = (outer_family == AF_INET) ? sizeof(struct iphdr) :
+						   sizeof(struct ipv6hdr);
+	inner_iphlen = (inner_family == AF_INET) ? sizeof(struct iphdr) :
+						   sizeof(struct ipv6hdr);
+
+	vh_tunnel->outer_th_offset = eth_hlen + outer_iphlen;
+	vh_tunnel->inner_nh_offset = vh_tunnel->outer_th_offset + ETH_HLEN +
+				     GENEVE_HLEN + sizeof(struct udphdr);
+
+	vh->csum_start = vh_tunnel->inner_nh_offset + inner_iphlen;
+	vh->csum_offset = __builtin_offsetof(struct udphdr, check);
+	vh->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+	vh->hdr_len = hdr_len;
+	vh->gso_size = gso_size;
+
+	if (gso_size) {
+		gso_type = outer_family == AF_INET ?
+				   VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 :
+				   VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6;
+		vh->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4 | gso_type;
+	}
+
+	return sizeof(struct virtio_net_hdr_v1_hash_tunnel);
+}
+
+#endif /* _TUNTAP_HELPERS_H */
-- 
cgit v1.2.3


From 82cfdcfa201057861ec8a60ca9354d2c4c67bd68 Mon Sep 17 00:00:00 2001
From: Xu Du <xudu@redhat.com>
Date: Wed, 21 Jan 2026 18:04:57 +0800
Subject: selftest: tun: Refactor tun_delete to use tuntap_helpers

The previous patch introduced common tuntap helpers to simplify
tun test code. This patch refactors the tun_delete function to
use these new helpers.

Signed-off-by: Xu Du <xudu@redhat.com>
Link: https://patch.msgid.link/ecc7c0c2d75d87cb814e97579e731650339703ab.1768979440.git.xudu@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/Makefile | 16 ++++++++++++---
 tools/testing/selftests/net/tun.c    | 39 ++----------------------------------
 2 files changed, 15 insertions(+), 40 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 33f56fcbde09..afdea6d95bde 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -183,7 +183,6 @@ TEST_GEN_PROGS := \
 	tap \
 	tcp_port_share \
 	tls \
-	tun \
 # end of TEST_GEN_PROGS
 
 TEST_FILES := \
@@ -195,7 +194,11 @@ TEST_FILES := \
 
 # YNL files, must be before "include ..lib.mk"
 YNL_GEN_FILES := busy_poller
-YNL_GEN_PROGS := netlink-dumps
+YNL_GEN_PROGS := \
+	netlink-dumps \
+	tun \
+# end of YNL_GEN_PROGS
+
 TEST_GEN_FILES += $(YNL_GEN_FILES)
 TEST_GEN_PROGS += $(YNL_GEN_PROGS)
 
@@ -206,7 +209,14 @@ TEST_INCLUDES := forwarding/lib.sh
 include ../lib.mk
 
 # YNL build
-YNL_GENS := netdev
+YNL_GENS := \
+	netdev \
+	rt-addr \
+	rt-link \
+	rt-neigh \
+	rt-route \
+# end of YNL_GENS
+
 include ynl.mk
 
 $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap
diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c
index 128b0a5327d4..d9030bdd2e06 100644
--- a/tools/testing/selftests/net/tun.c
+++ b/tools/testing/selftests/net/tun.c
@@ -8,14 +8,12 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
-#include <linux/if.h>
 #include <linux/if_tun.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 
 #include "kselftest_harness.h"
+#include "tuntap_helpers.h"
 
 static int tun_attach(int fd, char *dev)
 {
@@ -66,40 +64,7 @@ static int tun_alloc(char *dev)
 
 static int tun_delete(char *dev)
 {
-	struct {
-		struct nlmsghdr nh;
-		struct ifinfomsg ifm;
-		unsigned char data[64];
-	} req;
-	struct rtattr *rta;
-	int ret, rtnl;
-
-	rtnl = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE);
-	if (rtnl < 0) {
-		fprintf(stderr, "can't open rtnl: %s\n", strerror(errno));
-		return 1;
-	}
-
-	memset(&req, 0, sizeof(req));
-	req.nh.nlmsg_len = NLMSG_ALIGN(NLMSG_LENGTH(sizeof(req.ifm)));
-	req.nh.nlmsg_flags = NLM_F_REQUEST;
-	req.nh.nlmsg_type = RTM_DELLINK;
-
-	req.ifm.ifi_family = AF_UNSPEC;
-
-	rta = (struct rtattr *)(((char *)&req) + NLMSG_ALIGN(req.nh.nlmsg_len));
-	rta->rta_type = IFLA_IFNAME;
-	rta->rta_len = RTA_LENGTH(IFNAMSIZ);
-	req.nh.nlmsg_len += rta->rta_len;
-	memcpy(RTA_DATA(rta), dev, IFNAMSIZ);
-
-	ret = send(rtnl, &req, req.nh.nlmsg_len, 0);
-	if (ret < 0)
-		fprintf(stderr, "can't send: %s\n", strerror(errno));
-	ret = (unsigned int)ret != req.nh.nlmsg_len;
-
-	close(rtnl);
-	return ret;
+	return ip_link_del(dev);
 }
 
 FIXTURE(tun)
-- 
cgit v1.2.3


From 24e59f26eef2c806a8dbb94859aaf7af28197148 Mon Sep 17 00:00:00 2001
From: Xu Du <xudu@redhat.com>
Date: Wed, 21 Jan 2026 18:04:58 +0800
Subject: selftest: tun: Add helpers for GSO over UDP tunnel

In preparation for testing GSO over UDP tunnels, enhance the test
infrastructure to support a more complex data path involving a TUN
device and a GENEVE udp tunnel.

This patch introduces a dedicated setup/teardown topology that creates
both a GENEVE tunnel interface and a TUN interface. The TUN device acts
as the VTEP (Virtual Tunnel Endpoint), allowing it to send and receive
virtio-net packets. This setup effectively tests the kernel's data path
for encapsulated traffic.

Note that after adding a new address to the UDP tunnel, we need to wait
a bit until the associated route is available.

Additionally, a new data structure is defined to manage test parameters.
This structure is designed to be extensible, allowing different test
data and configurations to be easily added in subsequent patches.

Signed-off-by: Xu Du <xudu@redhat.com>
Link: https://patch.msgid.link/b5787b8c269f43ce11e1756f1691cc7fd9a1e901.1768979440.git.xudu@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tun.c | 425 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 425 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c
index d9030bdd2e06..ec089355312b 100644
--- a/tools/testing/selftests/net/tun.c
+++ b/tools/testing/selftests/net/tun.c
@@ -15,6 +15,80 @@
 #include "kselftest_harness.h"
 #include "tuntap_helpers.h"
 
+static const char param_dev_geneve_name[] = "geneve1";
+static unsigned char param_hwaddr_outer_dst[] = { 0x00, 0xfe, 0x98,
+						  0x14, 0x22, 0x42 };
+static unsigned char param_hwaddr_outer_src[] = { 0x00, 0xfe, 0x98,
+						  0x94, 0xd2, 0x43 };
+static unsigned char param_hwaddr_inner_dst[] = { 0x00, 0xfe, 0x98,
+						  0x94, 0x22, 0xcc };
+static unsigned char param_hwaddr_inner_src[] = { 0x00, 0xfe, 0x98,
+						  0x94, 0xd2, 0xdd };
+
+static struct in_addr param_ipaddr4_outer_dst = {
+	__constant_htonl(0xac100001),
+};
+
+static struct in_addr param_ipaddr4_outer_src = {
+	__constant_htonl(0xac100002),
+};
+
+static struct in_addr param_ipaddr4_inner_dst = {
+	__constant_htonl(0xac100101),
+};
+
+static struct in_addr param_ipaddr4_inner_src = {
+	__constant_htonl(0xac100102),
+};
+
+static struct in6_addr param_ipaddr6_outer_dst = {
+	{ { 0x20, 0x02, 0x0d, 0xb8, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 } },
+};
+
+static struct in6_addr param_ipaddr6_outer_src = {
+	{ { 0x20, 0x02, 0x0d, 0xb8, 0x01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 } },
+};
+
+static struct in6_addr param_ipaddr6_inner_dst = {
+	{ { 0x20, 0x02, 0x0d, 0xb8, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 } },
+};
+
+static struct in6_addr param_ipaddr6_inner_src = {
+	{ { 0x20, 0x02, 0x0d, 0xb8, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 } },
+};
+
+#define VN_ID 1
+#define VN_PORT 4789
+#define UDP_SRC_PORT 22
+#define UDP_DST_PORT 48878
+#define IPPREFIX_LEN 24
+#define IP6PREFIX_LEN 64
+#define TIMEOUT_SEC 10
+#define TIMEOUT_USEC 100000
+#define MAX_RETRIES 20
+
+#define UDP_TUNNEL_GENEVE_4IN4 0x01
+#define UDP_TUNNEL_GENEVE_6IN4 0x02
+#define UDP_TUNNEL_GENEVE_4IN6 0x04
+#define UDP_TUNNEL_GENEVE_6IN6 0x08
+
+#define UDP_TUNNEL_OUTER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_6IN4)
+#define UDP_TUNNEL_INNER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_4IN6)
+
+#define TUN_VNET_TNL_SIZE sizeof(struct virtio_net_hdr_v1_hash_tunnel)
+
+struct geneve_setup_config {
+	int family;
+	union {
+		struct in_addr r4;
+		struct in6_addr r6;
+	} remote;
+	__be32 vnid;
+	__be16 vnport;
+	unsigned char hwaddr[6];
+	uint8_t csum;
+};
+
 static int tun_attach(int fd, char *dev)
 {
 	struct ifreq ifr;
@@ -67,6 +141,202 @@ static int tun_delete(char *dev)
 	return ip_link_del(dev);
 }
 
+static int tun_open(char *dev, const int flags, const int hdrlen,
+		    const int features, const unsigned char *mac_addr)
+{
+	struct ifreq ifr = { 0 };
+	int fd, sk = -1;
+
+	fd = open("/dev/net/tun", O_RDWR);
+	if (fd < 0) {
+		perror("open");
+		return -1;
+	}
+
+	ifr.ifr_flags = flags;
+	if (ioctl(fd, TUNSETIFF, (void *)&ifr) < 0) {
+		perror("ioctl(TUNSETIFF)");
+		goto err;
+	}
+	strcpy(dev, ifr.ifr_name);
+
+	if (hdrlen > 0) {
+		if (ioctl(fd, TUNSETVNETHDRSZ, &hdrlen) < 0) {
+			perror("ioctl(TUNSETVNETHDRSZ)");
+			goto err;
+		}
+	}
+
+	if (features) {
+		if (ioctl(fd, TUNSETOFFLOAD, features) < 0) {
+			perror("ioctl(TUNSETOFFLOAD)");
+			goto err;
+		}
+	}
+
+	sk = socket(PF_INET, SOCK_DGRAM, 0);
+	if (sk < 0) {
+		perror("socket");
+		goto err;
+	}
+
+	if (ioctl(sk, SIOCGIFFLAGS, &ifr) < 0) {
+		perror("ioctl(SIOCGIFFLAGS)");
+		goto err;
+	}
+
+	ifr.ifr_flags |= (IFF_UP | IFF_RUNNING);
+	if (ioctl(sk, SIOCSIFFLAGS, &ifr) < 0) {
+		perror("ioctl(SIOCSIFFLAGS)");
+		goto err;
+	}
+
+	if (mac_addr && flags & IFF_TAP) {
+		ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
+		memcpy(ifr.ifr_hwaddr.sa_data, mac_addr, ETH_ALEN);
+
+		if (ioctl(sk, SIOCSIFHWADDR, &ifr) < 0) {
+			perror("ioctl(SIOCSIFHWADDR)");
+			goto err;
+		}
+	}
+
+out:
+	if (sk >= 0)
+		close(sk);
+	return fd;
+
+err:
+	close(fd);
+	fd = -1;
+	goto out;
+}
+
+static size_t sockaddr_len(int family)
+{
+	return (family == AF_INET) ? sizeof(struct sockaddr_in) :
+				     sizeof(struct sockaddr_in6);
+}
+
+static int geneve_fill_newlink(struct rt_link_newlink_req *req, void *data)
+{
+	struct geneve_setup_config *cfg = data;
+
+#define SET_GENEVE_REMOTE rt_link_newlink_req_set_linkinfo_data_geneve_remote
+#define SET_GENEVE_REMOTE6 rt_link_newlink_req_set_linkinfo_data_geneve_remote6
+
+	rt_link_newlink_req_set_address(req, cfg->hwaddr, ETH_ALEN);
+	rt_link_newlink_req_set_linkinfo_data_geneve_id(req, cfg->vnid);
+	rt_link_newlink_req_set_linkinfo_data_geneve_port(req, cfg->vnport);
+	rt_link_newlink_req_set_linkinfo_data_geneve_udp_csum(req, cfg->csum);
+
+	if (cfg->family == AF_INET)
+		SET_GENEVE_REMOTE(req, cfg->remote.r4.s_addr);
+	else
+		SET_GENEVE_REMOTE6(req, &cfg->remote.r6,
+				   sizeof(cfg->remote.r6));
+
+	return 0;
+}
+
+static int geneve_create(const char *dev, int family, void *remote,
+			 void *hwaddr)
+{
+	struct geneve_setup_config geneve;
+
+	memset(&geneve, 0, sizeof(geneve));
+	geneve.vnid = VN_ID;
+	geneve.vnport = htons(VN_PORT);
+	geneve.csum = 1;
+	geneve.family = family;
+	if (family == AF_INET)
+		memcpy(&geneve.remote.r4, remote, sizeof(struct in_addr));
+	else
+		memcpy(&geneve.remote.r6, remote, sizeof(struct in6_addr));
+	memcpy(geneve.hwaddr, hwaddr, ETH_ALEN);
+
+	return ip_link_add(dev, "geneve", geneve_fill_newlink, (void *)&geneve);
+}
+
+static int set_pmtu_discover(int fd, bool is_ipv4)
+{
+	int level, name, val;
+
+	if (is_ipv4) {
+		level = SOL_IP;
+		name = IP_MTU_DISCOVER;
+		val = IP_PMTUDISC_DO;
+	} else {
+		level = SOL_IPV6;
+		name = IPV6_MTU_DISCOVER;
+		val = IPV6_PMTUDISC_DO;
+	}
+
+	return setsockopt(fd, level, name, &val, sizeof(val));
+}
+
+static int udp_socket_open(struct sockaddr_storage *ssa, bool do_frag,
+			   bool do_connect, struct sockaddr_storage *dsa)
+{
+	struct timeval to = { .tv_sec = TIMEOUT_SEC };
+	int fd, family = ssa->ss_family;
+	int salen = sockaddr_len(family);
+
+	fd = socket(family, SOCK_DGRAM, 0);
+	if (fd < 0)
+		return -1;
+
+	if (bind(fd, (struct sockaddr *)ssa, salen) < 0) {
+		perror("bind");
+		goto err;
+	}
+
+	if (do_connect && connect(fd, (struct sockaddr *)dsa, salen) < 0) {
+		perror("connect");
+		goto err;
+	}
+
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &to, sizeof(to)) < 0) {
+		perror("setsockopt(SO_RCVTIMEO)");
+		goto err;
+	}
+
+	if (!do_frag && set_pmtu_discover(fd, family == AF_INET) < 0) {
+		perror("set_pmtu_discover");
+		goto err;
+	}
+	return fd;
+
+err:
+	close(fd);
+	return -1;
+}
+
+static void parse_route_rsp(struct rt_route_getroute_rsp *rsp, void *rtm_type)
+{
+	*(uint8_t *)rtm_type = rsp->_hdr.rtm_type;
+}
+
+static int ip_route_check(const char *intf, int family, void *addr)
+{
+	uint8_t rtm_type, table = RT_TABLE_LOCAL;
+	int retries = MAX_RETRIES;
+
+	while (retries-- > 0) {
+		if (ip_route_get(intf, family, table, addr, parse_route_rsp,
+				 &rtm_type) == 0 &&
+		    rtm_type == RTN_LOCAL)
+			break;
+
+		usleep(TIMEOUT_USEC);
+	}
+
+	if (retries < 0)
+		return -1;
+
+	return 0;
+}
+
 FIXTURE(tun)
 {
 	char ifname[IFNAMSIZ];
@@ -129,4 +399,159 @@ TEST_F(tun, reattach_close_delete)
 	EXPECT_EQ(tun_delete(self->ifname), 0);
 }
 
+FIXTURE(tun_vnet_udptnl)
+{
+	char ifname[IFNAMSIZ];
+	int fd, sock;
+};
+
+FIXTURE_VARIANT(tun_vnet_udptnl)
+{
+	int tunnel_type;
+	bool is_tap;
+};
+
+/* clang-format off */
+#define TUN_VNET_UDPTNL_VARIANT_ADD(type, desc)                              \
+	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##udptnl) {                 \
+		.tunnel_type = type,                                         \
+		.is_tap = true,                                              \
+	}
+/* clang-format on */
+
+TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN4, 4in4);
+TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_6IN4, 6in4);
+TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN6, 4in6);
+TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_6IN6, 6in6);
+
+static void assign_ifaddr_vars(int family, int is_outer, void **srcip,
+			       void **dstip, void **srcmac, void **dstmac)
+{
+	if (is_outer) {
+		if (family == AF_INET) {
+			*srcip = (void *)&param_ipaddr4_outer_src;
+			*dstip = (void *)&param_ipaddr4_outer_dst;
+		} else {
+			*srcip = (void *)&param_ipaddr6_outer_src;
+			*dstip = (void *)&param_ipaddr6_outer_dst;
+		}
+		*srcmac = param_hwaddr_outer_src;
+		*dstmac = param_hwaddr_outer_dst;
+	} else {
+		if (family == AF_INET) {
+			*srcip = (void *)&param_ipaddr4_inner_src;
+			*dstip = (void *)&param_ipaddr4_inner_dst;
+		} else {
+			*srcip = (void *)&param_ipaddr6_inner_src;
+			*dstip = (void *)&param_ipaddr6_inner_dst;
+		}
+		*srcmac = param_hwaddr_inner_src;
+		*dstmac = param_hwaddr_inner_dst;
+	}
+}
+
+static void assign_sockaddr_vars(int family, int is_outer,
+				 struct sockaddr_storage *src,
+				 struct sockaddr_storage *dst)
+{
+	src->ss_family = family;
+	dst->ss_family = family;
+
+	if (family == AF_INET) {
+		struct sockaddr_in *s4 = (struct sockaddr_in *)src;
+		struct sockaddr_in *d4 = (struct sockaddr_in *)dst;
+
+		s4->sin_addr = is_outer ? param_ipaddr4_outer_src :
+					  param_ipaddr4_inner_src;
+		d4->sin_addr = is_outer ? param_ipaddr4_outer_dst :
+					  param_ipaddr4_inner_dst;
+		if (!is_outer) {
+			s4->sin_port = htons(UDP_SRC_PORT);
+			d4->sin_port = htons(UDP_DST_PORT);
+		}
+	} else {
+		struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)src;
+		struct sockaddr_in6 *d6 = (struct sockaddr_in6 *)dst;
+
+		s6->sin6_addr = is_outer ? param_ipaddr6_outer_src :
+					   param_ipaddr6_inner_src;
+		d6->sin6_addr = is_outer ? param_ipaddr6_outer_dst :
+					   param_ipaddr6_inner_dst;
+		if (!is_outer) {
+			s6->sin6_port = htons(UDP_SRC_PORT);
+			d6->sin6_port = htons(UDP_DST_PORT);
+		}
+	}
+}
+
+FIXTURE_SETUP(tun_vnet_udptnl)
+{
+	int ret, family, prefix, flags, features;
+	int tunnel_type = variant->tunnel_type;
+	struct sockaddr_storage ssa, dsa;
+	void *sip, *dip, *smac, *dmac;
+
+	flags = (variant->is_tap ? IFF_TAP : IFF_TUN) | IFF_VNET_HDR |
+		IFF_MULTI_QUEUE | IFF_NO_PI;
+	features = TUN_F_CSUM | TUN_F_UDP_TUNNEL_GSO |
+		   TUN_F_UDP_TUNNEL_GSO_CSUM | TUN_F_USO4 | TUN_F_USO6;
+	self->fd = tun_open(self->ifname, flags, TUN_VNET_TNL_SIZE, features,
+			    param_hwaddr_outer_src);
+	ASSERT_GE(self->fd, 0);
+
+	family = (tunnel_type & UDP_TUNNEL_OUTER_IPV4) ? AF_INET : AF_INET6;
+	prefix = (family == AF_INET) ? IPPREFIX_LEN : IP6PREFIX_LEN;
+	assign_ifaddr_vars(family, 1, &sip, &dip, &smac, &dmac);
+
+	ret = ip_addr_add(self->ifname, family, sip, prefix);
+	ASSERT_EQ(ret, 0);
+	ret = ip_neigh_add(self->ifname, family, dip, dmac);
+	ASSERT_EQ(ret, 0);
+	ret = ip_route_check(self->ifname, family, sip);
+	ASSERT_EQ(ret, 0);
+
+	ret = geneve_create(param_dev_geneve_name, family, dip,
+			    param_hwaddr_inner_src);
+	ASSERT_EQ(ret, 0);
+
+	family = (tunnel_type & UDP_TUNNEL_INNER_IPV4) ? AF_INET : AF_INET6;
+	prefix = (family == AF_INET) ? IPPREFIX_LEN : IP6PREFIX_LEN;
+	assign_ifaddr_vars(family, 0, &sip, &dip, &smac, &dmac);
+
+	ret = ip_addr_add(param_dev_geneve_name, family, sip, prefix);
+	ASSERT_EQ(ret, 0);
+	ret = ip_neigh_add(param_dev_geneve_name, family, dip, dmac);
+	ASSERT_EQ(ret, 0);
+	ret = ip_route_check(param_dev_geneve_name, family, sip);
+	ASSERT_EQ(ret, 0);
+
+	assign_sockaddr_vars(family, 0, &ssa, &dsa);
+	self->sock = udp_socket_open(&ssa, false, true, &dsa);
+	ASSERT_GE(self->sock, 0);
+}
+
+FIXTURE_TEARDOWN(tun_vnet_udptnl)
+{
+	int ret;
+
+	if (self->sock != -1)
+		close(self->sock);
+
+	ret = ip_link_del(param_dev_geneve_name);
+	EXPECT_EQ(ret, 0);
+
+	ret = tun_delete(self->ifname);
+	EXPECT_EQ(ret, 0);
+}
+
+TEST_F(tun_vnet_udptnl, basic)
+{
+	int ret;
+	char cmd[256] = { 0 };
+
+	sprintf(cmd, "ip addr show %s > /dev/null 2>&1", param_dev_geneve_name);
+	ret = system(cmd);
+	ASSERT_EQ(ret, 0);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 400e658aa096cda99b37ce806ed63cfe894c9566 Mon Sep 17 00:00:00 2001
From: Xu Du <xudu@redhat.com>
Date: Wed, 21 Jan 2026 18:04:59 +0800
Subject: selftest: tun: Add test for sending gso packet into tun

The test constructs a raw packet, prepends a virtio_net_hdr,
and writes the result to the TUN device. This mimics the behavior
of a vm forwarding a guest's packet to the host networking stack.

Signed-off-by: Xu Du <xudu@redhat.com>
Link: https://patch.msgid.link/a988dbc9ca109e4f1f0b33858c5035bce8ebede3.1768979440.git.xudu@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tun.c | 144 +++++++++++++++++++++++++++++++++++---
 1 file changed, 135 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c
index ec089355312b..f0ddb5d37683 100644
--- a/tools/testing/selftests/net/tun.c
+++ b/tools/testing/selftests/net/tun.c
@@ -75,7 +75,34 @@ static struct in6_addr param_ipaddr6_inner_src = {
 #define UDP_TUNNEL_OUTER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_6IN4)
 #define UDP_TUNNEL_INNER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_4IN6)
 
+#define UDP_TUNNEL_GENEVE_4IN4_HDRLEN                        \
+	(ETH_HLEN + 2 * sizeof(struct iphdr) + GENEVE_HLEN + \
+	 2 * sizeof(struct udphdr))
+#define UDP_TUNNEL_GENEVE_6IN6_HDRLEN                          \
+	(ETH_HLEN + 2 * sizeof(struct ipv6hdr) + GENEVE_HLEN + \
+	 2 * sizeof(struct udphdr))
+#define UDP_TUNNEL_GENEVE_4IN6_HDRLEN                               \
+	(ETH_HLEN + sizeof(struct iphdr) + sizeof(struct ipv6hdr) + \
+	 GENEVE_HLEN + 2 * sizeof(struct udphdr))
+#define UDP_TUNNEL_GENEVE_6IN4_HDRLEN                               \
+	(ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct iphdr) + \
+	 GENEVE_HLEN + 2 * sizeof(struct udphdr))
+
+#define UDP_TUNNEL_HDRLEN(type)                                             \
+	((type) == UDP_TUNNEL_GENEVE_4IN4 ? UDP_TUNNEL_GENEVE_4IN4_HDRLEN : \
+	 (type) == UDP_TUNNEL_GENEVE_6IN6 ? UDP_TUNNEL_GENEVE_6IN6_HDRLEN : \
+	 (type) == UDP_TUNNEL_GENEVE_4IN6 ? UDP_TUNNEL_GENEVE_4IN6_HDRLEN : \
+	 (type) == UDP_TUNNEL_GENEVE_6IN4 ? UDP_TUNNEL_GENEVE_6IN4_HDRLEN : \
+					    0)
+
+#define UDP_TUNNEL_MSS(type) (ETH_DATA_LEN - UDP_TUNNEL_HDRLEN(type))
+#define UDP_TUNNEL_MAX(type, is_tap) \
+	(ETH_MAX_MTU - UDP_TUNNEL_HDRLEN(type) - ((is_tap) ? ETH_HLEN : 0))
+
 #define TUN_VNET_TNL_SIZE sizeof(struct virtio_net_hdr_v1_hash_tunnel)
+#define MAX_VNET_TUNNEL_PACKET_SZ                                       \
+	(TUN_VNET_TNL_SIZE + ETH_HLEN + UDP_TUNNEL_GENEVE_6IN6_HDRLEN + \
+	 ETH_MAX_MTU)
 
 struct geneve_setup_config {
 	int family;
@@ -408,15 +435,23 @@ FIXTURE(tun_vnet_udptnl)
 FIXTURE_VARIANT(tun_vnet_udptnl)
 {
 	int tunnel_type;
-	bool is_tap;
+	int gso_size;
+	int data_size;
+	int r_num_mss;
+	bool is_tap, no_gso;
 };
 
 /* clang-format off */
 #define TUN_VNET_UDPTNL_VARIANT_ADD(type, desc)                              \
-	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##udptnl) {                 \
+	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_1mss) {                  \
+		/* send a single MSS: fall back to no GSO */                 \
 		.tunnel_type = type,                                         \
+		.gso_size = UDP_TUNNEL_MSS(type),                            \
+		.data_size = UDP_TUNNEL_MSS(type),                           \
+		.r_num_mss = 1,                                              \
 		.is_tap = true,                                              \
-	}
+		.no_gso = true,                                              \
+	};
 /* clang-format on */
 
 TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN4, 4in4);
@@ -544,14 +579,105 @@ FIXTURE_TEARDOWN(tun_vnet_udptnl)
 	EXPECT_EQ(ret, 0);
 }
 
-TEST_F(tun_vnet_udptnl, basic)
+static int build_gso_packet_into_tun(const FIXTURE_VARIANT(tun_vnet_udptnl) *
+					     variant,
+				     uint8_t *buf)
 {
-	int ret;
-	char cmd[256] = { 0 };
+	int pktlen, hlen, proto, inner_family, outer_family;
+	int tunnel_type = variant->tunnel_type;
+	int payload_len = variant->data_size;
+	int gso_size = variant->gso_size;
+	uint8_t *outer_udph, *cur = buf;
+	void *sip, *dip, *smac, *dmac;
+	bool is_tap = variant->is_tap;
 
-	sprintf(cmd, "ip addr show %s > /dev/null 2>&1", param_dev_geneve_name);
-	ret = system(cmd);
-	ASSERT_EQ(ret, 0);
+	hlen = (is_tap ? ETH_HLEN : 0) + UDP_TUNNEL_HDRLEN(tunnel_type);
+	inner_family = (tunnel_type & UDP_TUNNEL_INNER_IPV4) ? AF_INET :
+							       AF_INET6;
+	outer_family = (tunnel_type & UDP_TUNNEL_OUTER_IPV4) ? AF_INET :
+							       AF_INET6;
+
+	cur += build_virtio_net_hdr_v1_hash_tunnel(cur, is_tap, hlen, gso_size,
+						   outer_family, inner_family);
+
+	pktlen = hlen + payload_len;
+	assign_ifaddr_vars(outer_family, 1, &sip, &dip, &smac, &dmac);
+
+	if (is_tap) {
+		proto = outer_family == AF_INET ? ETH_P_IP : ETH_P_IPV6;
+		pktlen -= ETH_HLEN;
+		cur += build_eth(cur, proto, dmac, smac);
+	}
+
+	if (outer_family == AF_INET) {
+		pktlen = pktlen - sizeof(struct iphdr);
+		cur += build_ipv4_header(cur, IPPROTO_UDP, pktlen, dip, sip);
+	} else {
+		pktlen = pktlen - sizeof(struct ipv6hdr);
+		cur += build_ipv6_header(cur, IPPROTO_UDP, 0, pktlen, dip, sip);
+	}
+
+	outer_udph = cur;
+	assign_ifaddr_vars(inner_family, 0, &sip, &dip, &smac, &dmac);
+
+	pktlen -= sizeof(struct udphdr);
+	proto = inner_family == AF_INET ? ETH_P_IP : ETH_P_IPV6;
+	cur += build_udp_header(cur, UDP_SRC_PORT, VN_PORT, pktlen);
+	cur += build_geneve_header(cur, VN_ID);
+	cur += build_eth(cur, proto, dmac, smac);
+
+	pktlen = sizeof(struct udphdr) + payload_len;
+	if (inner_family == AF_INET)
+		cur += build_ipv4_header(cur, IPPROTO_UDP, pktlen, dip, sip);
+	else
+		cur += build_ipv6_header(cur, IPPROTO_UDP, 0, pktlen, dip, sip);
+
+	cur += build_udp_packet(cur, UDP_DST_PORT, UDP_SRC_PORT, payload_len,
+				inner_family, false);
+
+	build_udp_packet_csum(outer_udph, outer_family, false);
+
+	return cur - buf;
+}
+
+static int
+receive_gso_packet_from_tunnel(FIXTURE_DATA(tun_vnet_udptnl) * self,
+			       const FIXTURE_VARIANT(tun_vnet_udptnl) * variant,
+			       int *r_num_mss)
+{
+	uint8_t packet_buf[MAX_VNET_TUNNEL_PACKET_SZ];
+	int len, total_len = 0, socket = self->sock;
+	int payload_len = variant->data_size;
+
+	while (total_len < payload_len) {
+		len = recv(socket, packet_buf, sizeof(packet_buf), 0);
+		if (len <= 0) {
+			if (len < 0 && errno != EAGAIN && errno != EWOULDBLOCK)
+				perror("recv");
+			break;
+		}
+
+		(*r_num_mss)++;
+		total_len += len;
+	}
+
+	return total_len;
+}
+
+TEST_F(tun_vnet_udptnl, send_gso_packet)
+{
+	uint8_t pkt[MAX_VNET_TUNNEL_PACKET_SZ];
+	int r_num_mss = 0;
+	int ret, off;
+
+	memset(pkt, 0, sizeof(pkt));
+	off = build_gso_packet_into_tun(variant, pkt);
+	ret = write(self->fd, pkt, off);
+	ASSERT_EQ(ret, off);
+
+	ret = receive_gso_packet_from_tunnel(self, variant, &r_num_mss);
+	ASSERT_EQ(ret, variant->data_size);
+	ASSERT_EQ(r_num_mss, variant->r_num_mss);
 }
 
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 6bdd7ae6059ec3c19f4717387706f92cd8b715a6 Mon Sep 17 00:00:00 2001
From: Xu Du <xudu@redhat.com>
Date: Wed, 21 Jan 2026 18:05:00 +0800
Subject: selftest: tun: Add test for receiving gso packet from tun

The test validate that GSO information are correctly exposed
when reading packets from a TUN device.

Signed-off-by: Xu Du <xudu@redhat.com>
Link: https://patch.msgid.link/fe75ac66466380490eba858eef50596a1bfbd071.1768979440.git.xudu@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tun.c | 194 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 194 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c
index f0ddb5d37683..628ae2caf6f4 100644
--- a/tools/testing/selftests/net/tun.c
+++ b/tools/testing/selftests/net/tun.c
@@ -364,6 +364,116 @@ static int ip_route_check(const char *intf, int family, void *addr)
 	return 0;
 }
 
+static int send_gso_udp_msg(int socket, struct sockaddr_storage *addr,
+			    uint8_t *send_buf, int send_len, int gso_size)
+{
+	char control[CMSG_SPACE(sizeof(uint16_t))] = { 0 };
+	int alen = sockaddr_len(addr->ss_family);
+	struct msghdr msg = { 0 };
+	struct iovec iov = { 0 };
+	int ret;
+
+	iov.iov_base = send_buf;
+	iov.iov_len = send_len;
+
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+	msg.msg_name = addr;
+	msg.msg_namelen = alen;
+
+	if (gso_size > 0) {
+		struct cmsghdr *cmsg;
+
+		msg.msg_control = control;
+		msg.msg_controllen = sizeof(control);
+
+		cmsg = CMSG_FIRSTHDR(&msg);
+		cmsg->cmsg_level = SOL_UDP;
+		cmsg->cmsg_type = UDP_SEGMENT;
+		cmsg->cmsg_len = CMSG_LEN(sizeof(uint16_t));
+		*(uint16_t *)CMSG_DATA(cmsg) = gso_size;
+	}
+
+	ret = sendmsg(socket, &msg, 0);
+	if (ret < 0)
+		perror("sendmsg");
+
+	return ret;
+}
+
+static int validate_hdrlen(uint8_t **cur, int *len, int x)
+{
+	if (*len < x)
+		return -1;
+	*cur += x;
+	*len -= x;
+	return 0;
+}
+
+static int parse_udp_tunnel_vnet_packet(uint8_t *buf, int len, int tunnel_type,
+					bool is_tap)
+{
+	struct ipv6hdr *iph6;
+	struct udphdr *udph;
+	struct iphdr *iph4;
+	uint8_t *cur = buf;
+
+	if (validate_hdrlen(&cur, &len, TUN_VNET_TNL_SIZE))
+		return -1;
+
+	if (is_tap) {
+		if (validate_hdrlen(&cur, &len, ETH_HLEN))
+			return -1;
+	}
+
+	if (tunnel_type & UDP_TUNNEL_OUTER_IPV4) {
+		iph4 = (struct iphdr *)cur;
+		if (validate_hdrlen(&cur, &len, sizeof(struct iphdr)))
+			return -1;
+		if (iph4->version != 4 || iph4->protocol != IPPROTO_UDP)
+			return -1;
+	} else {
+		iph6 = (struct ipv6hdr *)cur;
+		if (validate_hdrlen(&cur, &len, sizeof(struct ipv6hdr)))
+			return -1;
+		if (iph6->version != 6 || iph6->nexthdr != IPPROTO_UDP)
+			return -1;
+	}
+
+	udph = (struct udphdr *)cur;
+	if (validate_hdrlen(&cur, &len, sizeof(struct udphdr)))
+		return -1;
+	if (ntohs(udph->dest) != VN_PORT)
+		return -1;
+
+	if (validate_hdrlen(&cur, &len, GENEVE_HLEN))
+		return -1;
+	if (validate_hdrlen(&cur, &len, ETH_HLEN))
+		return -1;
+
+	if (tunnel_type & UDP_TUNNEL_INNER_IPV4) {
+		iph4 = (struct iphdr *)cur;
+		if (validate_hdrlen(&cur, &len, sizeof(struct iphdr)))
+			return -1;
+		if (iph4->version != 4 || iph4->protocol != IPPROTO_UDP)
+			return -1;
+	} else {
+		iph6 = (struct ipv6hdr *)cur;
+		if (validate_hdrlen(&cur, &len, sizeof(struct ipv6hdr)))
+			return -1;
+		if (iph6->version != 6 || iph6->nexthdr != IPPROTO_UDP)
+			return -1;
+	}
+
+	udph = (struct udphdr *)cur;
+	if (validate_hdrlen(&cur, &len, sizeof(struct udphdr)))
+		return -1;
+	if (ntohs(udph->dest) != UDP_DST_PORT)
+		return -1;
+
+	return len;
+}
+
 FIXTURE(tun)
 {
 	char ifname[IFNAMSIZ];
@@ -664,6 +774,68 @@ receive_gso_packet_from_tunnel(FIXTURE_DATA(tun_vnet_udptnl) * self,
 	return total_len;
 }
 
+static int send_gso_packet_into_tunnel(FIXTURE_DATA(tun_vnet_udptnl) * self,
+				       const FIXTURE_VARIANT(tun_vnet_udptnl) *
+					       variant)
+{
+	int family = (variant->tunnel_type & UDP_TUNNEL_INNER_IPV4) ? AF_INET :
+								      AF_INET6;
+	uint8_t buf[MAX_VNET_TUNNEL_PACKET_SZ] = { 0 };
+	int payload_len = variant->data_size;
+	int gso_size = variant->gso_size;
+	struct sockaddr_storage ssa, dsa;
+
+	assign_sockaddr_vars(family, 0, &ssa, &dsa);
+	return send_gso_udp_msg(self->sock, &dsa, buf, payload_len, gso_size);
+}
+
+static int
+receive_gso_packet_from_tun(FIXTURE_DATA(tun_vnet_udptnl) * self,
+			    const FIXTURE_VARIANT(tun_vnet_udptnl) * variant,
+			    struct virtio_net_hdr_v1_hash_tunnel *vnet_hdr)
+{
+	struct timeval timeout = { .tv_sec = TIMEOUT_SEC };
+	uint8_t buf[MAX_VNET_TUNNEL_PACKET_SZ];
+	int tunnel_type = variant->tunnel_type;
+	int payload_len = variant->data_size;
+	bool is_tap = variant->is_tap;
+	int ret, len, total_len = 0;
+	int tun_fd = self->fd;
+	fd_set fdset;
+
+	while (total_len < payload_len) {
+		FD_ZERO(&fdset);
+		FD_SET(tun_fd, &fdset);
+
+		ret = select(tun_fd + 1, &fdset, NULL, NULL, &timeout);
+		if (ret <= 0) {
+			perror("select");
+			break;
+		}
+		if (!FD_ISSET(tun_fd, &fdset))
+			continue;
+
+		len = read(tun_fd, buf, sizeof(buf));
+		if (len <= 0) {
+			if (len < 0 && errno != EAGAIN && errno != EWOULDBLOCK)
+				perror("read");
+			break;
+		}
+
+		len = parse_udp_tunnel_vnet_packet(buf, len, tunnel_type,
+						   is_tap);
+		if (len < 0)
+			continue;
+
+		if (total_len == 0)
+			memcpy(vnet_hdr, buf, TUN_VNET_TNL_SIZE);
+
+		total_len += len;
+	}
+
+	return total_len;
+}
+
 TEST_F(tun_vnet_udptnl, send_gso_packet)
 {
 	uint8_t pkt[MAX_VNET_TUNNEL_PACKET_SZ];
@@ -680,4 +852,26 @@ TEST_F(tun_vnet_udptnl, send_gso_packet)
 	ASSERT_EQ(r_num_mss, variant->r_num_mss);
 }
 
+TEST_F(tun_vnet_udptnl, recv_gso_packet)
+{
+	struct virtio_net_hdr_v1_hash_tunnel vnet_hdr = { 0 };
+	struct virtio_net_hdr_v1 *vh = &vnet_hdr.hash_hdr.hdr;
+	int ret, gso_type = VIRTIO_NET_HDR_GSO_UDP_L4;
+
+	ret = send_gso_packet_into_tunnel(self, variant);
+	ASSERT_EQ(ret, variant->data_size);
+
+	memset(&vnet_hdr, 0, sizeof(vnet_hdr));
+	ret = receive_gso_packet_from_tun(self, variant, &vnet_hdr);
+	ASSERT_EQ(ret, variant->data_size);
+
+	if (!variant->no_gso) {
+		ASSERT_EQ(vh->gso_size, variant->gso_size);
+		gso_type |= (variant->tunnel_type & UDP_TUNNEL_OUTER_IPV4) ?
+				    (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4) :
+				    (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6);
+		ASSERT_EQ(vh->gso_type, gso_type);
+	}
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 87db2fdef5a7c2ea8a404afd402800d48940d6b2 Mon Sep 17 00:00:00 2001
From: Xu Du <xudu@redhat.com>
Date: Wed, 21 Jan 2026 18:05:01 +0800
Subject: selftest: tun: Add test data for success and failure paths

To improve the robustness and coverage of the TUN selftests, this
patch expands the set of test data.

Signed-off-by: Xu Du <xudu@redhat.com>
Link: https://patch.msgid.link/5054f3ad9f3dbfe33b827183fccc5efeb8fd0da7.1768979440.git.xudu@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tun.c | 115 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 113 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c
index 628ae2caf6f4..8a5cd5cb5472 100644
--- a/tools/testing/selftests/net/tun.c
+++ b/tools/testing/selftests/net/tun.c
@@ -57,6 +57,10 @@ static struct in6_addr param_ipaddr6_inner_src = {
 	{ { 0x20, 0x02, 0x0d, 0xb8, 0x02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 } },
 };
 
+#ifndef BIT
+#define BIT(nr) (1UL << (nr))
+#endif
+
 #define VN_ID 1
 #define VN_PORT 4789
 #define UDP_SRC_PORT 22
@@ -72,6 +76,8 @@ static struct in6_addr param_ipaddr6_inner_src = {
 #define UDP_TUNNEL_GENEVE_4IN6 0x04
 #define UDP_TUNNEL_GENEVE_6IN6 0x08
 
+#define UDP_TUNNEL_MAX_SEGMENTS BIT(7)
+
 #define UDP_TUNNEL_OUTER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_6IN4)
 #define UDP_TUNNEL_INNER_IPV4 (UDP_TUNNEL_GENEVE_4IN4 | UDP_TUNNEL_GENEVE_4IN6)
 
@@ -553,6 +559,39 @@ FIXTURE_VARIANT(tun_vnet_udptnl)
 
 /* clang-format off */
 #define TUN_VNET_UDPTNL_VARIANT_ADD(type, desc)                              \
+	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_nogsosz_1byte) {         \
+		/* no GSO: send a single byte */                             \
+		.tunnel_type = type,                                         \
+		.data_size = 1,                                              \
+		.r_num_mss = 1,                                              \
+		.is_tap = true,                                              \
+		.no_gso = true,                                              \
+	};                                                                   \
+	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_nogsosz_1mss) {          \
+		/* no GSO: send a single MSS, fall back to no GSO */         \
+		.tunnel_type = type,                                         \
+		.data_size = UDP_TUNNEL_MSS(type),                           \
+		.r_num_mss = 1,                                              \
+		.is_tap = true,                                              \
+		.no_gso = true,                                              \
+	};                                                                   \
+	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_nogsosz_gtmss) {         \
+		/* no GSO: send a single MSS + 1B: fail */                   \
+		.tunnel_type = type,                                         \
+		.data_size = UDP_TUNNEL_MSS(type) + 1,                       \
+		.r_num_mss = 1,                                              \
+		.is_tap = true,                                              \
+		.no_gso = true,                                              \
+	};                                                                   \
+	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_1byte) {                 \
+		/* GSO: send 1 byte, gso 1 byte, fall back to no GSO */      \
+		.tunnel_type = type,                                         \
+		.gso_size = 1,                                               \
+		.data_size = 1,                                              \
+		.r_num_mss = 1,                                              \
+		.is_tap = true,                                              \
+		.no_gso = true,                                              \
+	};                                                                   \
 	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_1mss) {                  \
 		/* send a single MSS: fall back to no GSO */                 \
 		.tunnel_type = type,                                         \
@@ -561,8 +600,65 @@ FIXTURE_VARIANT(tun_vnet_udptnl)
 		.r_num_mss = 1,                                              \
 		.is_tap = true,                                              \
 		.no_gso = true,                                              \
-	};
-/* clang-format on */
+	};                                                                   \
+	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_ltgso) {                 \
+		/* data <= MSS < gso: will fall back to no GSO */            \
+		.tunnel_type = type,                                         \
+		.gso_size = UDP_TUNNEL_MSS(type) + 1,                        \
+		.data_size = UDP_TUNNEL_MSS(type),                           \
+		.r_num_mss = 1,                                              \
+		.is_tap = true,                                              \
+		.no_gso = true,                                              \
+	};                                                                   \
+	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_gtgso) {                 \
+		/* GSO: a single MSS + 1B */                                 \
+		.tunnel_type = type,                                         \
+		.gso_size = UDP_TUNNEL_MSS(type),                            \
+		.data_size = UDP_TUNNEL_MSS(type) + 1,                       \
+		.r_num_mss = 2,                                              \
+		.is_tap = true,                                              \
+	};                                                                   \
+	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_2mss) {                  \
+		/* no GSO: send exactly 2 MSS */                             \
+		.tunnel_type = type,                                         \
+		.gso_size = UDP_TUNNEL_MSS(type),                            \
+		.data_size = UDP_TUNNEL_MSS(type) * 2,                       \
+		.r_num_mss = 2,                                              \
+		.is_tap = true,                                              \
+	};                                                                   \
+	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_maxbytes) {              \
+		/* GSO: send max bytes */                                    \
+		.tunnel_type = type,                                         \
+		.gso_size = UDP_TUNNEL_MSS(type),                            \
+		.data_size = UDP_TUNNEL_MAX(type, true),                     \
+		.r_num_mss = UDP_TUNNEL_MAX(type, true) /                    \
+			     UDP_TUNNEL_MSS(type) + 1,                       \
+		.is_tap = true,                                              \
+	};                                                                   \
+	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_over_maxbytes) {         \
+		/* GSO: send oversize max bytes: fail */                     \
+		.tunnel_type = type,                                         \
+		.gso_size = UDP_TUNNEL_MSS(type),                            \
+		.data_size = ETH_MAX_MTU,                                    \
+		.r_num_mss = ETH_MAX_MTU / UDP_TUNNEL_MSS(type) + 1,         \
+		.is_tap = true,                                              \
+	};                                                                   \
+	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_maxsegs) {               \
+		/* GSO: send max number of min sized segments */             \
+		.tunnel_type = type,                                         \
+		.gso_size = 1,                                               \
+		.data_size = UDP_TUNNEL_MAX_SEGMENTS,                        \
+		.r_num_mss = UDP_TUNNEL_MAX_SEGMENTS,                        \
+		.is_tap = true,                                              \
+	};                                                                   \
+	FIXTURE_VARIANT_ADD(tun_vnet_udptnl, desc##_5byte) {                 \
+		/* GSO: send 5 bytes, gso 2 bytes */                         \
+		.tunnel_type = type,                                         \
+		.gso_size = 2,                                               \
+		.data_size = 5,                                              \
+		.r_num_mss = 3,                                              \
+		.is_tap = true,                                              \
+	} /* clang-format on */
 
 TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_4IN4, 4in4);
 TUN_VNET_UDPTNL_VARIANT_ADD(UDP_TUNNEL_GENEVE_6IN4, 6in4);
@@ -874,4 +970,19 @@ TEST_F(tun_vnet_udptnl, recv_gso_packet)
 	}
 }
 
+XFAIL_ADD(tun_vnet_udptnl, 4in4_nogsosz_gtmss, recv_gso_packet);
+XFAIL_ADD(tun_vnet_udptnl, 6in4_nogsosz_gtmss, recv_gso_packet);
+XFAIL_ADD(tun_vnet_udptnl, 4in6_nogsosz_gtmss, recv_gso_packet);
+XFAIL_ADD(tun_vnet_udptnl, 6in6_nogsosz_gtmss, recv_gso_packet);
+
+XFAIL_ADD(tun_vnet_udptnl, 4in4_over_maxbytes, send_gso_packet);
+XFAIL_ADD(tun_vnet_udptnl, 6in4_over_maxbytes, send_gso_packet);
+XFAIL_ADD(tun_vnet_udptnl, 4in6_over_maxbytes, send_gso_packet);
+XFAIL_ADD(tun_vnet_udptnl, 6in6_over_maxbytes, send_gso_packet);
+
+XFAIL_ADD(tun_vnet_udptnl, 4in4_over_maxbytes, recv_gso_packet);
+XFAIL_ADD(tun_vnet_udptnl, 6in4_over_maxbytes, recv_gso_packet);
+XFAIL_ADD(tun_vnet_udptnl, 4in6_over_maxbytes, recv_gso_packet);
+XFAIL_ADD(tun_vnet_udptnl, 6in6_over_maxbytes, recv_gso_packet);
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From d84f24c898864facc13412a58b78964f6a769d76 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 13:35:05 -0800
Subject: perf header: Fix memory leaks in process_cpu_domain_info()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

do_read_string() returns a string in allocated memory, for some reason
there was unused memory allocations and unnecessary strdups.

Remove these and make the "perf annotate basic tests" leak sanitizer
clean.

Fixes: d40c68a49f69c9bd ("perf header: Support CPU DOMAIN relation info")
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-csky@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Cc: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zecheng Li <zecheng@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/header.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 9a15dd4b7640..eefd1cd73b6a 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -3634,6 +3634,7 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused
 			if (!d_info)
 				return -1;
 
+			assert(cd_map[cpu]->domains[domain] == NULL);
 			cd_map[cpu]->domains[domain] = d_info;
 			d_info->domain = domain;
 
@@ -3642,30 +3643,20 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused
 				if (!dname)
 					return -1;
 
-				d_info->dname = zalloc(strlen(dname) + 1);
-				if (!d_info->dname)
-					return -1;
-
-				d_info->dname = strdup(dname);
+				d_info->dname = dname;
 			}
 
 			cpumask = do_read_string(ff);
 			if (!cpumask)
 				return -1;
 
-			d_info->cpumask = zalloc(strlen(cpumask) + 1);
-			if (!d_info->cpumask)
-				return -1;
-			d_info->cpumask = strdup(cpumask);
+			d_info->cpumask = cpumask;
 
 			cpulist = do_read_string(ff);
 			if (!cpulist)
 				return -1;
 
-			d_info->cpulist = zalloc(strlen(cpulist) + 1);
-			if (!d_info->cpulist)
-				return -1;
-			d_info->cpulist = strdup(cpulist);
+			d_info->cpulist = cpulist;
 		}
 	}
 
-- 
cgit v1.2.3


From 00419892bac28bf148450d762bbff990a6bd5494 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 13:35:06 -0800
Subject: perf annotate: Fix args leak of map_symbol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

map_symbol__exit() needs calling on an annotate_args.ms, however, rather
than introduce proper reference count handling to symbol__annotate()
just switch to passing the map_symbol pointer parameter around, making
the puts the caller's responsibility.

Fix a number of cases to ensure the map in a map_symbol has a
reference count increment and add the then necessary map_symbol_exits.

Fixes: 56e144fe98260a0f ("perf mem_info: Add and use map_symbol__exit and addr_map_symbol__exit")
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-csky@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zecheng Li <zecheng@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/loongarch/annotate/instructions.c | 14 +++++----
 tools/perf/arch/s390/annotate/instructions.c      | 11 ++++---
 tools/perf/util/annotate.c                        |  2 +-
 tools/perf/util/capstone.c                        | 14 ++++-----
 tools/perf/util/disasm.c                          | 36 +++++++++++++----------
 tools/perf/util/disasm.h                          |  2 +-
 tools/perf/util/llvm.c                            |  6 ++--
 7 files changed, 47 insertions(+), 38 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/arch/loongarch/annotate/instructions.c
index 70262d5f1444..1c3abb43c8d7 100644
--- a/tools/perf/arch/loongarch/annotate/instructions.c
+++ b/tools/perf/arch/loongarch/annotate/instructions.c
@@ -10,9 +10,7 @@ static int loongarch_call__parse(struct arch *arch, struct ins_operands *ops, st
 {
 	char *c, *endptr, *tok, *name;
 	struct map *map = ms->map;
-	struct addr_map_symbol target = {
-		.ms = { .map = map, },
-	};
+	struct addr_map_symbol target;
 
 	c = strchr(ops->raw, '#');
 	if (c++ == NULL)
@@ -38,12 +36,16 @@ static int loongarch_call__parse(struct arch *arch, struct ins_operands *ops, st
 	if (ops->target.name == NULL)
 		return -1;
 
-	target.addr = map__objdump_2mem(map, ops->target.addr);
+	target = (struct addr_map_symbol) {
+		.ms = { .map = map__get(map), },
+		.addr = map__objdump_2mem(map, ops->target.addr),
+	};
 
 	if (maps__find_ams(ms->maps, &target) == 0 &&
 	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
 		ops->target.sym = target.ms.sym;
 
+	addr_map_symbol__exit(&target);
 	return 0;
 }
 
@@ -58,7 +60,7 @@ static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, st
 	struct map *map = ms->map;
 	struct symbol *sym = ms->sym;
 	struct addr_map_symbol target = {
-		.ms = { .map = map, },
+		.ms = { .map = map__get(map), },
 	};
 	const char *c = strchr(ops->raw, '#');
 	u64 start, end;
@@ -90,7 +92,7 @@ static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, st
 	} else {
 		ops->target.offset_avail = false;
 	}
-
+	addr_map_symbol__exit(&target);
 	return 0;
 }
 
diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c
index c61193f1e096..626e6d2cbc81 100644
--- a/tools/perf/arch/s390/annotate/instructions.c
+++ b/tools/perf/arch/s390/annotate/instructions.c
@@ -6,9 +6,7 @@ static int s390_call__parse(struct arch *arch, struct ins_operands *ops,
 {
 	char *endptr, *tok, *name;
 	struct map *map = ms->map;
-	struct addr_map_symbol target = {
-		.ms = { .map = map, },
-	};
+	struct addr_map_symbol target;
 
 	tok = strchr(ops->raw, ',');
 	if (!tok)
@@ -36,12 +34,17 @@ static int s390_call__parse(struct arch *arch, struct ins_operands *ops,
 
 	if (ops->target.name == NULL)
 		return -1;
-	target.addr = map__objdump_2mem(map, ops->target.addr);
+
+	target = (struct addr_map_symbol) {
+		.ms = { .map = map__get(map), },
+		.addr = map__objdump_2mem(map, ops->target.addr),
+	};
 
 	if (maps__find_ams(ms->maps, &target) == 0 &&
 	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
 		ops->target.sym = target.ms.sym;
 
+	addr_map_symbol__exit(&target);
 	return 0;
 }
 
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index cc7764455faf..791d60f97c23 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1031,7 +1031,7 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel,
 		return 0;
 
 	args.arch = arch;
-	args.ms = *ms;
+	args.ms = ms;
 
 	if (notes->src == NULL) {
 		notes->src = annotated_source__new();
diff --git a/tools/perf/util/capstone.c b/tools/perf/util/capstone.c
index be5fd44b1f9d..2c7feab61b7b 100644
--- a/tools/perf/util/capstone.c
+++ b/tools/perf/util/capstone.c
@@ -143,7 +143,7 @@ static void print_capstone_detail(cs_insn *insn, char *buf, size_t len,
 				  struct annotate_args *args, u64 addr)
 {
 	int i;
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct symbol *sym;
 
 	/* TODO: support more architectures */
@@ -222,7 +222,7 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused,
 {
 #ifdef HAVE_LIBCAPSTONE_SUPPORT
 	struct annotation *notes = symbol__annotation(sym);
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
 	u64 start = map__rip_2objdump(map, sym->start);
 	u64 offset;
@@ -256,7 +256,7 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused,
 	args->line = disasm_buf;
 	args->line_nr = 0;
 	args->fileloc = NULL;
-	args->ms.sym = sym;
+	args->ms->sym = sym;
 
 	dl = disasm_line__new(args);
 	if (dl == NULL)
@@ -268,7 +268,7 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused,
 	    !strcmp(args->options->disassembler_style, "att"))
 		disassembler_style = true;
 
-	if (capstone_init(maps__machine(args->ms.maps), &handle, is_64bit, disassembler_style) < 0)
+	if (capstone_init(maps__machine(args->ms->maps), &handle, is_64bit, disassembler_style) < 0)
 		goto err;
 
 	needs_cs_close = true;
@@ -345,7 +345,7 @@ int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused,
 {
 #ifdef HAVE_LIBCAPSTONE_SUPPORT
 	struct annotation *notes = symbol__annotation(sym);
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
 	struct nscookie nsc;
 	u64 start = map__rip_2objdump(map, sym->start);
@@ -382,7 +382,7 @@ int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused,
 	    !strcmp(args->options->disassembler_style, "att"))
 		disassembler_style = true;
 
-	if (capstone_init(maps__machine(args->ms.maps), &handle, is_64bit, disassembler_style) < 0)
+	if (capstone_init(maps__machine(args->ms->maps), &handle, is_64bit, disassembler_style) < 0)
 		goto err;
 
 	needs_cs_close = true;
@@ -408,7 +408,7 @@ int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused,
 	args->line = disasm_buf;
 	args->line_nr = 0;
 	args->fileloc = NULL;
-	args->ms.sym = sym;
+	args->ms->sym = sym;
 
 	dl = disasm_line__new(args);
 	if (dl == NULL)
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 50b9433f3f8e..924429142631 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -269,9 +269,7 @@ static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_s
 {
 	char *endptr, *tok, *name;
 	struct map *map = ms->map;
-	struct addr_map_symbol target = {
-		.ms = { .map = map, },
-	};
+	struct addr_map_symbol target;
 
 	ops->target.addr = strtoull(ops->raw, &endptr, 16);
 
@@ -296,12 +294,16 @@ static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_s
 	if (ops->target.name == NULL)
 		return -1;
 find_target:
-	target.addr = map__objdump_2mem(map, ops->target.addr);
+	target = (struct addr_map_symbol) {
+		.ms = { .map = map__get(map), },
+		.addr = map__objdump_2mem(map, ops->target.addr),
+	};
 
 	if (maps__find_ams(ms->maps, &target) == 0 &&
 	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
 		ops->target.sym = target.ms.sym;
 
+	addr_map_symbol__exit(&target);
 	return 0;
 
 indirect_call:
@@ -366,7 +368,7 @@ static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_s
 	struct map *map = ms->map;
 	struct symbol *sym = ms->sym;
 	struct addr_map_symbol target = {
-		.ms = { .map = map, },
+		.ms = { .map = map__get(map), },
 	};
 	const char *c = strchr(ops->raw, ',');
 	u64 start, end;
@@ -440,7 +442,7 @@ static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_s
 	} else {
 		ops->target.offset_avail = false;
 	}
-
+	addr_map_symbol__exit(&target);
 	return 0;
 }
 
@@ -1046,7 +1048,7 @@ static size_t disasm_line_size(int nr)
 struct disasm_line *disasm_line__new(struct annotate_args *args)
 {
 	struct disasm_line *dl = NULL;
-	struct annotation *notes = symbol__annotation(args->ms.sym);
+	struct annotation *notes = symbol__annotation(args->ms->sym);
 	int nr = notes->src->nr_events;
 
 	dl = zalloc(disasm_line_size(nr));
@@ -1064,7 +1066,7 @@ struct disasm_line *disasm_line__new(struct annotate_args *args)
 		} else if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0)
 			goto out_free_line;
 
-		disasm_line__init_ins(dl, args->arch, &args->ms);
+		disasm_line__init_ins(dl, args->arch, args->ms);
 	}
 
 	return dl;
@@ -1119,7 +1121,7 @@ static int symbol__parse_objdump_line(struct symbol *sym,
 				      struct annotate_args *args,
 				      char *parsed_line, int *line_nr, char **fileloc)
 {
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct annotation *notes = symbol__annotation(sym);
 	struct disasm_line *dl;
 	char *tmp;
@@ -1151,7 +1153,7 @@ static int symbol__parse_objdump_line(struct symbol *sym,
 	args->line    = parsed_line;
 	args->line_nr = *line_nr;
 	args->fileloc = *fileloc;
-	args->ms.sym  = sym;
+	args->ms->sym  = sym;
 
 	dl = disasm_line__new(args);
 	(*line_nr)++;
@@ -1169,12 +1171,14 @@ static int symbol__parse_objdump_line(struct symbol *sym,
 	if (dl->ins.ops && ins__is_call(&dl->ins) && !dl->ops.target.sym) {
 		struct addr_map_symbol target = {
 			.addr = dl->ops.target.addr,
-			.ms = { .map = map, },
+			.ms = { .map = map__get(map), },
 		};
 
-		if (!maps__find_ams(args->ms.maps, &target) &&
+		if (!maps__find_ams(args->ms->maps, &target) &&
 		    target.ms.sym->start == target.al_addr)
 			dl->ops.target.sym = target.ms.sym;
+
+		addr_map_symbol__exit(&target);
 	}
 
 	annotation_line__add(&dl->al, &notes->src->source);
@@ -1338,7 +1342,7 @@ static int symbol__disassemble_raw(char *filename, struct symbol *sym,
 					struct annotate_args *args)
 {
 	struct annotation *notes = symbol__annotation(sym);
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
 	u64 start = map__rip_2objdump(map, sym->start);
 	u64 end = map__rip_2objdump(map, sym->end);
@@ -1375,7 +1379,7 @@ static int symbol__disassemble_raw(char *filename, struct symbol *sym,
 	args->line = disasm_buf;
 	args->line_nr = 0;
 	args->fileloc = NULL;
-	args->ms.sym = sym;
+	args->ms->sym = sym;
 
 	dl = disasm_line__new(args);
 	if (dl == NULL)
@@ -1501,7 +1505,7 @@ static int symbol__disassemble_objdump(const char *filename, struct symbol *sym,
 				       struct annotate_args *args)
 {
 	struct annotation_options *opts = &annotate_opts;
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
 	char *command;
 	FILE *file;
@@ -1644,7 +1648,7 @@ out_free_command:
 int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 {
 	struct annotation_options *options = args->options;
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
 	char symfs_filename[PATH_MAX];
 	bool delete_extract = false;
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index d2cb555e4a3b..a3ea9d676281 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -97,7 +97,7 @@ struct ins_ops {
 
 struct annotate_args {
 	struct arch		  *arch;
-	struct map_symbol	  ms;
+	struct map_symbol	  *ms;
 	struct annotation_options *options;
 	s64			  offset;
 	char			  *line;
diff --git a/tools/perf/util/llvm.c b/tools/perf/util/llvm.c
index 2ebf1f5f65bf..4ada9a10bd93 100644
--- a/tools/perf/util/llvm.c
+++ b/tools/perf/util/llvm.c
@@ -118,7 +118,7 @@ int symbol__disassemble_llvm(const char *filename, struct symbol *sym,
 {
 #ifdef HAVE_LIBLLVM_SUPPORT
 	struct annotation *notes = symbol__annotation(sym);
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
 	u64 start = map__rip_2objdump(map, sym->start);
 	/* Malloc-ed buffer containing instructions read from disk. */
@@ -184,7 +184,7 @@ int symbol__disassemble_llvm(const char *filename, struct symbol *sym,
 	args->line = disasm_buf;
 	args->line_nr = 0;
 	args->fileloc = NULL;
-	args->ms.sym = sym;
+	args->ms->sym = sym;
 
 	dl = disasm_line__new(args);
 	if (dl == NULL)
@@ -242,7 +242,7 @@ int symbol__disassemble_llvm(const char *filename, struct symbol *sym,
 					 &line_storage_len);
 		args->line_nr = 0;
 		args->fileloc = NULL;
-		args->ms.sym = sym;
+		args->ms->sym = sym;
 
 		llvm_addr2line(filename, pc, &args->fileloc,
 			       (unsigned int *)&args->line_nr, false, NULL);
-- 
cgit v1.2.3


From 6fdd2676db55b503c52dd3f1359b5c57f774ab75 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 13:35:07 -0800
Subject: perf maps: Fix reference count leak in maps__find_ams()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ams and so ams->ms.map is an in argument, however, it is also
overwritten. As a map is reference counted, ensure a map__put() is done
before overwriting it.

Fixes: 42fd623b58dbcc48 ("perf maps: Get map before returning in maps__find")
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zecheng Li <zecheng@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/maps.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c
index 8ccc46d515b6..4092211cff62 100644
--- a/tools/perf/util/maps.c
+++ b/tools/perf/util/maps.c
@@ -708,6 +708,7 @@ int maps__find_ams(struct maps *maps, struct addr_map_symbol *ams)
 	if (ams->addr < map__start(ams->ms.map) || ams->addr >= map__end(ams->ms.map)) {
 		if (maps == NULL)
 			return -1;
+		map__put(ams->ms.map);
 		ams->ms.map = maps__find(maps, ams->addr);
 		if (ams->ms.map == NULL)
 			return -1;
-- 
cgit v1.2.3


From 57d26593a92fdeaca5adcbbb5362fa13d5dd7540 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 13:35:08 -0800
Subject: perf disasm: Constify use of 'struct arch'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 'struct arch' holds variables that are read but not written, except
during some initialization.

Change most uses to be for a "const struct arch *" version to capture
this immutability.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zecheng Li <zecheng@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm64/annotate/instructions.c     |  2 +-
 tools/perf/arch/loongarch/annotate/instructions.c | 12 ++++---
 tools/perf/arch/s390/annotate/instructions.c      |  7 ++--
 tools/perf/arch/x86/annotate/instructions.c       |  4 +--
 tools/perf/ui/browsers/annotate.c                 |  2 +-
 tools/perf/util/annotate-data.c                   |  2 +-
 tools/perf/util/annotate-data.h                   |  2 +-
 tools/perf/util/annotate.c                        | 28 +++++++--------
 tools/perf/util/annotate.h                        | 10 +++---
 tools/perf/util/disasm.c                          | 42 +++++++++++++----------
 tools/perf/util/disasm.h                          | 14 ++++----
 11 files changed, 67 insertions(+), 58 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c
index 16cb62d40bd9..5099fa36180d 100644
--- a/tools/perf/arch/arm64/annotate/instructions.c
+++ b/tools/perf/arch/arm64/annotate/instructions.c
@@ -10,7 +10,7 @@ struct arm64_annotate {
 		jump_insn;
 };
 
-static int arm64_mov__parse(struct arch *arch __maybe_unused,
+static int arm64_mov__parse(const struct arch *arch __maybe_unused,
 			    struct ins_operands *ops,
 			    struct map_symbol *ms __maybe_unused,
 			    struct disasm_line *dl __maybe_unused)
diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/arch/loongarch/annotate/instructions.c
index 1c3abb43c8d7..5ebfe629ea68 100644
--- a/tools/perf/arch/loongarch/annotate/instructions.c
+++ b/tools/perf/arch/loongarch/annotate/instructions.c
@@ -5,8 +5,10 @@
  * Copyright (C) 2020-2023 Loongson Technology Corporation Limited
  */
 
-static int loongarch_call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
-		struct disasm_line *dl __maybe_unused)
+static int loongarch_call__parse(const struct arch *arch, struct ins_operands *ops,
+				 struct map_symbol *ms,
+				 struct disasm_line *dl __maybe_unused)
+
 {
 	char *c, *endptr, *tok, *name;
 	struct map *map = ms->map;
@@ -54,8 +56,10 @@ static struct ins_ops loongarch_call_ops = {
 	.scnprintf = call__scnprintf,
 };
 
-static int loongarch_jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
-		struct disasm_line *dl __maybe_unused)
+static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *ops,
+				 struct map_symbol *ms,
+				 struct disasm_line *dl __maybe_unused)
+
 {
 	struct map *map = ms->map;
 	struct symbol *sym = ms->sym;
diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c
index 626e6d2cbc81..37c1b62641d8 100644
--- a/tools/perf/arch/s390/annotate/instructions.c
+++ b/tools/perf/arch/s390/annotate/instructions.c
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/compiler.h>
 
-static int s390_call__parse(struct arch *arch, struct ins_operands *ops,
-			    struct map_symbol *ms, struct disasm_line *dl __maybe_unused)
+static int s390_call__parse(const struct arch *arch, struct ins_operands *ops,
+			    struct map_symbol *ms,
+			    struct disasm_line *dl __maybe_unused)
 {
 	char *endptr, *tok, *name;
 	struct map *map = ms->map;
@@ -53,7 +54,7 @@ static struct ins_ops s390_call_ops = {
 	.scnprintf = call__scnprintf,
 };
 
-static int s390_mov__parse(struct arch *arch __maybe_unused,
+static int s390_mov__parse(const struct arch *arch __maybe_unused,
 			   struct ins_operands *ops,
 			   struct map_symbol *ms __maybe_unused,
 			   struct disasm_line *dl __maybe_unused)
diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c
index 803f9351a3fb..24b388bacdae 100644
--- a/tools/perf/arch/x86/annotate/instructions.c
+++ b/tools/perf/arch/x86/annotate/instructions.c
@@ -119,7 +119,7 @@ static struct ins x86__instructions[] = {
 	{ .name = "xorps",	.ops = &mov_ops, },
 };
 
-static bool amd__ins_is_fused(struct arch *arch, const char *ins1,
+static bool amd__ins_is_fused(const struct arch *arch, const char *ins1,
 			      const char *ins2)
 {
 	if (strstr(ins2, "jmp"))
@@ -142,7 +142,7 @@ static bool amd__ins_is_fused(struct arch *arch, const char *ins1,
 	return false;
 }
 
-static bool intel__ins_is_fused(struct arch *arch, const char *ins1,
+static bool intel__ins_is_fused(const struct arch *arch, const char *ins1,
 				const char *ins2)
 {
 	if (arch->family != 6 || arch->model < 0x1e || strstr(ins2, "jmp"))
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 36aca8d6d003..3df61cd46652 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -30,7 +30,7 @@ struct annotate_browser {
 	struct rb_root		    entries;
 	struct rb_node		   *curr_hot;
 	struct annotation_line	   *selection;
-	struct arch		   *arch;
+	const struct arch	   *arch;
 	/*
 	 * perf top can delete hist_entry anytime.  Callers should make sure
 	 * its lifetime.
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 07cf9c334be0..edfcd6e9df9c 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -160,7 +160,7 @@ bool has_reg_type(struct type_state *state, int reg)
 	return (unsigned)reg < ARRAY_SIZE(state->regs);
 }
 
-static void init_type_state(struct type_state *state, struct arch *arch)
+static void init_type_state(struct type_state *state, const struct arch *arch)
 {
 	memset(state, 0, sizeof(*state));
 	INIT_LIST_HEAD(&state->stack_vars);
diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h
index 869307c7f130..9b222869e42d 100644
--- a/tools/perf/util/annotate-data.h
+++ b/tools/perf/util/annotate-data.h
@@ -117,7 +117,7 @@ extern struct annotated_data_type canary_type;
  */
 struct data_loc_info {
 	/* These are input field, should be filled by caller */
-	struct arch *arch;
+	const struct arch *arch;
 	struct thread *thread;
 	struct map_symbol *ms;
 	u64 ip;
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 791d60f97c23..132af2556aec 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -761,7 +761,7 @@ static int disasm_line__print(struct disasm_line *dl, u64 start, int addr_fmt_wi
 }
 
 static struct annotated_data_type *
-__hist_entry__get_data_type(struct hist_entry *he, struct arch *arch,
+__hist_entry__get_data_type(struct hist_entry *he, const struct arch *arch,
 			    struct debuginfo *dbg, struct disasm_line *dl,
 			    int *type_offset);
 
@@ -980,11 +980,11 @@ void symbol__calc_percent(struct symbol *sym, struct evsel *evsel)
 	annotation__calc_percent(notes, evsel, symbol__size(sym));
 }
 
-int evsel__get_arch(struct evsel *evsel, struct arch **parch)
+int evsel__get_arch(struct evsel *evsel, const struct arch **parch)
 {
 	struct perf_env *env = evsel__env(evsel);
 	const char *arch_name = perf_env__arch(env);
-	struct arch *arch;
+	const struct arch *arch;
 	int err;
 
 	if (!arch_name) {
@@ -999,7 +999,7 @@ int evsel__get_arch(struct evsel *evsel, struct arch **parch)
 	}
 
 	if (arch->init) {
-		err = arch->init(arch, env ? env->cpuid : NULL);
+		err = arch->init((struct arch *)arch, env ? env->cpuid : NULL);
 		if (err) {
 			pr_err("%s: failed to initialize %s arch priv area\n",
 			       __func__, arch->name);
@@ -1010,14 +1010,14 @@ int evsel__get_arch(struct evsel *evsel, struct arch **parch)
 }
 
 int symbol__annotate(struct map_symbol *ms, struct evsel *evsel,
-		     struct arch **parch)
+		     const struct arch **parch)
 {
 	struct symbol *sym = ms->sym;
 	struct annotation *notes = symbol__annotation(sym);
 	struct annotate_args args = {
 		.options	= &annotate_opts,
 	};
-	struct arch *arch = NULL;
+	const struct arch *arch = NULL;
 	int err, nr;
 
 	err = evsel__get_arch(evsel, &arch);
@@ -2204,7 +2204,7 @@ print_addr:
 }
 
 int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel,
-		      struct arch **parch)
+		      const struct arch **parch)
 {
 	struct symbol *sym = ms->sym;
 	struct annotation *notes = symbol__annotation(sym);
@@ -2457,7 +2457,7 @@ int annotate_check_args(void)
  * to revisit the format when it handles different architecture.
  * Fills @reg and @offset when return 0.
  */
-static int extract_reg_offset(struct arch *arch, const char *str,
+static int extract_reg_offset(const struct arch *arch, const char *str,
 			      struct annotated_op_loc *op_loc)
 {
 	char *p;
@@ -2538,7 +2538,7 @@ static int extract_reg_offset(struct arch *arch, const char *str,
  *                              # dst_reg1 = rbx, dst_reg2 = rcx, dst_mem = 1
  *                              # dst_multi_regs = 1, dst_offset = 8
  */
-int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl,
+int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl,
 			       struct annotated_insn_loc *loc)
 {
 	struct ins_operands *ops;
@@ -2673,7 +2673,7 @@ static struct annotated_item_stat *annotate_data_stat(struct list_head *head,
 	return istat;
 }
 
-static bool is_stack_operation(struct arch *arch, struct disasm_line *dl)
+static bool is_stack_operation(const struct arch *arch, struct disasm_line *dl)
 {
 	if (arch__is(arch, "x86")) {
 		if (!strncmp(dl->ins.name, "push", 4) ||
@@ -2686,7 +2686,7 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl)
 	return false;
 }
 
-static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc)
+static bool is_stack_canary(const struct arch *arch, struct annotated_op_loc *loc)
 {
 	/* On x86_64, %gs:40 is used for stack canary */
 	if (arch__is(arch, "x86")) {
@@ -2702,7 +2702,7 @@ static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc)
  * Returns true if the instruction has a memory operand without
  * performing a load/store
  */
-static bool is_address_gen_insn(struct arch *arch, struct disasm_line *dl)
+static bool is_address_gen_insn(const struct arch *arch, struct disasm_line *dl)
 {
 	if (arch__is(arch, "x86")) {
 		if (!strncmp(dl->ins.name, "lea", 3))
@@ -2791,7 +2791,7 @@ void debuginfo_cache__delete(void)
 }
 
 static struct annotated_data_type *
-__hist_entry__get_data_type(struct hist_entry *he, struct arch *arch,
+__hist_entry__get_data_type(struct hist_entry *he, const struct arch *arch,
 			    struct debuginfo *dbg, struct disasm_line *dl,
 			    int *type_offset)
 {
@@ -2895,7 +2895,7 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he)
 {
 	struct map_symbol *ms = &he->ms;
 	struct evsel *evsel = hists_to_evsel(he->hists);
-	struct arch *arch;
+	const struct arch *arch;
 	struct disasm_line *dl;
 	struct annotated_data_type *mem_type;
 	struct annotated_item_stat *istat;
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index d4990bff29a7..58eaf4b2fa65 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -202,7 +202,7 @@ struct annotation_write_ops {
 struct annotation_print_data {
 	struct hist_entry *he;
 	struct evsel *evsel;
-	struct arch *arch;
+	const struct arch *arch;
 	struct debuginfo *dbg;
 	/* save data type info keyed by al->offset */
 	struct hashmap *type_hash;
@@ -441,10 +441,10 @@ void symbol__annotate_zero_histograms(struct symbol *sym);
 
 int symbol__annotate(struct map_symbol *ms,
 		     struct evsel *evsel,
-		     struct arch **parch);
+		     const struct arch **parch);
 int symbol__annotate2(struct map_symbol *ms,
 		      struct evsel *evsel,
-		      struct arch **parch);
+		      const struct arch **parch);
 
 enum symbol_disassemble_errno {
 	SYMBOL_ANNOTATE_ERRNO__SUCCESS		= 0,
@@ -546,7 +546,7 @@ struct annotated_insn_loc {
 	     i++, op_loc++)
 
 /* Get detailed location info in the instruction */
-int annotate_get_insn_location(struct arch *arch, struct disasm_line *dl,
+int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl,
 			       struct annotated_insn_loc *loc);
 
 /* Returns a data type from the sample instruction (if any) */
@@ -586,5 +586,5 @@ int annotation_br_cntr_entry(char **str, int br_cntr_nr, u64 *br_cntr,
 			     int num_aggr, struct evsel *evsel);
 int annotation_br_cntr_abbr_list(char **str, struct evsel *evsel, bool header);
 
-int evsel__get_arch(struct evsel *evsel, struct arch **parch);
+int evsel__get_arch(struct evsel *evsel, const struct arch **parch);
 #endif	/* __PERF_ANNOTATE_H */
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 924429142631..d92c0424e8fc 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -213,7 +213,7 @@ static void arch__sort(void)
 	qsort(architectures, nmemb, sizeof(struct arch), arch__cmp);
 }
 
-struct arch *arch__find(const char *name)
+const struct arch *arch__find(const char *name)
 {
 	const int nmemb = ARRAY_SIZE(architectures);
 	static bool sorted;
@@ -226,7 +226,7 @@ struct arch *arch__find(const char *name)
 	return bsearch(name, architectures, nmemb, sizeof(struct arch), arch__key_cmp);
 }
 
-bool arch__is(struct arch *arch, const char *name)
+bool arch__is(const struct arch *arch, const char *name)
 {
 	return !strcmp(arch->name, name);
 }
@@ -256,7 +256,7 @@ static int ins__scnprintf(struct ins *ins, char *bf, size_t size,
 	return ins__raw_scnprintf(ins, bf, size, ops, max_ins_name);
 }
 
-bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2)
+bool ins__is_fused(const struct arch *arch, const char *ins1, const char *ins2)
 {
 	if (!arch || !arch->ins_is_fused)
 		return false;
@@ -264,7 +264,7 @@ bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2)
 	return arch->ins_is_fused(arch, ins1, ins2);
 }
 
-static int call__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+static int call__parse(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
 		struct disasm_line *dl __maybe_unused)
 {
 	char *endptr, *tok, *name;
@@ -362,7 +362,7 @@ static inline const char *validate_comma(const char *c, struct ins_operands *ops
 	return c;
 }
 
-static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+static int jump__parse(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
 		struct disasm_line *dl __maybe_unused)
 {
 	struct map *map = ms->map;
@@ -525,7 +525,7 @@ static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep)
 	return 0;
 }
 
-static int lock__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+static int lock__parse(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
 		struct disasm_line *dl __maybe_unused)
 {
 	ops->locked.ops = zalloc(sizeof(*ops->locked.ops));
@@ -592,7 +592,7 @@ static struct ins_ops lock_ops = {
  * But it doesn't care segment selectors like %gs:0x5678(%rcx), so just check
  * the input string after 'memory_ref_char' if exists.
  */
-static bool check_multi_regs(struct arch *arch, const char *op)
+static bool check_multi_regs(const struct arch *arch, const char *op)
 {
 	int count = 0;
 
@@ -613,8 +613,9 @@ static bool check_multi_regs(struct arch *arch, const char *op)
 	return count > 1;
 }
 
-static int mov__parse(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms __maybe_unused,
-		struct disasm_line *dl __maybe_unused)
+static int mov__parse(const struct arch *arch, struct ins_operands *ops,
+		      struct map_symbol *ms __maybe_unused,
+		      struct disasm_line *dl __maybe_unused)
 {
 	char *s = strchr(ops->raw, ','), *target, *comment, prev;
 
@@ -719,7 +720,7 @@ static int arithmetic__scnprintf(struct ins *ins, char *bf, size_t size,
  * - Add to Zero Extended XO-form ( Ex: addze, addzeo )
  * - Subtract From Zero Extended XO-form ( Ex: subfze )
  */
-static int arithmetic__parse(struct arch *arch __maybe_unused, struct ins_operands *ops,
+static int arithmetic__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops,
 		struct map_symbol *ms __maybe_unused, struct disasm_line *dl)
 {
 	int opcode = PPC_OP(dl->raw.raw_insn);
@@ -756,7 +757,7 @@ static int load_store__scnprintf(struct ins *ins, char *bf, size_t size,
  * used by powerpc and since binary instruction code is used to
  * extract opcode, regs and offset, no other parsing is needed here
  */
-static int load_store__parse(struct arch *arch __maybe_unused, struct ins_operands *ops,
+static int load_store__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops,
 		struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused)
 {
 	ops->source.mem_ref = true;
@@ -776,8 +777,9 @@ static struct ins_ops load_store_ops = {
 	.scnprintf = load_store__scnprintf,
 };
 
-static int dec__parse(struct arch *arch __maybe_unused, struct ins_operands *ops, struct map_symbol *ms __maybe_unused,
-		struct disasm_line *dl __maybe_unused)
+static int dec__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops,
+		      struct map_symbol *ms __maybe_unused,
+		      struct disasm_line *dl __maybe_unused)
 {
 	char *target, *comment, *s, prev;
 
@@ -867,7 +869,8 @@ static void ins__sort(struct arch *arch)
 	qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp);
 }
 
-static struct ins_ops *__ins__find(struct arch *arch, const char *name, struct disasm_line *dl)
+static struct ins_ops *__ins__find(const struct arch *arch, const char *name,
+				     struct disasm_line *dl)
 {
 	struct ins *ins;
 	const int nmemb = arch->nr_instructions;
@@ -885,8 +888,8 @@ static struct ins_ops *__ins__find(struct arch *arch, const char *name, struct d
 	}
 
 	if (!arch->sorted_instructions) {
-		ins__sort(arch);
-		arch->sorted_instructions = true;
+		ins__sort((struct arch *)arch);
+		((struct arch *)arch)->sorted_instructions = true;
 	}
 
 	ins = bsearch(name, arch->instructions, nmemb, sizeof(struct ins), ins__key_cmp);
@@ -913,17 +916,18 @@ static struct ins_ops *__ins__find(struct arch *arch, const char *name, struct d
 	return ins ? ins->ops : NULL;
 }
 
-struct ins_ops *ins__find(struct arch *arch, const char *name, struct disasm_line *dl)
+struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl)
 {
 	struct ins_ops *ops = __ins__find(arch, name, dl);
 
 	if (!ops && arch->associate_instruction_ops)
-		ops = arch->associate_instruction_ops(arch, name);
+		ops = arch->associate_instruction_ops((struct arch *)arch, name);
 
 	return ops;
 }
 
-static void disasm_line__init_ins(struct disasm_line *dl, struct arch *arch, struct map_symbol *ms)
+static void disasm_line__init_ins(struct disasm_line *dl, const struct arch *arch,
+				    struct map_symbol *ms)
 {
 	dl->ins.ops = ins__find(arch, dl->ins.name, dl);
 
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index a3ea9d676281..273a9c906514 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -30,7 +30,7 @@ struct arch {
 	unsigned int	model;
 	unsigned int	family;
 	int		(*init)(struct arch *arch, char *cpuid);
-	bool		(*ins_is_fused)(struct arch *arch, const char *ins1,
+	bool		(*ins_is_fused)(const struct arch *arch, const char *ins1,
 					const char *ins2);
 	struct		{
 		char comment_char;
@@ -89,14 +89,14 @@ struct ins_operands {
 
 struct ins_ops {
 	void (*free)(struct ins_operands *ops);
-	int (*parse)(struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
+	int (*parse)(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
 			struct disasm_line *dl);
 	int (*scnprintf)(struct ins *ins, char *bf, size_t size,
 			 struct ins_operands *ops, int max_ins_name);
 };
 
 struct annotate_args {
-	struct arch		  *arch;
+	const struct arch	  *arch;
 	struct map_symbol	  *ms;
 	struct annotation_options *options;
 	s64			  offset;
@@ -105,14 +105,14 @@ struct annotate_args {
 	char			  *fileloc;
 };
 
-struct arch *arch__find(const char *name);
-bool arch__is(struct arch *arch, const char *name);
+const struct arch *arch__find(const char *name);
+bool arch__is(const struct arch *arch, const char *name);
 
-struct ins_ops *ins__find(struct arch *arch, const char *name, struct disasm_line *dl);
+struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl);
 
 bool ins__is_call(const struct ins *ins);
 bool ins__is_jump(const struct ins *ins);
-bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2);
+bool ins__is_fused(const struct arch *arch, const char *ins1, const char *ins2);
 bool ins__is_ret(const struct ins *ins);
 bool ins__is_lock(const struct ins *ins);
 
-- 
cgit v1.2.3


From 1e3b91d6c53e2b5e01424511d009b6405d8a4152 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 13:35:09 -0800
Subject: perf disasm: Constify use of 'struct ins_op'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 'struct ins_op' holds variables to function pointers that are read
but not written. Change uses to be for a "const struct ins_op *"
version to capture this immutability.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zecheng Li <zecheng@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/annotate/instructions.c       |  4 +-
 tools/perf/arch/arm64/annotate/instructions.c     |  6 +--
 tools/perf/arch/csky/annotate/instructions.c      |  6 +--
 tools/perf/arch/loongarch/annotate/instructions.c |  8 ++--
 tools/perf/arch/mips/annotate/instructions.c      |  4 +-
 tools/perf/arch/powerpc/annotate/instructions.c   |  6 +--
 tools/perf/arch/riscv64/annotate/instructions.c   |  4 +-
 tools/perf/arch/s390/annotate/instructions.c      |  8 ++--
 tools/perf/arch/sparc/annotate/instructions.c     |  4 +-
 tools/perf/util/disasm.c                          | 46 +++++++++++------------
 tools/perf/util/disasm.h                          |  6 +--
 11 files changed, 51 insertions(+), 51 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/annotate/instructions.c b/tools/perf/arch/arm/annotate/instructions.c
index 5e667b0f5512..b997d127fedd 100644
--- a/tools/perf/arch/arm/annotate/instructions.c
+++ b/tools/perf/arch/arm/annotate/instructions.c
@@ -11,10 +11,10 @@ struct arm_annotate {
 		jump_insn;
 };
 
-static struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const char *name)
+static const struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const char *name)
 {
 	struct arm_annotate *arm = arch->priv;
-	struct ins_ops *ops;
+	const struct ins_ops *ops;
 	regmatch_t match[2];
 
 	if (!regexec(&arm->call_insn, name, 2, match, 0))
diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c
index 5099fa36180d..363af2f55122 100644
--- a/tools/perf/arch/arm64/annotate/instructions.c
+++ b/tools/perf/arch/arm64/annotate/instructions.c
@@ -63,15 +63,15 @@ out_free_source:
 static int mov__scnprintf(struct ins *ins, char *bf, size_t size,
 			  struct ins_operands *ops, int max_ins_name);
 
-static struct ins_ops arm64_mov_ops = {
+static const struct ins_ops arm64_mov_ops = {
 	.parse	   = arm64_mov__parse,
 	.scnprintf = mov__scnprintf,
 };
 
-static struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name)
+static const struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name)
 {
 	struct arm64_annotate *arm = arch->priv;
-	struct ins_ops *ops;
+	const struct ins_ops *ops;
 	regmatch_t match[2];
 
 	if (!regexec(&arm->jump_insn, name, 2, match, 0))
diff --git a/tools/perf/arch/csky/annotate/instructions.c b/tools/perf/arch/csky/annotate/instructions.c
index 14270311d215..4a55c84a320a 100644
--- a/tools/perf/arch/csky/annotate/instructions.c
+++ b/tools/perf/arch/csky/annotate/instructions.c
@@ -3,10 +3,10 @@
 
 #include <linux/compiler.h>
 
-static struct ins_ops *csky__associate_ins_ops(struct arch *arch,
-					       const char *name)
+static const struct ins_ops *csky__associate_ins_ops(struct arch *arch,
+						     const char *name)
 {
-	struct ins_ops *ops = NULL;
+	const struct ins_ops *ops = NULL;
 
 	/* catch all kind of jumps */
 	if (!strcmp(name, "bt") ||
diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/arch/loongarch/annotate/instructions.c
index 5ebfe629ea68..5010d5d58375 100644
--- a/tools/perf/arch/loongarch/annotate/instructions.c
+++ b/tools/perf/arch/loongarch/annotate/instructions.c
@@ -51,7 +51,7 @@ static int loongarch_call__parse(const struct arch *arch, struct ins_operands *o
 	return 0;
 }
 
-static struct ins_ops loongarch_call_ops = {
+static const struct ins_ops loongarch_call_ops = {
 	.parse	   = loongarch_call__parse,
 	.scnprintf = call__scnprintf,
 };
@@ -100,15 +100,15 @@ static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *o
 	return 0;
 }
 
-static struct ins_ops loongarch_jump_ops = {
+static const struct ins_ops loongarch_jump_ops = {
 	.parse	   = loongarch_jump__parse,
 	.scnprintf = jump__scnprintf,
 };
 
 static
-struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char *name)
+const struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char *name)
 {
-	struct ins_ops *ops = NULL;
+	const struct ins_ops *ops = NULL;
 
 	if (!strcmp(name, "bl"))
 		ops = &loongarch_call_ops;
diff --git a/tools/perf/arch/mips/annotate/instructions.c b/tools/perf/arch/mips/annotate/instructions.c
index b50b46c613d6..0fbe0a7df95a 100644
--- a/tools/perf/arch/mips/annotate/instructions.c
+++ b/tools/perf/arch/mips/annotate/instructions.c
@@ -1,9 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 
 static
-struct ins_ops *mips__associate_ins_ops(struct arch *arch, const char *name)
+const struct ins_ops *mips__associate_ins_ops(struct arch *arch, const char *name)
 {
-	struct ins_ops *ops = NULL;
+	const struct ins_ops *ops = NULL;
 
 	if (!strncmp(name, "bal", 3) ||
 	    !strncmp(name, "bgezal", 6) ||
diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c
index ca567cfdcbdb..d1be55425e35 100644
--- a/tools/perf/arch/powerpc/annotate/instructions.c
+++ b/tools/perf/arch/powerpc/annotate/instructions.c
@@ -1,10 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/compiler.h>
 
-static struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, const char *name)
+static const struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, const char *name)
 {
 	int i;
-	struct ins_ops *ops;
+	const struct ins_ops *ops;
 
 	/*
 	 * - Interested only if instruction starts with 'b'.
@@ -189,7 +189,7 @@ static int cmp_offset(const void *a, const void *b)
 	return (val1->value - val2->value);
 }
 
-static struct ins_ops *check_ppc_insn(struct disasm_line *dl)
+static const struct ins_ops *check_ppc_insn(struct disasm_line *dl)
 {
 	int raw_insn = dl->raw.raw_insn;
 	int opcode = PPC_OP(raw_insn);
diff --git a/tools/perf/arch/riscv64/annotate/instructions.c b/tools/perf/arch/riscv64/annotate/instructions.c
index 55cf911633f8..a34798864fab 100644
--- a/tools/perf/arch/riscv64/annotate/instructions.c
+++ b/tools/perf/arch/riscv64/annotate/instructions.c
@@ -1,9 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 
 static
-struct ins_ops *riscv64__associate_ins_ops(struct arch *arch, const char *name)
+const struct ins_ops *riscv64__associate_ins_ops(struct arch *arch, const char *name)
 {
-	struct ins_ops *ops = NULL;
+	const struct ins_ops *ops = NULL;
 
 	if (!strncmp(name, "jal", 3) ||
 	    !strncmp(name, "jr", 2) ||
diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c
index 37c1b62641d8..1b22e6276e7d 100644
--- a/tools/perf/arch/s390/annotate/instructions.c
+++ b/tools/perf/arch/s390/annotate/instructions.c
@@ -49,7 +49,7 @@ static int s390_call__parse(const struct arch *arch, struct ins_operands *ops,
 	return 0;
 }
 
-static struct ins_ops s390_call_ops = {
+static const struct ins_ops s390_call_ops = {
 	.parse	   = s390_call__parse,
 	.scnprintf = call__scnprintf,
 };
@@ -103,14 +103,14 @@ out_free_source:
 }
 
 
-static struct ins_ops s390_mov_ops = {
+static const struct ins_ops s390_mov_ops = {
 	.parse	   = s390_mov__parse,
 	.scnprintf = mov__scnprintf,
 };
 
-static struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name)
+static const struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name)
 {
-	struct ins_ops *ops = NULL;
+	const struct ins_ops *ops = NULL;
 
 	/* catch all kind of jumps */
 	if (strchr(name, 'j') ||
diff --git a/tools/perf/arch/sparc/annotate/instructions.c b/tools/perf/arch/sparc/annotate/instructions.c
index 68c31580ccfc..a08d8734c883 100644
--- a/tools/perf/arch/sparc/annotate/instructions.c
+++ b/tools/perf/arch/sparc/annotate/instructions.c
@@ -117,9 +117,9 @@ static int is_branch_float_cond(const char *cond)
 	return 0;
 }
 
-static struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, const char *name)
+static const struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, const char *name)
 {
-	struct ins_ops *ops = NULL;
+	const struct ins_ops *ops = NULL;
 
 	if (!strcmp(name, "call") ||
 	    !strcmp(name, "jmp") ||
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index d92c0424e8fc..9bc9b1de98db 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -33,15 +33,15 @@
 static regex_t	 file_lineno;
 
 /* These can be referred from the arch-dependent code */
-static struct ins_ops call_ops;
-static struct ins_ops dec_ops;
-static struct ins_ops jump_ops;
-static struct ins_ops mov_ops;
-static struct ins_ops nop_ops;
-static struct ins_ops lock_ops;
-static struct ins_ops ret_ops;
-static struct ins_ops load_store_ops;
-static struct ins_ops arithmetic_ops;
+static const struct ins_ops call_ops;
+static const struct ins_ops dec_ops;
+static const struct ins_ops jump_ops;
+static const struct ins_ops mov_ops;
+static const struct ins_ops nop_ops;
+static const struct ins_ops lock_ops;
+static const struct ins_ops ret_ops;
+static const struct ins_ops load_store_ops;
+static const struct ins_ops arithmetic_ops;
 
 static int jump__scnprintf(struct ins *ins, char *bf, size_t size,
 			   struct ins_operands *ops, int max_ins_name);
@@ -85,7 +85,7 @@ grow_from_non_allocated_table:
 	goto out_update_instructions;
 }
 
-static int arch__associate_ins_ops(struct arch* arch, const char *name, struct ins_ops *ops)
+static int arch__associate_ins_ops(struct arch *arch, const char *name, const struct ins_ops *ops)
 {
 	struct ins *ins;
 
@@ -334,7 +334,7 @@ static int call__scnprintf(struct ins *ins, char *bf, size_t size,
 	return scnprintf(bf, size, "%-*s *%" PRIx64, max_ins_name, ins->name, ops->target.addr);
 }
 
-static struct ins_ops call_ops = {
+static const struct ins_ops call_ops = {
 	.parse	   = call__parse,
 	.scnprintf = call__scnprintf,
 };
@@ -487,7 +487,7 @@ static void jump__delete(struct ins_operands *ops __maybe_unused)
 	 */
 }
 
-static struct ins_ops jump_ops = {
+static const struct ins_ops jump_ops = {
 	.free	   = jump__delete,
 	.parse	   = jump__parse,
 	.scnprintf = jump__scnprintf,
@@ -579,7 +579,7 @@ static void lock__delete(struct ins_operands *ops)
 	zfree(&ops->target.name);
 }
 
-static struct ins_ops lock_ops = {
+static const struct ins_ops lock_ops = {
 	.free	   = lock__delete,
 	.parse	   = lock__parse,
 	.scnprintf = lock__scnprintf,
@@ -688,7 +688,7 @@ static int mov__scnprintf(struct ins *ins, char *bf, size_t size,
 			 ops->target.name ?: ops->target.raw);
 }
 
-static struct ins_ops mov_ops = {
+static const struct ins_ops mov_ops = {
 	.parse	   = mov__parse,
 	.scnprintf = mov__scnprintf,
 };
@@ -738,7 +738,7 @@ static int arithmetic__parse(const struct arch *arch __maybe_unused, struct ins_
 	return 0;
 }
 
-static struct ins_ops arithmetic_ops = {
+static const struct ins_ops arithmetic_ops = {
 	.parse     = arithmetic__parse,
 	.scnprintf = arithmetic__scnprintf,
 };
@@ -772,7 +772,7 @@ static int load_store__parse(const struct arch *arch __maybe_unused, struct ins_
 	return 0;
 }
 
-static struct ins_ops load_store_ops = {
+static const struct ins_ops load_store_ops = {
 	.parse     = load_store__parse,
 	.scnprintf = load_store__scnprintf,
 };
@@ -813,7 +813,7 @@ static int dec__scnprintf(struct ins *ins, char *bf, size_t size,
 			 ops->target.name ?: ops->target.raw);
 }
 
-static struct ins_ops dec_ops = {
+static const struct ins_ops dec_ops = {
 	.parse	   = dec__parse,
 	.scnprintf = dec__scnprintf,
 };
@@ -824,11 +824,11 @@ static int nop__scnprintf(struct ins *ins __maybe_unused, char *bf, size_t size,
 	return scnprintf(bf, size, "%-*s", max_ins_name, "nop");
 }
 
-static struct ins_ops nop_ops = {
+static const struct ins_ops nop_ops = {
 	.scnprintf = nop__scnprintf,
 };
 
-static struct ins_ops ret_ops = {
+static const struct ins_ops ret_ops = {
 	.scnprintf = ins__raw_scnprintf,
 };
 
@@ -869,7 +869,7 @@ static void ins__sort(struct arch *arch)
 	qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp);
 }
 
-static struct ins_ops *__ins__find(const struct arch *arch, const char *name,
+static const struct ins_ops *__ins__find(const struct arch *arch, const char *name,
 				     struct disasm_line *dl)
 {
 	struct ins *ins;
@@ -880,7 +880,7 @@ static struct ins_ops *__ins__find(const struct arch *arch, const char *name,
 		 * For powerpc, identify the instruction ops
 		 * from the opcode using raw_insn.
 		 */
-		struct ins_ops *ops;
+		const struct ins_ops *ops;
 
 		ops = check_ppc_insn(dl);
 		if (ops)
@@ -916,9 +916,9 @@ static struct ins_ops *__ins__find(const struct arch *arch, const char *name,
 	return ins ? ins->ops : NULL;
 }
 
-struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl)
+const struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl)
 {
-	struct ins_ops *ops = __ins__find(arch, name, dl);
+	const struct ins_ops *ops = __ins__find(arch, name, dl);
 
 	if (!ops && arch->associate_instruction_ops)
 		ops = arch->associate_instruction_ops((struct arch *)arch, name);
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index 273a9c906514..dc5233f2a773 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -22,7 +22,7 @@ struct arch {
 	struct ins	*instructions;
 	size_t		nr_instructions;
 	size_t		nr_instructions_allocated;
-	struct ins_ops  *(*associate_instruction_ops)(struct arch *arch, const char *name);
+	const struct ins_ops  *(*associate_instruction_ops)(struct arch *arch, const char *name);
 	bool		sorted_instructions;
 	bool		initialized;
 	const char	*insn_suffix;
@@ -52,7 +52,7 @@ struct arch {
 
 struct ins {
 	const char     *name;
-	struct ins_ops *ops;
+	const struct ins_ops *ops;
 };
 
 struct ins_operands {
@@ -108,7 +108,7 @@ struct annotate_args {
 const struct arch *arch__find(const char *name);
 bool arch__is(const struct arch *arch, const char *name);
 
-struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl);
+const struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl);
 
 bool ins__is_call(const struct ins *ins);
 bool ins__is_jump(const struct ins *ins);
-- 
cgit v1.2.3


From 2a1ca20d0b586d582e56fcb615b27045834d415a Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 13:35:10 -0800
Subject: perf disasm: Constify use of 'struct ins'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 'struct ins' holds variables that are read but not written, except
during some initialization.

Change most uses to be for a "const struct ins *" version to capture
this immutability.

So the x86__instructions can be const pre-sort it and make the sorted
variable true.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zecheng Li <zecheng@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm64/annotate/instructions.c |  2 +-
 tools/perf/arch/x86/annotate/instructions.c   | 26 +++++++++++++++++-----
 tools/perf/util/disasm.c                      | 32 ++++++++++++++-------------
 tools/perf/util/disasm.h                      |  4 ++--
 4 files changed, 40 insertions(+), 24 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c
index 363af2f55122..44db33854dba 100644
--- a/tools/perf/arch/arm64/annotate/instructions.c
+++ b/tools/perf/arch/arm64/annotate/instructions.c
@@ -60,7 +60,7 @@ out_free_source:
 	return -1;
 }
 
-static int mov__scnprintf(struct ins *ins, char *bf, size_t size,
+static int mov__scnprintf(const struct ins *ins, char *bf, size_t size,
 			  struct ins_operands *ops, int max_ins_name);
 
 static const struct ins_ops arm64_mov_ops = {
diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c
index 24b388bacdae..ffca3029388b 100644
--- a/tools/perf/arch/x86/annotate/instructions.c
+++ b/tools/perf/arch/x86/annotate/instructions.c
@@ -7,7 +7,7 @@
  * So this table should not have entries with the suffix unless it's
  * a complete different instruction than ones without the suffix.
  */
-static struct ins x86__instructions[] = {
+static const struct ins x86__instructions[] = {
 	{ .name = "adc",	.ops = &mov_ops,  },
 	{ .name = "add",	.ops = &mov_ops,  },
 	{ .name = "addsd",	.ops = &mov_ops,  },
@@ -19,9 +19,9 @@ static struct ins x86__instructions[] = {
 	{ .name = "btr",	.ops = &mov_ops,  },
 	{ .name = "bts",	.ops = &mov_ops,  },
 	{ .name = "call",	.ops = &call_ops, },
+	{ .name = "cmovae",	.ops = &mov_ops,  },
 	{ .name = "cmovbe",	.ops = &mov_ops,  },
 	{ .name = "cmove",	.ops = &mov_ops,  },
-	{ .name = "cmovae",	.ops = &mov_ops,  },
 	{ .name = "cmp",	.ops = &mov_ops,  },
 	{ .name = "cmpxch",	.ops = &mov_ops,  },
 	{ .name = "cmpxchg",	.ops = &mov_ops,  },
@@ -73,23 +73,23 @@ static struct ins x86__instructions[] = {
 	{ .name = "movaps",	.ops = &mov_ops,  },
 	{ .name = "movdqa",	.ops = &mov_ops,  },
 	{ .name = "movdqu",	.ops = &mov_ops,  },
+	{ .name = "movsb",	.ops = &mov_ops,  },
 	{ .name = "movsd",	.ops = &mov_ops,  },
+	{ .name = "movsl",	.ops = &mov_ops,  },
 	{ .name = "movss",	.ops = &mov_ops,  },
-	{ .name = "movsb",	.ops = &mov_ops,  },
 	{ .name = "movsw",	.ops = &mov_ops,  },
-	{ .name = "movsl",	.ops = &mov_ops,  },
 	{ .name = "movupd",	.ops = &mov_ops,  },
 	{ .name = "movups",	.ops = &mov_ops,  },
 	{ .name = "movzb",	.ops = &mov_ops,  },
-	{ .name = "movzw",	.ops = &mov_ops,  },
 	{ .name = "movzl",	.ops = &mov_ops,  },
+	{ .name = "movzw",	.ops = &mov_ops,  },
 	{ .name = "mulsd",	.ops = &mov_ops,  },
 	{ .name = "mulss",	.ops = &mov_ops,  },
 	{ .name = "nop",	.ops = &nop_ops,  },
 	{ .name = "or",		.ops = &mov_ops,  },
 	{ .name = "orps",	.ops = &mov_ops,  },
-	{ .name = "pand",	.ops = &mov_ops,  },
 	{ .name = "paddq",	.ops = &mov_ops,  },
+	{ .name = "pand",	.ops = &mov_ops,  },
 	{ .name = "pcmpeqb",	.ops = &mov_ops,  },
 	{ .name = "por",	.ops = &mov_ops,  },
 	{ .name = "rcl",	.ops = &mov_ops,  },
@@ -202,6 +202,20 @@ static int x86__annotate_init(struct arch *arch, char *cpuid)
 		if (x86__cpuid_parse(arch, cpuid))
 			err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING;
 	}
+
+#ifndef NDEBUG
+	{
+		static bool sorted_check;
+
+		if (!sorted_check) {
+			for (size_t i = 0; i < arch->nr_instructions - 1; i++) {
+				assert(strcmp(arch->instructions[i].name,
+					      arch->instructions[i + 1].name) <= 0);
+			}
+			sorted_check = true;
+		}
+	}
+#endif
 	arch->e_machine = EM_X86_64;
 	arch->e_flags = 0;
 	arch->initialized = true;
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 9bc9b1de98db..2793697ce75c 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -43,9 +43,9 @@ static const struct ins_ops ret_ops;
 static const struct ins_ops load_store_ops;
 static const struct ins_ops arithmetic_ops;
 
-static int jump__scnprintf(struct ins *ins, char *bf, size_t size,
+static int jump__scnprintf(const struct ins *ins, char *bf, size_t size,
 			   struct ins_operands *ops, int max_ins_name);
-static int call__scnprintf(struct ins *ins, char *bf, size_t size,
+static int call__scnprintf(const struct ins *ins, char *bf, size_t size,
 			   struct ins_operands *ops, int max_ins_name);
 
 static void ins__sort(struct arch *arch);
@@ -66,7 +66,8 @@ static int arch__grow_instructions(struct arch *arch)
 		goto grow_from_non_allocated_table;
 
 	new_nr_allocated = arch->nr_instructions_allocated + 128;
-	new_instructions = realloc(arch->instructions, new_nr_allocated * sizeof(struct ins));
+	new_instructions = realloc((void *)arch->instructions,
+				   new_nr_allocated * sizeof(struct ins));
 	if (new_instructions == NULL)
 		return -1;
 
@@ -93,7 +94,7 @@ static int arch__associate_ins_ops(struct arch *arch, const char *name, const st
 	    arch__grow_instructions(arch))
 		return -1;
 
-	ins = &arch->instructions[arch->nr_instructions];
+	ins = (struct ins *)&arch->instructions[arch->nr_instructions];
 	ins->name = strdup(name);
 	if (!ins->name)
 		return -1;
@@ -146,6 +147,7 @@ static struct arch architectures[] = {
 		.init = x86__annotate_init,
 		.instructions = x86__instructions,
 		.nr_instructions = ARRAY_SIZE(x86__instructions),
+		.sorted_instructions = true,
 		.insn_suffix = "bwlq",
 		.objdump =  {
 			.comment_char = '#',
@@ -241,13 +243,13 @@ static void ins_ops__delete(struct ins_operands *ops)
 	zfree(&ops->target.name);
 }
 
-static int ins__raw_scnprintf(struct ins *ins, char *bf, size_t size,
+static int ins__raw_scnprintf(const struct ins *ins, char *bf, size_t size,
 			      struct ins_operands *ops, int max_ins_name)
 {
 	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->raw);
 }
 
-static int ins__scnprintf(struct ins *ins, char *bf, size_t size,
+static int ins__scnprintf(const struct ins *ins, char *bf, size_t size,
 			  struct ins_operands *ops, int max_ins_name)
 {
 	if (ins->ops->scnprintf)
@@ -319,7 +321,7 @@ indirect_call:
 	goto find_target;
 }
 
-static int call__scnprintf(struct ins *ins, char *bf, size_t size,
+static int call__scnprintf(const struct ins *ins, char *bf, size_t size,
 			   struct ins_operands *ops, int max_ins_name)
 {
 	if (ops->target.sym)
@@ -446,7 +448,7 @@ static int jump__parse(const struct arch *arch, struct ins_operands *ops, struct
 	return 0;
 }
 
-static int jump__scnprintf(struct ins *ins, char *bf, size_t size,
+static int jump__scnprintf(const struct ins *ins, char *bf, size_t size,
 			   struct ins_operands *ops, int max_ins_name)
 {
 	const char *c;
@@ -551,7 +553,7 @@ out_free_ops:
 	return 0;
 }
 
-static int lock__scnprintf(struct ins *ins, char *bf, size_t size,
+static int lock__scnprintf(const struct ins *ins, char *bf, size_t size,
 			   struct ins_operands *ops, int max_ins_name)
 {
 	int printed;
@@ -680,7 +682,7 @@ out_free_source:
 	return -1;
 }
 
-static int mov__scnprintf(struct ins *ins, char *bf, size_t size,
+static int mov__scnprintf(const struct ins *ins, char *bf, size_t size,
 			   struct ins_operands *ops, int max_ins_name)
 {
 	return scnprintf(bf, size, "%-*s %s,%s", max_ins_name, ins->name,
@@ -699,7 +701,7 @@ static const struct ins_ops mov_ops = {
 #define	ADD_ZERO_EXT_XO_FORM	202
 #define	SUB_ZERO_EXT_XO_FORM	200
 
-static int arithmetic__scnprintf(struct ins *ins, char *bf, size_t size,
+static int arithmetic__scnprintf(const struct ins *ins, char *bf, size_t size,
 		struct ins_operands *ops, int max_ins_name)
 {
 	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
@@ -743,7 +745,7 @@ static const struct ins_ops arithmetic_ops = {
 	.scnprintf = arithmetic__scnprintf,
 };
 
-static int load_store__scnprintf(struct ins *ins, char *bf, size_t size,
+static int load_store__scnprintf(const struct ins *ins, char *bf, size_t size,
 		struct ins_operands *ops, int max_ins_name)
 {
 	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
@@ -806,7 +808,7 @@ static int dec__parse(const struct arch *arch __maybe_unused, struct ins_operand
 	return 0;
 }
 
-static int dec__scnprintf(struct ins *ins, char *bf, size_t size,
+static int dec__scnprintf(const struct ins *ins, char *bf, size_t size,
 			   struct ins_operands *ops, int max_ins_name)
 {
 	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
@@ -818,7 +820,7 @@ static const struct ins_ops dec_ops = {
 	.scnprintf = dec__scnprintf,
 };
 
-static int nop__scnprintf(struct ins *ins __maybe_unused, char *bf, size_t size,
+static int nop__scnprintf(const struct ins *ins __maybe_unused, char *bf, size_t size,
 			  struct ins_operands *ops __maybe_unused, int max_ins_name)
 {
 	return scnprintf(bf, size, "%-*s", max_ins_name, "nop");
@@ -866,7 +868,7 @@ static void ins__sort(struct arch *arch)
 {
 	const int nmemb = arch->nr_instructions;
 
-	qsort(arch->instructions, nmemb, sizeof(struct ins), ins__cmp);
+	qsort((void *)arch->instructions, nmemb, sizeof(struct ins), ins__cmp);
 }
 
 static const struct ins_ops *__ins__find(const struct arch *arch, const char *name,
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index dc5233f2a773..4f5c9a985786 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -19,7 +19,7 @@ struct disasm_line;
 
 struct arch {
 	const char	*name;
-	struct ins	*instructions;
+	const struct ins	*instructions;
 	size_t		nr_instructions;
 	size_t		nr_instructions_allocated;
 	const struct ins_ops  *(*associate_instruction_ops)(struct arch *arch, const char *name);
@@ -91,7 +91,7 @@ struct ins_ops {
 	void (*free)(struct ins_operands *ops);
 	int (*parse)(const struct arch *arch, struct ins_operands *ops, struct map_symbol *ms,
 			struct disasm_line *dl);
-	int (*scnprintf)(struct ins *ins, char *bf, size_t size,
+	int (*scnprintf)(const struct ins *ins, char *bf, size_t size,
 			 struct ins_operands *ops, int max_ins_name);
 };
 
-- 
cgit v1.2.3


From 9273085273103e5994952dc2725f1f0109af97d1 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 13:35:11 -0800
Subject: perf disasm: Rework the string arch__is to use the ELF machine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add new arch__is_x86 and arch__is_powerpc functions that avoid string
comparisons and use the ELF machine.

Remove arch__is() that is no longer used.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zecheng Li <zecheng@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-data.c | 10 +++++-----
 tools/perf/util/annotate.c      | 16 ++++++++--------
 tools/perf/util/capstone.c      |  2 +-
 tools/perf/util/disasm.c        | 15 ++++++++++-----
 tools/perf/util/disasm.h        |  3 ++-
 tools/perf/util/llvm.c          |  2 +-
 6 files changed, 27 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index edfcd6e9df9c..44fbd41e3845 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -165,7 +165,7 @@ static void init_type_state(struct type_state *state, const struct arch *arch)
 	memset(state, 0, sizeof(*state));
 	INIT_LIST_HEAD(&state->stack_vars);
 
-	if (arch__is(arch, "x86")) {
+	if (arch__is_x86(arch)) {
 		state->regs[0].caller_saved = true;
 		state->regs[1].caller_saved = true;
 		state->regs[2].caller_saved = true;
@@ -526,7 +526,7 @@ static enum type_match_result check_variable(struct data_loc_info *dloc,
 		needs_pointer = false;
 	else if (reg == dloc->fbreg || is_fbreg)
 		needs_pointer = false;
-	else if (arch__is(dloc->arch, "x86") && reg == X86_REG_SP)
+	else if (arch__is_x86(dloc->arch) && reg == X86_REG_SP)
 		needs_pointer = false;
 
 	/* Get the type of the variable */
@@ -1071,7 +1071,7 @@ static void delete_var_types(struct die_var_type *var_types)
 /* should match to is_stack_canary() in util/annotate.c */
 static void setup_stack_canary(struct data_loc_info *dloc)
 {
-	if (arch__is(dloc->arch, "x86")) {
+	if (arch__is_x86(dloc->arch)) {
 		dloc->op->segment = INSN_SEG_X86_GS;
 		dloc->op->imm = true;
 		dloc->op->offset = 40;
@@ -1311,7 +1311,7 @@ check_kernel:
 
 		/* Direct this-cpu access like "%gs:0x34740" */
 		if (dloc->op->segment == INSN_SEG_X86_GS && dloc->op->imm &&
-		    arch__is(dloc->arch, "x86")) {
+		    arch__is_x86(dloc->arch)) {
 			pr_debug_dtp("this-cpu var");
 
 			addr = dloc->op->offset;
@@ -1397,7 +1397,7 @@ out:
 
 static int arch_supports_insn_tracking(struct data_loc_info *dloc)
 {
-	if ((arch__is(dloc->arch, "x86")) || (arch__is(dloc->arch, "powerpc")))
+	if ((arch__is_x86(dloc->arch)) || (arch__is_powerpc(dloc->arch)))
 		return 1;
 	return 0;
 }
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 132af2556aec..79702072568b 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -2474,7 +2474,7 @@ static int extract_reg_offset(const struct arch *arch, const char *str,
 	 * %gs:0x18(%rbx).  In that case it should skip the part.
 	 */
 	if (*str == arch->objdump.register_char) {
-		if (arch__is(arch, "x86")) {
+		if (arch__is_x86(arch)) {
 			/* FIXME: Handle other segment registers */
 			if (!strncmp(str, "%gs:", 4))
 				op_loc->segment = INSN_SEG_X86_GS;
@@ -2571,7 +2571,7 @@ int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl,
 		op_loc->reg2 = -1;
 
 		if (insn_str == NULL) {
-			if (!arch__is(arch, "powerpc"))
+			if (!arch__is_powerpc(arch))
 				continue;
 		}
 
@@ -2580,7 +2580,7 @@ int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl,
 		 * required fields for op_loc, ie reg1, reg2, offset from the
 		 * raw instruction.
 		 */
-		if (arch__is(arch, "powerpc")) {
+		if (arch__is_powerpc(arch)) {
 			op_loc->mem_ref = mem_ref;
 			op_loc->multi_regs = multi_regs;
 			get_powerpc_regs(dl->raw.raw_insn, !i, op_loc);
@@ -2591,7 +2591,7 @@ int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl,
 		} else {
 			char *s, *p = NULL;
 
-			if (arch__is(arch, "x86")) {
+			if (arch__is_x86(arch)) {
 				/* FIXME: Handle other segment registers */
 				if (!strncmp(insn_str, "%gs:", 4)) {
 					op_loc->segment = INSN_SEG_X86_GS;
@@ -2675,7 +2675,7 @@ static struct annotated_item_stat *annotate_data_stat(struct list_head *head,
 
 static bool is_stack_operation(const struct arch *arch, struct disasm_line *dl)
 {
-	if (arch__is(arch, "x86")) {
+	if (arch__is_x86(arch)) {
 		if (!strncmp(dl->ins.name, "push", 4) ||
 		    !strncmp(dl->ins.name, "pop", 3) ||
 		    !strncmp(dl->ins.name, "call", 4) ||
@@ -2689,7 +2689,7 @@ static bool is_stack_operation(const struct arch *arch, struct disasm_line *dl)
 static bool is_stack_canary(const struct arch *arch, struct annotated_op_loc *loc)
 {
 	/* On x86_64, %gs:40 is used for stack canary */
-	if (arch__is(arch, "x86")) {
+	if (arch__is_x86(arch)) {
 		if (loc->segment == INSN_SEG_X86_GS && loc->imm &&
 		    loc->offset == 40)
 			return true;
@@ -2704,7 +2704,7 @@ static bool is_stack_canary(const struct arch *arch, struct annotated_op_loc *lo
  */
 static bool is_address_gen_insn(const struct arch *arch, struct disasm_line *dl)
 {
-	if (arch__is(arch, "x86")) {
+	if (arch__is_x86(arch)) {
 		if (!strncmp(dl->ins.name, "lea", 3))
 			return true;
 	}
@@ -2847,7 +2847,7 @@ __hist_entry__get_data_type(struct hist_entry *he, const struct arch *arch,
 		}
 
 		/* This CPU access in kernel - pretend PC-relative addressing */
-		if (dso__kernel(map__dso(ms->map)) && arch__is(arch, "x86") &&
+		if (dso__kernel(map__dso(ms->map)) && arch__is_x86(arch) &&
 		    op_loc->segment == INSN_SEG_X86_GS && op_loc->imm) {
 			dloc.var_addr = op_loc->offset;
 			op_loc->reg1 = DWARF_REG_PC;
diff --git a/tools/perf/util/capstone.c b/tools/perf/util/capstone.c
index 2c7feab61b7b..ce06cfd253ef 100644
--- a/tools/perf/util/capstone.c
+++ b/tools/perf/util/capstone.c
@@ -147,7 +147,7 @@ static void print_capstone_detail(cs_insn *insn, char *buf, size_t len,
 	struct symbol *sym;
 
 	/* TODO: support more architectures */
-	if (!arch__is(args->arch, "x86"))
+	if (!arch__is_x86(args->arch))
 		return;
 
 	if (insn->detail == NULL)
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 2793697ce75c..b7523256c4ad 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -228,9 +228,14 @@ const struct arch *arch__find(const char *name)
 	return bsearch(name, architectures, nmemb, sizeof(struct arch), arch__key_cmp);
 }
 
-bool arch__is(const struct arch *arch, const char *name)
+bool arch__is_x86(const struct arch *arch)
 {
-	return !strcmp(arch->name, name);
+	return arch->e_machine == EM_386 || arch->e_machine == EM_X86_64;
+}
+
+bool arch__is_powerpc(const struct arch *arch)
+{
+	return arch->e_machine == EM_PPC || arch->e_machine == EM_PPC64;
 }
 
 static void ins_ops__delete(struct ins_operands *ops)
@@ -877,7 +882,7 @@ static const struct ins_ops *__ins__find(const struct arch *arch, const char *na
 	struct ins *ins;
 	const int nmemb = arch->nr_instructions;
 
-	if (arch__is(arch, "powerpc")) {
+	if (arch__is_powerpc(arch)) {
 		/*
 		 * For powerpc, identify the instruction ops
 		 * from the opcode using raw_insn.
@@ -1066,7 +1071,7 @@ struct disasm_line *disasm_line__new(struct annotate_args *args)
 		goto out_delete;
 
 	if (args->offset != -1) {
-		if (arch__is(args->arch, "powerpc")) {
+		if (arch__is_powerpc(args->arch)) {
 			if (disasm_line__parse_powerpc(dl, args) < 0)
 				goto out_free_line;
 		} else if (disasm_line__parse(dl->al.line, &dl->ins.name, &dl->ops.raw) < 0)
@@ -1700,7 +1705,7 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 	 * and typeoff, disassemble to mnemonic notation is not required in
 	 * case of powerpc.
 	 */
-	if (arch__is(args->arch, "powerpc")) {
+	if (arch__is_powerpc(args->arch)) {
 		extern const char *sort_order;
 
 		if (sort_order && !strstr(sort_order, "sym")) {
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index 4f5c9a985786..db7f1ee3d8e7 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -106,7 +106,8 @@ struct annotate_args {
 };
 
 const struct arch *arch__find(const char *name);
-bool arch__is(const struct arch *arch, const char *name);
+bool arch__is_x86(const struct arch *arch);
+bool arch__is_powerpc(const struct arch *arch);
 
 const struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl);
 
diff --git a/tools/perf/util/llvm.c b/tools/perf/util/llvm.c
index 4ada9a10bd93..0d126d233c01 100644
--- a/tools/perf/util/llvm.c
+++ b/tools/perf/util/llvm.c
@@ -146,7 +146,7 @@ int symbol__disassemble_llvm(const char *filename, struct symbol *sym,
 		return errno;
 
 	init_llvm();
-	if (arch__is(args->arch, "x86")) {
+	if (arch__is_x86(args->arch)) {
 		const char *triplet = is_64bit ? "x86_64-pc-linux" : "i686-pc-linux";
 
 		disasm = LLVMCreateDisasm(triplet, &storage, /*tag_type=*/0,
-- 
cgit v1.2.3


From 07b972ff09f45cfb7acd20cd9b3769c6975bc434 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 13:35:12 -0800
Subject: perf disasm: Don't include C files from the arch directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the arch instructions.c files into appropriately named files in
annotate-arch in the util directory.

Don't #include to compile the code, switch to building the files and fix
up the #includes accordingly.

Move powerpc specific disasm code out of disasm.c and into
annotate-powerpc.c.

Declarations and static removed as appropriate for the code to compile
as separate compilation units.

The e_machine and e_flags set up is moved to the disasm.c architectures
array so that later patches can sort by them.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zecheng Li <zecheng@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arc/annotate/instructions.c        |  11 -
 tools/perf/arch/arm/annotate/instructions.c        |  66 --
 tools/perf/arch/arm64/annotate/instructions.c      | 126 ----
 tools/perf/arch/csky/annotate/instructions.c       |  53 --
 tools/perf/arch/loongarch/annotate/instructions.c  | 145 ----
 tools/perf/arch/mips/annotate/instructions.c       |  48 --
 tools/perf/arch/powerpc/annotate/instructions.c    | 317 --------
 tools/perf/arch/riscv64/annotate/instructions.c    |  36 -
 tools/perf/arch/s390/annotate/instructions.c       | 178 -----
 tools/perf/arch/sparc/annotate/instructions.c      | 171 -----
 tools/perf/arch/x86/annotate/instructions.c        | 797 --------------------
 tools/perf/util/Build                              |   1 +
 tools/perf/util/annotate-arch/Build                |  11 +
 tools/perf/util/annotate-arch/annotate-arc.c       |  10 +
 tools/perf/util/annotate-arch/annotate-arm.c       |  65 ++
 tools/perf/util/annotate-arch/annotate-arm64.c     | 124 ++++
 tools/perf/util/annotate-arch/annotate-csky.c      |  48 ++
 tools/perf/util/annotate-arch/annotate-loongarch.c | 148 ++++
 tools/perf/util/annotate-arch/annotate-mips.c      |  48 ++
 tools/perf/util/annotate-arch/annotate-powerpc.c   | 406 ++++++++++
 tools/perf/util/annotate-arch/annotate-riscv64.c   |  36 +
 tools/perf/util/annotate-arch/annotate-s390.c      | 185 +++++
 tools/perf/util/annotate-arch/annotate-sparc.c     | 172 +++++
 tools/perf/util/annotate-arch/annotate-x86.c       | 820 +++++++++++++++++++++
 tools/perf/util/disasm.c                           | 199 ++---
 tools/perf/util/disasm.h                           |  38 +
 26 files changed, 2155 insertions(+), 2104 deletions(-)
 delete mode 100644 tools/perf/arch/arc/annotate/instructions.c
 delete mode 100644 tools/perf/arch/arm/annotate/instructions.c
 delete mode 100644 tools/perf/arch/arm64/annotate/instructions.c
 delete mode 100644 tools/perf/arch/csky/annotate/instructions.c
 delete mode 100644 tools/perf/arch/loongarch/annotate/instructions.c
 delete mode 100644 tools/perf/arch/mips/annotate/instructions.c
 delete mode 100644 tools/perf/arch/powerpc/annotate/instructions.c
 delete mode 100644 tools/perf/arch/riscv64/annotate/instructions.c
 delete mode 100644 tools/perf/arch/s390/annotate/instructions.c
 delete mode 100644 tools/perf/arch/sparc/annotate/instructions.c
 delete mode 100644 tools/perf/arch/x86/annotate/instructions.c
 create mode 100644 tools/perf/util/annotate-arch/Build
 create mode 100644 tools/perf/util/annotate-arch/annotate-arc.c
 create mode 100644 tools/perf/util/annotate-arch/annotate-arm.c
 create mode 100644 tools/perf/util/annotate-arch/annotate-arm64.c
 create mode 100644 tools/perf/util/annotate-arch/annotate-csky.c
 create mode 100644 tools/perf/util/annotate-arch/annotate-loongarch.c
 create mode 100644 tools/perf/util/annotate-arch/annotate-mips.c
 create mode 100644 tools/perf/util/annotate-arch/annotate-powerpc.c
 create mode 100644 tools/perf/util/annotate-arch/annotate-riscv64.c
 create mode 100644 tools/perf/util/annotate-arch/annotate-s390.c
 create mode 100644 tools/perf/util/annotate-arch/annotate-sparc.c
 create mode 100644 tools/perf/util/annotate-arch/annotate-x86.c

(limited to 'tools')

diff --git a/tools/perf/arch/arc/annotate/instructions.c b/tools/perf/arch/arc/annotate/instructions.c
deleted file mode 100644
index e5619770a1af..000000000000
--- a/tools/perf/arch/arc/annotate/instructions.c
+++ /dev/null
@@ -1,11 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/compiler.h>
-
-static int arc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
-{
-	arch->initialized = true;
-	arch->objdump.comment_char = ';';
-	arch->e_machine = EM_ARC;
-	arch->e_flags = 0;
-	return 0;
-}
diff --git a/tools/perf/arch/arm/annotate/instructions.c b/tools/perf/arch/arm/annotate/instructions.c
deleted file mode 100644
index b997d127fedd..000000000000
--- a/tools/perf/arch/arm/annotate/instructions.c
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/compiler.h>
-#include <linux/zalloc.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <regex.h>
-#include <stdlib.h>
-
-struct arm_annotate {
-	regex_t call_insn,
-		jump_insn;
-};
-
-static const struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const char *name)
-{
-	struct arm_annotate *arm = arch->priv;
-	const struct ins_ops *ops;
-	regmatch_t match[2];
-
-	if (!regexec(&arm->call_insn, name, 2, match, 0))
-		ops = &call_ops;
-	else if (!regexec(&arm->jump_insn, name, 2, match, 0))
-		ops = &jump_ops;
-	else
-		return NULL;
-
-	arch__associate_ins_ops(arch, name, ops);
-	return ops;
-}
-
-static int arm__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
-{
-	struct arm_annotate *arm;
-	int err;
-
-	if (arch->initialized)
-		return 0;
-
-	arm = zalloc(sizeof(*arm));
-	if (!arm)
-		return ENOMEM;
-
-#define ARM_CONDS "(cc|cs|eq|ge|gt|hi|le|ls|lt|mi|ne|pl|vc|vs)"
-	err = regcomp(&arm->call_insn, "^blx?" ARM_CONDS "?$", REG_EXTENDED);
-	if (err)
-		goto out_free_arm;
-	err = regcomp(&arm->jump_insn, "^bx?" ARM_CONDS "?$", REG_EXTENDED);
-	if (err)
-		goto out_free_call;
-#undef ARM_CONDS
-
-	arch->initialized = true;
-	arch->priv	  = arm;
-	arch->associate_instruction_ops   = arm__associate_instruction_ops;
-	arch->objdump.comment_char	  = ';';
-	arch->objdump.skip_functions_char = '+';
-	arch->e_machine = EM_ARM;
-	arch->e_flags = 0;
-	return 0;
-
-out_free_call:
-	regfree(&arm->call_insn);
-out_free_arm:
-	free(arm);
-	return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP;
-}
diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c
deleted file mode 100644
index 44db33854dba..000000000000
--- a/tools/perf/arch/arm64/annotate/instructions.c
+++ /dev/null
@@ -1,126 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/compiler.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <regex.h>
-#include <stdlib.h>
-
-struct arm64_annotate {
-	regex_t call_insn,
-		jump_insn;
-};
-
-static int arm64_mov__parse(const struct arch *arch __maybe_unused,
-			    struct ins_operands *ops,
-			    struct map_symbol *ms __maybe_unused,
-			    struct disasm_line *dl __maybe_unused)
-{
-	char *s = strchr(ops->raw, ','), *target, *endptr;
-
-	if (s == NULL)
-		return -1;
-
-	*s = '\0';
-	ops->source.raw = strdup(ops->raw);
-	*s = ',';
-
-	if (ops->source.raw == NULL)
-		return -1;
-
-	target = ++s;
-	ops->target.raw = strdup(target);
-	if (ops->target.raw == NULL)
-		goto out_free_source;
-
-	ops->target.addr = strtoull(target, &endptr, 16);
-	if (endptr == target)
-		goto out_free_target;
-
-	s = strchr(endptr, '<');
-	if (s == NULL)
-		goto out_free_target;
-	endptr = strchr(s + 1, '>');
-	if (endptr == NULL)
-		goto out_free_target;
-
-	*endptr = '\0';
-	*s = ' ';
-	ops->target.name = strdup(s);
-	*s = '<';
-	*endptr = '>';
-	if (ops->target.name == NULL)
-		goto out_free_target;
-
-	return 0;
-
-out_free_target:
-	zfree(&ops->target.raw);
-out_free_source:
-	zfree(&ops->source.raw);
-	return -1;
-}
-
-static int mov__scnprintf(const struct ins *ins, char *bf, size_t size,
-			  struct ins_operands *ops, int max_ins_name);
-
-static const struct ins_ops arm64_mov_ops = {
-	.parse	   = arm64_mov__parse,
-	.scnprintf = mov__scnprintf,
-};
-
-static const struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name)
-{
-	struct arm64_annotate *arm = arch->priv;
-	const struct ins_ops *ops;
-	regmatch_t match[2];
-
-	if (!regexec(&arm->jump_insn, name, 2, match, 0))
-		ops = &jump_ops;
-	else if (!regexec(&arm->call_insn, name, 2, match, 0))
-		ops = &call_ops;
-	else if (!strcmp(name, "ret"))
-		ops = &ret_ops;
-	else
-		ops = &arm64_mov_ops;
-
-	arch__associate_ins_ops(arch, name, ops);
-	return ops;
-}
-
-static int arm64__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
-{
-	struct arm64_annotate *arm;
-	int err;
-
-	if (arch->initialized)
-		return 0;
-
-	arm = zalloc(sizeof(*arm));
-	if (!arm)
-		return ENOMEM;
-
-	/* bl, blr */
-	err = regcomp(&arm->call_insn, "^blr?$", REG_EXTENDED);
-	if (err)
-		goto out_free_arm;
-	/* b, b.cond, br, cbz/cbnz, tbz/tbnz */
-	err = regcomp(&arm->jump_insn, "^[ct]?br?\\.?(cc|cs|eq|ge|gt|hi|hs|le|lo|ls|lt|mi|ne|pl|vc|vs)?n?z?$",
-		      REG_EXTENDED);
-	if (err)
-		goto out_free_call;
-
-	arch->initialized = true;
-	arch->priv	  = arm;
-	arch->associate_instruction_ops   = arm64__associate_instruction_ops;
-	arch->objdump.comment_char	  = '/';
-	arch->objdump.skip_functions_char = '+';
-	arch->e_machine = EM_AARCH64;
-	arch->e_flags = 0;
-	return 0;
-
-out_free_call:
-	regfree(&arm->call_insn);
-out_free_arm:
-	free(arm);
-	return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP;
-}
diff --git a/tools/perf/arch/csky/annotate/instructions.c b/tools/perf/arch/csky/annotate/instructions.c
deleted file mode 100644
index 4a55c84a320a..000000000000
--- a/tools/perf/arch/csky/annotate/instructions.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
-
-#include <linux/compiler.h>
-
-static const struct ins_ops *csky__associate_ins_ops(struct arch *arch,
-						     const char *name)
-{
-	const struct ins_ops *ops = NULL;
-
-	/* catch all kind of jumps */
-	if (!strcmp(name, "bt") ||
-	    !strcmp(name, "bf") ||
-	    !strcmp(name, "bez") ||
-	    !strcmp(name, "bnez") ||
-	    !strcmp(name, "bnezad") ||
-	    !strcmp(name, "bhsz") ||
-	    !strcmp(name, "bhz") ||
-	    !strcmp(name, "blsz") ||
-	    !strcmp(name, "blz") ||
-	    !strcmp(name, "br") ||
-	    !strcmp(name, "jmpi") ||
-	    !strcmp(name, "jmp"))
-		ops = &jump_ops;
-
-	/* catch function call */
-	if (!strcmp(name, "bsr") ||
-	    !strcmp(name, "jsri") ||
-	    !strcmp(name, "jsr"))
-		ops = &call_ops;
-
-	/* catch function return */
-	if (!strcmp(name, "rts"))
-		ops = &ret_ops;
-
-	if (ops)
-		arch__associate_ins_ops(arch, name, ops);
-	return ops;
-}
-
-static int csky__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
-{
-	arch->initialized = true;
-	arch->objdump.comment_char = '/';
-	arch->associate_instruction_ops = csky__associate_ins_ops;
-	arch->e_machine = EM_CSKY;
-#if defined(__CSKYABIV2__)
-	arch->e_flags = EF_CSKY_ABIV2;
-#else
-	arch->e_flags = EF_CSKY_ABIV1;
-#endif
-	return 0;
-}
diff --git a/tools/perf/arch/loongarch/annotate/instructions.c b/tools/perf/arch/loongarch/annotate/instructions.c
deleted file mode 100644
index 5010d5d58375..000000000000
--- a/tools/perf/arch/loongarch/annotate/instructions.c
+++ /dev/null
@@ -1,145 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Perf annotate functions.
- *
- * Copyright (C) 2020-2023 Loongson Technology Corporation Limited
- */
-
-static int loongarch_call__parse(const struct arch *arch, struct ins_operands *ops,
-				 struct map_symbol *ms,
-				 struct disasm_line *dl __maybe_unused)
-
-{
-	char *c, *endptr, *tok, *name;
-	struct map *map = ms->map;
-	struct addr_map_symbol target;
-
-	c = strchr(ops->raw, '#');
-	if (c++ == NULL)
-		return -1;
-
-	ops->target.addr = strtoull(c, &endptr, 16);
-
-	name = strchr(endptr, '<');
-	name++;
-
-	if (arch->objdump.skip_functions_char &&
-	    strchr(name, arch->objdump.skip_functions_char))
-		return -1;
-
-	tok = strchr(name, '>');
-	if (tok == NULL)
-		return -1;
-
-	*tok = '\0';
-	ops->target.name = strdup(name);
-	*tok = '>';
-
-	if (ops->target.name == NULL)
-		return -1;
-
-	target = (struct addr_map_symbol) {
-		.ms = { .map = map__get(map), },
-		.addr = map__objdump_2mem(map, ops->target.addr),
-	};
-
-	if (maps__find_ams(ms->maps, &target) == 0 &&
-	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
-		ops->target.sym = target.ms.sym;
-
-	addr_map_symbol__exit(&target);
-	return 0;
-}
-
-static const struct ins_ops loongarch_call_ops = {
-	.parse	   = loongarch_call__parse,
-	.scnprintf = call__scnprintf,
-};
-
-static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *ops,
-				 struct map_symbol *ms,
-				 struct disasm_line *dl __maybe_unused)
-
-{
-	struct map *map = ms->map;
-	struct symbol *sym = ms->sym;
-	struct addr_map_symbol target = {
-		.ms = { .map = map__get(map), },
-	};
-	const char *c = strchr(ops->raw, '#');
-	u64 start, end;
-
-	ops->jump.raw_comment = strchr(ops->raw, arch->objdump.comment_char);
-	ops->jump.raw_func_start = strchr(ops->raw, '<');
-
-	if (ops->jump.raw_func_start && c > ops->jump.raw_func_start)
-		c = NULL;
-
-	if (c++ != NULL)
-		ops->target.addr = strtoull(c, NULL, 16);
-	else
-		ops->target.addr = strtoull(ops->raw, NULL, 16);
-
-	target.addr = map__objdump_2mem(map, ops->target.addr);
-	start = map__unmap_ip(map, sym->start);
-	end = map__unmap_ip(map, sym->end);
-
-	ops->target.outside = target.addr < start || target.addr > end;
-
-	if (maps__find_ams(ms->maps, &target) == 0 &&
-	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
-		ops->target.sym = target.ms.sym;
-
-	if (!ops->target.outside) {
-		ops->target.offset = target.addr - start;
-		ops->target.offset_avail = true;
-	} else {
-		ops->target.offset_avail = false;
-	}
-	addr_map_symbol__exit(&target);
-	return 0;
-}
-
-static const struct ins_ops loongarch_jump_ops = {
-	.parse	   = loongarch_jump__parse,
-	.scnprintf = jump__scnprintf,
-};
-
-static
-const struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char *name)
-{
-	const struct ins_ops *ops = NULL;
-
-	if (!strcmp(name, "bl"))
-		ops = &loongarch_call_ops;
-	else if (!strcmp(name, "jirl"))
-		ops = &ret_ops;
-	else if (!strcmp(name, "b") ||
-		 !strncmp(name, "beq", 3) ||
-		 !strncmp(name, "bne", 3) ||
-		 !strncmp(name, "blt", 3) ||
-		 !strncmp(name, "bge", 3) ||
-		 !strncmp(name, "bltu", 4) ||
-		 !strncmp(name, "bgeu", 4))
-		ops = &loongarch_jump_ops;
-	else
-		return NULL;
-
-	arch__associate_ins_ops(arch, name, ops);
-
-	return ops;
-}
-
-static
-int loongarch__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
-{
-	if (!arch->initialized) {
-		arch->associate_instruction_ops = loongarch__associate_ins_ops;
-		arch->initialized = true;
-		arch->objdump.comment_char = '#';
-		arch->e_machine = EM_LOONGARCH;
-		arch->e_flags = 0;
-	}
-
-	return 0;
-}
diff --git a/tools/perf/arch/mips/annotate/instructions.c b/tools/perf/arch/mips/annotate/instructions.c
deleted file mode 100644
index 0fbe0a7df95a..000000000000
--- a/tools/perf/arch/mips/annotate/instructions.c
+++ /dev/null
@@ -1,48 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-static
-const struct ins_ops *mips__associate_ins_ops(struct arch *arch, const char *name)
-{
-	const struct ins_ops *ops = NULL;
-
-	if (!strncmp(name, "bal", 3) ||
-	    !strncmp(name, "bgezal", 6) ||
-	    !strncmp(name, "bltzal", 6) ||
-	    !strncmp(name, "bgtzal", 6) ||
-	    !strncmp(name, "blezal", 6) ||
-	    !strncmp(name, "beqzal", 6) ||
-	    !strncmp(name, "bnezal", 6) ||
-	    !strncmp(name, "bgtzl", 5) ||
-	    !strncmp(name, "bltzl", 5) ||
-	    !strncmp(name, "bgezl", 5) ||
-	    !strncmp(name, "blezl", 5) ||
-	    !strncmp(name, "jialc", 5) ||
-	    !strncmp(name, "beql", 4) ||
-	    !strncmp(name, "bnel", 4) ||
-	    !strncmp(name, "jal", 3))
-		ops = &call_ops;
-	else if (!strncmp(name, "jr", 2))
-		ops = &ret_ops;
-	else if (name[0] == 'j' || name[0] == 'b')
-		ops = &jump_ops;
-	else
-		return NULL;
-
-	arch__associate_ins_ops(arch, name, ops);
-
-	return ops;
-}
-
-static
-int mips__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
-{
-	if (!arch->initialized) {
-		arch->associate_instruction_ops = mips__associate_ins_ops;
-		arch->initialized = true;
-		arch->objdump.comment_char = '#';
-		arch->e_machine = EM_MIPS;
-		arch->e_flags = 0;
-	}
-
-	return 0;
-}
diff --git a/tools/perf/arch/powerpc/annotate/instructions.c b/tools/perf/arch/powerpc/annotate/instructions.c
deleted file mode 100644
index d1be55425e35..000000000000
--- a/tools/perf/arch/powerpc/annotate/instructions.c
+++ /dev/null
@@ -1,317 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/compiler.h>
-
-static const struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, const char *name)
-{
-	int i;
-	const struct ins_ops *ops;
-
-	/*
-	 * - Interested only if instruction starts with 'b'.
-	 * - Few start with 'b', but aren't branch instructions.
-	 */
-	if (name[0] != 'b'             ||
-	    !strncmp(name, "bcd", 3)   ||
-	    !strncmp(name, "brinc", 5) ||
-	    !strncmp(name, "bper", 4))
-		return NULL;
-
-	ops = &jump_ops;
-
-	i = strlen(name) - 1;
-	if (i < 0)
-		return NULL;
-
-	/* ignore optional hints at the end of the instructions */
-	if (name[i] == '+' || name[i] == '-')
-		i--;
-
-	if (name[i] == 'l' || (name[i] == 'a' && name[i-1] == 'l')) {
-		/*
-		 * if the instruction ends up with 'l' or 'la', then
-		 * those are considered 'calls' since they update LR.
-		 * ... except for 'bnl' which is branch if not less than
-		 * and the absolute form of the same.
-		 */
-		if (strcmp(name, "bnl") && strcmp(name, "bnl+") &&
-		    strcmp(name, "bnl-") && strcmp(name, "bnla") &&
-		    strcmp(name, "bnla+") && strcmp(name, "bnla-"))
-			ops = &call_ops;
-	}
-	if (name[i] == 'r' && name[i-1] == 'l')
-		/*
-		 * instructions ending with 'lr' are considered to be
-		 * return instructions
-		 */
-		ops = &ret_ops;
-
-	arch__associate_ins_ops(arch, name, ops);
-	return ops;
-}
-
-#define PPC_OP(op)	(((op) >> 26) & 0x3F)
-#define PPC_21_30(R)	(((R) >> 1) & 0x3ff)
-#define PPC_22_30(R)	(((R) >> 1) & 0x1ff)
-
-struct insn_offset {
-	const char	*name;
-	int		value;
-};
-
-/*
- * There are memory instructions with opcode 31 which are
- * of X Form, Example:
- * ldx RT,RA,RB
- * ______________________________________
- * | 31 |  RT  |  RA |  RB |   21     |/|
- * --------------------------------------
- * 0    6     11    16    21         30 31
- *
- * But all instructions with opcode 31 are not memory.
- * Example: add RT,RA,RB
- *
- * Use bits 21 to 30 to check memory insns with 31 as opcode.
- * In ins_array below, for ldx instruction:
- * name => OP_31_XOP_LDX
- * value => 21
- */
-
-static struct insn_offset ins_array[] = {
-	{ .name = "OP_31_XOP_LXSIWZX",  .value = 12, },
-	{ .name = "OP_31_XOP_LWARX",	.value = 20, },
-	{ .name = "OP_31_XOP_LDX",	.value = 21, },
-	{ .name = "OP_31_XOP_LWZX",	.value = 23, },
-	{ .name = "OP_31_XOP_LDUX",	.value = 53, },
-	{ .name = "OP_31_XOP_LWZUX",	.value = 55, },
-	{ .name = "OP_31_XOP_LXSIWAX",  .value = 76, },
-	{ .name = "OP_31_XOP_LDARX",    .value = 84, },
-	{ .name = "OP_31_XOP_LBZX",	.value = 87, },
-	{ .name = "OP_31_XOP_LVX",      .value = 103, },
-	{ .name = "OP_31_XOP_LBZUX",    .value = 119, },
-	{ .name = "OP_31_XOP_STXSIWX",  .value = 140, },
-	{ .name = "OP_31_XOP_STDX",	.value = 149, },
-	{ .name = "OP_31_XOP_STWX",	.value = 151, },
-	{ .name = "OP_31_XOP_STDUX",	.value = 181, },
-	{ .name = "OP_31_XOP_STWUX",	.value = 183, },
-	{ .name = "OP_31_XOP_STBX",	.value = 215, },
-	{ .name = "OP_31_XOP_STVX",     .value = 231, },
-	{ .name = "OP_31_XOP_STBUX",	.value = 247, },
-	{ .name = "OP_31_XOP_LHZX",	.value = 279, },
-	{ .name = "OP_31_XOP_LHZUX",	.value = 311, },
-	{ .name = "OP_31_XOP_LXVDSX",   .value = 332, },
-	{ .name = "OP_31_XOP_LWAX",	.value = 341, },
-	{ .name = "OP_31_XOP_LHAX",	.value = 343, },
-	{ .name = "OP_31_XOP_LWAUX",	.value = 373, },
-	{ .name = "OP_31_XOP_LHAUX",	.value = 375, },
-	{ .name = "OP_31_XOP_STHX",	.value = 407, },
-	{ .name = "OP_31_XOP_STHUX",	.value = 439, },
-	{ .name = "OP_31_XOP_LXSSPX",   .value = 524, },
-	{ .name = "OP_31_XOP_LDBRX",	.value = 532, },
-	{ .name = "OP_31_XOP_LSWX",	.value = 533, },
-	{ .name = "OP_31_XOP_LWBRX",	.value = 534, },
-	{ .name = "OP_31_XOP_LFSUX",    .value = 567, },
-	{ .name = "OP_31_XOP_LXSDX",    .value = 588, },
-	{ .name = "OP_31_XOP_LSWI",	.value = 597, },
-	{ .name = "OP_31_XOP_LFDX",     .value = 599, },
-	{ .name = "OP_31_XOP_LFDUX",    .value = 631, },
-	{ .name = "OP_31_XOP_STXSSPX",  .value = 652, },
-	{ .name = "OP_31_XOP_STDBRX",	.value = 660, },
-	{ .name = "OP_31_XOP_STXWX",	.value = 661, },
-	{ .name = "OP_31_XOP_STWBRX",	.value = 662, },
-	{ .name = "OP_31_XOP_STFSX",	.value = 663, },
-	{ .name = "OP_31_XOP_STFSUX",	.value = 695, },
-	{ .name = "OP_31_XOP_STXSDX",   .value = 716, },
-	{ .name = "OP_31_XOP_STSWI",	.value = 725, },
-	{ .name = "OP_31_XOP_STFDX",	.value = 727, },
-	{ .name = "OP_31_XOP_STFDUX",	.value = 759, },
-	{ .name = "OP_31_XOP_LXVW4X",   .value = 780, },
-	{ .name = "OP_31_XOP_LHBRX",	.value = 790, },
-	{ .name = "OP_31_XOP_LXVD2X",   .value = 844, },
-	{ .name = "OP_31_XOP_LFIWAX",	.value = 855, },
-	{ .name = "OP_31_XOP_LFIWZX",	.value = 887, },
-	{ .name = "OP_31_XOP_STXVW4X",  .value = 908, },
-	{ .name = "OP_31_XOP_STHBRX",	.value = 918, },
-	{ .name = "OP_31_XOP_STXVD2X",  .value = 972, },
-	{ .name = "OP_31_XOP_STFIWX",	.value = 983, },
-};
-
-/*
- * Arithmetic instructions which are having opcode as 31.
- * These instructions are tracked to save the register state
- * changes. Example:
- *
- * lwz	r10,264(r3)
- * add	r31, r3, r3
- * lwz	r9, 0(r31)
- *
- * Here instruction tracking needs to identify the "add"
- * instruction and save data type of r3 to r31. If a sample
- * is hit at next "lwz r9, 0(r31)", by this instruction tracking,
- * data type of r31 can be resolved.
- */
-static struct insn_offset arithmetic_ins_op_31[] = {
-	{ .name = "SUB_CARRY_XO_FORM",  .value = 8, },
-	{ .name = "MUL_HDW_XO_FORM1",   .value = 9, },
-	{ .name = "ADD_CARRY_XO_FORM",  .value = 10, },
-	{ .name = "MUL_HW_XO_FORM1",    .value = 11, },
-	{ .name = "SUB_XO_FORM",        .value = 40, },
-	{ .name = "MUL_HDW_XO_FORM",    .value = 73, },
-	{ .name = "MUL_HW_XO_FORM",     .value = 75, },
-	{ .name = "SUB_EXT_XO_FORM",    .value = 136, },
-	{ .name = "ADD_EXT_XO_FORM",    .value = 138, },
-	{ .name = "SUB_ZERO_EXT_XO_FORM",       .value = 200, },
-	{ .name = "ADD_ZERO_EXT_XO_FORM",       .value = 202, },
-	{ .name = "SUB_EXT_XO_FORM2",   .value = 232, },
-	{ .name = "MUL_DW_XO_FORM",     .value = 233, },
-	{ .name = "ADD_EXT_XO_FORM2",   .value = 234, },
-	{ .name = "MUL_W_XO_FORM",      .value = 235, },
-	{ .name = "ADD_XO_FORM",	.value = 266, },
-	{ .name = "DIV_DW_XO_FORM1",    .value = 457, },
-	{ .name = "DIV_W_XO_FORM1",     .value = 459, },
-	{ .name = "DIV_DW_XO_FORM",	.value = 489, },
-	{ .name = "DIV_W_XO_FORM",	.value = 491, },
-};
-
-static struct insn_offset arithmetic_two_ops[] = {
-	{ .name = "mulli",      .value = 7, },
-	{ .name = "subfic",     .value = 8, },
-	{ .name = "addic",      .value = 12, },
-	{ .name = "addic.",     .value = 13, },
-	{ .name = "addi",       .value = 14, },
-	{ .name = "addis",      .value = 15, },
-};
-
-static int cmp_offset(const void *a, const void *b)
-{
-	const struct insn_offset *val1 = a;
-	const struct insn_offset *val2 = b;
-
-	return (val1->value - val2->value);
-}
-
-static const struct ins_ops *check_ppc_insn(struct disasm_line *dl)
-{
-	int raw_insn = dl->raw.raw_insn;
-	int opcode = PPC_OP(raw_insn);
-	int mem_insn_31 = PPC_21_30(raw_insn);
-	struct insn_offset *ret;
-	struct insn_offset mem_insns_31_opcode = {
-		"OP_31_INSN",
-		mem_insn_31
-	};
-	char name_insn[32];
-
-	/*
-	 * Instructions with opcode 32 to 63 are memory
-	 * instructions in powerpc
-	 */
-	if ((opcode & 0x20)) {
-		/*
-		 * Set name in case of raw instruction to
-		 * opcode to be used in insn-stat
-		 */
-		if (!strlen(dl->ins.name)) {
-			sprintf(name_insn, "%d", opcode);
-			dl->ins.name = strdup(name_insn);
-		}
-		return &load_store_ops;
-	} else if (opcode == 31) {
-		/* Check for memory instructions with opcode 31 */
-		ret = bsearch(&mem_insns_31_opcode, ins_array, ARRAY_SIZE(ins_array), sizeof(ins_array[0]), cmp_offset);
-		if (ret) {
-			if (!strlen(dl->ins.name))
-				dl->ins.name = strdup(ret->name);
-			return &load_store_ops;
-		} else {
-			mem_insns_31_opcode.value = PPC_22_30(raw_insn);
-			ret = bsearch(&mem_insns_31_opcode, arithmetic_ins_op_31, ARRAY_SIZE(arithmetic_ins_op_31),
-					sizeof(arithmetic_ins_op_31[0]), cmp_offset);
-			if (ret != NULL)
-				return &arithmetic_ops;
-			/* Bits 21 to 30 has value 444 for "mr" insn ie, OR X form */
-			if (PPC_21_30(raw_insn) == 444)
-				return &arithmetic_ops;
-		}
-	} else {
-		mem_insns_31_opcode.value = opcode;
-		ret = bsearch(&mem_insns_31_opcode, arithmetic_two_ops, ARRAY_SIZE(arithmetic_two_ops),
-				sizeof(arithmetic_two_ops[0]), cmp_offset);
-		if (ret != NULL)
-			return &arithmetic_ops;
-	}
-
-	return NULL;
-}
-
-/*
- * Instruction tracking function to track register state moves.
- * Example sequence:
- *    ld      r10,264(r3)
- *    mr      r31,r3
- *    <<after some sequence>
- *    ld      r9,312(r31)
- *
- * Previous instruction sequence shows that register state of r3
- * is moved to r31. update_insn_state_powerpc tracks these state
- * changes
- */
-#ifdef HAVE_LIBDW_SUPPORT
-static void update_insn_state_powerpc(struct type_state *state,
-		struct data_loc_info *dloc, Dwarf_Die * cu_die __maybe_unused,
-		struct disasm_line *dl)
-{
-	struct annotated_insn_loc loc;
-	struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE];
-	struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET];
-	struct type_state_reg *tsr;
-	u32 insn_offset = dl->al.offset;
-
-	if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0)
-		return;
-
-	/*
-	 * Value 444 for bits 21:30 is for "mr"
-	 * instruction. "mr" is extended OR. So set the
-	 * source and destination reg correctly
-	 */
-	if (PPC_21_30(dl->raw.raw_insn) == 444) {
-		int src_reg = src->reg1;
-
-		src->reg1 = dst->reg1;
-		dst->reg1 = src_reg;
-	}
-
-	if (!has_reg_type(state, dst->reg1))
-		return;
-
-	tsr = &state->regs[dst->reg1];
-
-	if (!has_reg_type(state, src->reg1) ||
-			!state->regs[src->reg1].ok) {
-		tsr->ok = false;
-		return;
-	}
-
-	tsr->type = state->regs[src->reg1].type;
-	tsr->kind = state->regs[src->reg1].kind;
-	tsr->ok = true;
-
-	pr_debug_dtp("mov [%x] reg%d -> reg%d",
-			insn_offset, src->reg1, dst->reg1);
-	pr_debug_type_name(&tsr->type, tsr->kind);
-}
-#endif /* HAVE_LIBDW_SUPPORT */
-
-static int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
-{
-	if (!arch->initialized) {
-		arch->initialized = true;
-		arch->associate_instruction_ops = powerpc__associate_instruction_ops;
-		arch->objdump.comment_char      = '#';
-		annotate_opts.show_asm_raw = true;
-		arch->e_machine = EM_PPC;
-		arch->e_flags = 0;
-	}
-
-	return 0;
-}
diff --git a/tools/perf/arch/riscv64/annotate/instructions.c b/tools/perf/arch/riscv64/annotate/instructions.c
deleted file mode 100644
index a34798864fab..000000000000
--- a/tools/perf/arch/riscv64/annotate/instructions.c
+++ /dev/null
@@ -1,36 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-static
-const struct ins_ops *riscv64__associate_ins_ops(struct arch *arch, const char *name)
-{
-	const struct ins_ops *ops = NULL;
-
-	if (!strncmp(name, "jal", 3) ||
-	    !strncmp(name, "jr", 2) ||
-	    !strncmp(name, "call", 4))
-		ops = &call_ops;
-	else if (!strncmp(name, "ret", 3))
-		ops = &ret_ops;
-	else if (name[0] == 'j' || name[0] == 'b')
-		ops = &jump_ops;
-	else
-		return NULL;
-
-	arch__associate_ins_ops(arch, name, ops);
-
-	return ops;
-}
-
-static
-int riscv64__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
-{
-	if (!arch->initialized) {
-		arch->associate_instruction_ops = riscv64__associate_ins_ops;
-		arch->initialized = true;
-		arch->objdump.comment_char = '#';
-		arch->e_machine = EM_RISCV;
-		arch->e_flags = 0;
-	}
-
-	return 0;
-}
diff --git a/tools/perf/arch/s390/annotate/instructions.c b/tools/perf/arch/s390/annotate/instructions.c
deleted file mode 100644
index 1b22e6276e7d..000000000000
--- a/tools/perf/arch/s390/annotate/instructions.c
+++ /dev/null
@@ -1,178 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/compiler.h>
-
-static int s390_call__parse(const struct arch *arch, struct ins_operands *ops,
-			    struct map_symbol *ms,
-			    struct disasm_line *dl __maybe_unused)
-{
-	char *endptr, *tok, *name;
-	struct map *map = ms->map;
-	struct addr_map_symbol target;
-
-	tok = strchr(ops->raw, ',');
-	if (!tok)
-		return -1;
-
-	ops->target.addr = strtoull(tok + 1, &endptr, 16);
-
-	name = strchr(endptr, '<');
-	if (name == NULL)
-		return -1;
-
-	name++;
-
-	if (arch->objdump.skip_functions_char &&
-	    strchr(name, arch->objdump.skip_functions_char))
-		return -1;
-
-	tok = strchr(name, '>');
-	if (tok == NULL)
-		return -1;
-
-	*tok = '\0';
-	ops->target.name = strdup(name);
-	*tok = '>';
-
-	if (ops->target.name == NULL)
-		return -1;
-
-	target = (struct addr_map_symbol) {
-		.ms = { .map = map__get(map), },
-		.addr = map__objdump_2mem(map, ops->target.addr),
-	};
-
-	if (maps__find_ams(ms->maps, &target) == 0 &&
-	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
-		ops->target.sym = target.ms.sym;
-
-	addr_map_symbol__exit(&target);
-	return 0;
-}
-
-static const struct ins_ops s390_call_ops = {
-	.parse	   = s390_call__parse,
-	.scnprintf = call__scnprintf,
-};
-
-static int s390_mov__parse(const struct arch *arch __maybe_unused,
-			   struct ins_operands *ops,
-			   struct map_symbol *ms __maybe_unused,
-			   struct disasm_line *dl __maybe_unused)
-{
-	char *s = strchr(ops->raw, ','), *target, *endptr;
-
-	if (s == NULL)
-		return -1;
-
-	*s = '\0';
-	ops->source.raw = strdup(ops->raw);
-	*s = ',';
-
-	if (ops->source.raw == NULL)
-		return -1;
-
-	target = ++s;
-	ops->target.raw = strdup(target);
-	if (ops->target.raw == NULL)
-		goto out_free_source;
-
-	ops->target.addr = strtoull(target, &endptr, 16);
-	if (endptr == target)
-		goto out_free_target;
-
-	s = strchr(endptr, '<');
-	if (s == NULL)
-		goto out_free_target;
-	endptr = strchr(s + 1, '>');
-	if (endptr == NULL)
-		goto out_free_target;
-
-	*endptr = '\0';
-	ops->target.name = strdup(s + 1);
-	*endptr = '>';
-	if (ops->target.name == NULL)
-		goto out_free_target;
-
-	return 0;
-
-out_free_target:
-	zfree(&ops->target.raw);
-out_free_source:
-	zfree(&ops->source.raw);
-	return -1;
-}
-
-
-static const struct ins_ops s390_mov_ops = {
-	.parse	   = s390_mov__parse,
-	.scnprintf = mov__scnprintf,
-};
-
-static const struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name)
-{
-	const struct ins_ops *ops = NULL;
-
-	/* catch all kind of jumps */
-	if (strchr(name, 'j') ||
-	    !strncmp(name, "bct", 3) ||
-	    !strncmp(name, "br", 2))
-		ops = &jump_ops;
-	/* override call/returns */
-	if (!strcmp(name, "bras") ||
-	    !strcmp(name, "brasl") ||
-	    !strcmp(name, "basr"))
-		ops = &s390_call_ops;
-	if (!strcmp(name, "br"))
-		ops = &ret_ops;
-	/* override load/store relative to PC */
-	if (!strcmp(name, "lrl") ||
-	    !strcmp(name, "lgrl") ||
-	    !strcmp(name, "lgfrl") ||
-	    !strcmp(name, "llgfrl") ||
-	    !strcmp(name, "strl") ||
-	    !strcmp(name, "stgrl"))
-		ops = &s390_mov_ops;
-
-	if (ops)
-		arch__associate_ins_ops(arch, name, ops);
-	return ops;
-}
-
-static int s390__cpuid_parse(struct arch *arch, char *cpuid)
-{
-	unsigned int family;
-	char model[16], model_c[16], cpumf_v[16], cpumf_a[16];
-	int ret;
-
-	/*
-	 * cpuid string format:
-	 * "IBM,family,model-capacity,model[,cpum_cf-version,cpum_cf-authorization]"
-	 */
-	ret = sscanf(cpuid, "%*[^,],%u,%[^,],%[^,],%[^,],%s", &family, model_c,
-		     model, cpumf_v, cpumf_a);
-	if (ret >= 2) {
-		arch->family = family;
-		arch->model = 0;
-		return 0;
-	}
-
-	return -1;
-}
-
-static int s390__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
-{
-	int err = 0;
-
-	if (!arch->initialized) {
-		arch->initialized = true;
-		arch->associate_instruction_ops = s390__associate_ins_ops;
-		if (cpuid) {
-			if (s390__cpuid_parse(arch, cpuid))
-				err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING;
-		}
-		arch->e_machine = EM_S390;
-		arch->e_flags = 0;
-	}
-
-	return err;
-}
diff --git a/tools/perf/arch/sparc/annotate/instructions.c b/tools/perf/arch/sparc/annotate/instructions.c
deleted file mode 100644
index a08d8734c883..000000000000
--- a/tools/perf/arch/sparc/annotate/instructions.c
+++ /dev/null
@@ -1,171 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-static int is_branch_cond(const char *cond)
-{
-	if (cond[0] == '\0')
-		return 1;
-
-	if (cond[0] == 'a' && cond[1] == '\0')
-		return 1;
-
-	if (cond[0] == 'c' &&
-	    (cond[1] == 'c' || cond[1] == 's') &&
-	    cond[2] == '\0')
-		return 1;
-
-	if (cond[0] == 'e' &&
-	    (cond[1] == '\0' ||
-	     (cond[1] == 'q' && cond[2] == '\0')))
-		return 1;
-
-	if (cond[0] == 'g' &&
-	    (cond[1] == '\0' ||
-	     (cond[1] == 't' && cond[2] == '\0') ||
-	     (cond[1] == 'e' && cond[2] == '\0') ||
-	     (cond[1] == 'e' && cond[2] == 'u' && cond[3] == '\0')))
-		return 1;
-
-	if (cond[0] == 'l' &&
-	    (cond[1] == '\0' ||
-	     (cond[1] == 't' && cond[2] == '\0') ||
-	     (cond[1] == 'u' && cond[2] == '\0') ||
-	     (cond[1] == 'e' && cond[2] == '\0') ||
-	     (cond[1] == 'e' && cond[2] == 'u' && cond[3] == '\0')))
-		return 1;
-
-	if (cond[0] == 'n' &&
-	    (cond[1] == '\0' ||
-	     (cond[1] == 'e' && cond[2] == '\0') ||
-	     (cond[1] == 'z' && cond[2] == '\0') ||
-	     (cond[1] == 'e' && cond[2] == 'g' && cond[3] == '\0')))
-		return 1;
-
-	if (cond[0] == 'b' &&
-	    cond[1] == 'p' &&
-	    cond[2] == 'o' &&
-	    cond[3] == 's' &&
-	    cond[4] == '\0')
-		return 1;
-
-	if (cond[0] == 'v' &&
-	    (cond[1] == 'c' || cond[1] == 's') &&
-	    cond[2] == '\0')
-		return 1;
-
-	if (cond[0] == 'b' &&
-	    cond[1] == 'z' &&
-	    cond[2] == '\0')
-		return 1;
-
-	return 0;
-}
-
-static int is_branch_reg_cond(const char *cond)
-{
-	if ((cond[0] == 'n' || cond[0] == 'l') &&
-	    cond[1] == 'z' &&
-	    cond[2] == '\0')
-		return 1;
-
-	if (cond[0] == 'z' &&
-	    cond[1] == '\0')
-		return 1;
-
-	if ((cond[0] == 'g' || cond[0] == 'l') &&
-	    cond[1] == 'e' &&
-	    cond[2] == 'z' &&
-	    cond[3] == '\0')
-		return 1;
-
-	if (cond[0] == 'g' &&
-	    cond[1] == 'z' &&
-	    cond[2] == '\0')
-		return 1;
-
-	return 0;
-}
-
-static int is_branch_float_cond(const char *cond)
-{
-	if (cond[0] == '\0')
-		return 1;
-
-	if ((cond[0] == 'a' || cond[0] == 'e' ||
-	     cond[0] == 'z' || cond[0] == 'g' ||
-	     cond[0] == 'l' || cond[0] == 'n' ||
-	     cond[0] == 'o' || cond[0] == 'u') &&
-	    cond[1] == '\0')
-		return 1;
-
-	if (((cond[0] == 'g' && cond[1] == 'e') ||
-	     (cond[0] == 'l' && (cond[1] == 'e' ||
-				 cond[1] == 'g')) ||
-	     (cond[0] == 'n' && (cond[1] == 'e' ||
-				 cond[1] == 'z')) ||
-	     (cond[0] == 'u' && (cond[1] == 'e' ||
-				 cond[1] == 'g' ||
-				 cond[1] == 'l'))) &&
-	    cond[2] == '\0')
-		return 1;
-
-	if (cond[0] == 'u' &&
-	    (cond[1] == 'g' || cond[1] == 'l') &&
-	    cond[2] == 'e' &&
-	    cond[3] == '\0')
-		return 1;
-
-	return 0;
-}
-
-static const struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, const char *name)
-{
-	const struct ins_ops *ops = NULL;
-
-	if (!strcmp(name, "call") ||
-	    !strcmp(name, "jmp") ||
-	    !strcmp(name, "jmpl")) {
-		ops = &call_ops;
-	} else if (!strcmp(name, "ret") ||
-		   !strcmp(name, "retl") ||
-		   !strcmp(name, "return")) {
-		ops = &ret_ops;
-	} else if (!strcmp(name, "mov")) {
-		ops = &mov_ops;
-	} else {
-		if (name[0] == 'c' &&
-		    (name[1] == 'w' || name[1] == 'x'))
-			name += 2;
-
-		if (name[0] == 'b') {
-			const char *cond = name + 1;
-
-			if (cond[0] == 'r') {
-				if (is_branch_reg_cond(cond + 1))
-					ops = &jump_ops;
-			} else if (is_branch_cond(cond)) {
-				ops = &jump_ops;
-			}
-		} else if (name[0] == 'f' && name[1] == 'b') {
-			if (is_branch_float_cond(name + 2))
-				ops = &jump_ops;
-		}
-	}
-
-	if (ops)
-		arch__associate_ins_ops(arch, name, ops);
-
-	return ops;
-}
-
-static int sparc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
-{
-	if (!arch->initialized) {
-		arch->initialized = true;
-		arch->associate_instruction_ops = sparc__associate_instruction_ops;
-		arch->objdump.comment_char = '#';
-		arch->e_machine = EM_SPARC;
-		arch->e_flags = 0;
-	}
-
-	return 0;
-}
diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c
deleted file mode 100644
index ffca3029388b..000000000000
--- a/tools/perf/arch/x86/annotate/instructions.c
+++ /dev/null
@@ -1,797 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * x86 instruction nmemonic table to parse disasm lines for annotate.
- * This table is searched twice - one for exact match and another for
- * match without a size suffix (b, w, l, q) in case of AT&T syntax.
- *
- * So this table should not have entries with the suffix unless it's
- * a complete different instruction than ones without the suffix.
- */
-static const struct ins x86__instructions[] = {
-	{ .name = "adc",	.ops = &mov_ops,  },
-	{ .name = "add",	.ops = &mov_ops,  },
-	{ .name = "addsd",	.ops = &mov_ops,  },
-	{ .name = "and",	.ops = &mov_ops,  },
-	{ .name = "andpd",	.ops = &mov_ops,  },
-	{ .name = "andps",	.ops = &mov_ops,  },
-	{ .name = "bsr",	.ops = &mov_ops,  },
-	{ .name = "bt",		.ops = &mov_ops,  },
-	{ .name = "btr",	.ops = &mov_ops,  },
-	{ .name = "bts",	.ops = &mov_ops,  },
-	{ .name = "call",	.ops = &call_ops, },
-	{ .name = "cmovae",	.ops = &mov_ops,  },
-	{ .name = "cmovbe",	.ops = &mov_ops,  },
-	{ .name = "cmove",	.ops = &mov_ops,  },
-	{ .name = "cmp",	.ops = &mov_ops,  },
-	{ .name = "cmpxch",	.ops = &mov_ops,  },
-	{ .name = "cmpxchg",	.ops = &mov_ops,  },
-	{ .name = "cs",		.ops = &mov_ops,  },
-	{ .name = "dec",	.ops = &dec_ops,  },
-	{ .name = "divsd",	.ops = &mov_ops,  },
-	{ .name = "divss",	.ops = &mov_ops,  },
-	{ .name = "gs",		.ops = &mov_ops,  },
-	{ .name = "imul",	.ops = &mov_ops,  },
-	{ .name = "inc",	.ops = &dec_ops,  },
-	{ .name = "ja",		.ops = &jump_ops, },
-	{ .name = "jae",	.ops = &jump_ops, },
-	{ .name = "jb",		.ops = &jump_ops, },
-	{ .name = "jbe",	.ops = &jump_ops, },
-	{ .name = "jc",		.ops = &jump_ops, },
-	{ .name = "jcxz",	.ops = &jump_ops, },
-	{ .name = "je",		.ops = &jump_ops, },
-	{ .name = "jecxz",	.ops = &jump_ops, },
-	{ .name = "jg",		.ops = &jump_ops, },
-	{ .name = "jge",	.ops = &jump_ops, },
-	{ .name = "jl",		.ops = &jump_ops, },
-	{ .name = "jle",	.ops = &jump_ops, },
-	{ .name = "jmp",	.ops = &jump_ops, },
-	{ .name = "jna",	.ops = &jump_ops, },
-	{ .name = "jnae",	.ops = &jump_ops, },
-	{ .name = "jnb",	.ops = &jump_ops, },
-	{ .name = "jnbe",	.ops = &jump_ops, },
-	{ .name = "jnc",	.ops = &jump_ops, },
-	{ .name = "jne",	.ops = &jump_ops, },
-	{ .name = "jng",	.ops = &jump_ops, },
-	{ .name = "jnge",	.ops = &jump_ops, },
-	{ .name = "jnl",	.ops = &jump_ops, },
-	{ .name = "jnle",	.ops = &jump_ops, },
-	{ .name = "jno",	.ops = &jump_ops, },
-	{ .name = "jnp",	.ops = &jump_ops, },
-	{ .name = "jns",	.ops = &jump_ops, },
-	{ .name = "jnz",	.ops = &jump_ops, },
-	{ .name = "jo",		.ops = &jump_ops, },
-	{ .name = "jp",		.ops = &jump_ops, },
-	{ .name = "jpe",	.ops = &jump_ops, },
-	{ .name = "jpo",	.ops = &jump_ops, },
-	{ .name = "jrcxz",	.ops = &jump_ops, },
-	{ .name = "js",		.ops = &jump_ops, },
-	{ .name = "jz",		.ops = &jump_ops, },
-	{ .name = "lea",	.ops = &mov_ops,  },
-	{ .name = "lock",	.ops = &lock_ops, },
-	{ .name = "mov",	.ops = &mov_ops,  },
-	{ .name = "movapd",	.ops = &mov_ops,  },
-	{ .name = "movaps",	.ops = &mov_ops,  },
-	{ .name = "movdqa",	.ops = &mov_ops,  },
-	{ .name = "movdqu",	.ops = &mov_ops,  },
-	{ .name = "movsb",	.ops = &mov_ops,  },
-	{ .name = "movsd",	.ops = &mov_ops,  },
-	{ .name = "movsl",	.ops = &mov_ops,  },
-	{ .name = "movss",	.ops = &mov_ops,  },
-	{ .name = "movsw",	.ops = &mov_ops,  },
-	{ .name = "movupd",	.ops = &mov_ops,  },
-	{ .name = "movups",	.ops = &mov_ops,  },
-	{ .name = "movzb",	.ops = &mov_ops,  },
-	{ .name = "movzl",	.ops = &mov_ops,  },
-	{ .name = "movzw",	.ops = &mov_ops,  },
-	{ .name = "mulsd",	.ops = &mov_ops,  },
-	{ .name = "mulss",	.ops = &mov_ops,  },
-	{ .name = "nop",	.ops = &nop_ops,  },
-	{ .name = "or",		.ops = &mov_ops,  },
-	{ .name = "orps",	.ops = &mov_ops,  },
-	{ .name = "paddq",	.ops = &mov_ops,  },
-	{ .name = "pand",	.ops = &mov_ops,  },
-	{ .name = "pcmpeqb",	.ops = &mov_ops,  },
-	{ .name = "por",	.ops = &mov_ops,  },
-	{ .name = "rcl",	.ops = &mov_ops,  },
-	{ .name = "ret",	.ops = &ret_ops,  },
-	{ .name = "sbb",	.ops = &mov_ops,  },
-	{ .name = "sete",	.ops = &mov_ops,  },
-	{ .name = "sub",	.ops = &mov_ops,  },
-	{ .name = "subsd",	.ops = &mov_ops,  },
-	{ .name = "test",	.ops = &mov_ops,  },
-	{ .name = "tzcnt",	.ops = &mov_ops,  },
-	{ .name = "ucomisd",	.ops = &mov_ops,  },
-	{ .name = "ucomiss",	.ops = &mov_ops,  },
-	{ .name = "vaddsd",	.ops = &mov_ops,  },
-	{ .name = "vandpd",	.ops = &mov_ops,  },
-	{ .name = "vmovdqa",	.ops = &mov_ops,  },
-	{ .name = "vmovq",	.ops = &mov_ops,  },
-	{ .name = "vmovsd",	.ops = &mov_ops,  },
-	{ .name = "vmulsd",	.ops = &mov_ops,  },
-	{ .name = "vorpd",	.ops = &mov_ops,  },
-	{ .name = "vsubsd",	.ops = &mov_ops,  },
-	{ .name = "vucomisd",	.ops = &mov_ops,  },
-	{ .name = "xadd",	.ops = &mov_ops,  },
-	{ .name = "xbegin",	.ops = &jump_ops, },
-	{ .name = "xchg",	.ops = &mov_ops,  },
-	{ .name = "xor",	.ops = &mov_ops, },
-	{ .name = "xorpd",	.ops = &mov_ops, },
-	{ .name = "xorps",	.ops = &mov_ops, },
-};
-
-static bool amd__ins_is_fused(const struct arch *arch, const char *ins1,
-			      const char *ins2)
-{
-	if (strstr(ins2, "jmp"))
-		return false;
-
-	/* Family >= 15h supports cmp/test + branch fusion */
-	if (arch->family >= 0x15 && (strstarts(ins1, "test") ||
-	    (strstarts(ins1, "cmp") && !strstr(ins1, "xchg")))) {
-		return true;
-	}
-
-	/* Family >= 19h supports some ALU + branch fusion */
-	if (arch->family >= 0x19 && (strstarts(ins1, "add") ||
-	    strstarts(ins1, "sub") || strstarts(ins1, "and") ||
-	    strstarts(ins1, "inc") || strstarts(ins1, "dec") ||
-	    strstarts(ins1, "or") || strstarts(ins1, "xor"))) {
-		return true;
-	}
-
-	return false;
-}
-
-static bool intel__ins_is_fused(const struct arch *arch, const char *ins1,
-				const char *ins2)
-{
-	if (arch->family != 6 || arch->model < 0x1e || strstr(ins2, "jmp"))
-		return false;
-
-	if (arch->model == 0x1e) {
-		/* Nehalem */
-		if ((strstr(ins1, "cmp") && !strstr(ins1, "xchg")) ||
-		     strstr(ins1, "test")) {
-			return true;
-		}
-	} else {
-		/* Newer platform */
-		if ((strstr(ins1, "cmp") && !strstr(ins1, "xchg")) ||
-		     strstr(ins1, "test") ||
-		     strstr(ins1, "add") ||
-		     strstr(ins1, "sub") ||
-		     strstr(ins1, "and") ||
-		     strstr(ins1, "inc") ||
-		     strstr(ins1, "dec")) {
-			return true;
-		}
-	}
-
-	return false;
-}
-
-static int x86__cpuid_parse(struct arch *arch, char *cpuid)
-{
-	unsigned int family, model, stepping;
-	int ret;
-
-	/*
-	 * cpuid = "GenuineIntel,family,model,stepping"
-	 */
-	ret = sscanf(cpuid, "%*[^,],%u,%u,%u", &family, &model, &stepping);
-	if (ret == 3) {
-		arch->family = family;
-		arch->model = model;
-		arch->ins_is_fused = strstarts(cpuid, "AuthenticAMD") ?
-					amd__ins_is_fused :
-					intel__ins_is_fused;
-		return 0;
-	}
-
-	return -1;
-}
-
-static int x86__annotate_init(struct arch *arch, char *cpuid)
-{
-	int err = 0;
-
-	if (arch->initialized)
-		return 0;
-
-	if (cpuid) {
-		if (x86__cpuid_parse(arch, cpuid))
-			err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING;
-	}
-
-#ifndef NDEBUG
-	{
-		static bool sorted_check;
-
-		if (!sorted_check) {
-			for (size_t i = 0; i < arch->nr_instructions - 1; i++) {
-				assert(strcmp(arch->instructions[i].name,
-					      arch->instructions[i + 1].name) <= 0);
-			}
-			sorted_check = true;
-		}
-	}
-#endif
-	arch->e_machine = EM_X86_64;
-	arch->e_flags = 0;
-	arch->initialized = true;
-	return err;
-}
-
-#ifdef HAVE_LIBDW_SUPPORT
-static void update_insn_state_x86(struct type_state *state,
-				  struct data_loc_info *dloc, Dwarf_Die *cu_die,
-				  struct disasm_line *dl)
-{
-	struct annotated_insn_loc loc;
-	struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE];
-	struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET];
-	struct type_state_reg *tsr;
-	Dwarf_Die type_die;
-	u32 insn_offset = dl->al.offset;
-	int fbreg = dloc->fbreg;
-	int fboff = 0;
-
-	if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0)
-		return;
-
-	if (ins__is_call(&dl->ins)) {
-		struct symbol *func = dl->ops.target.sym;
-
-		if (func == NULL)
-			return;
-
-		/* __fentry__ will preserve all registers */
-		if (!strcmp(func->name, "__fentry__"))
-			return;
-
-		pr_debug_dtp("call [%x] %s\n", insn_offset, func->name);
-
-		/* Otherwise invalidate caller-saved registers after call */
-		for (unsigned i = 0; i < ARRAY_SIZE(state->regs); i++) {
-			if (state->regs[i].caller_saved)
-				state->regs[i].ok = false;
-		}
-
-		/* Update register with the return type (if any) */
-		if (die_find_func_rettype(cu_die, func->name, &type_die)) {
-			tsr = &state->regs[state->ret_reg];
-			tsr->type = type_die;
-			tsr->kind = TSR_KIND_TYPE;
-			tsr->offset = 0;
-			tsr->ok = true;
-
-			pr_debug_dtp("call [%x] return -> reg%d",
-				     insn_offset, state->ret_reg);
-			pr_debug_type_name(&type_die, tsr->kind);
-		}
-		return;
-	}
-
-	if (!strncmp(dl->ins.name, "add", 3)) {
-		u64 imm_value = -1ULL;
-		int offset;
-		const char *var_name = NULL;
-		struct map_symbol *ms = dloc->ms;
-		u64 ip = ms->sym->start + dl->al.offset;
-
-		if (!has_reg_type(state, dst->reg1))
-			return;
-
-		tsr = &state->regs[dst->reg1];
-		tsr->copied_from = -1;
-
-		if (src->imm)
-			imm_value = src->offset;
-		else if (has_reg_type(state, src->reg1) &&
-			 state->regs[src->reg1].kind == TSR_KIND_CONST)
-			imm_value = state->regs[src->reg1].imm_value;
-		else if (src->reg1 == DWARF_REG_PC) {
-			u64 var_addr = annotate_calc_pcrel(dloc->ms, ip,
-							   src->offset, dl);
-
-			if (get_global_var_info(dloc, var_addr,
-						&var_name, &offset) &&
-			    !strcmp(var_name, "this_cpu_off") &&
-			    tsr->kind == TSR_KIND_CONST) {
-				tsr->kind = TSR_KIND_PERCPU_BASE;
-				tsr->offset = 0;
-				tsr->ok = true;
-				imm_value = tsr->imm_value;
-			}
-		}
-		else
-			return;
-
-		/* Ignore add to non-pointer or non-const types */
-		if (tsr->kind == TSR_KIND_POINTER ||
-		    (dwarf_tag(&tsr->type) == DW_TAG_pointer_type &&
-		     src->reg1 != DWARF_REG_PC && tsr->kind == TSR_KIND_TYPE && !dst->mem_ref)) {
-			tsr->offset += imm_value;
-			pr_debug_dtp("add [%x] offset %#"PRIx64" to reg%d",
-				     insn_offset, imm_value, dst->reg1);
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-
-		if (tsr->kind == TSR_KIND_CONST)
-			tsr->imm_value += imm_value;
-
-		if (tsr->kind != TSR_KIND_PERCPU_BASE)
-			return;
-
-		if (get_global_var_type(cu_die, dloc, ip, imm_value, &offset,
-					&type_die) && offset == 0) {
-			/*
-			 * This is not a pointer type, but it should be treated
-			 * as a pointer.
-			 */
-			tsr->type = type_die;
-			tsr->kind = TSR_KIND_PERCPU_POINTER;
-			tsr->offset = 0;
-			tsr->ok = true;
-
-			pr_debug_dtp("add [%x] percpu %#"PRIx64" -> reg%d",
-				     insn_offset, imm_value, dst->reg1);
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-		return;
-	}
-
-	if (!strncmp(dl->ins.name, "sub", 3)) {
-		u64 imm_value = -1ULL;
-
-		if (!has_reg_type(state, dst->reg1))
-			return;
-
-		tsr = &state->regs[dst->reg1];
-		tsr->copied_from = -1;
-
-		if (src->imm)
-			imm_value = src->offset;
-		else if (has_reg_type(state, src->reg1) &&
-			 state->regs[src->reg1].kind == TSR_KIND_CONST)
-			imm_value = state->regs[src->reg1].imm_value;
-
-		if (tsr->kind == TSR_KIND_POINTER ||
-		    (dwarf_tag(&tsr->type) == DW_TAG_pointer_type &&
-		     src->reg1 != DWARF_REG_PC && tsr->kind == TSR_KIND_TYPE && !dst->mem_ref)) {
-			tsr->offset -= imm_value;
-			pr_debug_dtp("sub [%x] offset %#"PRIx64" to reg%d",
-				     insn_offset, imm_value, dst->reg1);
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-
-		if (tsr->kind == TSR_KIND_CONST)
-			tsr->imm_value -= imm_value;
-
-		return;
-	}
-
-	if (!strncmp(dl->ins.name, "lea", 3)) {
-		int sreg = src->reg1;
-		struct type_state_reg src_tsr;
-
-		if (!has_reg_type(state, sreg) ||
-		    !has_reg_type(state, dst->reg1) ||
-		    !src->mem_ref)
-			return;
-
-		src_tsr = state->regs[sreg];
-		tsr = &state->regs[dst->reg1];
-
-		tsr->copied_from = -1;
-		tsr->ok = false;
-
-		/* Case 1: Based on stack pointer or frame pointer */
-		if (sreg == fbreg || sreg == state->stack_reg) {
-			struct type_state_stack *stack;
-			int offset = src->offset - fboff;
-
-			stack = find_stack_state(state, offset);
-			if (!stack)
-				return;
-
-			tsr->type = stack->type;
-			tsr->kind = TSR_KIND_POINTER;
-			tsr->offset = offset - stack->offset;
-			tsr->ok = true;
-
-			if (sreg == fbreg) {
-				pr_debug_dtp("lea [%x] address of -%#x(stack) -> reg%d",
-					     insn_offset, -src->offset, dst->reg1);
-			} else {
-				pr_debug_dtp("lea [%x] address of %#x(reg%d) -> reg%d",
-					     insn_offset, src->offset, sreg, dst->reg1);
-			}
-
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-		/* Case 2: Based on a register holding a typed pointer */
-		else if (src_tsr.ok && (src_tsr.kind == TSR_KIND_POINTER ||
-			 (dwarf_tag(&src_tsr.type) == DW_TAG_pointer_type &&
-			  src_tsr.kind == TSR_KIND_TYPE))) {
-
-			if (src_tsr.kind == TSR_KIND_TYPE &&
-			    __die_get_real_type(&state->regs[sreg].type, &type_die) == NULL)
-				return;
-
-			if (src_tsr.kind == TSR_KIND_POINTER)
-				type_die = state->regs[sreg].type;
-
-			/* Check if the target type has a member at the new offset */
-			if (die_get_member_type(&type_die,
-						src->offset + src_tsr.offset, &type_die) == NULL)
-				return;
-
-			tsr->type = src_tsr.type;
-			tsr->kind = src_tsr.kind;
-			tsr->offset = src->offset + src_tsr.offset;
-			tsr->ok = true;
-
-			pr_debug_dtp("lea [%x] address of %s%#x(reg%d) -> reg%d",
-						insn_offset, src->offset < 0 ? "-" : "",
-						abs(src->offset), sreg, dst->reg1);
-
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-		return;
-	}
-
-	/* Invalidate register states for other ops which may change pointers */
-	if (has_reg_type(state, dst->reg1) && !dst->mem_ref &&
-	    dwarf_tag(&state->regs[dst->reg1].type) == DW_TAG_pointer_type) {
-		if (!strncmp(dl->ins.name, "imul", 4) || !strncmp(dl->ins.name, "mul", 3) ||
-		    !strncmp(dl->ins.name, "idiv", 4) || !strncmp(dl->ins.name, "div", 3) ||
-		    !strncmp(dl->ins.name, "shl", 3)  || !strncmp(dl->ins.name, "shr", 3) ||
-		    !strncmp(dl->ins.name, "sar", 3)  || !strncmp(dl->ins.name, "and", 3) ||
-		    !strncmp(dl->ins.name, "or", 2)   || !strncmp(dl->ins.name, "neg", 3) ||
-		    !strncmp(dl->ins.name, "inc", 3)  || !strncmp(dl->ins.name, "dec", 3)) {
-			pr_debug_dtp("%s [%x] invalidate reg%d\n",
-						dl->ins.name, insn_offset, dst->reg1);
-			state->regs[dst->reg1].ok = false;
-			state->regs[dst->reg1].copied_from = -1;
-			return;
-		}
-
-		if (!strncmp(dl->ins.name, "xor", 3) && dst->reg1 == src->reg1) {
-			/* xor reg, reg clears the register */
-			pr_debug_dtp("xor [%x] clear reg%d\n",
-				     insn_offset, dst->reg1);
-
-			state->regs[dst->reg1].kind = TSR_KIND_CONST;
-			state->regs[dst->reg1].imm_value = 0;
-			state->regs[dst->reg1].ok = true;
-			state->regs[dst->reg1].copied_from = -1;
-			return;
-		}
-	}
-
-	if (strncmp(dl->ins.name, "mov", 3))
-		return;
-
-	if (dloc->fb_cfa) {
-		u64 ip = dloc->ms->sym->start + dl->al.offset;
-		u64 pc = map__rip_2objdump(dloc->ms->map, ip);
-
-		if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0)
-			fbreg = -1;
-	}
-
-	/* Case 1. register to register or segment:offset to register transfers */
-	if (!src->mem_ref && !dst->mem_ref) {
-		if (!has_reg_type(state, dst->reg1))
-			return;
-
-		tsr = &state->regs[dst->reg1];
-		tsr->copied_from = -1;
-
-		if (dso__kernel(map__dso(dloc->ms->map)) &&
-		    src->segment == INSN_SEG_X86_GS && src->imm) {
-			u64 ip = dloc->ms->sym->start + dl->al.offset;
-			u64 var_addr;
-			int offset;
-
-			/*
-			 * In kernel, %gs points to a per-cpu region for the
-			 * current CPU.  Access with a constant offset should
-			 * be treated as a global variable access.
-			 */
-			var_addr = src->offset;
-
-			if (var_addr == 40) {
-				tsr->kind = TSR_KIND_CANARY;
-				tsr->offset = 0;
-				tsr->ok = true;
-
-				pr_debug_dtp("mov [%x] stack canary -> reg%d\n",
-					     insn_offset, dst->reg1);
-				return;
-			}
-
-			if (!get_global_var_type(cu_die, dloc, ip, var_addr,
-						 &offset, &type_die) ||
-			    !die_get_member_type(&type_die, offset, &type_die)) {
-				tsr->ok = false;
-				return;
-			}
-
-			tsr->type = type_die;
-			tsr->kind = TSR_KIND_TYPE;
-			tsr->offset = 0;
-			tsr->ok = true;
-
-			pr_debug_dtp("mov [%x] this-cpu addr=%#"PRIx64" -> reg%d",
-				     insn_offset, var_addr, dst->reg1);
-			pr_debug_type_name(&tsr->type, tsr->kind);
-			return;
-		}
-
-		if (src->imm) {
-			tsr->kind = TSR_KIND_CONST;
-			tsr->imm_value = src->offset;
-			tsr->offset = 0;
-			tsr->ok = true;
-
-			pr_debug_dtp("mov [%x] imm=%#x -> reg%d\n",
-				     insn_offset, tsr->imm_value, dst->reg1);
-			return;
-		}
-
-		if (!has_reg_type(state, src->reg1) ||
-		    !state->regs[src->reg1].ok) {
-			tsr->ok = false;
-			return;
-		}
-
-		tsr->type = state->regs[src->reg1].type;
-		tsr->kind = state->regs[src->reg1].kind;
-		tsr->imm_value = state->regs[src->reg1].imm_value;
-		tsr->offset = state->regs[src->reg1].offset;
-		tsr->ok = true;
-
-		/* To copy back the variable type later (hopefully) */
-		if (tsr->kind == TSR_KIND_TYPE || tsr->kind == TSR_KIND_POINTER)
-			tsr->copied_from = src->reg1;
-
-		pr_debug_dtp("mov [%x] reg%d -> reg%d",
-			     insn_offset, src->reg1, dst->reg1);
-		pr_debug_type_name(&tsr->type, tsr->kind);
-	}
-	/* Case 2. memory to register transers */
-	if (src->mem_ref && !dst->mem_ref) {
-		int sreg = src->reg1;
-
-		if (!has_reg_type(state, dst->reg1))
-			return;
-
-		tsr = &state->regs[dst->reg1];
-		tsr->copied_from = -1;
-
-retry:
-		/* Check stack variables with offset */
-		if (sreg == fbreg || sreg == state->stack_reg) {
-			struct type_state_stack *stack;
-			int offset = src->offset - fboff;
-
-			stack = find_stack_state(state, offset);
-			if (stack == NULL) {
-				tsr->ok = false;
-				return;
-			} else if (!stack->compound) {
-				tsr->type = stack->type;
-				tsr->kind = stack->kind;
-				tsr->offset = stack->ptr_offset;
-				tsr->ok = true;
-			} else if (die_get_member_type(&stack->type,
-						       offset - stack->offset,
-						       &type_die)) {
-				tsr->type = type_die;
-				tsr->kind = TSR_KIND_TYPE;
-				tsr->offset = 0;
-				tsr->ok = true;
-			} else {
-				tsr->ok = false;
-				return;
-			}
-
-			if (sreg == fbreg) {
-				pr_debug_dtp("mov [%x] -%#x(stack) -> reg%d",
-					     insn_offset, -offset, dst->reg1);
-			} else {
-				pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d",
-					     insn_offset, offset, sreg, dst->reg1);
-			}
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-		/* And then dereference the pointer if it has one */
-		else if (has_reg_type(state, sreg) && state->regs[sreg].ok &&
-			 state->regs[sreg].kind == TSR_KIND_TYPE &&
-			 die_deref_ptr_type(&state->regs[sreg].type,
-					    src->offset + state->regs[sreg].offset, &type_die)) {
-			tsr->type = type_die;
-			tsr->kind = TSR_KIND_TYPE;
-			tsr->offset = 0;
-			tsr->ok = true;
-
-			pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d",
-				     insn_offset, src->offset, sreg, dst->reg1);
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-		/* Handle dereference of TSR_KIND_POINTER registers */
-		else if (has_reg_type(state, sreg) && state->regs[sreg].ok &&
-			 state->regs[sreg].kind == TSR_KIND_POINTER &&
-			 die_get_member_type(&state->regs[sreg].type,
-					     src->offset + state->regs[sreg].offset, &type_die)) {
-			tsr->type = state->regs[sreg].type;
-			tsr->kind = TSR_KIND_TYPE;
-			tsr->offset = src->offset + state->regs[sreg].offset;
-			tsr->ok = true;
-
-			pr_debug_dtp("mov [%x] addr %#x(reg%d) -> reg%d",
-				     insn_offset, src->offset, sreg, dst->reg1);
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-		/* Or check if it's a global variable */
-		else if (sreg == DWARF_REG_PC) {
-			struct map_symbol *ms = dloc->ms;
-			u64 ip = ms->sym->start + dl->al.offset;
-			u64 addr;
-			int offset;
-
-			addr = annotate_calc_pcrel(ms, ip, src->offset, dl);
-
-			if (!get_global_var_type(cu_die, dloc, ip, addr, &offset,
-						 &type_die) ||
-			    !die_get_member_type(&type_die, offset, &type_die)) {
-				tsr->ok = false;
-				return;
-			}
-
-			tsr->type = type_die;
-			tsr->kind = TSR_KIND_TYPE;
-			tsr->offset = 0;
-			tsr->ok = true;
-
-			pr_debug_dtp("mov [%x] global addr=%"PRIx64" -> reg%d",
-				     insn_offset, addr, dst->reg1);
-			pr_debug_type_name(&type_die, tsr->kind);
-		}
-		/* And check percpu access with base register */
-		else if (has_reg_type(state, sreg) &&
-			 state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) {
-			u64 ip = dloc->ms->sym->start + dl->al.offset;
-			u64 var_addr = src->offset;
-			int offset;
-
-			if (src->multi_regs) {
-				int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1;
-
-				if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
-				    state->regs[reg2].kind == TSR_KIND_CONST)
-					var_addr += state->regs[reg2].imm_value;
-			}
-
-			/*
-			 * In kernel, %gs points to a per-cpu region for the
-			 * current CPU.  Access with a constant offset should
-			 * be treated as a global variable access.
-			 */
-			if (get_global_var_type(cu_die, dloc, ip, var_addr,
-						&offset, &type_die) &&
-			    die_get_member_type(&type_die, offset, &type_die)) {
-				tsr->type = type_die;
-				tsr->kind = TSR_KIND_TYPE;
-				tsr->offset = 0;
-				tsr->ok = true;
-
-				if (src->multi_regs) {
-					pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d",
-						     insn_offset, src->offset, src->reg1,
-						     src->reg2, dst->reg1);
-				} else {
-					pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
-						     insn_offset, src->offset, sreg, dst->reg1);
-				}
-				pr_debug_type_name(&tsr->type, tsr->kind);
-			} else {
-				tsr->ok = false;
-			}
-		}
-		/* And then dereference the calculated pointer if it has one */
-		else if (has_reg_type(state, sreg) && state->regs[sreg].ok &&
-			 state->regs[sreg].kind == TSR_KIND_PERCPU_POINTER &&
-			 die_get_member_type(&state->regs[sreg].type,
-					     src->offset, &type_die)) {
-			tsr->type = type_die;
-			tsr->kind = TSR_KIND_TYPE;
-			tsr->offset = 0;
-			tsr->ok = true;
-
-			pr_debug_dtp("mov [%x] pointer %#x(reg%d) -> reg%d",
-				     insn_offset, src->offset, sreg, dst->reg1);
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-		/* Or try another register if any */
-		else if (src->multi_regs && sreg == src->reg1 &&
-			 src->reg1 != src->reg2) {
-			sreg = src->reg2;
-			goto retry;
-		}
-		else {
-			int offset;
-			const char *var_name = NULL;
-
-			/* it might be per-cpu variable (in kernel) access */
-			if (src->offset < 0) {
-				if (get_global_var_info(dloc, (s64)src->offset,
-							&var_name, &offset) &&
-				    !strcmp(var_name, "__per_cpu_offset")) {
-					tsr->kind = TSR_KIND_PERCPU_BASE;
-					tsr->offset = 0;
-					tsr->ok = true;
-
-					pr_debug_dtp("mov [%x] percpu base reg%d\n",
-						     insn_offset, dst->reg1);
-					return;
-				}
-			}
-
-			tsr->ok = false;
-		}
-	}
-	/* Case 3. register to memory transfers */
-	if (!src->mem_ref && dst->mem_ref) {
-		if (!has_reg_type(state, src->reg1) ||
-		    !state->regs[src->reg1].ok)
-			return;
-
-		/* Check stack variables with offset */
-		if (dst->reg1 == fbreg || dst->reg1 == state->stack_reg) {
-			struct type_state_stack *stack;
-			int offset = dst->offset - fboff;
-
-			tsr = &state->regs[src->reg1];
-
-			stack = find_stack_state(state, offset);
-			if (stack) {
-				/*
-				 * The source register is likely to hold a type
-				 * of member if it's a compound type.  Do not
-				 * update the stack variable type since we can
-				 * get the member type later by using the
-				 * die_get_member_type().
-				 */
-				if (!stack->compound)
-					set_stack_state(stack, offset, tsr->kind,
-							&tsr->type, tsr->offset);
-			} else {
-				findnew_stack_state(state, offset, tsr->kind,
-						    &tsr->type, tsr->offset);
-			}
-
-			if (dst->reg1 == fbreg) {
-				pr_debug_dtp("mov [%x] reg%d -> -%#x(stack)",
-					     insn_offset, src->reg1, -offset);
-			} else {
-				pr_debug_dtp("mov [%x] reg%d -> %#x(reg%d)",
-					     insn_offset, src->reg1, offset, dst->reg1);
-			}
-			if (tsr->offset != 0) {
-				pr_debug_dtp(" reg%d offset %#x ->",
-					src->reg1, tsr->offset);
-			}
-
-			pr_debug_type_name(&tsr->type, tsr->kind);
-		}
-		/*
-		 * Ignore other transfers since it'd set a value in a struct
-		 * and won't change the type.
-		 */
-	}
-	/* Case 4. memory to memory transfers (not handled for now) */
-}
-#endif
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index c30ff257f8b4..b9925c6902ca 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -1,6 +1,7 @@
 include $(srctree)/tools/scripts/Makefile.include
 include $(srctree)/tools/scripts/utilities.mak
 
+perf-util-y += annotate-arch/
 perf-util-y += arm64-frame-pointer-unwind-support.o
 perf-util-y += addr2line.o
 perf-util-y += addr_location.o
diff --git a/tools/perf/util/annotate-arch/Build b/tools/perf/util/annotate-arch/Build
new file mode 100644
index 000000000000..23316743fdc5
--- /dev/null
+++ b/tools/perf/util/annotate-arch/Build
@@ -0,0 +1,11 @@
+perf-util-y += annotate-arc.o
+perf-util-y += annotate-arm.o
+perf-util-y += annotate-arm64.o
+perf-util-y += annotate-csky.o
+perf-util-y += annotate-loongarch.o
+perf-util-y += annotate-mips.o
+perf-util-y += annotate-x86.o
+perf-util-y += annotate-powerpc.o
+perf-util-y += annotate-riscv64.o
+perf-util-y += annotate-s390.o
+perf-util-y += annotate-sparc.o
diff --git a/tools/perf/util/annotate-arch/annotate-arc.c b/tools/perf/util/annotate-arch/annotate-arc.c
new file mode 100644
index 000000000000..d7ca08ca5600
--- /dev/null
+++ b/tools/perf/util/annotate-arch/annotate-arc.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/compiler.h>
+#include "../disasm.h"
+
+int arc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+{
+	arch->initialized = true;
+	arch->objdump.comment_char = ';';
+	return 0;
+}
diff --git a/tools/perf/util/annotate-arch/annotate-arm.c b/tools/perf/util/annotate-arch/annotate-arm.c
new file mode 100644
index 000000000000..08c49067c3c9
--- /dev/null
+++ b/tools/perf/util/annotate-arch/annotate-arm.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdlib.h>
+#include <linux/compiler.h>
+#include <linux/zalloc.h>
+#include <errno.h>
+#include <regex.h>
+#include "../annotate.h"
+#include "../disasm.h"
+
+struct arm_annotate {
+	regex_t call_insn,
+		jump_insn;
+};
+
+static const struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const char *name)
+{
+	struct arm_annotate *arm = arch->priv;
+	const struct ins_ops *ops;
+	regmatch_t match[2];
+
+	if (!regexec(&arm->call_insn, name, 2, match, 0))
+		ops = &call_ops;
+	else if (!regexec(&arm->jump_insn, name, 2, match, 0))
+		ops = &jump_ops;
+	else
+		return NULL;
+
+	arch__associate_ins_ops(arch, name, ops);
+	return ops;
+}
+
+int arm__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+{
+	struct arm_annotate *arm;
+	int err;
+
+	if (arch->initialized)
+		return 0;
+
+	arm = zalloc(sizeof(*arm));
+	if (!arm)
+		return ENOMEM;
+
+#define ARM_CONDS "(cc|cs|eq|ge|gt|hi|le|ls|lt|mi|ne|pl|vc|vs)"
+	err = regcomp(&arm->call_insn, "^blx?" ARM_CONDS "?$", REG_EXTENDED);
+	if (err)
+		goto out_free_arm;
+	err = regcomp(&arm->jump_insn, "^bx?" ARM_CONDS "?$", REG_EXTENDED);
+	if (err)
+		goto out_free_call;
+#undef ARM_CONDS
+
+	arch->initialized = true;
+	arch->priv	  = arm;
+	arch->associate_instruction_ops   = arm__associate_instruction_ops;
+	arch->objdump.comment_char	  = ';';
+	arch->objdump.skip_functions_char = '+';
+	return 0;
+
+out_free_call:
+	regfree(&arm->call_insn);
+out_free_arm:
+	free(arm);
+	return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP;
+}
diff --git a/tools/perf/util/annotate-arch/annotate-arm64.c b/tools/perf/util/annotate-arch/annotate-arm64.c
new file mode 100644
index 000000000000..d2ea32984b0d
--- /dev/null
+++ b/tools/perf/util/annotate-arch/annotate-arm64.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/compiler.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/zalloc.h>
+#include <regex.h>
+#include "../annotate.h"
+#include "../disasm.h"
+
+struct arm64_annotate {
+	regex_t call_insn,
+		jump_insn;
+};
+
+static int arm64_mov__parse(const struct arch *arch __maybe_unused,
+			    struct ins_operands *ops,
+			    struct map_symbol *ms __maybe_unused,
+			    struct disasm_line *dl __maybe_unused)
+{
+	char *s = strchr(ops->raw, ','), *target, *endptr;
+
+	if (s == NULL)
+		return -1;
+
+	*s = '\0';
+	ops->source.raw = strdup(ops->raw);
+	*s = ',';
+
+	if (ops->source.raw == NULL)
+		return -1;
+
+	target = ++s;
+	ops->target.raw = strdup(target);
+	if (ops->target.raw == NULL)
+		goto out_free_source;
+
+	ops->target.addr = strtoull(target, &endptr, 16);
+	if (endptr == target)
+		goto out_free_target;
+
+	s = strchr(endptr, '<');
+	if (s == NULL)
+		goto out_free_target;
+	endptr = strchr(s + 1, '>');
+	if (endptr == NULL)
+		goto out_free_target;
+
+	*endptr = '\0';
+	*s = ' ';
+	ops->target.name = strdup(s);
+	*s = '<';
+	*endptr = '>';
+	if (ops->target.name == NULL)
+		goto out_free_target;
+
+	return 0;
+
+out_free_target:
+	zfree(&ops->target.raw);
+out_free_source:
+	zfree(&ops->source.raw);
+	return -1;
+}
+
+static const struct ins_ops arm64_mov_ops = {
+	.parse	   = arm64_mov__parse,
+	.scnprintf = mov__scnprintf,
+};
+
+static const struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name)
+{
+	struct arm64_annotate *arm = arch->priv;
+	const struct ins_ops *ops;
+	regmatch_t match[2];
+
+	if (!regexec(&arm->jump_insn, name, 2, match, 0))
+		ops = &jump_ops;
+	else if (!regexec(&arm->call_insn, name, 2, match, 0))
+		ops = &call_ops;
+	else if (!strcmp(name, "ret"))
+		ops = &ret_ops;
+	else
+		ops = &arm64_mov_ops;
+
+	arch__associate_ins_ops(arch, name, ops);
+	return ops;
+}
+
+int arm64__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+{
+	struct arm64_annotate *arm;
+	int err;
+
+	if (arch->initialized)
+		return 0;
+
+	arm = zalloc(sizeof(*arm));
+	if (!arm)
+		return ENOMEM;
+
+	/* bl, blr */
+	err = regcomp(&arm->call_insn, "^blr?$", REG_EXTENDED);
+	if (err)
+		goto out_free_arm;
+	/* b, b.cond, br, cbz/cbnz, tbz/tbnz */
+	err = regcomp(&arm->jump_insn, "^[ct]?br?\\.?(cc|cs|eq|ge|gt|hi|hs|le|lo|ls|lt|mi|ne|pl|vc|vs)?n?z?$",
+		      REG_EXTENDED);
+	if (err)
+		goto out_free_call;
+
+	arch->initialized = true;
+	arch->priv	  = arm;
+	arch->associate_instruction_ops   = arm64__associate_instruction_ops;
+	arch->objdump.comment_char	  = '/';
+	arch->objdump.skip_functions_char = '+';
+	return 0;
+
+out_free_call:
+	regfree(&arm->call_insn);
+out_free_arm:
+	free(arm);
+	return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP;
+}
diff --git a/tools/perf/util/annotate-arch/annotate-csky.c b/tools/perf/util/annotate-arch/annotate-csky.c
new file mode 100644
index 000000000000..0b0b09b068ec
--- /dev/null
+++ b/tools/perf/util/annotate-arch/annotate-csky.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
+#include <string.h>
+#include <linux/compiler.h>
+#include "../disasm.h"
+
+static const struct ins_ops *csky__associate_ins_ops(struct arch *arch,
+						     const char *name)
+{
+	const struct ins_ops *ops = NULL;
+
+	/* catch all kind of jumps */
+	if (!strcmp(name, "bt") ||
+	    !strcmp(name, "bf") ||
+	    !strcmp(name, "bez") ||
+	    !strcmp(name, "bnez") ||
+	    !strcmp(name, "bnezad") ||
+	    !strcmp(name, "bhsz") ||
+	    !strcmp(name, "bhz") ||
+	    !strcmp(name, "blsz") ||
+	    !strcmp(name, "blz") ||
+	    !strcmp(name, "br") ||
+	    !strcmp(name, "jmpi") ||
+	    !strcmp(name, "jmp"))
+		ops = &jump_ops;
+
+	/* catch function call */
+	if (!strcmp(name, "bsr") ||
+	    !strcmp(name, "jsri") ||
+	    !strcmp(name, "jsr"))
+		ops = &call_ops;
+
+	/* catch function return */
+	if (!strcmp(name, "rts"))
+		ops = &ret_ops;
+
+	if (ops)
+		arch__associate_ins_ops(arch, name, ops);
+	return ops;
+}
+
+int csky__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+{
+	arch->initialized = true;
+	arch->objdump.comment_char = '/';
+	arch->associate_instruction_ops = csky__associate_ins_ops;
+	return 0;
+}
diff --git a/tools/perf/util/annotate-arch/annotate-loongarch.c b/tools/perf/util/annotate-arch/annotate-loongarch.c
new file mode 100644
index 000000000000..32df10f6fed5
--- /dev/null
+++ b/tools/perf/util/annotate-arch/annotate-loongarch.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Perf annotate functions.
+ *
+ * Copyright (C) 2020-2023 Loongson Technology Corporation Limited
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <linux/compiler.h>
+#include "../disasm.h"
+#include "../map.h"
+#include "../maps.h"
+#include "../symbol.h"
+
+static int loongarch_call__parse(const struct arch *arch, struct ins_operands *ops,
+				 struct map_symbol *ms,
+				 struct disasm_line *dl __maybe_unused)
+{
+	char *c, *endptr, *tok, *name;
+	struct map *map = ms->map;
+	struct addr_map_symbol target;
+
+	c = strchr(ops->raw, '#');
+	if (c++ == NULL)
+		return -1;
+
+	ops->target.addr = strtoull(c, &endptr, 16);
+
+	name = strchr(endptr, '<');
+	name++;
+
+	if (arch->objdump.skip_functions_char &&
+	    strchr(name, arch->objdump.skip_functions_char))
+		return -1;
+
+	tok = strchr(name, '>');
+	if (tok == NULL)
+		return -1;
+
+	*tok = '\0';
+	ops->target.name = strdup(name);
+	*tok = '>';
+
+	if (ops->target.name == NULL)
+		return -1;
+
+	target = (struct addr_map_symbol) {
+		.ms = { .map = map__get(map), },
+		.addr = map__objdump_2mem(map, ops->target.addr),
+	};
+
+	if (maps__find_ams(ms->maps, &target) == 0 &&
+	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
+		ops->target.sym = target.ms.sym;
+
+	addr_map_symbol__exit(&target);
+	return 0;
+}
+
+const struct ins_ops loongarch_call_ops = {
+	.parse	   = loongarch_call__parse,
+	.scnprintf = call__scnprintf,
+};
+
+static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *ops,
+				 struct map_symbol *ms,
+				 struct disasm_line *dl __maybe_unused)
+
+{
+	struct map *map = ms->map;
+	struct symbol *sym = ms->sym;
+	struct addr_map_symbol target = {
+		.ms = { .map = map__get(map), },
+	};
+	const char *c = strchr(ops->raw, '#');
+	u64 start, end;
+
+	ops->jump.raw_comment = strchr(ops->raw, arch->objdump.comment_char);
+	ops->jump.raw_func_start = strchr(ops->raw, '<');
+
+	if (ops->jump.raw_func_start && c > ops->jump.raw_func_start)
+		c = NULL;
+
+	if (c++ != NULL)
+		ops->target.addr = strtoull(c, NULL, 16);
+	else
+		ops->target.addr = strtoull(ops->raw, NULL, 16);
+
+	target.addr = map__objdump_2mem(map, ops->target.addr);
+	start = map__unmap_ip(map, sym->start);
+	end = map__unmap_ip(map, sym->end);
+
+	ops->target.outside = target.addr < start || target.addr > end;
+
+	if (maps__find_ams(ms->maps, &target) == 0 &&
+	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
+		ops->target.sym = target.ms.sym;
+
+	if (!ops->target.outside) {
+		ops->target.offset = target.addr - start;
+		ops->target.offset_avail = true;
+	} else {
+		ops->target.offset_avail = false;
+	}
+	addr_map_symbol__exit(&target);
+	return 0;
+}
+
+const struct ins_ops loongarch_jump_ops = {
+	.parse	   = loongarch_jump__parse,
+	.scnprintf = jump__scnprintf,
+};
+
+static
+const struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char *name)
+{
+	const struct ins_ops *ops = NULL;
+
+	if (!strcmp(name, "bl"))
+		ops = &loongarch_call_ops;
+	else if (!strcmp(name, "jirl"))
+		ops = &ret_ops;
+	else if (!strcmp(name, "b") ||
+		 !strncmp(name, "beq", 3) ||
+		 !strncmp(name, "bne", 3) ||
+		 !strncmp(name, "blt", 3) ||
+		 !strncmp(name, "bge", 3) ||
+		 !strncmp(name, "bltu", 4) ||
+		 !strncmp(name, "bgeu", 4))
+		ops = &loongarch_jump_ops;
+	else
+		return NULL;
+
+	arch__associate_ins_ops(arch, name, ops);
+
+	return ops;
+}
+
+int loongarch__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+{
+	if (!arch->initialized) {
+		arch->associate_instruction_ops = loongarch__associate_ins_ops;
+		arch->initialized = true;
+		arch->objdump.comment_char = '#';
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/annotate-arch/annotate-mips.c b/tools/perf/util/annotate-arch/annotate-mips.c
new file mode 100644
index 000000000000..f14b34ed77d3
--- /dev/null
+++ b/tools/perf/util/annotate-arch/annotate-mips.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <linux/compiler.h>
+#include "../disasm.h"
+
+static
+const struct ins_ops *mips__associate_ins_ops(struct arch *arch, const char *name)
+{
+	const struct ins_ops *ops = NULL;
+
+	if (!strncmp(name, "bal", 3) ||
+	    !strncmp(name, "bgezal", 6) ||
+	    !strncmp(name, "bltzal", 6) ||
+	    !strncmp(name, "bgtzal", 6) ||
+	    !strncmp(name, "blezal", 6) ||
+	    !strncmp(name, "beqzal", 6) ||
+	    !strncmp(name, "bnezal", 6) ||
+	    !strncmp(name, "bgtzl", 5) ||
+	    !strncmp(name, "bltzl", 5) ||
+	    !strncmp(name, "bgezl", 5) ||
+	    !strncmp(name, "blezl", 5) ||
+	    !strncmp(name, "jialc", 5) ||
+	    !strncmp(name, "beql", 4) ||
+	    !strncmp(name, "bnel", 4) ||
+	    !strncmp(name, "jal", 3))
+		ops = &call_ops;
+	else if (!strncmp(name, "jr", 2))
+		ops = &ret_ops;
+	else if (name[0] == 'j' || name[0] == 'b')
+		ops = &jump_ops;
+	else
+		return NULL;
+
+	arch__associate_ins_ops(arch, name, ops);
+
+	return ops;
+}
+
+int mips__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+{
+	if (!arch->initialized) {
+		arch->associate_instruction_ops = mips__associate_ins_ops;
+		arch->initialized = true;
+		arch->objdump.comment_char = '#';
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/annotate-arch/annotate-powerpc.c b/tools/perf/util/annotate-arch/annotate-powerpc.c
new file mode 100644
index 000000000000..593c138c8104
--- /dev/null
+++ b/tools/perf/util/annotate-arch/annotate-powerpc.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include "../annotate-data.h"
+#include "../debug.h"
+#include "../disasm.h"
+
+#define PPC_OP(op)	(((op) >> 26) & 0x3F)
+#define PPC_21_30(R)	(((R) >> 1) & 0x3ff)
+#define PPC_22_30(R)	(((R) >> 1) & 0x1ff)
+
+#define MINUS_EXT_XO_FORM	234
+#define SUB_EXT_XO_FORM		232
+#define	ADD_ZERO_EXT_XO_FORM	202
+#define	SUB_ZERO_EXT_XO_FORM	200
+
+static int arithmetic__scnprintf(const struct ins *ins, char *bf, size_t size,
+		struct ins_operands *ops, int max_ins_name)
+{
+	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
+			ops->raw);
+}
+
+/*
+ * Sets the fields: multi_regs and "mem_ref".
+ * "mem_ref" is set for ops->source which is later used to
+ * fill the objdump->memory_ref-char field. This ops is currently
+ * used by powerpc and since binary instruction code is used to
+ * extract opcode, regs and offset, no other parsing is needed here.
+ *
+ * Dont set multi regs for 4 cases since it has only one operand
+ * for source:
+ * - Add to Minus One Extended XO-form ( Ex: addme, addmeo )
+ * - Subtract From Minus One Extended XO-form ( Ex: subfme )
+ * - Add to Zero Extended XO-form ( Ex: addze, addzeo )
+ * - Subtract From Zero Extended XO-form ( Ex: subfze )
+ */
+static int arithmetic__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops,
+		struct map_symbol *ms __maybe_unused, struct disasm_line *dl)
+{
+	int opcode = PPC_OP(dl->raw.raw_insn);
+
+	ops->source.mem_ref = false;
+	if (opcode == 31) {
+		if ((opcode != MINUS_EXT_XO_FORM) && (opcode != SUB_EXT_XO_FORM) &&
+		    (opcode != ADD_ZERO_EXT_XO_FORM) && (opcode != SUB_ZERO_EXT_XO_FORM))
+			ops->source.multi_regs = true;
+	}
+
+	ops->target.mem_ref = false;
+	ops->target.multi_regs = false;
+
+	return 0;
+}
+
+static const struct ins_ops arithmetic_ops = {
+	.parse     = arithmetic__parse,
+	.scnprintf = arithmetic__scnprintf,
+};
+
+static int load_store__scnprintf(const struct ins *ins, char *bf, size_t size,
+		struct ins_operands *ops, int max_ins_name)
+{
+	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
+			ops->raw);
+}
+
+/*
+ * Sets the fields: multi_regs and "mem_ref".
+ * "mem_ref" is set for ops->source which is later used to
+ * fill the objdump->memory_ref-char field. This ops is currently
+ * used by powerpc and since binary instruction code is used to
+ * extract opcode, regs and offset, no other parsing is needed here
+ */
+static int load_store__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops,
+		struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused)
+{
+	ops->source.mem_ref = true;
+	ops->source.multi_regs = false;
+	/* opcode 31 is of X form */
+	if (PPC_OP(dl->raw.raw_insn) == 31)
+		ops->source.multi_regs = true;
+
+	ops->target.mem_ref = false;
+	ops->target.multi_regs = false;
+
+	return 0;
+}
+
+static const struct ins_ops load_store_ops = {
+	.parse     = load_store__parse,
+	.scnprintf = load_store__scnprintf,
+};
+
+static const struct ins_ops *powerpc__associate_instruction_ops(struct arch *arch, const char *name)
+{
+	int i;
+	const struct ins_ops *ops;
+
+	/*
+	 * - Interested only if instruction starts with 'b'.
+	 * - Few start with 'b', but aren't branch instructions.
+	 */
+	if (name[0] != 'b'             ||
+	    !strncmp(name, "bcd", 3)   ||
+	    !strncmp(name, "brinc", 5) ||
+	    !strncmp(name, "bper", 4))
+		return NULL;
+
+	ops = &jump_ops;
+
+	i = strlen(name) - 1;
+	if (i < 0)
+		return NULL;
+
+	/* ignore optional hints at the end of the instructions */
+	if (name[i] == '+' || name[i] == '-')
+		i--;
+
+	if (name[i] == 'l' || (name[i] == 'a' && name[i-1] == 'l')) {
+		/*
+		 * if the instruction ends up with 'l' or 'la', then
+		 * those are considered 'calls' since they update LR.
+		 * ... except for 'bnl' which is branch if not less than
+		 * and the absolute form of the same.
+		 */
+		if (strcmp(name, "bnl") && strcmp(name, "bnl+") &&
+		    strcmp(name, "bnl-") && strcmp(name, "bnla") &&
+		    strcmp(name, "bnla+") && strcmp(name, "bnla-"))
+			ops = &call_ops;
+	}
+	if (name[i] == 'r' && name[i-1] == 'l')
+		/*
+		 * instructions ending with 'lr' are considered to be
+		 * return instructions
+		 */
+		ops = &ret_ops;
+
+	arch__associate_ins_ops(arch, name, ops);
+	return ops;
+}
+
+struct insn_offset {
+	const char	*name;
+	int		value;
+};
+
+/*
+ * There are memory instructions with opcode 31 which are
+ * of X Form, Example:
+ * ldx RT,RA,RB
+ * ______________________________________
+ * | 31 |  RT  |  RA |  RB |   21     |/|
+ * --------------------------------------
+ * 0    6     11    16    21         30 31
+ *
+ * But all instructions with opcode 31 are not memory.
+ * Example: add RT,RA,RB
+ *
+ * Use bits 21 to 30 to check memory insns with 31 as opcode.
+ * In ins_array below, for ldx instruction:
+ * name => OP_31_XOP_LDX
+ * value => 21
+ */
+
+static struct insn_offset ins_array[] = {
+	{ .name = "OP_31_XOP_LXSIWZX",  .value = 12, },
+	{ .name = "OP_31_XOP_LWARX",	.value = 20, },
+	{ .name = "OP_31_XOP_LDX",	.value = 21, },
+	{ .name = "OP_31_XOP_LWZX",	.value = 23, },
+	{ .name = "OP_31_XOP_LDUX",	.value = 53, },
+	{ .name = "OP_31_XOP_LWZUX",	.value = 55, },
+	{ .name = "OP_31_XOP_LXSIWAX",  .value = 76, },
+	{ .name = "OP_31_XOP_LDARX",    .value = 84, },
+	{ .name = "OP_31_XOP_LBZX",	.value = 87, },
+	{ .name = "OP_31_XOP_LVX",      .value = 103, },
+	{ .name = "OP_31_XOP_LBZUX",    .value = 119, },
+	{ .name = "OP_31_XOP_STXSIWX",  .value = 140, },
+	{ .name = "OP_31_XOP_STDX",	.value = 149, },
+	{ .name = "OP_31_XOP_STWX",	.value = 151, },
+	{ .name = "OP_31_XOP_STDUX",	.value = 181, },
+	{ .name = "OP_31_XOP_STWUX",	.value = 183, },
+	{ .name = "OP_31_XOP_STBX",	.value = 215, },
+	{ .name = "OP_31_XOP_STVX",     .value = 231, },
+	{ .name = "OP_31_XOP_STBUX",	.value = 247, },
+	{ .name = "OP_31_XOP_LHZX",	.value = 279, },
+	{ .name = "OP_31_XOP_LHZUX",	.value = 311, },
+	{ .name = "OP_31_XOP_LXVDSX",   .value = 332, },
+	{ .name = "OP_31_XOP_LWAX",	.value = 341, },
+	{ .name = "OP_31_XOP_LHAX",	.value = 343, },
+	{ .name = "OP_31_XOP_LWAUX",	.value = 373, },
+	{ .name = "OP_31_XOP_LHAUX",	.value = 375, },
+	{ .name = "OP_31_XOP_STHX",	.value = 407, },
+	{ .name = "OP_31_XOP_STHUX",	.value = 439, },
+	{ .name = "OP_31_XOP_LXSSPX",   .value = 524, },
+	{ .name = "OP_31_XOP_LDBRX",	.value = 532, },
+	{ .name = "OP_31_XOP_LSWX",	.value = 533, },
+	{ .name = "OP_31_XOP_LWBRX",	.value = 534, },
+	{ .name = "OP_31_XOP_LFSUX",    .value = 567, },
+	{ .name = "OP_31_XOP_LXSDX",    .value = 588, },
+	{ .name = "OP_31_XOP_LSWI",	.value = 597, },
+	{ .name = "OP_31_XOP_LFDX",     .value = 599, },
+	{ .name = "OP_31_XOP_LFDUX",    .value = 631, },
+	{ .name = "OP_31_XOP_STXSSPX",  .value = 652, },
+	{ .name = "OP_31_XOP_STDBRX",	.value = 660, },
+	{ .name = "OP_31_XOP_STXWX",	.value = 661, },
+	{ .name = "OP_31_XOP_STWBRX",	.value = 662, },
+	{ .name = "OP_31_XOP_STFSX",	.value = 663, },
+	{ .name = "OP_31_XOP_STFSUX",	.value = 695, },
+	{ .name = "OP_31_XOP_STXSDX",   .value = 716, },
+	{ .name = "OP_31_XOP_STSWI",	.value = 725, },
+	{ .name = "OP_31_XOP_STFDX",	.value = 727, },
+	{ .name = "OP_31_XOP_STFDUX",	.value = 759, },
+	{ .name = "OP_31_XOP_LXVW4X",   .value = 780, },
+	{ .name = "OP_31_XOP_LHBRX",	.value = 790, },
+	{ .name = "OP_31_XOP_LXVD2X",   .value = 844, },
+	{ .name = "OP_31_XOP_LFIWAX",	.value = 855, },
+	{ .name = "OP_31_XOP_LFIWZX",	.value = 887, },
+	{ .name = "OP_31_XOP_STXVW4X",  .value = 908, },
+	{ .name = "OP_31_XOP_STHBRX",	.value = 918, },
+	{ .name = "OP_31_XOP_STXVD2X",  .value = 972, },
+	{ .name = "OP_31_XOP_STFIWX",	.value = 983, },
+};
+
+/*
+ * Arithmetic instructions which are having opcode as 31.
+ * These instructions are tracked to save the register state
+ * changes. Example:
+ *
+ * lwz	r10,264(r3)
+ * add	r31, r3, r3
+ * lwz	r9, 0(r31)
+ *
+ * Here instruction tracking needs to identify the "add"
+ * instruction and save data type of r3 to r31. If a sample
+ * is hit at next "lwz r9, 0(r31)", by this instruction tracking,
+ * data type of r31 can be resolved.
+ */
+static struct insn_offset arithmetic_ins_op_31[] = {
+	{ .name = "SUB_CARRY_XO_FORM",  .value = 8, },
+	{ .name = "MUL_HDW_XO_FORM1",   .value = 9, },
+	{ .name = "ADD_CARRY_XO_FORM",  .value = 10, },
+	{ .name = "MUL_HW_XO_FORM1",    .value = 11, },
+	{ .name = "SUB_XO_FORM",        .value = 40, },
+	{ .name = "MUL_HDW_XO_FORM",    .value = 73, },
+	{ .name = "MUL_HW_XO_FORM",     .value = 75, },
+	{ .name = "SUB_EXT_XO_FORM",    .value = 136, },
+	{ .name = "ADD_EXT_XO_FORM",    .value = 138, },
+	{ .name = "SUB_ZERO_EXT_XO_FORM",       .value = 200, },
+	{ .name = "ADD_ZERO_EXT_XO_FORM",       .value = 202, },
+	{ .name = "SUB_EXT_XO_FORM2",   .value = 232, },
+	{ .name = "MUL_DW_XO_FORM",     .value = 233, },
+	{ .name = "ADD_EXT_XO_FORM2",   .value = 234, },
+	{ .name = "MUL_W_XO_FORM",      .value = 235, },
+	{ .name = "ADD_XO_FORM",	.value = 266, },
+	{ .name = "DIV_DW_XO_FORM1",    .value = 457, },
+	{ .name = "DIV_W_XO_FORM1",     .value = 459, },
+	{ .name = "DIV_DW_XO_FORM",	.value = 489, },
+	{ .name = "DIV_W_XO_FORM",	.value = 491, },
+};
+
+static struct insn_offset arithmetic_two_ops[] = {
+	{ .name = "mulli",      .value = 7, },
+	{ .name = "subfic",     .value = 8, },
+	{ .name = "addic",      .value = 12, },
+	{ .name = "addic.",     .value = 13, },
+	{ .name = "addi",       .value = 14, },
+	{ .name = "addis",      .value = 15, },
+};
+
+static int cmp_offset(const void *a, const void *b)
+{
+	const struct insn_offset *val1 = a;
+	const struct insn_offset *val2 = b;
+
+	return (val1->value - val2->value);
+}
+
+const struct ins_ops *check_ppc_insn(struct disasm_line *dl)
+{
+	int raw_insn = dl->raw.raw_insn;
+	int opcode = PPC_OP(raw_insn);
+	int mem_insn_31 = PPC_21_30(raw_insn);
+	struct insn_offset *ret;
+	struct insn_offset mem_insns_31_opcode = {
+		"OP_31_INSN",
+		mem_insn_31
+	};
+	char name_insn[32];
+
+	/*
+	 * Instructions with opcode 32 to 63 are memory
+	 * instructions in powerpc
+	 */
+	if ((opcode & 0x20)) {
+		/*
+		 * Set name in case of raw instruction to
+		 * opcode to be used in insn-stat
+		 */
+		if (!strlen(dl->ins.name)) {
+			sprintf(name_insn, "%d", opcode);
+			dl->ins.name = strdup(name_insn);
+		}
+		return &load_store_ops;
+	} else if (opcode == 31) {
+		/* Check for memory instructions with opcode 31 */
+		ret = bsearch(&mem_insns_31_opcode, ins_array, ARRAY_SIZE(ins_array), sizeof(ins_array[0]), cmp_offset);
+		if (ret) {
+			if (!strlen(dl->ins.name))
+				dl->ins.name = strdup(ret->name);
+			return &load_store_ops;
+		} else {
+			mem_insns_31_opcode.value = PPC_22_30(raw_insn);
+			ret = bsearch(&mem_insns_31_opcode, arithmetic_ins_op_31, ARRAY_SIZE(arithmetic_ins_op_31),
+					sizeof(arithmetic_ins_op_31[0]), cmp_offset);
+			if (ret != NULL)
+				return &arithmetic_ops;
+			/* Bits 21 to 30 has value 444 for "mr" insn ie, OR X form */
+			if (PPC_21_30(raw_insn) == 444)
+				return &arithmetic_ops;
+		}
+	} else {
+		mem_insns_31_opcode.value = opcode;
+		ret = bsearch(&mem_insns_31_opcode, arithmetic_two_ops, ARRAY_SIZE(arithmetic_two_ops),
+				sizeof(arithmetic_two_ops[0]), cmp_offset);
+		if (ret != NULL)
+			return &arithmetic_ops;
+	}
+
+	return NULL;
+}
+
+/*
+ * Instruction tracking function to track register state moves.
+ * Example sequence:
+ *    ld      r10,264(r3)
+ *    mr      r31,r3
+ *    <<after some sequence>
+ *    ld      r9,312(r31)
+ *
+ * Previous instruction sequence shows that register state of r3
+ * is moved to r31. update_insn_state_powerpc tracks these state
+ * changes
+ */
+#ifdef HAVE_LIBDW_SUPPORT
+static void update_insn_state_powerpc(struct type_state *state,
+		struct data_loc_info *dloc, Dwarf_Die * cu_die __maybe_unused,
+		struct disasm_line *dl)
+{
+	struct annotated_insn_loc loc;
+	struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE];
+	struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET];
+	struct type_state_reg *tsr;
+	u32 insn_offset = dl->al.offset;
+
+	if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0)
+		return;
+
+	/*
+	 * Value 444 for bits 21:30 is for "mr"
+	 * instruction. "mr" is extended OR. So set the
+	 * source and destination reg correctly
+	 */
+	if (PPC_21_30(dl->raw.raw_insn) == 444) {
+		int src_reg = src->reg1;
+
+		src->reg1 = dst->reg1;
+		dst->reg1 = src_reg;
+	}
+
+	if (!has_reg_type(state, dst->reg1))
+		return;
+
+	tsr = &state->regs[dst->reg1];
+
+	if (!has_reg_type(state, src->reg1) ||
+			!state->regs[src->reg1].ok) {
+		tsr->ok = false;
+		return;
+	}
+
+	tsr->type = state->regs[src->reg1].type;
+	tsr->kind = state->regs[src->reg1].kind;
+	tsr->ok = true;
+
+	pr_debug_dtp("mov [%x] reg%d -> reg%d",
+			insn_offset, src->reg1, dst->reg1);
+	pr_debug_type_name(&tsr->type, tsr->kind);
+}
+#endif /* HAVE_LIBDW_SUPPORT */
+
+int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+{
+	if (!arch->initialized) {
+		arch->initialized = true;
+		arch->associate_instruction_ops = powerpc__associate_instruction_ops;
+		arch->objdump.comment_char      = '#';
+		annotate_opts.show_asm_raw = true;
+#ifdef HAVE_LIBDW_SUPPORT
+		arch->update_insn_state = update_insn_state_powerpc;
+#endif
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/annotate-arch/annotate-riscv64.c b/tools/perf/util/annotate-arch/annotate-riscv64.c
new file mode 100644
index 000000000000..15526824037a
--- /dev/null
+++ b/tools/perf/util/annotate-arch/annotate-riscv64.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <linux/compiler.h>
+#include "../disasm.h"
+
+static
+const struct ins_ops *riscv64__associate_ins_ops(struct arch *arch, const char *name)
+{
+	const struct ins_ops *ops = NULL;
+
+	if (!strncmp(name, "jal", 3) ||
+	    !strncmp(name, "jr", 2) ||
+	    !strncmp(name, "call", 4))
+		ops = &call_ops;
+	else if (!strncmp(name, "ret", 3))
+		ops = &ret_ops;
+	else if (name[0] == 'j' || name[0] == 'b')
+		ops = &jump_ops;
+	else
+		return NULL;
+
+	arch__associate_ins_ops(arch, name, ops);
+
+	return ops;
+}
+
+int riscv64__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+{
+	if (!arch->initialized) {
+		arch->associate_instruction_ops = riscv64__associate_ins_ops;
+		arch->initialized = true;
+		arch->objdump.comment_char = '#';
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/annotate-arch/annotate-s390.c b/tools/perf/util/annotate-arch/annotate-s390.c
new file mode 100644
index 000000000000..81db102b3e15
--- /dev/null
+++ b/tools/perf/util/annotate-arch/annotate-s390.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <linux/compiler.h>
+#include "../debug.h"
+#include "../disasm.h"
+#include "../map.h"
+#include "../maps.h"
+#include "../symbol.h"
+#include "../annotate.h"
+#include "../annotate-data.h"
+
+static int s390_call__parse(const struct arch *arch, struct ins_operands *ops,
+			    struct map_symbol *ms,
+			    struct disasm_line *dl __maybe_unused)
+{
+	char *endptr, *tok, *name;
+	struct map *map = ms->map;
+	struct addr_map_symbol target;
+
+	tok = strchr(ops->raw, ',');
+	if (!tok)
+		return -1;
+
+	ops->target.addr = strtoull(tok + 1, &endptr, 16);
+
+	name = strchr(endptr, '<');
+	if (name == NULL)
+		return -1;
+
+	name++;
+
+	if (arch->objdump.skip_functions_char &&
+	    strchr(name, arch->objdump.skip_functions_char))
+		return -1;
+
+	tok = strchr(name, '>');
+	if (tok == NULL)
+		return -1;
+
+	*tok = '\0';
+	ops->target.name = strdup(name);
+	*tok = '>';
+
+	if (ops->target.name == NULL)
+		return -1;
+
+	target = (struct addr_map_symbol) {
+		.ms = { .map = map__get(map), },
+		.addr = map__objdump_2mem(map, ops->target.addr),
+	};
+
+	if (maps__find_ams(ms->maps, &target) == 0 &&
+	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
+		ops->target.sym = target.ms.sym;
+
+	addr_map_symbol__exit(&target);
+	return 0;
+}
+
+const struct ins_ops s390_call_ops = {
+	.parse	   = s390_call__parse,
+	.scnprintf = call__scnprintf,
+};
+
+static int s390_mov__parse(const struct arch *arch __maybe_unused,
+			   struct ins_operands *ops,
+			   struct map_symbol *ms __maybe_unused,
+			   struct disasm_line *dl __maybe_unused)
+{
+	char *s = strchr(ops->raw, ','), *target, *endptr;
+
+	if (s == NULL)
+		return -1;
+
+	*s = '\0';
+	ops->source.raw = strdup(ops->raw);
+	*s = ',';
+
+	if (ops->source.raw == NULL)
+		return -1;
+
+	target = ++s;
+	ops->target.raw = strdup(target);
+	if (ops->target.raw == NULL)
+		goto out_free_source;
+
+	ops->target.addr = strtoull(target, &endptr, 16);
+	if (endptr == target)
+		goto out_free_target;
+
+	s = strchr(endptr, '<');
+	if (s == NULL)
+		goto out_free_target;
+	endptr = strchr(s + 1, '>');
+	if (endptr == NULL)
+		goto out_free_target;
+
+	*endptr = '\0';
+	ops->target.name = strdup(s + 1);
+	*endptr = '>';
+	if (ops->target.name == NULL)
+		goto out_free_target;
+
+	return 0;
+
+out_free_target:
+	zfree(&ops->target.raw);
+out_free_source:
+	zfree(&ops->source.raw);
+	return -1;
+}
+
+
+static const struct ins_ops s390_mov_ops = {
+	.parse	   = s390_mov__parse,
+	.scnprintf = mov__scnprintf,
+};
+
+static const struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name)
+{
+	const struct ins_ops *ops = NULL;
+
+	/* catch all kind of jumps */
+	if (strchr(name, 'j') ||
+	    !strncmp(name, "bct", 3) ||
+	    !strncmp(name, "br", 2))
+		ops = &jump_ops;
+	/* override call/returns */
+	if (!strcmp(name, "bras") ||
+	    !strcmp(name, "brasl") ||
+	    !strcmp(name, "basr"))
+		ops = &s390_call_ops;
+	if (!strcmp(name, "br"))
+		ops = &ret_ops;
+	/* override load/store relative to PC */
+	if (!strcmp(name, "lrl") ||
+	    !strcmp(name, "lgrl") ||
+	    !strcmp(name, "lgfrl") ||
+	    !strcmp(name, "llgfrl") ||
+	    !strcmp(name, "strl") ||
+	    !strcmp(name, "stgrl"))
+		ops = &s390_mov_ops;
+
+	if (ops)
+		arch__associate_ins_ops(arch, name, ops);
+	return ops;
+}
+
+static int s390__cpuid_parse(struct arch *arch, char *cpuid)
+{
+	unsigned int family;
+	char model[16], model_c[16], cpumf_v[16], cpumf_a[16];
+	int ret;
+
+	/*
+	 * cpuid string format:
+	 * "IBM,family,model-capacity,model[,cpum_cf-version,cpum_cf-authorization]"
+	 */
+	ret = sscanf(cpuid, "%*[^,],%u,%[^,],%[^,],%[^,],%s", &family, model_c,
+		     model, cpumf_v, cpumf_a);
+	if (ret >= 2) {
+		arch->family = family;
+		arch->model = 0;
+		return 0;
+	}
+
+	return -1;
+}
+
+int s390__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+{
+	int err = 0;
+
+	if (!arch->initialized) {
+		arch->initialized = true;
+		arch->associate_instruction_ops = s390__associate_ins_ops;
+		if (cpuid) {
+			if (s390__cpuid_parse(arch, cpuid))
+				err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING;
+		}
+		arch->objdump.comment_char = '#';
+	}
+
+	return err;
+}
diff --git a/tools/perf/util/annotate-arch/annotate-sparc.c b/tools/perf/util/annotate-arch/annotate-sparc.c
new file mode 100644
index 000000000000..66a0174376dd
--- /dev/null
+++ b/tools/perf/util/annotate-arch/annotate-sparc.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <linux/compiler.h>
+#include "../../util/disasm.h"
+
+static int is_branch_cond(const char *cond)
+{
+	if (cond[0] == '\0')
+		return 1;
+
+	if (cond[0] == 'a' && cond[1] == '\0')
+		return 1;
+
+	if (cond[0] == 'c' &&
+	    (cond[1] == 'c' || cond[1] == 's') &&
+	    cond[2] == '\0')
+		return 1;
+
+	if (cond[0] == 'e' &&
+	    (cond[1] == '\0' ||
+	     (cond[1] == 'q' && cond[2] == '\0')))
+		return 1;
+
+	if (cond[0] == 'g' &&
+	    (cond[1] == '\0' ||
+	     (cond[1] == 't' && cond[2] == '\0') ||
+	     (cond[1] == 'e' && cond[2] == '\0') ||
+	     (cond[1] == 'e' && cond[2] == 'u' && cond[3] == '\0')))
+		return 1;
+
+	if (cond[0] == 'l' &&
+	    (cond[1] == '\0' ||
+	     (cond[1] == 't' && cond[2] == '\0') ||
+	     (cond[1] == 'u' && cond[2] == '\0') ||
+	     (cond[1] == 'e' && cond[2] == '\0') ||
+	     (cond[1] == 'e' && cond[2] == 'u' && cond[3] == '\0')))
+		return 1;
+
+	if (cond[0] == 'n' &&
+	    (cond[1] == '\0' ||
+	     (cond[1] == 'e' && cond[2] == '\0') ||
+	     (cond[1] == 'z' && cond[2] == '\0') ||
+	     (cond[1] == 'e' && cond[2] == 'g' && cond[3] == '\0')))
+		return 1;
+
+	if (cond[0] == 'b' &&
+	    cond[1] == 'p' &&
+	    cond[2] == 'o' &&
+	    cond[3] == 's' &&
+	    cond[4] == '\0')
+		return 1;
+
+	if (cond[0] == 'v' &&
+	    (cond[1] == 'c' || cond[1] == 's') &&
+	    cond[2] == '\0')
+		return 1;
+
+	if (cond[0] == 'b' &&
+	    cond[1] == 'z' &&
+	    cond[2] == '\0')
+		return 1;
+
+	return 0;
+}
+
+static int is_branch_reg_cond(const char *cond)
+{
+	if ((cond[0] == 'n' || cond[0] == 'l') &&
+	    cond[1] == 'z' &&
+	    cond[2] == '\0')
+		return 1;
+
+	if (cond[0] == 'z' &&
+	    cond[1] == '\0')
+		return 1;
+
+	if ((cond[0] == 'g' || cond[0] == 'l') &&
+	    cond[1] == 'e' &&
+	    cond[2] == 'z' &&
+	    cond[3] == '\0')
+		return 1;
+
+	if (cond[0] == 'g' &&
+	    cond[1] == 'z' &&
+	    cond[2] == '\0')
+		return 1;
+
+	return 0;
+}
+
+static int is_branch_float_cond(const char *cond)
+{
+	if (cond[0] == '\0')
+		return 1;
+
+	if ((cond[0] == 'a' || cond[0] == 'e' ||
+	     cond[0] == 'z' || cond[0] == 'g' ||
+	     cond[0] == 'l' || cond[0] == 'n' ||
+	     cond[0] == 'o' || cond[0] == 'u') &&
+	    cond[1] == '\0')
+		return 1;
+
+	if (((cond[0] == 'g' && cond[1] == 'e') ||
+	     (cond[0] == 'l' && (cond[1] == 'e' ||
+				 cond[1] == 'g')) ||
+	     (cond[0] == 'n' && (cond[1] == 'e' ||
+				 cond[1] == 'z')) ||
+	     (cond[0] == 'u' && (cond[1] == 'e' ||
+				 cond[1] == 'g' ||
+				 cond[1] == 'l'))) &&
+	    cond[2] == '\0')
+		return 1;
+
+	if (cond[0] == 'u' &&
+	    (cond[1] == 'g' || cond[1] == 'l') &&
+	    cond[2] == 'e' &&
+	    cond[3] == '\0')
+		return 1;
+
+	return 0;
+}
+
+static const struct ins_ops *sparc__associate_instruction_ops(struct arch *arch, const char *name)
+{
+	const struct ins_ops *ops = NULL;
+
+	if (!strcmp(name, "call") ||
+	    !strcmp(name, "jmp") ||
+	    !strcmp(name, "jmpl")) {
+		ops = &call_ops;
+	} else if (!strcmp(name, "ret") ||
+		   !strcmp(name, "retl") ||
+		   !strcmp(name, "return")) {
+		ops = &ret_ops;
+	} else if (!strcmp(name, "mov")) {
+		ops = &mov_ops;
+	} else {
+		if (name[0] == 'c' &&
+		    (name[1] == 'w' || name[1] == 'x'))
+			name += 2;
+
+		if (name[0] == 'b') {
+			const char *cond = name + 1;
+
+			if (cond[0] == 'r') {
+				if (is_branch_reg_cond(cond + 1))
+					ops = &jump_ops;
+			} else if (is_branch_cond(cond)) {
+				ops = &jump_ops;
+			}
+		} else if (name[0] == 'f' && name[1] == 'b') {
+			if (is_branch_float_cond(name + 2))
+				ops = &jump_ops;
+		}
+	}
+
+	if (ops)
+		arch__associate_ins_ops(arch, name, ops);
+
+	return ops;
+}
+
+int sparc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+{
+	if (!arch->initialized) {
+		arch->initialized = true;
+		arch->associate_instruction_ops = sparc__associate_instruction_ops;
+		arch->objdump.comment_char = '#';
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/annotate-arch/annotate-x86.c b/tools/perf/util/annotate-arch/annotate-x86.c
new file mode 100644
index 000000000000..0c7957fe60da
--- /dev/null
+++ b/tools/perf/util/annotate-arch/annotate-x86.c
@@ -0,0 +1,820 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <linux/compiler.h>
+#include <assert.h>
+#include <inttypes.h>
+#include "../annotate-data.h"
+#include "../debug.h"
+#include "../disasm.h"
+#include "../dso.h"
+#include "../map.h"
+#include "../string2.h" // strstarts
+#include "../symbol.h"
+
+/*
+ * x86 instruction nmemonic table to parse disasm lines for annotate.
+ * This table is searched twice - one for exact match and another for
+ * match without a size suffix (b, w, l, q) in case of AT&T syntax.
+ *
+ * So this table should not have entries with the suffix unless it's
+ * a complete different instruction than ones without the suffix.
+ */
+static const struct ins x86__instructions[] = {
+	{ .name = "adc",	.ops = &mov_ops,  },
+	{ .name = "add",	.ops = &mov_ops,  },
+	{ .name = "addsd",	.ops = &mov_ops,  },
+	{ .name = "and",	.ops = &mov_ops,  },
+	{ .name = "andpd",	.ops = &mov_ops,  },
+	{ .name = "andps",	.ops = &mov_ops,  },
+	{ .name = "bsr",	.ops = &mov_ops,  },
+	{ .name = "bt",		.ops = &mov_ops,  },
+	{ .name = "btr",	.ops = &mov_ops,  },
+	{ .name = "bts",	.ops = &mov_ops,  },
+	{ .name = "call",	.ops = &call_ops, },
+	{ .name = "cmovae",	.ops = &mov_ops,  },
+	{ .name = "cmovbe",	.ops = &mov_ops,  },
+	{ .name = "cmove",	.ops = &mov_ops,  },
+	{ .name = "cmp",	.ops = &mov_ops,  },
+	{ .name = "cmpxch",	.ops = &mov_ops,  },
+	{ .name = "cmpxchg",	.ops = &mov_ops,  },
+	{ .name = "cs",		.ops = &mov_ops,  },
+	{ .name = "dec",	.ops = &dec_ops,  },
+	{ .name = "divsd",	.ops = &mov_ops,  },
+	{ .name = "divss",	.ops = &mov_ops,  },
+	{ .name = "gs",		.ops = &mov_ops,  },
+	{ .name = "imul",	.ops = &mov_ops,  },
+	{ .name = "inc",	.ops = &dec_ops,  },
+	{ .name = "ja",		.ops = &jump_ops, },
+	{ .name = "jae",	.ops = &jump_ops, },
+	{ .name = "jb",		.ops = &jump_ops, },
+	{ .name = "jbe",	.ops = &jump_ops, },
+	{ .name = "jc",		.ops = &jump_ops, },
+	{ .name = "jcxz",	.ops = &jump_ops, },
+	{ .name = "je",		.ops = &jump_ops, },
+	{ .name = "jecxz",	.ops = &jump_ops, },
+	{ .name = "jg",		.ops = &jump_ops, },
+	{ .name = "jge",	.ops = &jump_ops, },
+	{ .name = "jl",		.ops = &jump_ops, },
+	{ .name = "jle",	.ops = &jump_ops, },
+	{ .name = "jmp",	.ops = &jump_ops, },
+	{ .name = "jna",	.ops = &jump_ops, },
+	{ .name = "jnae",	.ops = &jump_ops, },
+	{ .name = "jnb",	.ops = &jump_ops, },
+	{ .name = "jnbe",	.ops = &jump_ops, },
+	{ .name = "jnc",	.ops = &jump_ops, },
+	{ .name = "jne",	.ops = &jump_ops, },
+	{ .name = "jng",	.ops = &jump_ops, },
+	{ .name = "jnge",	.ops = &jump_ops, },
+	{ .name = "jnl",	.ops = &jump_ops, },
+	{ .name = "jnle",	.ops = &jump_ops, },
+	{ .name = "jno",	.ops = &jump_ops, },
+	{ .name = "jnp",	.ops = &jump_ops, },
+	{ .name = "jns",	.ops = &jump_ops, },
+	{ .name = "jnz",	.ops = &jump_ops, },
+	{ .name = "jo",		.ops = &jump_ops, },
+	{ .name = "jp",		.ops = &jump_ops, },
+	{ .name = "jpe",	.ops = &jump_ops, },
+	{ .name = "jpo",	.ops = &jump_ops, },
+	{ .name = "jrcxz",	.ops = &jump_ops, },
+	{ .name = "js",		.ops = &jump_ops, },
+	{ .name = "jz",		.ops = &jump_ops, },
+	{ .name = "lea",	.ops = &mov_ops,  },
+	{ .name = "lock",	.ops = &lock_ops, },
+	{ .name = "mov",	.ops = &mov_ops,  },
+	{ .name = "movapd",	.ops = &mov_ops,  },
+	{ .name = "movaps",	.ops = &mov_ops,  },
+	{ .name = "movdqa",	.ops = &mov_ops,  },
+	{ .name = "movdqu",	.ops = &mov_ops,  },
+	{ .name = "movsb",	.ops = &mov_ops,  },
+	{ .name = "movsd",	.ops = &mov_ops,  },
+	{ .name = "movsl",	.ops = &mov_ops,  },
+	{ .name = "movss",	.ops = &mov_ops,  },
+	{ .name = "movsw",	.ops = &mov_ops,  },
+	{ .name = "movupd",	.ops = &mov_ops,  },
+	{ .name = "movups",	.ops = &mov_ops,  },
+	{ .name = "movzb",	.ops = &mov_ops,  },
+	{ .name = "movzl",	.ops = &mov_ops,  },
+	{ .name = "movzw",	.ops = &mov_ops,  },
+	{ .name = "mulsd",	.ops = &mov_ops,  },
+	{ .name = "mulss",	.ops = &mov_ops,  },
+	{ .name = "nop",	.ops = &nop_ops,  },
+	{ .name = "or",		.ops = &mov_ops,  },
+	{ .name = "orps",	.ops = &mov_ops,  },
+	{ .name = "paddq",	.ops = &mov_ops,  },
+	{ .name = "pand",	.ops = &mov_ops,  },
+	{ .name = "pcmpeqb",	.ops = &mov_ops,  },
+	{ .name = "por",	.ops = &mov_ops,  },
+	{ .name = "rcl",	.ops = &mov_ops,  },
+	{ .name = "ret",	.ops = &ret_ops,  },
+	{ .name = "sbb",	.ops = &mov_ops,  },
+	{ .name = "sete",	.ops = &mov_ops,  },
+	{ .name = "sub",	.ops = &mov_ops,  },
+	{ .name = "subsd",	.ops = &mov_ops,  },
+	{ .name = "test",	.ops = &mov_ops,  },
+	{ .name = "tzcnt",	.ops = &mov_ops,  },
+	{ .name = "ucomisd",	.ops = &mov_ops,  },
+	{ .name = "ucomiss",	.ops = &mov_ops,  },
+	{ .name = "vaddsd",	.ops = &mov_ops,  },
+	{ .name = "vandpd",	.ops = &mov_ops,  },
+	{ .name = "vmovdqa",	.ops = &mov_ops,  },
+	{ .name = "vmovq",	.ops = &mov_ops,  },
+	{ .name = "vmovsd",	.ops = &mov_ops,  },
+	{ .name = "vmulsd",	.ops = &mov_ops,  },
+	{ .name = "vorpd",	.ops = &mov_ops,  },
+	{ .name = "vsubsd",	.ops = &mov_ops,  },
+	{ .name = "vucomisd",	.ops = &mov_ops,  },
+	{ .name = "xadd",	.ops = &mov_ops,  },
+	{ .name = "xbegin",	.ops = &jump_ops, },
+	{ .name = "xchg",	.ops = &mov_ops,  },
+	{ .name = "xor",	.ops = &mov_ops, },
+	{ .name = "xorpd",	.ops = &mov_ops, },
+	{ .name = "xorps",	.ops = &mov_ops, },
+};
+
+static bool amd__ins_is_fused(const struct arch *arch, const char *ins1,
+			      const char *ins2)
+{
+	if (strstr(ins2, "jmp"))
+		return false;
+
+	/* Family >= 15h supports cmp/test + branch fusion */
+	if (arch->family >= 0x15 && (strstarts(ins1, "test") ||
+	    (strstarts(ins1, "cmp") && !strstr(ins1, "xchg")))) {
+		return true;
+	}
+
+	/* Family >= 19h supports some ALU + branch fusion */
+	if (arch->family >= 0x19 && (strstarts(ins1, "add") ||
+	    strstarts(ins1, "sub") || strstarts(ins1, "and") ||
+	    strstarts(ins1, "inc") || strstarts(ins1, "dec") ||
+	    strstarts(ins1, "or") || strstarts(ins1, "xor"))) {
+		return true;
+	}
+
+	return false;
+}
+
+static bool intel__ins_is_fused(const struct arch *arch, const char *ins1,
+				const char *ins2)
+{
+	if (arch->family != 6 || arch->model < 0x1e || strstr(ins2, "jmp"))
+		return false;
+
+	if (arch->model == 0x1e) {
+		/* Nehalem */
+		if ((strstr(ins1, "cmp") && !strstr(ins1, "xchg")) ||
+		     strstr(ins1, "test")) {
+			return true;
+		}
+	} else {
+		/* Newer platform */
+		if ((strstr(ins1, "cmp") && !strstr(ins1, "xchg")) ||
+		     strstr(ins1, "test") ||
+		     strstr(ins1, "add") ||
+		     strstr(ins1, "sub") ||
+		     strstr(ins1, "and") ||
+		     strstr(ins1, "inc") ||
+		     strstr(ins1, "dec")) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static int x86__cpuid_parse(struct arch *arch, char *cpuid)
+{
+	unsigned int family, model, stepping;
+	int ret;
+
+	/*
+	 * cpuid = "GenuineIntel,family,model,stepping"
+	 */
+	ret = sscanf(cpuid, "%*[^,],%u,%u,%u", &family, &model, &stepping);
+	if (ret == 3) {
+		arch->family = family;
+		arch->model = model;
+		arch->ins_is_fused = strstarts(cpuid, "AuthenticAMD") ?
+					amd__ins_is_fused :
+					intel__ins_is_fused;
+		return 0;
+	}
+
+	return -1;
+}
+
+#ifdef HAVE_LIBDW_SUPPORT
+static void update_insn_state_x86(struct type_state *state,
+				  struct data_loc_info *dloc, Dwarf_Die *cu_die,
+				  struct disasm_line *dl)
+{
+	struct annotated_insn_loc loc;
+	struct annotated_op_loc *src = &loc.ops[INSN_OP_SOURCE];
+	struct annotated_op_loc *dst = &loc.ops[INSN_OP_TARGET];
+	struct type_state_reg *tsr;
+	Dwarf_Die type_die;
+	u32 insn_offset = dl->al.offset;
+	int fbreg = dloc->fbreg;
+	int fboff = 0;
+
+	if (annotate_get_insn_location(dloc->arch, dl, &loc) < 0)
+		return;
+
+	if (ins__is_call(&dl->ins)) {
+		struct symbol *func = dl->ops.target.sym;
+
+		if (func == NULL)
+			return;
+
+		/* __fentry__ will preserve all registers */
+		if (!strcmp(func->name, "__fentry__"))
+			return;
+
+		pr_debug_dtp("call [%x] %s\n", insn_offset, func->name);
+
+		/* Otherwise invalidate caller-saved registers after call */
+		for (unsigned i = 0; i < ARRAY_SIZE(state->regs); i++) {
+			if (state->regs[i].caller_saved)
+				state->regs[i].ok = false;
+		}
+
+		/* Update register with the return type (if any) */
+		if (die_find_func_rettype(cu_die, func->name, &type_die)) {
+			tsr = &state->regs[state->ret_reg];
+			tsr->type = type_die;
+			tsr->kind = TSR_KIND_TYPE;
+			tsr->offset = 0;
+			tsr->ok = true;
+
+			pr_debug_dtp("call [%x] return -> reg%d",
+				     insn_offset, state->ret_reg);
+			pr_debug_type_name(&type_die, tsr->kind);
+		}
+		return;
+	}
+
+	if (!strncmp(dl->ins.name, "add", 3)) {
+		u64 imm_value = -1ULL;
+		int offset;
+		const char *var_name = NULL;
+		struct map_symbol *ms = dloc->ms;
+		u64 ip = ms->sym->start + dl->al.offset;
+
+		if (!has_reg_type(state, dst->reg1))
+			return;
+
+		tsr = &state->regs[dst->reg1];
+		tsr->copied_from = -1;
+
+		if (src->imm)
+			imm_value = src->offset;
+		else if (has_reg_type(state, src->reg1) &&
+			 state->regs[src->reg1].kind == TSR_KIND_CONST)
+			imm_value = state->regs[src->reg1].imm_value;
+		else if (src->reg1 == DWARF_REG_PC) {
+			u64 var_addr = annotate_calc_pcrel(dloc->ms, ip,
+							   src->offset, dl);
+
+			if (get_global_var_info(dloc, var_addr,
+						&var_name, &offset) &&
+			    !strcmp(var_name, "this_cpu_off") &&
+			    tsr->kind == TSR_KIND_CONST) {
+				tsr->kind = TSR_KIND_PERCPU_BASE;
+				tsr->offset = 0;
+				tsr->ok = true;
+				imm_value = tsr->imm_value;
+			}
+		}
+		else
+			return;
+
+		/* Ignore add to non-pointer or non-const types */
+		if (tsr->kind == TSR_KIND_POINTER ||
+		    (dwarf_tag(&tsr->type) == DW_TAG_pointer_type &&
+		     src->reg1 != DWARF_REG_PC && tsr->kind == TSR_KIND_TYPE && !dst->mem_ref)) {
+			tsr->offset += imm_value;
+			pr_debug_dtp("add [%x] offset %#"PRIx64" to reg%d",
+				     insn_offset, imm_value, dst->reg1);
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+
+		if (tsr->kind == TSR_KIND_CONST)
+			tsr->imm_value += imm_value;
+
+		if (tsr->kind != TSR_KIND_PERCPU_BASE)
+			return;
+
+		if (get_global_var_type(cu_die, dloc, ip, imm_value, &offset,
+					&type_die) && offset == 0) {
+			/*
+			 * This is not a pointer type, but it should be treated
+			 * as a pointer.
+			 */
+			tsr->type = type_die;
+			tsr->kind = TSR_KIND_PERCPU_POINTER;
+			tsr->offset = 0;
+			tsr->ok = true;
+
+			pr_debug_dtp("add [%x] percpu %#"PRIx64" -> reg%d",
+				     insn_offset, imm_value, dst->reg1);
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+		return;
+	}
+
+	if (!strncmp(dl->ins.name, "sub", 3)) {
+		u64 imm_value = -1ULL;
+
+		if (!has_reg_type(state, dst->reg1))
+			return;
+
+		tsr = &state->regs[dst->reg1];
+		tsr->copied_from = -1;
+
+		if (src->imm)
+			imm_value = src->offset;
+		else if (has_reg_type(state, src->reg1) &&
+			 state->regs[src->reg1].kind == TSR_KIND_CONST)
+			imm_value = state->regs[src->reg1].imm_value;
+
+		if (tsr->kind == TSR_KIND_POINTER ||
+		    (dwarf_tag(&tsr->type) == DW_TAG_pointer_type &&
+		     src->reg1 != DWARF_REG_PC && tsr->kind == TSR_KIND_TYPE && !dst->mem_ref)) {
+			tsr->offset -= imm_value;
+			pr_debug_dtp("sub [%x] offset %#"PRIx64" to reg%d",
+				     insn_offset, imm_value, dst->reg1);
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+
+		if (tsr->kind == TSR_KIND_CONST)
+			tsr->imm_value -= imm_value;
+
+		return;
+	}
+
+	if (!strncmp(dl->ins.name, "lea", 3)) {
+		int sreg = src->reg1;
+		struct type_state_reg src_tsr;
+
+		if (!has_reg_type(state, sreg) ||
+		    !has_reg_type(state, dst->reg1) ||
+		    !src->mem_ref)
+			return;
+
+		src_tsr = state->regs[sreg];
+		tsr = &state->regs[dst->reg1];
+
+		tsr->copied_from = -1;
+		tsr->ok = false;
+
+		/* Case 1: Based on stack pointer or frame pointer */
+		if (sreg == fbreg || sreg == state->stack_reg) {
+			struct type_state_stack *stack;
+			int offset = src->offset - fboff;
+
+			stack = find_stack_state(state, offset);
+			if (!stack)
+				return;
+
+			tsr->type = stack->type;
+			tsr->kind = TSR_KIND_POINTER;
+			tsr->offset = offset - stack->offset;
+			tsr->ok = true;
+
+			if (sreg == fbreg) {
+				pr_debug_dtp("lea [%x] address of -%#x(stack) -> reg%d",
+					     insn_offset, -src->offset, dst->reg1);
+			} else {
+				pr_debug_dtp("lea [%x] address of %#x(reg%d) -> reg%d",
+					     insn_offset, src->offset, sreg, dst->reg1);
+			}
+
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+		/* Case 2: Based on a register holding a typed pointer */
+		else if (src_tsr.ok && (src_tsr.kind == TSR_KIND_POINTER ||
+			 (dwarf_tag(&src_tsr.type) == DW_TAG_pointer_type &&
+			  src_tsr.kind == TSR_KIND_TYPE))) {
+
+			if (src_tsr.kind == TSR_KIND_TYPE &&
+			    __die_get_real_type(&state->regs[sreg].type, &type_die) == NULL)
+				return;
+
+			if (src_tsr.kind == TSR_KIND_POINTER)
+				type_die = state->regs[sreg].type;
+
+			/* Check if the target type has a member at the new offset */
+			if (die_get_member_type(&type_die,
+						src->offset + src_tsr.offset, &type_die) == NULL)
+				return;
+
+			tsr->type = src_tsr.type;
+			tsr->kind = src_tsr.kind;
+			tsr->offset = src->offset + src_tsr.offset;
+			tsr->ok = true;
+
+			pr_debug_dtp("lea [%x] address of %s%#x(reg%d) -> reg%d",
+						insn_offset, src->offset < 0 ? "-" : "",
+						abs(src->offset), sreg, dst->reg1);
+
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+		return;
+	}
+
+	/* Invalidate register states for other ops which may change pointers */
+	if (has_reg_type(state, dst->reg1) && !dst->mem_ref &&
+	    dwarf_tag(&state->regs[dst->reg1].type) == DW_TAG_pointer_type) {
+		if (!strncmp(dl->ins.name, "imul", 4) || !strncmp(dl->ins.name, "mul", 3) ||
+		    !strncmp(dl->ins.name, "idiv", 4) || !strncmp(dl->ins.name, "div", 3) ||
+		    !strncmp(dl->ins.name, "shl", 3)  || !strncmp(dl->ins.name, "shr", 3) ||
+		    !strncmp(dl->ins.name, "sar", 3)  || !strncmp(dl->ins.name, "and", 3) ||
+		    !strncmp(dl->ins.name, "or", 2)   || !strncmp(dl->ins.name, "neg", 3) ||
+		    !strncmp(dl->ins.name, "inc", 3)  || !strncmp(dl->ins.name, "dec", 3)) {
+			pr_debug_dtp("%s [%x] invalidate reg%d\n",
+						dl->ins.name, insn_offset, dst->reg1);
+			state->regs[dst->reg1].ok = false;
+			state->regs[dst->reg1].copied_from = -1;
+			return;
+		}
+
+		if (!strncmp(dl->ins.name, "xor", 3) && dst->reg1 == src->reg1) {
+			/* xor reg, reg clears the register */
+			pr_debug_dtp("xor [%x] clear reg%d\n",
+				     insn_offset, dst->reg1);
+
+			state->regs[dst->reg1].kind = TSR_KIND_CONST;
+			state->regs[dst->reg1].imm_value = 0;
+			state->regs[dst->reg1].ok = true;
+			state->regs[dst->reg1].copied_from = -1;
+			return;
+		}
+	}
+
+	if (strncmp(dl->ins.name, "mov", 3))
+		return;
+
+	if (dloc->fb_cfa) {
+		u64 ip = dloc->ms->sym->start + dl->al.offset;
+		u64 pc = map__rip_2objdump(dloc->ms->map, ip);
+
+		if (die_get_cfa(dloc->di->dbg, pc, &fbreg, &fboff) < 0)
+			fbreg = -1;
+	}
+
+	/* Case 1. register to register or segment:offset to register transfers */
+	if (!src->mem_ref && !dst->mem_ref) {
+		if (!has_reg_type(state, dst->reg1))
+			return;
+
+		tsr = &state->regs[dst->reg1];
+		tsr->copied_from = -1;
+
+		if (dso__kernel(map__dso(dloc->ms->map)) &&
+		    src->segment == INSN_SEG_X86_GS && src->imm) {
+			u64 ip = dloc->ms->sym->start + dl->al.offset;
+			u64 var_addr;
+			int offset;
+
+			/*
+			 * In kernel, %gs points to a per-cpu region for the
+			 * current CPU.  Access with a constant offset should
+			 * be treated as a global variable access.
+			 */
+			var_addr = src->offset;
+
+			if (var_addr == 40) {
+				tsr->kind = TSR_KIND_CANARY;
+				tsr->offset = 0;
+				tsr->ok = true;
+
+				pr_debug_dtp("mov [%x] stack canary -> reg%d\n",
+					     insn_offset, dst->reg1);
+				return;
+			}
+
+			if (!get_global_var_type(cu_die, dloc, ip, var_addr,
+						 &offset, &type_die) ||
+			    !die_get_member_type(&type_die, offset, &type_die)) {
+				tsr->ok = false;
+				return;
+			}
+
+			tsr->type = type_die;
+			tsr->kind = TSR_KIND_TYPE;
+			tsr->offset = 0;
+			tsr->ok = true;
+
+			pr_debug_dtp("mov [%x] this-cpu addr=%#"PRIx64" -> reg%d",
+				     insn_offset, var_addr, dst->reg1);
+			pr_debug_type_name(&tsr->type, tsr->kind);
+			return;
+		}
+
+		if (src->imm) {
+			tsr->kind = TSR_KIND_CONST;
+			tsr->imm_value = src->offset;
+			tsr->offset = 0;
+			tsr->ok = true;
+
+			pr_debug_dtp("mov [%x] imm=%#x -> reg%d\n",
+				     insn_offset, tsr->imm_value, dst->reg1);
+			return;
+		}
+
+		if (!has_reg_type(state, src->reg1) ||
+		    !state->regs[src->reg1].ok) {
+			tsr->ok = false;
+			return;
+		}
+
+		tsr->type = state->regs[src->reg1].type;
+		tsr->kind = state->regs[src->reg1].kind;
+		tsr->imm_value = state->regs[src->reg1].imm_value;
+		tsr->offset = state->regs[src->reg1].offset;
+		tsr->ok = true;
+
+		/* To copy back the variable type later (hopefully) */
+		if (tsr->kind == TSR_KIND_TYPE || tsr->kind == TSR_KIND_POINTER)
+			tsr->copied_from = src->reg1;
+
+		pr_debug_dtp("mov [%x] reg%d -> reg%d",
+			     insn_offset, src->reg1, dst->reg1);
+		pr_debug_type_name(&tsr->type, tsr->kind);
+	}
+	/* Case 2. memory to register transers */
+	if (src->mem_ref && !dst->mem_ref) {
+		int sreg = src->reg1;
+
+		if (!has_reg_type(state, dst->reg1))
+			return;
+
+		tsr = &state->regs[dst->reg1];
+		tsr->copied_from = -1;
+
+retry:
+		/* Check stack variables with offset */
+		if (sreg == fbreg || sreg == state->stack_reg) {
+			struct type_state_stack *stack;
+			int offset = src->offset - fboff;
+
+			stack = find_stack_state(state, offset);
+			if (stack == NULL) {
+				tsr->ok = false;
+				return;
+			} else if (!stack->compound) {
+				tsr->type = stack->type;
+				tsr->kind = stack->kind;
+				tsr->offset = stack->ptr_offset;
+				tsr->ok = true;
+			} else if (die_get_member_type(&stack->type,
+						       offset - stack->offset,
+						       &type_die)) {
+				tsr->type = type_die;
+				tsr->kind = TSR_KIND_TYPE;
+				tsr->offset = 0;
+				tsr->ok = true;
+			} else {
+				tsr->ok = false;
+				return;
+			}
+
+			if (sreg == fbreg) {
+				pr_debug_dtp("mov [%x] -%#x(stack) -> reg%d",
+					     insn_offset, -offset, dst->reg1);
+			} else {
+				pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d",
+					     insn_offset, offset, sreg, dst->reg1);
+			}
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+		/* And then dereference the pointer if it has one */
+		else if (has_reg_type(state, sreg) && state->regs[sreg].ok &&
+			 state->regs[sreg].kind == TSR_KIND_TYPE &&
+			 die_deref_ptr_type(&state->regs[sreg].type,
+					    src->offset + state->regs[sreg].offset, &type_die)) {
+			tsr->type = type_die;
+			tsr->kind = TSR_KIND_TYPE;
+			tsr->offset = 0;
+			tsr->ok = true;
+
+			pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d",
+				     insn_offset, src->offset, sreg, dst->reg1);
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+		/* Handle dereference of TSR_KIND_POINTER registers */
+		else if (has_reg_type(state, sreg) && state->regs[sreg].ok &&
+			 state->regs[sreg].kind == TSR_KIND_POINTER &&
+			 die_get_member_type(&state->regs[sreg].type,
+					     src->offset + state->regs[sreg].offset, &type_die)) {
+			tsr->type = state->regs[sreg].type;
+			tsr->kind = TSR_KIND_TYPE;
+			tsr->offset = src->offset + state->regs[sreg].offset;
+			tsr->ok = true;
+
+			pr_debug_dtp("mov [%x] addr %#x(reg%d) -> reg%d",
+				     insn_offset, src->offset, sreg, dst->reg1);
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+		/* Or check if it's a global variable */
+		else if (sreg == DWARF_REG_PC) {
+			struct map_symbol *ms = dloc->ms;
+			u64 ip = ms->sym->start + dl->al.offset;
+			u64 addr;
+			int offset;
+
+			addr = annotate_calc_pcrel(ms, ip, src->offset, dl);
+
+			if (!get_global_var_type(cu_die, dloc, ip, addr, &offset,
+						 &type_die) ||
+			    !die_get_member_type(&type_die, offset, &type_die)) {
+				tsr->ok = false;
+				return;
+			}
+
+			tsr->type = type_die;
+			tsr->kind = TSR_KIND_TYPE;
+			tsr->offset = 0;
+			tsr->ok = true;
+
+			pr_debug_dtp("mov [%x] global addr=%"PRIx64" -> reg%d",
+				     insn_offset, addr, dst->reg1);
+			pr_debug_type_name(&type_die, tsr->kind);
+		}
+		/* And check percpu access with base register */
+		else if (has_reg_type(state, sreg) &&
+			 state->regs[sreg].kind == TSR_KIND_PERCPU_BASE) {
+			u64 ip = dloc->ms->sym->start + dl->al.offset;
+			u64 var_addr = src->offset;
+			int offset;
+
+			if (src->multi_regs) {
+				int reg2 = (sreg == src->reg1) ? src->reg2 : src->reg1;
+
+				if (has_reg_type(state, reg2) && state->regs[reg2].ok &&
+				    state->regs[reg2].kind == TSR_KIND_CONST)
+					var_addr += state->regs[reg2].imm_value;
+			}
+
+			/*
+			 * In kernel, %gs points to a per-cpu region for the
+			 * current CPU.  Access with a constant offset should
+			 * be treated as a global variable access.
+			 */
+			if (get_global_var_type(cu_die, dloc, ip, var_addr,
+						&offset, &type_die) &&
+			    die_get_member_type(&type_die, offset, &type_die)) {
+				tsr->type = type_die;
+				tsr->kind = TSR_KIND_TYPE;
+				tsr->offset = 0;
+				tsr->ok = true;
+
+				if (src->multi_regs) {
+					pr_debug_dtp("mov [%x] percpu %#x(reg%d,reg%d) -> reg%d",
+						     insn_offset, src->offset, src->reg1,
+						     src->reg2, dst->reg1);
+				} else {
+					pr_debug_dtp("mov [%x] percpu %#x(reg%d) -> reg%d",
+						     insn_offset, src->offset, sreg, dst->reg1);
+				}
+				pr_debug_type_name(&tsr->type, tsr->kind);
+			} else {
+				tsr->ok = false;
+			}
+		}
+		/* And then dereference the calculated pointer if it has one */
+		else if (has_reg_type(state, sreg) && state->regs[sreg].ok &&
+			 state->regs[sreg].kind == TSR_KIND_PERCPU_POINTER &&
+			 die_get_member_type(&state->regs[sreg].type,
+					     src->offset, &type_die)) {
+			tsr->type = type_die;
+			tsr->kind = TSR_KIND_TYPE;
+			tsr->offset = 0;
+			tsr->ok = true;
+
+			pr_debug_dtp("mov [%x] pointer %#x(reg%d) -> reg%d",
+				     insn_offset, src->offset, sreg, dst->reg1);
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+		/* Or try another register if any */
+		else if (src->multi_regs && sreg == src->reg1 &&
+			 src->reg1 != src->reg2) {
+			sreg = src->reg2;
+			goto retry;
+		}
+		else {
+			int offset;
+			const char *var_name = NULL;
+
+			/* it might be per-cpu variable (in kernel) access */
+			if (src->offset < 0) {
+				if (get_global_var_info(dloc, (s64)src->offset,
+							&var_name, &offset) &&
+				    !strcmp(var_name, "__per_cpu_offset")) {
+					tsr->kind = TSR_KIND_PERCPU_BASE;
+					tsr->offset = 0;
+					tsr->ok = true;
+
+					pr_debug_dtp("mov [%x] percpu base reg%d\n",
+						     insn_offset, dst->reg1);
+					return;
+				}
+			}
+
+			tsr->ok = false;
+		}
+	}
+	/* Case 3. register to memory transfers */
+	if (!src->mem_ref && dst->mem_ref) {
+		if (!has_reg_type(state, src->reg1) ||
+		    !state->regs[src->reg1].ok)
+			return;
+
+		/* Check stack variables with offset */
+		if (dst->reg1 == fbreg || dst->reg1 == state->stack_reg) {
+			struct type_state_stack *stack;
+			int offset = dst->offset - fboff;
+
+			tsr = &state->regs[src->reg1];
+
+			stack = find_stack_state(state, offset);
+			if (stack) {
+				/*
+				 * The source register is likely to hold a type
+				 * of member if it's a compound type.  Do not
+				 * update the stack variable type since we can
+				 * get the member type later by using the
+				 * die_get_member_type().
+				 */
+				if (!stack->compound)
+					set_stack_state(stack, offset, tsr->kind,
+							&tsr->type, tsr->offset);
+			} else {
+				findnew_stack_state(state, offset, tsr->kind,
+						    &tsr->type, tsr->offset);
+			}
+
+			if (dst->reg1 == fbreg) {
+				pr_debug_dtp("mov [%x] reg%d -> -%#x(stack)",
+					     insn_offset, src->reg1, -offset);
+			} else {
+				pr_debug_dtp("mov [%x] reg%d -> %#x(reg%d)",
+					     insn_offset, src->reg1, offset, dst->reg1);
+			}
+			if (tsr->offset != 0) {
+				pr_debug_dtp(" reg%d offset %#x ->",
+					src->reg1, tsr->offset);
+			}
+
+			pr_debug_type_name(&tsr->type, tsr->kind);
+		}
+		/*
+		 * Ignore other transfers since it'd set a value in a struct
+		 * and won't change the type.
+		 */
+	}
+	/* Case 4. memory to memory transfers (not handled for now) */
+}
+#endif
+
+int x86__annotate_init(struct arch *arch, char *cpuid)
+{
+	int err = 0;
+
+	if (arch->initialized)
+		return 0;
+
+	if (cpuid) {
+		if (x86__cpuid_parse(arch, cpuid))
+			err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING;
+	}
+
+	arch->instructions = x86__instructions;
+	arch->nr_instructions = ARRAY_SIZE(x86__instructions);
+#ifndef NDEBUG
+	{
+		static bool sorted_check;
+
+		if (!sorted_check) {
+			for (size_t i = 0; i < arch->nr_instructions - 1; i++) {
+				assert(strcmp(arch->instructions[i].name,
+					      arch->instructions[i + 1].name) <= 0);
+			}
+			sorted_check = true;
+		}
+	}
+#endif
+	arch->sorted_instructions = true;
+	arch->objdump.comment_char = '#';
+	arch->objdump.register_char = '%';
+	arch->objdump.memory_ref_char = '(';
+	arch->objdump.imm_char = '$';
+	arch->insn_suffix = "bwlq";
+	arch->e_machine = EM_X86_64;
+	arch->e_flags = 0;
+	arch->initialized = true;
+#ifdef HAVE_LIBDW_SUPPORT
+	arch->update_insn_state = update_insn_state_x86;
+#endif
+	return err;
+}
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index b7523256c4ad..845c2d0f39b1 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -33,20 +33,15 @@
 static regex_t	 file_lineno;
 
 /* These can be referred from the arch-dependent code */
-static const struct ins_ops call_ops;
-static const struct ins_ops dec_ops;
-static const struct ins_ops jump_ops;
-static const struct ins_ops mov_ops;
-static const struct ins_ops nop_ops;
-static const struct ins_ops lock_ops;
-static const struct ins_ops ret_ops;
-static const struct ins_ops load_store_ops;
-static const struct ins_ops arithmetic_ops;
-
-static int jump__scnprintf(const struct ins *ins, char *bf, size_t size,
-			   struct ins_operands *ops, int max_ins_name);
-static int call__scnprintf(const struct ins *ins, char *bf, size_t size,
-			   struct ins_operands *ops, int max_ins_name);
+const struct ins_ops call_ops;
+const struct ins_ops dec_ops;
+const struct ins_ops jump_ops;
+const struct ins_ops mov_ops;
+const struct ins_ops nop_ops;
+const struct ins_ops lock_ops;
+const struct ins_ops ret_ops;
+const struct ins_ops load_store_ops;
+const struct ins_ops arithmetic_ops;
 
 static void ins__sort(struct arch *arch);
 static int disasm_line__parse(char *line, const char **namep, char **rawp);
@@ -86,7 +81,7 @@ grow_from_non_allocated_table:
 	goto out_update_instructions;
 }
 
-static int arch__associate_ins_ops(struct arch *arch, const char *name, const struct ins_ops *ops)
+int arch__associate_ins_ops(struct arch *arch, const char *name, const struct ins_ops *ops)
 {
 	struct ins *ins;
 
@@ -106,90 +101,66 @@ static int arch__associate_ins_ops(struct arch *arch, const char *name, const st
 	return 0;
 }
 
-#include "arch/arc/annotate/instructions.c"
-#include "arch/arm/annotate/instructions.c"
-#include "arch/arm64/annotate/instructions.c"
-#include "arch/csky/annotate/instructions.c"
-#include "arch/loongarch/annotate/instructions.c"
-#include "arch/mips/annotate/instructions.c"
-#include "arch/x86/annotate/instructions.c"
-#include "arch/powerpc/annotate/instructions.c"
-#include "arch/riscv64/annotate/instructions.c"
-#include "arch/s390/annotate/instructions.c"
-#include "arch/sparc/annotate/instructions.c"
-
 static struct arch architectures[] = {
 	{
 		.name = "arc",
 		.init = arc__annotate_init,
+		.e_machine = EM_ARC,
 	},
 	{
 		.name = "arm",
 		.init = arm__annotate_init,
+		.e_machine = EM_ARM,
 	},
 	{
 		.name = "arm64",
 		.init = arm64__annotate_init,
+		.e_machine = EM_AARCH64,
 	},
 	{
 		.name = "csky",
 		.init = csky__annotate_init,
+		.e_machine = EM_CSKY,
+#if defined(__CSKYABIV2__)
+		.e_flags = EF_CSKY_ABIV2,
+#else
+		.e_flags = EF_CSKY_ABIV1,
+#endif
 	},
 	{
 		.name = "mips",
 		.init = mips__annotate_init,
-		.objdump = {
-			.comment_char = '#',
-		},
+		.e_machine = EM_MIPS,
 	},
 	{
 		.name = "x86",
 		.init = x86__annotate_init,
-		.instructions = x86__instructions,
-		.nr_instructions = ARRAY_SIZE(x86__instructions),
-		.sorted_instructions = true,
-		.insn_suffix = "bwlq",
-		.objdump =  {
-			.comment_char = '#',
-			.register_char = '%',
-			.memory_ref_char = '(',
-			.imm_char = '$',
-		},
-#ifdef HAVE_LIBDW_SUPPORT
-		.update_insn_state = update_insn_state_x86,
-#endif
+		.e_machine = EM_X86_64, // TODO: EM_386 too.
 	},
 	{
 		.name = "powerpc",
 		.init = powerpc__annotate_init,
-#ifdef HAVE_LIBDW_SUPPORT
-		.update_insn_state = update_insn_state_powerpc,
-#endif
+		.e_machine = EM_PPC, // TODO: EM_PPC64 too.
 	},
 	{
 		.name = "riscv64",
 		.init = riscv64__annotate_init,
+		.e_machine = EM_RISCV,
 	},
 	{
 		.name = "s390",
 		.init = s390__annotate_init,
-		.objdump =  {
-			.comment_char = '#',
-		},
+		.e_machine = EM_S390,
 	},
 	{
 		.name = "sparc",
 		.init = sparc__annotate_init,
-		.objdump = {
-			.comment_char = '#',
-		},
+		.e_machine = EM_SPARC,
 	},
 	{
 		.name = "loongarch",
 		.init = loongarch__annotate_init,
-		.objdump = {
-			.comment_char = '#',
-		},
+		.e_machine = EM_LOONGARCH,
 	},
 };
 
@@ -248,14 +219,14 @@ static void ins_ops__delete(struct ins_operands *ops)
 	zfree(&ops->target.name);
 }
 
-static int ins__raw_scnprintf(const struct ins *ins, char *bf, size_t size,
-			      struct ins_operands *ops, int max_ins_name)
+int ins__raw_scnprintf(const struct ins *ins, char *bf, size_t size,
+			   struct ins_operands *ops, int max_ins_name)
 {
 	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->raw);
 }
 
-static int ins__scnprintf(const struct ins *ins, char *bf, size_t size,
-			  struct ins_operands *ops, int max_ins_name)
+int ins__scnprintf(const struct ins *ins, char *bf, size_t size,
+		   struct ins_operands *ops, int max_ins_name)
 {
 	if (ins->ops->scnprintf)
 		return ins->ops->scnprintf(ins, bf, size, ops, max_ins_name);
@@ -326,8 +297,8 @@ indirect_call:
 	goto find_target;
 }
 
-static int call__scnprintf(const struct ins *ins, char *bf, size_t size,
-			   struct ins_operands *ops, int max_ins_name)
+int call__scnprintf(const struct ins *ins, char *bf, size_t size,
+		      struct ins_operands *ops, int max_ins_name)
 {
 	if (ops->target.sym)
 		return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->target.sym->name);
@@ -341,7 +312,7 @@ static int call__scnprintf(const struct ins *ins, char *bf, size_t size,
 	return scnprintf(bf, size, "%-*s *%" PRIx64, max_ins_name, ins->name, ops->target.addr);
 }
 
-static const struct ins_ops call_ops = {
+const struct ins_ops call_ops = {
 	.parse	   = call__parse,
 	.scnprintf = call__scnprintf,
 };
@@ -453,8 +424,8 @@ static int jump__parse(const struct arch *arch, struct ins_operands *ops, struct
 	return 0;
 }
 
-static int jump__scnprintf(const struct ins *ins, char *bf, size_t size,
-			   struct ins_operands *ops, int max_ins_name)
+int jump__scnprintf(const struct ins *ins, char *bf, size_t size,
+		      struct ins_operands *ops, int max_ins_name)
 {
 	const char *c;
 
@@ -494,7 +465,7 @@ static void jump__delete(struct ins_operands *ops __maybe_unused)
 	 */
 }
 
-static const struct ins_ops jump_ops = {
+const struct ins_ops jump_ops = {
 	.free	   = jump__delete,
 	.parse	   = jump__parse,
 	.scnprintf = jump__scnprintf,
@@ -586,7 +557,7 @@ static void lock__delete(struct ins_operands *ops)
 	zfree(&ops->target.name);
 }
 
-static const struct ins_ops lock_ops = {
+const struct ins_ops lock_ops = {
 	.free	   = lock__delete,
 	.parse	   = lock__parse,
 	.scnprintf = lock__scnprintf,
@@ -687,103 +658,19 @@ out_free_source:
 	return -1;
 }
 
-static int mov__scnprintf(const struct ins *ins, char *bf, size_t size,
-			   struct ins_operands *ops, int max_ins_name)
+int mov__scnprintf(const struct ins *ins, char *bf, size_t size,
+		     struct ins_operands *ops, int max_ins_name)
 {
 	return scnprintf(bf, size, "%-*s %s,%s", max_ins_name, ins->name,
 			 ops->source.name ?: ops->source.raw,
 			 ops->target.name ?: ops->target.raw);
 }
 
-static const struct ins_ops mov_ops = {
+const struct ins_ops mov_ops = {
 	.parse	   = mov__parse,
 	.scnprintf = mov__scnprintf,
 };
 
-#define PPC_22_30(R)    (((R) >> 1) & 0x1ff)
-#define MINUS_EXT_XO_FORM	234
-#define SUB_EXT_XO_FORM		232
-#define	ADD_ZERO_EXT_XO_FORM	202
-#define	SUB_ZERO_EXT_XO_FORM	200
-
-static int arithmetic__scnprintf(const struct ins *ins, char *bf, size_t size,
-		struct ins_operands *ops, int max_ins_name)
-{
-	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
-			ops->raw);
-}
-
-/*
- * Sets the fields: multi_regs and "mem_ref".
- * "mem_ref" is set for ops->source which is later used to
- * fill the objdump->memory_ref-char field. This ops is currently
- * used by powerpc and since binary instruction code is used to
- * extract opcode, regs and offset, no other parsing is needed here.
- *
- * Dont set multi regs for 4 cases since it has only one operand
- * for source:
- * - Add to Minus One Extended XO-form ( Ex: addme, addmeo )
- * - Subtract From Minus One Extended XO-form ( Ex: subfme )
- * - Add to Zero Extended XO-form ( Ex: addze, addzeo )
- * - Subtract From Zero Extended XO-form ( Ex: subfze )
- */
-static int arithmetic__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops,
-		struct map_symbol *ms __maybe_unused, struct disasm_line *dl)
-{
-	int opcode = PPC_OP(dl->raw.raw_insn);
-
-	ops->source.mem_ref = false;
-	if (opcode == 31) {
-		if ((opcode != MINUS_EXT_XO_FORM) && (opcode != SUB_EXT_XO_FORM) \
-				&& (opcode != ADD_ZERO_EXT_XO_FORM) && (opcode != SUB_ZERO_EXT_XO_FORM))
-			ops->source.multi_regs = true;
-	}
-
-	ops->target.mem_ref = false;
-	ops->target.multi_regs = false;
-
-	return 0;
-}
-
-static const struct ins_ops arithmetic_ops = {
-	.parse     = arithmetic__parse,
-	.scnprintf = arithmetic__scnprintf,
-};
-
-static int load_store__scnprintf(const struct ins *ins, char *bf, size_t size,
-		struct ins_operands *ops, int max_ins_name)
-{
-	return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name,
-			ops->raw);
-}
-
-/*
- * Sets the fields: multi_regs and "mem_ref".
- * "mem_ref" is set for ops->source which is later used to
- * fill the objdump->memory_ref-char field. This ops is currently
- * used by powerpc and since binary instruction code is used to
- * extract opcode, regs and offset, no other parsing is needed here
- */
-static int load_store__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops,
-		struct map_symbol *ms __maybe_unused, struct disasm_line *dl __maybe_unused)
-{
-	ops->source.mem_ref = true;
-	ops->source.multi_regs = false;
-	/* opcode 31 is of X form */
-	if (PPC_OP(dl->raw.raw_insn) == 31)
-		ops->source.multi_regs = true;
-
-	ops->target.mem_ref = false;
-	ops->target.multi_regs = false;
-
-	return 0;
-}
-
-static const struct ins_ops load_store_ops = {
-	.parse     = load_store__parse,
-	.scnprintf = load_store__scnprintf,
-};
-
 static int dec__parse(const struct arch *arch __maybe_unused, struct ins_operands *ops,
 		      struct map_symbol *ms __maybe_unused,
 		      struct disasm_line *dl __maybe_unused)
@@ -820,7 +707,7 @@ static int dec__scnprintf(const struct ins *ins, char *bf, size_t size,
 			 ops->target.name ?: ops->target.raw);
 }
 
-static const struct ins_ops dec_ops = {
+const struct ins_ops dec_ops = {
 	.parse	   = dec__parse,
 	.scnprintf = dec__scnprintf,
 };
@@ -831,11 +718,11 @@ static int nop__scnprintf(const struct ins *ins __maybe_unused, char *bf, size_t
 	return scnprintf(bf, size, "%-*s", max_ins_name, "nop");
 }
 
-static const struct ins_ops nop_ops = {
+const struct ins_ops nop_ops = {
 	.scnprintf = nop__scnprintf,
 };
 
-static const struct ins_ops ret_ops = {
+const struct ins_ops ret_ops = {
 	.scnprintf = ins__raw_scnprintf,
 };
 
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index db7f1ee3d8e7..83503c5075f9 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -109,6 +109,28 @@ const struct arch *arch__find(const char *name);
 bool arch__is_x86(const struct arch *arch);
 bool arch__is_powerpc(const struct arch *arch);
 
+extern const struct ins_ops call_ops;
+extern const struct ins_ops dec_ops;
+extern const struct ins_ops jump_ops;
+extern const struct ins_ops mov_ops;
+extern const struct ins_ops nop_ops;
+extern const struct ins_ops lock_ops;
+extern const struct ins_ops ret_ops;
+
+int arch__associate_ins_ops(struct arch *arch, const char *name, const struct ins_ops *ops);
+
+int arc__annotate_init(struct arch *arch, char *cpuid);
+int arm__annotate_init(struct arch *arch, char *cpuid);
+int arm64__annotate_init(struct arch *arch, char *cpuid);
+int csky__annotate_init(struct arch *arch, char *cpuid);
+int loongarch__annotate_init(struct arch *arch, char *cpuid);
+int mips__annotate_init(struct arch *arch, char *cpuid);
+int powerpc__annotate_init(struct arch *arch, char *cpuid);
+int riscv64__annotate_init(struct arch *arch, char *cpuid);
+int s390__annotate_init(struct arch *arch, char *cpuid);
+int sparc__annotate_init(struct arch *arch, char *cpuid);
+int x86__annotate_init(struct arch *arch, char *cpuid);
+
 const struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl);
 
 bool ins__is_call(const struct ins *ins);
@@ -117,12 +139,28 @@ bool ins__is_fused(const struct arch *arch, const char *ins1, const char *ins2);
 bool ins__is_ret(const struct ins *ins);
 bool ins__is_lock(const struct ins *ins);
 
+extern const struct ins_ops s390_call_ops;
+extern const struct ins_ops loongarch_call_ops;
+extern const struct ins_ops loongarch_jump_ops;
+const struct ins_ops *check_ppc_insn(struct disasm_line *dl);
+
 struct disasm_line *disasm_line__new(struct annotate_args *args);
 void disasm_line__free(struct disasm_line *dl);
 
 int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size,
 			   bool raw, int max_ins_name);
 
+int ins__raw_scnprintf(const struct ins *ins, char *bf, size_t size,
+			   struct ins_operands *ops, int max_ins_name);
+int ins__scnprintf(const struct ins *ins, char *bf, size_t size,
+		   struct ins_operands *ops, int max_ins_name);
+int call__scnprintf(const struct ins *ins, char *bf, size_t size,
+		    struct ins_operands *ops, int max_ins_name);
+int jump__scnprintf(const struct ins *ins, char *bf, size_t size,
+		    struct ins_operands *ops, int max_ins_name);
+int mov__scnprintf(const struct ins *ins, char *bf, size_t size,
+		   struct ins_operands *ops, int max_ins_name);
+
 int symbol__disassemble(struct symbol *sym, struct annotate_args *args);
 
 char *expand_tabs(char *line, char **storage, size_t *storage_len);
-- 
cgit v1.2.3


From 5301cc698821551b34d0b357cf7842984182b35c Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 13:35:13 -0800
Subject: perf disasm: Refactor ins__is_call/jump to avoid exposing arch
 ins_ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add booleans indicating whether and ins_ops are call or jump and
return it. This avoids exposing loongarch and s390 ins_ops for the
sake of matching.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zecheng Li <zecheng@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate-arch/annotate-loongarch.c | 6 ++++--
 tools/perf/util/annotate-arch/annotate-s390.c      | 3 ++-
 tools/perf/util/disasm.c                           | 6 ++++--
 tools/perf/util/disasm.h                           | 5 ++---
 4 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate-arch/annotate-loongarch.c b/tools/perf/util/annotate-arch/annotate-loongarch.c
index 32df10f6fed5..79dc116ade2f 100644
--- a/tools/perf/util/annotate-arch/annotate-loongarch.c
+++ b/tools/perf/util/annotate-arch/annotate-loongarch.c
@@ -57,9 +57,10 @@ static int loongarch_call__parse(const struct arch *arch, struct ins_operands *o
 	return 0;
 }
 
-const struct ins_ops loongarch_call_ops = {
+static const struct ins_ops loongarch_call_ops = {
 	.parse	   = loongarch_call__parse,
 	.scnprintf = call__scnprintf,
+	.is_call   = true,
 };
 
 static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *ops,
@@ -106,9 +107,10 @@ static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *o
 	return 0;
 }
 
-const struct ins_ops loongarch_jump_ops = {
+static const struct ins_ops loongarch_jump_ops = {
 	.parse	   = loongarch_jump__parse,
 	.scnprintf = jump__scnprintf,
+	.is_jump   = true,
 };
 
 static
diff --git a/tools/perf/util/annotate-arch/annotate-s390.c b/tools/perf/util/annotate-arch/annotate-s390.c
index 81db102b3e15..7b2d27b62e6b 100644
--- a/tools/perf/util/annotate-arch/annotate-s390.c
+++ b/tools/perf/util/annotate-arch/annotate-s390.c
@@ -57,9 +57,10 @@ static int s390_call__parse(const struct arch *arch, struct ins_operands *ops,
 	return 0;
 }
 
-const struct ins_ops s390_call_ops = {
+static const struct ins_ops s390_call_ops = {
 	.parse	   = s390_call__parse,
 	.scnprintf = call__scnprintf,
+	.is_call   = true,
 };
 
 static int s390_mov__parse(const struct arch *arch __maybe_unused,
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 845c2d0f39b1..8c3e9094600a 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -315,11 +315,12 @@ int call__scnprintf(const struct ins *ins, char *bf, size_t size,
 const struct ins_ops call_ops = {
 	.parse	   = call__parse,
 	.scnprintf = call__scnprintf,
+	.is_call   = true,
 };
 
 bool ins__is_call(const struct ins *ins)
 {
-	return ins->ops == &call_ops || ins->ops == &s390_call_ops || ins->ops == &loongarch_call_ops;
+	return ins->ops && ins->ops->is_call;
 }
 
 /*
@@ -469,11 +470,12 @@ const struct ins_ops jump_ops = {
 	.free	   = jump__delete,
 	.parse	   = jump__parse,
 	.scnprintf = jump__scnprintf,
+	.is_jump   = true,
 };
 
 bool ins__is_jump(const struct ins *ins)
 {
-	return ins->ops == &jump_ops || ins->ops == &loongarch_jump_ops;
+	return ins->ops && ins->ops->is_jump;
 }
 
 static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep)
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index 83503c5075f9..b6a2a30fdf27 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -93,6 +93,8 @@ struct ins_ops {
 			struct disasm_line *dl);
 	int (*scnprintf)(const struct ins *ins, char *bf, size_t size,
 			 struct ins_operands *ops, int max_ins_name);
+	bool is_jump;
+	bool is_call;
 };
 
 struct annotate_args {
@@ -139,9 +141,6 @@ bool ins__is_fused(const struct arch *arch, const char *ins1, const char *ins2);
 bool ins__is_ret(const struct ins *ins);
 bool ins__is_lock(const struct ins *ins);
 
-extern const struct ins_ops s390_call_ops;
-extern const struct ins_ops loongarch_call_ops;
-extern const struct ins_ops loongarch_jump_ops;
 const struct ins_ops *check_ppc_insn(struct disasm_line *dl);
 
 struct disasm_line *disasm_line__new(struct annotate_args *args);
-- 
cgit v1.2.3


From c4e3a00356fffb20c03bd9609083afb1dc4a2edf Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 13:35:14 -0800
Subject: perf map_symbol: Switch from holding maps to holding thread
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

maps may belong to >1 thread. In contexts like symbolization
information from the thread may be useful, such as the ELF machine.

As the maps can be gained from the thread switch from holding maps in
struct map_symbol to holding the thread.

Holding the maps in addr_location is also redundant, switch this to
using thread__maps.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zecheng Li <zecheng@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/annotate.c                  |  2 +-
 tools/perf/ui/browsers/hists.c                     |  3 ++-
 tools/perf/util/addr_location.c                    |  4 ----
 tools/perf/util/addr_location.h                    |  1 -
 tools/perf/util/annotate-arch/annotate-loongarch.c |  5 +++--
 tools/perf/util/annotate-arch/annotate-s390.c      |  3 ++-
 tools/perf/util/callchain.c                        | 12 +++++++-----
 tools/perf/util/capstone.c                         |  6 ++++--
 tools/perf/util/db-export.c                        |  1 -
 tools/perf/util/disasm.c                           |  7 ++++---
 tools/perf/util/event.c                            |  2 --
 tools/perf/util/hist.c                             | 18 +++++++++---------
 tools/perf/util/machine.c                          | 13 +++++++------
 tools/perf/util/map_symbol.c                       |  5 +++--
 tools/perf/util/map_symbol.h                       |  3 ++-
 tools/perf/util/sort.c                             |  2 +-
 tools/perf/util/unwind-libdw.c                     |  2 +-
 tools/perf/util/unwind-libunwind-local.c           |  2 +-
 18 files changed, 47 insertions(+), 44 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 3df61cd46652..91ded9c271ee 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -601,7 +601,7 @@ static bool annotate_browser__callq(struct annotate_browser *browser,
 		return true;
 	}
 
-	target_ms.maps = ms->maps;
+	target_ms.thread = ms->thread;
 	target_ms.map = ms->map;
 	target_ms.sym = dl->ops.target.sym;
 	annotation__unlock(notes);
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 08fecbe28a52..cfa6386e6e1d 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -3189,7 +3189,8 @@ do_hotkey:		 // key came straight from options ui__popup_menu()
 		case 'k':
 			if (browser->selection != NULL)
 				hists_browser__zoom_map(browser,
-					      maps__machine(browser->selection->maps)->vmlinux_map);
+					maps__machine(thread__maps(browser->selection->thread)
+						     )->vmlinux_map);
 			continue;
 		case 'V':
 			verbose = (verbose + 1) % 4;
diff --git a/tools/perf/util/addr_location.c b/tools/perf/util/addr_location.c
index 81a0b79c5e10..57e8217a00f9 100644
--- a/tools/perf/util/addr_location.c
+++ b/tools/perf/util/addr_location.c
@@ -7,7 +7,6 @@
 void addr_location__init(struct addr_location *al)
 {
 	al->thread = NULL;
-	al->maps = NULL;
 	al->map = NULL;
 	al->sym = NULL;
 	al->srcline = NULL;
@@ -30,16 +29,13 @@ void addr_location__exit(struct addr_location *al)
 {
 	map__zput(al->map);
 	thread__zput(al->thread);
-	maps__zput(al->maps);
 }
 
 void addr_location__copy(struct addr_location *dst, struct addr_location *src)
 {
 	thread__put(dst->thread);
-	maps__put(dst->maps);
 	map__put(dst->map);
 	*dst = *src;
 	dst->thread = thread__get(src->thread);
-	dst->maps = maps__get(src->maps);
 	dst->map = map__get(src->map);
 }
diff --git a/tools/perf/util/addr_location.h b/tools/perf/util/addr_location.h
index 64b551025216..fdc4d3f3a68b 100644
--- a/tools/perf/util/addr_location.h
+++ b/tools/perf/util/addr_location.h
@@ -11,7 +11,6 @@ struct symbol;
 
 struct addr_location {
 	struct thread *thread;
-	struct maps   *maps;
 	struct map    *map;
 	struct symbol *sym;
 	const char    *srcline;
diff --git a/tools/perf/util/annotate-arch/annotate-loongarch.c b/tools/perf/util/annotate-arch/annotate-loongarch.c
index 79dc116ade2f..6c94cb98a104 100644
--- a/tools/perf/util/annotate-arch/annotate-loongarch.c
+++ b/tools/perf/util/annotate-arch/annotate-loongarch.c
@@ -11,6 +11,7 @@
 #include "../map.h"
 #include "../maps.h"
 #include "../symbol.h"
+#include "../thread.h"
 
 static int loongarch_call__parse(const struct arch *arch, struct ins_operands *ops,
 				 struct map_symbol *ms,
@@ -49,7 +50,7 @@ static int loongarch_call__parse(const struct arch *arch, struct ins_operands *o
 		.addr = map__objdump_2mem(map, ops->target.addr),
 	};
 
-	if (maps__find_ams(ms->maps, &target) == 0 &&
+	if (maps__find_ams(thread__maps(ms->thread), &target) == 0 &&
 	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
 		ops->target.sym = target.ms.sym;
 
@@ -93,7 +94,7 @@ static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *o
 
 	ops->target.outside = target.addr < start || target.addr > end;
 
-	if (maps__find_ams(ms->maps, &target) == 0 &&
+	if (maps__find_ams(thread__maps(ms->thread), &target) == 0 &&
 	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
 		ops->target.sym = target.ms.sym;
 
diff --git a/tools/perf/util/annotate-arch/annotate-s390.c b/tools/perf/util/annotate-arch/annotate-s390.c
index 7b2d27b62e6b..47573f0310c1 100644
--- a/tools/perf/util/annotate-arch/annotate-s390.c
+++ b/tools/perf/util/annotate-arch/annotate-s390.c
@@ -6,6 +6,7 @@
 #include "../map.h"
 #include "../maps.h"
 #include "../symbol.h"
+#include "../thread.h"
 #include "../annotate.h"
 #include "../annotate-data.h"
 
@@ -49,7 +50,7 @@ static int s390_call__parse(const struct arch *arch, struct ins_operands *ops,
 		.addr = map__objdump_2mem(map, ops->target.addr),
 	};
 
-	if (maps__find_ams(ms->maps, &target) == 0 &&
+	if (maps__find_ams(thread__maps(ms->thread), &target) == 0 &&
 	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
 		ops->target.sym = target.ms.sym;
 
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 428e5350d7a2..515bb8b5da01 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -31,6 +31,7 @@
 #include "callchain.h"
 #include "branch.h"
 #include "symbol.h"
+#include "thread.h"
 #include "util.h"
 #include "../perf.h"
 
@@ -1042,7 +1043,7 @@ merge_chain_branch(struct callchain_cursor *cursor,
 
 	list_for_each_entry_safe(list, next_list, &src->val, list) {
 		struct map_symbol ms = {
-			.maps = maps__get(list->ms.maps),
+			.thread = thread__get(list->ms.thread),
 			.map = map__get(list->ms.map),
 		};
 		callchain_cursor_append(cursor, list->ip, &ms, false, NULL, 0, 0, 0, list->srcline);
@@ -1147,10 +1148,11 @@ int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *samp
 int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node *node,
 			bool hide_unresolved)
 {
-	struct machine *machine = node->ms.maps ? maps__machine(node->ms.maps) : NULL;
+	struct machine *machine = NULL;
+
+	if (node->ms.thread)
+		machine = maps__machine(thread__maps(node->ms.thread));
 
-	maps__put(al->maps);
-	al->maps = maps__get(node->ms.maps);
 	map__put(al->map);
 	al->map = map__get(node->ms.map);
 	al->sym = node->ms.sym;
@@ -1163,7 +1165,7 @@ int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node *
 		if (al->map == NULL)
 			goto out;
 	}
-	if (maps__equal(al->maps, machine__kernel_maps(machine))) {
+	if (maps__equal(thread__maps(al->thread), machine__kernel_maps(machine))) {
 		if (machine__is_host(machine)) {
 			al->cpumode = PERF_RECORD_MISC_KERNEL;
 			al->level = 'k';
diff --git a/tools/perf/util/capstone.c b/tools/perf/util/capstone.c
index ce06cfd253ef..9216916f848f 100644
--- a/tools/perf/util/capstone.c
+++ b/tools/perf/util/capstone.c
@@ -268,7 +268,8 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused,
 	    !strcmp(args->options->disassembler_style, "att"))
 		disassembler_style = true;
 
-	if (capstone_init(maps__machine(args->ms->maps), &handle, is_64bit, disassembler_style) < 0)
+	if (capstone_init(maps__machine(thread__maps(args->ms->thread)), &handle, is_64bit,
+			  disassembler_style) < 0)
 		goto err;
 
 	needs_cs_close = true;
@@ -382,7 +383,8 @@ int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused,
 	    !strcmp(args->options->disassembler_style, "att"))
 		disassembler_style = true;
 
-	if (capstone_init(maps__machine(args->ms->maps), &handle, is_64bit, disassembler_style) < 0)
+	if (capstone_init(maps__machine(thread__maps(args->ms->thread)), &handle, is_64bit,
+			  disassembler_style) < 0)
 		goto err;
 
 	needs_cs_close = true;
diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c
index 8f52e8cefcf3..ae9a9065aab7 100644
--- a/tools/perf/util/db-export.c
+++ b/tools/perf/util/db-export.c
@@ -254,7 +254,6 @@ static struct call_path *call_path_from_sample(struct db_export *dbe,
 		addr_location__init(&al);
 		al.sym = node->ms.sym;
 		al.map = map__get(node->ms.map);
-		al.maps = maps__get(thread__maps(thread));
 		al.addr = node->ip;
 		al.thread = thread__get(thread);
 
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 8c3e9094600a..d81469db0aac 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -28,6 +28,7 @@
 #include "namespaces.h"
 #include "srcline.h"
 #include "symbol.h"
+#include "thread.h"
 #include "util.h"
 
 static regex_t	 file_lineno;
@@ -277,7 +278,7 @@ find_target:
 		.addr = map__objdump_2mem(map, ops->target.addr),
 	};
 
-	if (maps__find_ams(ms->maps, &target) == 0 &&
+	if (maps__find_ams(thread__maps(ms->thread), &target) == 0 &&
 	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
 		ops->target.sym = target.ms.sym;
 
@@ -411,7 +412,7 @@ static int jump__parse(const struct arch *arch, struct ins_operands *ops, struct
 	 * Actual navigation will come next, with further understanding of how
 	 * the symbol searching and disassembly should be done.
 	 */
-	if (maps__find_ams(ms->maps, &target) == 0 &&
+	if (maps__find_ams(thread__maps(ms->thread), &target) == 0 &&
 	    map__rip_2objdump(target.ms.map, map__map_ip(target.ms.map, target.addr)) == ops->target.addr)
 		ops->target.sym = target.ms.sym;
 
@@ -1074,7 +1075,7 @@ static int symbol__parse_objdump_line(struct symbol *sym,
 			.ms = { .map = map__get(map), },
 		};
 
-		if (!maps__find_ams(args->ms->maps, &target) &&
+		if (!maps__find_ams(thread__maps(args->ms->thread), &target) &&
 		    target.ms.sym->start == target.al_addr)
 			dl->ops.target.sym = target.ms.sym;
 
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 2dde1044b5a7..bc045fddf7d5 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -698,7 +698,6 @@ struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr,
 	struct machine *machine = maps__machine(maps);
 	bool load_map = false;
 
-	maps__zput(al->maps);
 	map__zput(al->map);
 	thread__zput(al->thread);
 	al->thread = thread__get(thread);
@@ -736,7 +735,6 @@ struct map *thread__find_map(struct thread *thread, u8 cpumode, u64 addr,
 
 		return NULL;
 	}
-	al->maps = maps__get(maps);
 	al->map = maps__find(maps, al->addr);
 	if (al->map != NULL) {
 		/*
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index ef4b569f7df4..7ffaa3d9851b 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -251,7 +251,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 
 	if (h->cgroup) {
 		const char *cgrp_name = "unknown";
-		struct cgroup *cgrp = cgroup__find(maps__machine(h->ms.maps)->env,
+		struct cgroup *cgrp = cgroup__find(maps__machine(thread__maps(h->ms.thread))->env,
 						   h->cgroup);
 		if (cgrp != NULL)
 			cgrp_name = cgrp->name;
@@ -536,7 +536,7 @@ static int hist_entry__init(struct hist_entry *he,
 			memset(&he->stat, 0, sizeof(he->stat));
 	}
 
-	he->ms.maps = maps__get(he->ms.maps);
+	he->ms.thread = thread__get(he->ms.thread);
 	he->ms.map = map__get(he->ms.map);
 
 	if (he->branch_info) {
@@ -552,9 +552,9 @@ static int hist_entry__init(struct hist_entry *he,
 		memcpy(he->branch_info, template->branch_info,
 		       sizeof(*he->branch_info));
 
-		he->branch_info->from.ms.maps = maps__get(he->branch_info->from.ms.maps);
+		he->branch_info->from.ms.thread = thread__get(he->branch_info->from.ms.thread);
 		he->branch_info->from.ms.map = map__get(he->branch_info->from.ms.map);
-		he->branch_info->to.ms.maps = maps__get(he->branch_info->to.ms.maps);
+		he->branch_info->to.ms.thread = thread__get(he->branch_info->to.ms.thread);
 		he->branch_info->to.ms.map = map__get(he->branch_info->to.ms.map);
 	}
 
@@ -810,7 +810,7 @@ __hists__add_entry(struct hists *hists,
 		},
 		.cgroup = sample->cgroup,
 		.ms = {
-			.maps	= al->maps,
+			.thread	= al->thread,
 			.map	= al->map,
 			.sym	= al->sym,
 		},
@@ -890,7 +890,7 @@ struct hist_entry *hists__add_entry_block(struct hists *hists,
 		.block_info = block_info,
 		.hists = hists,
 		.ms = {
-			.maps = al->maps,
+			.thread = al->thread,
 			.map = al->map,
 			.sym = al->sym,
 		},
@@ -1020,8 +1020,8 @@ iter_next_branch_entry(struct hist_entry_iter *iter, struct addr_location *al)
 	if (iter->curr >= iter->total)
 		return 0;
 
-	maps__put(al->maps);
-	al->maps = maps__get(bi[i].to.ms.maps);
+	thread__put(al->thread);
+	al->thread = thread__get(bi[i].to.ms.thread);
 	map__put(al->map);
 	al->map = map__get(bi[i].to.ms.map);
 	al->sym = bi[i].to.ms.sym;
@@ -1232,7 +1232,7 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
 		.comm = thread__comm(al->thread),
 		.ip = al->addr,
 		.ms = {
-			.maps = al->maps,
+			.thread = al->thread,
 			.map = al->map,
 			.sym = al->sym,
 		},
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 30d606fbf040..5b0f5a48ffd4 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2016,7 +2016,7 @@ static void ip__resolve_ams(struct thread *thread,
 	ams->addr = ip;
 	ams->al_addr = al.addr;
 	ams->al_level = al.level;
-	ams->ms.maps = maps__get(al.maps);
+	ams->ms.thread = thread__get(al.thread);
 	ams->ms.sym = al.sym;
 	ams->ms.map = map__get(al.map);
 	ams->phys_addr = 0;
@@ -2037,7 +2037,7 @@ static void ip__resolve_data(struct thread *thread,
 	ams->addr = addr;
 	ams->al_addr = al.addr;
 	ams->al_level = al.level;
-	ams->ms.maps = maps__get(al.maps);
+	ams->ms.thread = thread__get(al.thread);
 	ams->ms.sym = al.sym;
 	ams->ms.map = map__get(al.map);
 	ams->phys_addr = phys_addr;
@@ -2120,7 +2120,7 @@ static int append_inlines(struct callchain_cursor *cursor, struct map_symbol *ms
 	}
 
 	ilist_ms = (struct map_symbol) {
-		.maps = maps__get(ms->maps),
+		.thread = thread__get(ms->thread),
 		.map = map__get(map),
 	};
 	list_for_each_entry(ilist, &inline_node->val, list) {
@@ -2220,7 +2220,7 @@ static int add_callchain_ip(struct thread *thread,
 		iter_cycles = iter->cycles;
 	}
 
-	ms.maps = maps__get(al.maps);
+	ms.thread = thread__get(al.thread);
 	ms.map = map__get(al.map);
 	ms.sym = al.sym;
 
@@ -2383,7 +2383,7 @@ static void save_lbr_cursor_node(struct thread *thread,
 	map_symbol__exit(&lbr_stitch->prev_lbr_cursor[idx].ms);
 	memcpy(&lbr_stitch->prev_lbr_cursor[idx], cursor->curr,
 	       sizeof(struct callchain_cursor_node));
-	lbr_stitch->prev_lbr_cursor[idx].ms.maps = maps__get(cursor->curr->ms.maps);
+	lbr_stitch->prev_lbr_cursor[idx].ms.thread = thread__get(cursor->curr->ms.thread);
 	lbr_stitch->prev_lbr_cursor[idx].ms.map = map__get(cursor->curr->ms.map);
 
 	lbr_stitch->prev_lbr_cursor[idx].valid = true;
@@ -2596,7 +2596,8 @@ static bool has_stitched_lbr(struct thread *thread,
 		memcpy(&stitch_node->cursor, &lbr_stitch->prev_lbr_cursor[i],
 		       sizeof(struct callchain_cursor_node));
 
-		stitch_node->cursor.ms.maps = maps__get(lbr_stitch->prev_lbr_cursor[i].ms.maps);
+		stitch_node->cursor.ms.thread =
+			thread__get(lbr_stitch->prev_lbr_cursor[i].ms.thread);
 		stitch_node->cursor.ms.map = map__get(lbr_stitch->prev_lbr_cursor[i].ms.map);
 
 		if (callee)
diff --git a/tools/perf/util/map_symbol.c b/tools/perf/util/map_symbol.c
index 6ad2960bc289..11bc0a7f704c 100644
--- a/tools/perf/util/map_symbol.c
+++ b/tools/perf/util/map_symbol.c
@@ -2,10 +2,11 @@
 #include "map_symbol.h"
 #include "maps.h"
 #include "map.h"
+#include "thread.h"
 
 void map_symbol__exit(struct map_symbol *ms)
 {
-	maps__zput(ms->maps);
+	thread__zput(ms->thread);
 	map__zput(ms->map);
 }
 
@@ -16,7 +17,7 @@ void addr_map_symbol__exit(struct addr_map_symbol *ams)
 
 void map_symbol__copy(struct map_symbol *dst, struct map_symbol *src)
 {
-	dst->maps = maps__get(src->maps);
+	dst->thread = thread__get(src->thread);
 	dst->map = map__get(src->map);
 	dst->sym = src->sym;
 }
diff --git a/tools/perf/util/map_symbol.h b/tools/perf/util/map_symbol.h
index e370bb32ed47..7437e319f4a3 100644
--- a/tools/perf/util/map_symbol.h
+++ b/tools/perf/util/map_symbol.h
@@ -4,12 +4,13 @@
 
 #include <linux/types.h>
 
+struct thread;
 struct maps;
 struct map;
 struct symbol;
 
 struct map_symbol {
-	struct maps   *maps;
+	struct thread *thread;
 	struct map    *map;
 	struct symbol *sym;
 };
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index f963d61ac166..01a9d73ae348 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1016,7 +1016,7 @@ static int hist_entry__cgroup_snprintf(struct hist_entry *he,
 	const char *cgrp_name = "N/A";
 
 	if (he->cgroup) {
-		struct cgroup *cgrp = cgroup__find(maps__machine(he->ms.maps)->env,
+		struct cgroup *cgrp = cgroup__find(maps__machine(thread__maps(he->ms.thread))->env,
 						   he->cgroup);
 		if (cgrp != NULL)
 			cgrp_name = cgrp->name;
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index c1646ef5f971..9cb0960ef905 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -161,7 +161,7 @@ static int entry(u64 ip, struct unwind_info *ui)
 	}
 
 	e->ip	  = ip;
-	e->ms.maps = maps__get(al.maps);
+	e->ms.thread = thread__get(al.thread);
 	e->ms.map = map__get(al.map);
 	e->ms.sym = al.sym;
 
diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
index a24b45106acd..ecf0c01fe51f 100644
--- a/tools/perf/util/unwind-libunwind-local.c
+++ b/tools/perf/util/unwind-libunwind-local.c
@@ -666,7 +666,7 @@ static int entry(u64 ip, struct thread *thread,
 	e.ms.sym = thread__find_symbol(thread, PERF_RECORD_MISC_USER, ip, &al);
 	e.ip     = ip;
 	e.ms.map = al.map;
-	e.ms.maps = al.maps;
+	e.ms.thread = thread__get(al.thread);
 
 	pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n",
 		 al.sym ? al.sym->name : "''",
-- 
cgit v1.2.3


From 0e26ba5a87744ee8957cc1f341e403c0fd758398 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 13:35:15 -0800
Subject: perf disasm: Refactor arch__find and initialization of arch structs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switch arch__find to using an ELF machine number rather than a
string.

Rather than an array of fixed size arch structs turn the init functions
into new functions indexed by the ELF machine they correspond to.

This allows data to be stored with a struct arch with the container_of
trick, so the priv variable can be removed.

Switch to using the thread to find the arch rather than the evsel as the
evsel only has limited notions of the running thread upon which
disassembly is performed.

Factor out the e_machine and e_flags into their own struct to make them
easier to pass around.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zecheng Li <zecheng@google.com>
[ Include elf.h for EM_CSKY and friends and also conditionally define EM_CSKY_ABIMASK for old distros ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/ui/browsers/annotate.c                  |   2 +-
 tools/perf/util/annotate-arch/annotate-arc.c       |  14 +-
 tools/perf/util/annotate-arch/annotate-arm.c       |  39 ++---
 tools/perf/util/annotate-arch/annotate-arm64.c     |  39 ++---
 tools/perf/util/annotate-arch/annotate-csky.c      |  14 +-
 tools/perf/util/annotate-arch/annotate-loongarch.c |  19 ++-
 tools/perf/util/annotate-arch/annotate-mips.c      |  19 ++-
 tools/perf/util/annotate-arch/annotate-powerpc.c   |  24 ++--
 tools/perf/util/annotate-arch/annotate-riscv64.c   |  19 ++-
 tools/perf/util/annotate-arch/annotate-s390.c      |  29 ++--
 tools/perf/util/annotate-arch/annotate-sparc.c     |  19 ++-
 tools/perf/util/annotate-arch/annotate-x86.c       |  24 ++--
 tools/perf/util/annotate.c                         |  46 +++---
 tools/perf/util/annotate.h                         |   2 +-
 tools/perf/util/disasm.c                           | 157 ++++++++++-----------
 tools/perf/util/disasm.h                           |  59 ++++----
 16 files changed, 283 insertions(+), 242 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 91ded9c271ee..ea17e6d29a7e 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -1198,7 +1198,7 @@ int __hist_entry__tui_annotate(struct hist_entry *he, struct map_symbol *ms,
 				ui__warning("Annotation has no source code.");
 		}
 	} else {
-		err = evsel__get_arch(evsel, &browser.arch);
+		err = thread__get_arch(ms->thread, &browser.arch);
 		if (err) {
 			annotate_browser__symbol_annotate_error(&browser, err);
 			return -1;
diff --git a/tools/perf/util/annotate-arch/annotate-arc.c b/tools/perf/util/annotate-arch/annotate-arc.c
index d7ca08ca5600..170103e383a4 100644
--- a/tools/perf/util/annotate-arch/annotate-arc.c
+++ b/tools/perf/util/annotate-arch/annotate-arc.c
@@ -1,10 +1,18 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/compiler.h>
+#include <linux/zalloc.h>
 #include "../disasm.h"
 
-int arc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_arc(const struct e_machine_and_e_flags *id,
+				 const char *cpuid __maybe_unused)
 {
-	arch->initialized = true;
+	struct arch *arch = zalloc(sizeof(*arch));
+
+	if (!arch)
+		return NULL;
+
+	arch->name = "arc";
+	arch->id = *id;
 	arch->objdump.comment_char = ';';
-	return 0;
+	return arch;
 }
diff --git a/tools/perf/util/annotate-arch/annotate-arm.c b/tools/perf/util/annotate-arch/annotate-arm.c
index 08c49067c3c9..afb413c80156 100644
--- a/tools/perf/util/annotate-arch/annotate-arm.c
+++ b/tools/perf/util/annotate-arch/annotate-arm.c
@@ -7,14 +7,15 @@
 #include "../annotate.h"
 #include "../disasm.h"
 
-struct arm_annotate {
-	regex_t call_insn,
-		jump_insn;
+struct arch_arm {
+	struct arch arch;
+	regex_t call_insn;
+	regex_t jump_insn;
 };
 
 static const struct ins_ops *arm__associate_instruction_ops(struct arch *arch, const char *name)
 {
-	struct arm_annotate *arm = arch->priv;
+	struct arch_arm *arm = container_of(arch, struct arch_arm, arch);
 	const struct ins_ops *ops;
 	regmatch_t match[2];
 
@@ -29,37 +30,39 @@ static const struct ins_ops *arm__associate_instruction_ops(struct arch *arch, c
 	return ops;
 }
 
-int arm__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_arm(const struct e_machine_and_e_flags *id,
+				 const char *cpuid __maybe_unused)
 {
-	struct arm_annotate *arm;
 	int err;
+	struct arch_arm *arm = zalloc(sizeof(*arm));
+	struct arch *arch;
 
-	if (arch->initialized)
-		return 0;
-
-	arm = zalloc(sizeof(*arm));
 	if (!arm)
-		return ENOMEM;
+		return NULL;
+
+	arch = &arm->arch;
+	arch->name = "arm";
+	arch->id = *id;
+	arch->objdump.comment_char	  = ';';
+	arch->objdump.skip_functions_char = '+';
+	arch->associate_instruction_ops   = arm__associate_instruction_ops;
 
 #define ARM_CONDS "(cc|cs|eq|ge|gt|hi|le|ls|lt|mi|ne|pl|vc|vs)"
 	err = regcomp(&arm->call_insn, "^blx?" ARM_CONDS "?$", REG_EXTENDED);
 	if (err)
 		goto out_free_arm;
+
 	err = regcomp(&arm->jump_insn, "^bx?" ARM_CONDS "?$", REG_EXTENDED);
 	if (err)
 		goto out_free_call;
 #undef ARM_CONDS
 
-	arch->initialized = true;
-	arch->priv	  = arm;
-	arch->associate_instruction_ops   = arm__associate_instruction_ops;
-	arch->objdump.comment_char	  = ';';
-	arch->objdump.skip_functions_char = '+';
-	return 0;
+	return arch;
 
 out_free_call:
 	regfree(&arm->call_insn);
 out_free_arm:
 	free(arm);
-	return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP;
+	errno = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP;
+	return NULL;
 }
diff --git a/tools/perf/util/annotate-arch/annotate-arm64.c b/tools/perf/util/annotate-arch/annotate-arm64.c
index d2ea32984b0d..33080fdca125 100644
--- a/tools/perf/util/annotate-arch/annotate-arm64.c
+++ b/tools/perf/util/annotate-arch/annotate-arm64.c
@@ -8,9 +8,10 @@
 #include "../annotate.h"
 #include "../disasm.h"
 
-struct arm64_annotate {
-	regex_t call_insn,
-		jump_insn;
+struct arch_arm64 {
+	struct arch arch;
+	regex_t call_insn;
+	regex_t jump_insn;
 };
 
 static int arm64_mov__parse(const struct arch *arch __maybe_unused,
@@ -70,7 +71,7 @@ static const struct ins_ops arm64_mov_ops = {
 
 static const struct ins_ops *arm64__associate_instruction_ops(struct arch *arch, const char *name)
 {
-	struct arm64_annotate *arm = arch->priv;
+	struct arch_arm64 *arm = container_of(arch, struct arch_arm64, arch);
 	const struct ins_ops *ops;
 	regmatch_t match[2];
 
@@ -87,38 +88,40 @@ static const struct ins_ops *arm64__associate_instruction_ops(struct arch *arch,
 	return ops;
 }
 
-int arm64__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_arm64(const struct e_machine_and_e_flags *id,
+				   const char *cpuid __maybe_unused)
 {
-	struct arm64_annotate *arm;
 	int err;
+	struct arch_arm64 *arm = zalloc(sizeof(*arm));
+	struct arch *arch;
 
-	if (arch->initialized)
-		return 0;
-
-	arm = zalloc(sizeof(*arm));
 	if (!arm)
-		return ENOMEM;
+		return NULL;
+
+	arch = &arm->arch;
+	arch->name = "arm64";
+	arch->id = *id;
+	arch->objdump.comment_char	  = '/';
+	arch->objdump.skip_functions_char = '+';
+	arch->associate_instruction_ops   = arm64__associate_instruction_ops;
 
 	/* bl, blr */
 	err = regcomp(&arm->call_insn, "^blr?$", REG_EXTENDED);
 	if (err)
 		goto out_free_arm;
+
 	/* b, b.cond, br, cbz/cbnz, tbz/tbnz */
 	err = regcomp(&arm->jump_insn, "^[ct]?br?\\.?(cc|cs|eq|ge|gt|hi|hs|le|lo|ls|lt|mi|ne|pl|vc|vs)?n?z?$",
 		      REG_EXTENDED);
 	if (err)
 		goto out_free_call;
 
-	arch->initialized = true;
-	arch->priv	  = arm;
-	arch->associate_instruction_ops   = arm64__associate_instruction_ops;
-	arch->objdump.comment_char	  = '/';
-	arch->objdump.skip_functions_char = '+';
-	return 0;
+	return arch;
 
 out_free_call:
 	regfree(&arm->call_insn);
 out_free_arm:
 	free(arm);
-	return SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP;
+	errno = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_REGEXP;
+	return NULL;
 }
diff --git a/tools/perf/util/annotate-arch/annotate-csky.c b/tools/perf/util/annotate-arch/annotate-csky.c
index 0b0b09b068ec..d2b18e4ea2c9 100644
--- a/tools/perf/util/annotate-arch/annotate-csky.c
+++ b/tools/perf/util/annotate-arch/annotate-csky.c
@@ -2,6 +2,7 @@
 // Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd.
 #include <string.h>
 #include <linux/compiler.h>
+#include <linux/zalloc.h>
 #include "../disasm.h"
 
 static const struct ins_ops *csky__associate_ins_ops(struct arch *arch,
@@ -39,10 +40,17 @@ static const struct ins_ops *csky__associate_ins_ops(struct arch *arch,
 	return ops;
 }
 
-int csky__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_csky(const struct e_machine_and_e_flags *id,
+				  const char *cpuid __maybe_unused)
 {
-	arch->initialized = true;
+	struct arch *arch = zalloc(sizeof(*arch));
+
+	if (!arch)
+		return NULL;
+
+	arch->name = "csky";
+	arch->id = *id;
 	arch->objdump.comment_char = '/';
 	arch->associate_instruction_ops = csky__associate_ins_ops;
-	return 0;
+	return arch;
 }
diff --git a/tools/perf/util/annotate-arch/annotate-loongarch.c b/tools/perf/util/annotate-arch/annotate-loongarch.c
index 6c94cb98a104..3aeab453a059 100644
--- a/tools/perf/util/annotate-arch/annotate-loongarch.c
+++ b/tools/perf/util/annotate-arch/annotate-loongarch.c
@@ -7,6 +7,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <linux/compiler.h>
+#include <linux/zalloc.h>
 #include "../disasm.h"
 #include "../map.h"
 #include "../maps.h"
@@ -139,13 +140,17 @@ const struct ins_ops *loongarch__associate_ins_ops(struct arch *arch, const char
 	return ops;
 }
 
-int loongarch__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_loongarch(const struct e_machine_and_e_flags *id,
+				       const char *cpuid __maybe_unused)
 {
-	if (!arch->initialized) {
-		arch->associate_instruction_ops = loongarch__associate_ins_ops;
-		arch->initialized = true;
-		arch->objdump.comment_char = '#';
-	}
+	struct arch *arch = zalloc(sizeof(*arch));
 
-	return 0;
+	if (!arch)
+		return NULL;
+
+	arch->name = "loongarch";
+	arch->id = *id;
+	arch->associate_instruction_ops = loongarch__associate_ins_ops;
+	arch->objdump.comment_char = '#';
+	return arch;
 }
diff --git a/tools/perf/util/annotate-arch/annotate-mips.c b/tools/perf/util/annotate-arch/annotate-mips.c
index f14b34ed77d3..e8d1c6c7e9f3 100644
--- a/tools/perf/util/annotate-arch/annotate-mips.c
+++ b/tools/perf/util/annotate-arch/annotate-mips.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <string.h>
 #include <linux/compiler.h>
+#include <linux/zalloc.h>
 #include "../disasm.h"
 
 static
@@ -36,13 +37,17 @@ const struct ins_ops *mips__associate_ins_ops(struct arch *arch, const char *nam
 	return ops;
 }
 
-int mips__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_mips(const struct e_machine_and_e_flags *id,
+				  const char *cpuid __maybe_unused)
 {
-	if (!arch->initialized) {
-		arch->associate_instruction_ops = mips__associate_ins_ops;
-		arch->initialized = true;
-		arch->objdump.comment_char = '#';
-	}
+	struct arch *arch = zalloc(sizeof(*arch));
 
-	return 0;
+	if (!arch)
+		return NULL;
+
+	arch->name = "mips";
+	arch->id = *id;
+	arch->objdump.comment_char = '#';
+	arch->associate_instruction_ops = mips__associate_ins_ops;
+	return arch;
 }
diff --git a/tools/perf/util/annotate-arch/annotate-powerpc.c b/tools/perf/util/annotate-arch/annotate-powerpc.c
index 593c138c8104..218207b52581 100644
--- a/tools/perf/util/annotate-arch/annotate-powerpc.c
+++ b/tools/perf/util/annotate-arch/annotate-powerpc.c
@@ -390,17 +390,21 @@ static void update_insn_state_powerpc(struct type_state *state,
 }
 #endif /* HAVE_LIBDW_SUPPORT */
 
-int powerpc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_powerpc(const struct e_machine_and_e_flags *id,
+				     const char *cpuid __maybe_unused)
 {
-	if (!arch->initialized) {
-		arch->initialized = true;
-		arch->associate_instruction_ops = powerpc__associate_instruction_ops;
-		arch->objdump.comment_char      = '#';
-		annotate_opts.show_asm_raw = true;
+	struct arch *arch = zalloc(sizeof(*arch));
+
+	if (!arch)
+		return NULL;
+
+	arch->name = "powerpc";
+	arch->id = *id;
+	arch->objdump.comment_char = '#';
+	annotate_opts.show_asm_raw = true;
+	arch->associate_instruction_ops = powerpc__associate_instruction_ops;
 #ifdef HAVE_LIBDW_SUPPORT
-		arch->update_insn_state = update_insn_state_powerpc;
+	arch->update_insn_state = update_insn_state_powerpc;
 #endif
-	}
-
-	return 0;
+	return arch;
 }
diff --git a/tools/perf/util/annotate-arch/annotate-riscv64.c b/tools/perf/util/annotate-arch/annotate-riscv64.c
index 15526824037a..29a988fca8c9 100644
--- a/tools/perf/util/annotate-arch/annotate-riscv64.c
+++ b/tools/perf/util/annotate-arch/annotate-riscv64.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <string.h>
 #include <linux/compiler.h>
+#include <linux/zalloc.h>
 #include "../disasm.h"
 
 static
@@ -24,13 +25,17 @@ const struct ins_ops *riscv64__associate_ins_ops(struct arch *arch, const char *
 	return ops;
 }
 
-int riscv64__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_riscv64(const struct e_machine_and_e_flags *id,
+				     const char *cpuid __maybe_unused)
 {
-	if (!arch->initialized) {
-		arch->associate_instruction_ops = riscv64__associate_ins_ops;
-		arch->initialized = true;
-		arch->objdump.comment_char = '#';
-	}
+	struct arch *arch = zalloc(sizeof(*arch));
 
-	return 0;
+	if (!arch)
+		return NULL;
+
+	arch->name = "riscv";
+	arch->id = *id;
+	arch->objdump.comment_char = '#';
+	arch->associate_instruction_ops = riscv64__associate_ins_ops;
+	return arch;
 }
diff --git a/tools/perf/util/annotate-arch/annotate-s390.c b/tools/perf/util/annotate-arch/annotate-s390.c
index 47573f0310c1..af9cabd0a586 100644
--- a/tools/perf/util/annotate-arch/annotate-s390.c
+++ b/tools/perf/util/annotate-arch/annotate-s390.c
@@ -148,7 +148,7 @@ static const struct ins_ops *s390__associate_ins_ops(struct arch *arch, const ch
 	return ops;
 }
 
-static int s390__cpuid_parse(struct arch *arch, char *cpuid)
+static int s390__cpuid_parse(struct arch *arch, const char *cpuid)
 {
 	unsigned int family;
 	char model[16], model_c[16], cpumf_v[16], cpumf_a[16];
@@ -169,19 +169,22 @@ static int s390__cpuid_parse(struct arch *arch, char *cpuid)
 	return -1;
 }
 
-int s390__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_s390(const struct e_machine_and_e_flags *id, const char *cpuid)
 {
-	int err = 0;
-
-	if (!arch->initialized) {
-		arch->initialized = true;
-		arch->associate_instruction_ops = s390__associate_ins_ops;
-		if (cpuid) {
-			if (s390__cpuid_parse(arch, cpuid))
-				err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING;
+	struct arch *arch = zalloc(sizeof(*arch));
+
+	if (!arch)
+		return NULL;
+
+	arch->name = "s390";
+	arch->id = *id;
+	arch->associate_instruction_ops = s390__associate_ins_ops;
+	if (cpuid) {
+		if (s390__cpuid_parse(arch, cpuid)) {
+			errno = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING;
+			return NULL;
 		}
-		arch->objdump.comment_char = '#';
 	}
-
-	return err;
+	arch->objdump.comment_char = '#';
+	return arch;
 }
diff --git a/tools/perf/util/annotate-arch/annotate-sparc.c b/tools/perf/util/annotate-arch/annotate-sparc.c
index 66a0174376dd..2f07bc7a56dd 100644
--- a/tools/perf/util/annotate-arch/annotate-sparc.c
+++ b/tools/perf/util/annotate-arch/annotate-sparc.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <string.h>
 #include <linux/compiler.h>
+#include <linux/zalloc.h>
 #include "../../util/disasm.h"
 
 static int is_branch_cond(const char *cond)
@@ -160,13 +161,17 @@ static const struct ins_ops *sparc__associate_instruction_ops(struct arch *arch,
 	return ops;
 }
 
-int sparc__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
+const struct arch *arch__new_sparc(const struct e_machine_and_e_flags *id,
+				   const char *cpuid __maybe_unused)
 {
-	if (!arch->initialized) {
-		arch->initialized = true;
-		arch->associate_instruction_ops = sparc__associate_instruction_ops;
-		arch->objdump.comment_char = '#';
-	}
+	struct arch *arch = zalloc(sizeof(*arch));
 
-	return 0;
+	if (!arch)
+		return NULL;
+
+	arch->name = "sparc";
+	arch->id = *id;
+	arch->associate_instruction_ops = sparc__associate_instruction_ops;
+	arch->objdump.comment_char = '#';
+	return arch;
 }
diff --git a/tools/perf/util/annotate-arch/annotate-x86.c b/tools/perf/util/annotate-arch/annotate-x86.c
index 0c7957fe60da..eb9a649ca656 100644
--- a/tools/perf/util/annotate-arch/annotate-x86.c
+++ b/tools/perf/util/annotate-arch/annotate-x86.c
@@ -182,7 +182,7 @@ static bool intel__ins_is_fused(const struct arch *arch, const char *ins1,
 	return false;
 }
 
-static int x86__cpuid_parse(struct arch *arch, char *cpuid)
+static int x86__cpuid_parse(struct arch *arch, const char *cpuid)
 {
 	unsigned int family, model, stepping;
 	int ret;
@@ -777,18 +777,21 @@ retry:
 }
 #endif
 
-int x86__annotate_init(struct arch *arch, char *cpuid)
+const struct arch *arch__new_x86(const struct e_machine_and_e_flags *id, const char *cpuid)
 {
-	int err = 0;
+	struct arch *arch = zalloc(sizeof(*arch));
 
-	if (arch->initialized)
-		return 0;
+	if (!arch)
+		return NULL;
 
+	arch->name = "x86";
+	arch->id = *id;
 	if (cpuid) {
-		if (x86__cpuid_parse(arch, cpuid))
-			err = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING;
+		if (x86__cpuid_parse(arch, cpuid)) {
+			errno = SYMBOL_ANNOTATE_ERRNO__ARCH_INIT_CPUID_PARSING;
+			return NULL;
+		}
 	}
-
 	arch->instructions = x86__instructions;
 	arch->nr_instructions = ARRAY_SIZE(x86__instructions);
 #ifndef NDEBUG
@@ -810,11 +813,8 @@ int x86__annotate_init(struct arch *arch, char *cpuid)
 	arch->objdump.memory_ref_char = '(';
 	arch->objdump.imm_char = '$';
 	arch->insn_suffix = "bwlq";
-	arch->e_machine = EM_X86_64;
-	arch->e_flags = 0;
-	arch->initialized = true;
 #ifdef HAVE_LIBDW_SUPPORT
 	arch->update_insn_state = update_insn_state_x86;
 #endif
-	return err;
+	return arch;
 }
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 79702072568b..c16c6dfaa959 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -980,32 +980,27 @@ void symbol__calc_percent(struct symbol *sym, struct evsel *evsel)
 	annotation__calc_percent(notes, evsel, symbol__size(sym));
 }
 
-int evsel__get_arch(struct evsel *evsel, const struct arch **parch)
+int thread__get_arch(struct thread *thread, const struct arch **parch)
 {
-	struct perf_env *env = evsel__env(evsel);
-	const char *arch_name = perf_env__arch(env);
 	const struct arch *arch;
-	int err;
+	struct machine *machine;
+	uint16_t e_machine;
 
-	if (!arch_name) {
+	if (!thread) {
 		*parch = NULL;
-		return errno;
+		return -1;
 	}
 
-	*parch = arch = arch__find(arch_name);
+	machine = maps__machine(thread__maps(thread));
+	e_machine = thread__e_machine(thread, machine);
+	arch = arch__find(e_machine, machine->env ? machine->env->cpuid : NULL);
 	if (arch == NULL) {
-		pr_err("%s: unsupported arch %s\n", __func__, arch_name);
-		return ENOTSUP;
+		pr_err("%s: unsupported arch %d\n", __func__, e_machine);
+		return errno;
 	}
+	if (parch)
+		*parch = arch;
 
-	if (arch->init) {
-		err = arch->init((struct arch *)arch, env ? env->cpuid : NULL);
-		if (err) {
-			pr_err("%s: failed to initialize %s arch priv area\n",
-			       __func__, arch->name);
-			return err;
-		}
-	}
 	return 0;
 }
 
@@ -1020,7 +1015,7 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel,
 	const struct arch *arch = NULL;
 	int err, nr;
 
-	err = evsel__get_arch(evsel, &arch);
+	err = thread__get_arch(ms->thread, &arch);
 	if (err)
 		return err;
 
@@ -1268,7 +1263,7 @@ int hist_entry__annotate_printf(struct hist_entry *he, struct evsel *evsel)
 
 	apd.addr_fmt_width = annotated_source__addr_fmt_width(&notes->src->source,
 							      notes->src->start);
-	evsel__get_arch(evsel, &apd.arch);
+	thread__get_arch(ms->thread, &apd.arch);
 	apd.dbg = dso__debuginfo(dso);
 
 	list_for_each_entry(pos, &notes->src->source, node) {
@@ -1373,7 +1368,7 @@ static int symbol__annotate_fprintf2(struct symbol *sym, FILE *fp,
 	struct annotation_line *al;
 
 	if (annotate_opts.code_with_type) {
-		evsel__get_arch(apd->evsel, &apd->arch);
+		thread__get_arch(apd->he->ms.thread, &apd->arch);
 		apd->dbg = dso__debuginfo(map__dso(apd->he->ms.map));
 	}
 
@@ -2495,7 +2490,7 @@ static int extract_reg_offset(const struct arch *arch, const char *str,
 	if (regname == NULL)
 		return -1;
 
-	op_loc->reg1 = get_dwarf_regnum(regname, arch->e_machine, arch->e_flags);
+	op_loc->reg1 = get_dwarf_regnum(regname, arch->id.e_machine, arch->id.e_flags);
 	free(regname);
 
 	/* Get the second register */
@@ -2508,7 +2503,7 @@ static int extract_reg_offset(const struct arch *arch, const char *str,
 		if (regname == NULL)
 			return -1;
 
-		op_loc->reg2 = get_dwarf_regnum(regname, arch->e_machine, arch->e_flags);
+		op_loc->reg2 = get_dwarf_regnum(regname, arch->id.e_machine, arch->id.e_flags);
 		free(regname);
 	}
 	return 0;
@@ -2607,8 +2602,11 @@ int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl,
 			if (s == NULL)
 				return -1;
 
-			if (*s == arch->objdump.register_char)
-				op_loc->reg1 = get_dwarf_regnum(s, arch->e_machine, arch->e_flags);
+			if (*s == arch->objdump.register_char) {
+				op_loc->reg1 = get_dwarf_regnum(s,
+								arch->id.e_machine,
+								arch->id.e_flags);
+			}
 			else if (*s == arch->objdump.imm_char) {
 				op_loc->offset = strtol(s + 1, &p, 0);
 				if (p && p != s + 1)
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 58eaf4b2fa65..696e36dbf013 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -586,5 +586,5 @@ int annotation_br_cntr_entry(char **str, int br_cntr_nr, u64 *br_cntr,
 			     int num_aggr, struct evsel *evsel);
 int annotation_br_cntr_abbr_list(char **str, struct evsel *evsel, bool header);
 
-int evsel__get_arch(struct evsel *evsel, const struct arch **parch);
+int thread__get_arch(struct thread *thread, const struct arch **parch);
 #endif	/* __PERF_ANNOTATE_H */
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index d81469db0aac..4f60726247d6 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -1,5 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include <ctype.h>
+#include <elf.h>
+#ifndef EF_CSKY_ABIMASK
+#define EF_CSKY_ABIMASK	0XF0000000
+#endif
 #include <errno.h>
 #include <fcntl.h>
 #include <inttypes.h>
@@ -102,112 +106,101 @@ int arch__associate_ins_ops(struct arch *arch, const char *name, const struct in
 	return 0;
 }
 
-static struct arch architectures[] = {
-	{
-		.name = "arc",
-		.init = arc__annotate_init,
-		.e_machine = EM_ARC,
-	},
-	{
-		.name = "arm",
-		.init = arm__annotate_init,
-		.e_machine = EM_ARM,
-	},
-	{
-		.name = "arm64",
-		.init = arm64__annotate_init,
-		.e_machine = EM_AARCH64,
-	},
-	{
-		.name = "csky",
-		.init = csky__annotate_init,
-		.e_machine = EM_CSKY,
-#if defined(__CSKYABIV2__)
-		.e_flags = EF_CSKY_ABIV2,
-#else
-		.e_flags = EF_CSKY_ABIV1,
-#endif
-	},
-	{
-		.name = "mips",
-		.init = mips__annotate_init,
-		.e_machine = EM_MIPS,
-	},
-	{
-		.name = "x86",
-		.init = x86__annotate_init,
-		.e_machine = EM_X86_64, // TODO: EM_386 too.
-	},
-	{
-		.name = "powerpc",
-		.init = powerpc__annotate_init,
-		.e_machine = EM_PPC, // TODO: EM_PPC64 too.
-	},
-	{
-		.name = "riscv64",
-		.init = riscv64__annotate_init,
-		.e_machine = EM_RISCV,
-	},
-	{
-		.name = "s390",
-		.init = s390__annotate_init,
-		.e_machine = EM_S390,
-	},
-	{
-		.name = "sparc",
-		.init = sparc__annotate_init,
-		.e_machine = EM_SPARC,
-	},
-	{
-		.name = "loongarch",
-		.init = loongarch__annotate_init,
-		.e_machine = EM_LOONGARCH,
-	},
-};
+static int e_machine_and_eflags__cmp(const struct e_machine_and_e_flags *val1,
+				     const struct e_machine_and_e_flags *val2)
+{
+	if (val1->e_machine == val2->e_machine) {
+		if (val1->e_machine != EM_CSKY)
+			return 0;
+		if ((val1->e_flags & EF_CSKY_ABIMASK) < (val2->e_flags & EF_CSKY_ABIMASK))
+			return -1;
+		return (val1->e_flags & EF_CSKY_ABIMASK) > (val2->e_flags & EF_CSKY_ABIMASK);
+	}
+	return val1->e_machine < val2->e_machine ? -1 : 1;
+}
 
-static int arch__key_cmp(const void *name, const void *archp)
+static int arch__key_cmp(const void *key, const void *archp)
 {
-	const struct arch *arch = archp;
+	const struct arch *const *arch = archp;
 
-	return strcmp(name, arch->name);
+	return e_machine_and_eflags__cmp(key, &(*arch)->id);
 }
 
 static int arch__cmp(const void *a, const void *b)
 {
-	const struct arch *aa = a;
-	const struct arch *ab = b;
+	const struct arch *const *aa = a;
+	const struct arch *const *ab = b;
 
-	return strcmp(aa->name, ab->name);
+	return e_machine_and_eflags__cmp(&(*aa)->id, &(*ab)->id);
 }
 
-static void arch__sort(void)
+const struct arch *arch__find(uint16_t e_machine, const char *cpuid)
 {
-	const int nmemb = ARRAY_SIZE(architectures);
+	static const struct arch *(*const arch_new_fn[])(const struct e_machine_and_e_flags *id,
+							 const char *cpuid) = {
+		[EM_386]	= arch__new_x86,
+		[EM_ARC]	= arch__new_arc,
+		[EM_ARM]	= arch__new_arm,
+		[EM_AARCH64]	= arch__new_arm64,
+		[EM_CSKY]	= arch__new_csky,
+		[EM_LOONGARCH]	= arch__new_loongarch,
+		[EM_MIPS]	= arch__new_mips,
+		[EM_PPC64]	= arch__new_powerpc,
+		[EM_PPC]	= arch__new_powerpc,
+		[EM_RISCV]	= arch__new_riscv64,
+		[EM_S390]	= arch__new_s390,
+		[EM_SPARC]	= arch__new_sparc,
+		[EM_SPARCV9]	= arch__new_sparc,
+		[EM_X86_64]	= arch__new_x86,
+	};
+	static const struct arch **archs;
+	static size_t num_archs;
+	struct e_machine_and_e_flags key = {
+		.e_machine = e_machine,
+		// TODO: e_flags should really come from the same source as e_machine.
+		.e_flags = EF_HOST,
+	};
+	const struct arch *result = NULL, **tmp;
 
-	qsort(architectures, nmemb, sizeof(struct arch), arch__cmp);
-}
+	if (num_archs > 0) {
+		tmp = bsearch(&key, archs, num_archs, sizeof(*archs), arch__key_cmp);
+		if (tmp)
+			result = *tmp;
+	}
 
-const struct arch *arch__find(const char *name)
-{
-	const int nmemb = ARRAY_SIZE(architectures);
-	static bool sorted;
+	if (result)
+		return result;
 
-	if (!sorted) {
-		arch__sort();
-		sorted = true;
+	if (e_machine >= ARRAY_SIZE(arch_new_fn) || arch_new_fn[e_machine] == NULL) {
+		errno = ENOTSUP;
+		return NULL;
 	}
 
-	return bsearch(name, architectures, nmemb, sizeof(struct arch), arch__key_cmp);
+	tmp = reallocarray(archs, num_archs + 1, sizeof(*archs));
+	if (!tmp)
+		return NULL;
+
+	result = arch_new_fn[e_machine](&key, cpuid);
+	if (!result) {
+		pr_err("%s: failed to initialize %s (%u) arch priv area\n",
+			__func__, result->name, e_machine);
+		free(tmp);
+		return NULL;
+	}
+	archs = tmp;
+	archs[num_archs++] = result;
+	qsort(archs, num_archs, sizeof(*archs), arch__cmp);
+	return result;
 }
 
 bool arch__is_x86(const struct arch *arch)
 {
-	return arch->e_machine == EM_386 || arch->e_machine == EM_X86_64;
+	return arch->id.e_machine == EM_386 || arch->id.e_machine == EM_X86_64;
 }
 
 bool arch__is_powerpc(const struct arch *arch)
 {
-	return arch->e_machine == EM_PPC || arch->e_machine == EM_PPC64;
+	return arch->id.e_machine == EM_PPC || arch->id.e_machine == EM_PPC64;
 }
 
 static void ins_ops__delete(struct ins_operands *ops)
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index b6a2a30fdf27..2793d48aa04e 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -17,21 +17,23 @@ struct data_loc_info;
 struct type_state;
 struct disasm_line;
 
+struct e_machine_and_e_flags {
+	uint16_t e_machine;
+	uint32_t e_flags;
+};
+
 struct arch {
-	const char	*name;
+	/** @id: ELF machine and flags associated with arch. */
+	struct e_machine_and_e_flags id;
+	/** @name: name such as "x86" or "powerpc". */
+	const char		*name;
 	const struct ins	*instructions;
-	size_t		nr_instructions;
-	size_t		nr_instructions_allocated;
-	const struct ins_ops  *(*associate_instruction_ops)(struct arch *arch, const char *name);
-	bool		sorted_instructions;
-	bool		initialized;
-	const char	*insn_suffix;
-	void		*priv;
-	unsigned int	model;
-	unsigned int	family;
-	int		(*init)(struct arch *arch, char *cpuid);
-	bool		(*ins_is_fused)(const struct arch *arch, const char *ins1,
-					const char *ins2);
+	size_t			nr_instructions;
+	size_t			nr_instructions_allocated;
+	bool			sorted_instructions;
+	const char		*insn_suffix;
+	unsigned int		model;
+	unsigned int		family;
 	struct		{
 		char comment_char;
 		char skip_functions_char;
@@ -39,15 +41,14 @@ struct arch {
 		char memory_ref_char;
 		char imm_char;
 	} objdump;
+	bool		(*ins_is_fused)(const struct arch *arch, const char *ins1,
+					const char *ins2);
+	const struct ins_ops  *(*associate_instruction_ops)(struct arch *arch, const char *name);
 #ifdef HAVE_LIBDW_SUPPORT
 	void		(*update_insn_state)(struct type_state *state,
 				struct data_loc_info *dloc, Dwarf_Die *cu_die,
 				struct disasm_line *dl);
 #endif
-	/** @e_machine: ELF machine associated with arch. */
-	unsigned int e_machine;
-	/** @e_flags: Optional ELF flags associated with arch. */
-	unsigned int e_flags;
 };
 
 struct ins {
@@ -107,7 +108,7 @@ struct annotate_args {
 	char			  *fileloc;
 };
 
-const struct arch *arch__find(const char *name);
+const struct arch *arch__find(uint16_t e_machine, const char *cpuid);
 bool arch__is_x86(const struct arch *arch);
 bool arch__is_powerpc(const struct arch *arch);
 
@@ -121,17 +122,17 @@ extern const struct ins_ops ret_ops;
 
 int arch__associate_ins_ops(struct arch *arch, const char *name, const struct ins_ops *ops);
 
-int arc__annotate_init(struct arch *arch, char *cpuid);
-int arm__annotate_init(struct arch *arch, char *cpuid);
-int arm64__annotate_init(struct arch *arch, char *cpuid);
-int csky__annotate_init(struct arch *arch, char *cpuid);
-int loongarch__annotate_init(struct arch *arch, char *cpuid);
-int mips__annotate_init(struct arch *arch, char *cpuid);
-int powerpc__annotate_init(struct arch *arch, char *cpuid);
-int riscv64__annotate_init(struct arch *arch, char *cpuid);
-int s390__annotate_init(struct arch *arch, char *cpuid);
-int sparc__annotate_init(struct arch *arch, char *cpuid);
-int x86__annotate_init(struct arch *arch, char *cpuid);
+const struct arch *arch__new_arc(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_arm(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_arm64(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_csky(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_loongarch(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_mips(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_powerpc(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_riscv64(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_s390(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_sparc(const struct e_machine_and_e_flags *id, const char *cpuid);
+const struct arch *arch__new_x86(const struct e_machine_and_e_flags *id, const char *cpuid);
 
 const struct ins_ops *ins__find(const struct arch *arch, const char *name, struct disasm_line *dl);
 
-- 
cgit v1.2.3


From dc329efc162ac168e2a0c83d1334608371dd525b Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 13:35:16 -0800
Subject: perf disasm: Minor layout tweaks for 'struct arch'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pack some holes to bring down the overall struct size from 96 to 88
bytes.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zecheng Li <zecheng@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/disasm.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index 2793d48aa04e..6a1905f9d4fc 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -18,22 +18,22 @@ struct type_state;
 struct disasm_line;
 
 struct e_machine_and_e_flags {
-	uint16_t e_machine;
 	uint32_t e_flags;
+	uint16_t e_machine;
 };
 
 struct arch {
-	/** @id: ELF machine and flags associated with arch. */
-	struct e_machine_and_e_flags id;
 	/** @name: name such as "x86" or "powerpc". */
 	const char		*name;
 	const struct ins	*instructions;
 	size_t			nr_instructions;
 	size_t			nr_instructions_allocated;
-	bool			sorted_instructions;
 	const char		*insn_suffix;
 	unsigned int		model;
 	unsigned int		family;
+	/** @id: ELF machine and flags associated with arch. */
+	struct e_machine_and_e_flags id;
+	bool			sorted_instructions;
 	struct		{
 		char comment_char;
 		char skip_functions_char;
-- 
cgit v1.2.3


From e786a04b4a5461dd7e2829422314a5a6d5a664d9 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 09:58:46 -0800
Subject: perf inject: With --convert-callchain ignore the dummy event for
 dwarf stacks

On hybrid systems there is generally >1 event and a dummy event.

The perf inject --convert-callchain option is failing to convert
perf.data files on such systems reporting "--convert-callchain requires
DWARF call graph."

The failing event is the dummy event that doesn't need to be set up for
samples.

As such ignore this event when checking the evsels.

Fixes: 92ea788d2af4e65a ("perf inject: Add --convert-callchain option")
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 2c9456614cde..5b29f4296861 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -2729,7 +2729,7 @@ int cmd_inject(int argc, const char **argv)
 		}
 
 		evlist__for_each_entry(inject.session->evlist, evsel) {
-			if (!evsel__has_dwarf_callchain(evsel)) {
+			if (!evsel__has_dwarf_callchain(evsel) && !evsel__is_dummy_event(evsel)) {
 				pr_err("--convert-callchain requires DWARF call graph.\n");
 				goto out_delete;
 			}
-- 
cgit v1.2.3


From c5e47e4d00fbc15f2390bb6ed8d9c21836363291 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 22 Jan 2026 09:53:37 -0800
Subject: perf tests sched: Avoid error in cleanup on loaded machines

The stop_noploops function will kill the noploop processes that are
running for 10 seconds.

On a loaded machine they may have already terminated meaning the kill
will return an error of no such process.

This doesn't matter and so ignore the error to avoid the test
terminating in the cleanup.

Fixes: 0e22c5ca44e68798 ("perf test: Add sched latency and script shell tests")
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/sched.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/sched.sh b/tools/perf/tests/shell/sched.sh
index b9b81eaf856e..b9637069adb1 100755
--- a/tools/perf/tests/shell/sched.sh
+++ b/tools/perf/tests/shell/sched.sh
@@ -53,7 +53,7 @@ start_noploops() {
 }
 
 cleanup_noploops() {
-  kill "$PID1" "$PID2"
+  kill "$PID1" "$PID2" || true
 }
 
 test_sched_record() {
-- 
cgit v1.2.3


From f0d98c78f8bf73ce2a9b7793f66cda240fa9ab10 Mon Sep 17 00:00:00 2001
From: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Date: Thu, 22 Jan 2026 22:47:04 +0530
Subject: perf annotate: Fix memcpy size in arch__grow_instructions()

The memcpy() in arch__grow_instructions() is copying the wrong number of
bytes when growing from a non-allocated table.

It should copy arch->nr_instructions * sizeof(struct ins) bytes, not
just arch->nr_instructions bytes.

This bug causes data corruption as only a partial copy of the
instruction table is made, leading to garbage data in most entries and
potential crashes

Fixes: 2a1ff812c40be982 ("perf annotate: Introduce alternative method of keeping instructions table")
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Suchit Karunakaran <suchitkarunakaran@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/disasm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 4f60726247d6..9b0ba1fc5aec 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -82,7 +82,7 @@ grow_from_non_allocated_table:
 	if (new_instructions == NULL)
 		return -1;
 
-	memcpy(new_instructions, arch->instructions, arch->nr_instructions);
+	memcpy(new_instructions, arch->instructions, arch->nr_instructions * sizeof(struct ins));
 	goto out_update_instructions;
 }
 
-- 
cgit v1.2.3


From 3d06db9bad1ad8e67c3981964cfba224c07fc306 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 20 Jan 2026 18:17:35 -0800
Subject: perf regs: Refactor use of arch__sample_reg_masks() to
 perf_reg_name()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

arch__sample_reg_masks isn't supported on ARM(32), csky, loongarch,
MIPS, RISC-V and s390.

The table returned by the function just has the name of a register
paired with the corresponding sample_regs_user mask value.

For a given perf register we can compute the name with perf_reg_name and
the mask is just 1 left-shifted by the perf register number.

Change __parse_regs to use this method for finding registers rather than
arch__sample_reg_masks, thereby adding __parse_regs support for ARM(32),
csky, loongarch, MIPS, RISC-V and s390.

As arch__sample_reg_masks is then unused, remove the now unneeded
declarations.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Richter <tmricht@linux.ibm.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Haibo Xu <haibo1.xu@intel.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/util/perf_regs.c               |   9 --
 tools/perf/arch/arm64/util/machine.c               |  14 +--
 tools/perf/arch/arm64/util/perf_regs.c             |  45 +-------
 tools/perf/arch/csky/util/perf_regs.c              |   9 --
 tools/perf/arch/loongarch/util/perf_regs.c         |   9 --
 tools/perf/arch/mips/util/perf_regs.c              |   9 --
 tools/perf/arch/powerpc/util/perf_regs.c           |  68 ------------
 tools/perf/arch/riscv/util/perf_regs.c             |   9 --
 tools/perf/arch/s390/util/perf_regs.c              |   9 --
 tools/perf/arch/x86/util/perf_regs.c               |  47 ---------
 .../perf/util/arm64-frame-pointer-unwind-support.c |   3 +-
 tools/perf/util/parse-regs-options.c               | 116 +++++++++++++--------
 tools/perf/util/perf_regs.c                        |   9 --
 tools/perf/util/perf_regs.h                        |  12 ---
 14 files changed, 81 insertions(+), 287 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/util/perf_regs.c b/tools/perf/arch/arm/util/perf_regs.c
index f94a0210c7b7..03a5bc0cf64c 100644
--- a/tools/perf/arch/arm/util/perf_regs.c
+++ b/tools/perf/arch/arm/util/perf_regs.c
@@ -2,10 +2,6 @@
 #include "perf_regs.h"
 #include "../../../util/perf_regs.h"
 
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
-
 uint64_t arch__intr_reg_mask(void)
 {
 	return PERF_REGS_MASK;
@@ -15,8 +11,3 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/arm64/util/machine.c b/tools/perf/arch/arm64/util/machine.c
index aab1cc2bc283..80fb13c958d9 100644
--- a/tools/perf/arch/arm64/util/machine.c
+++ b/tools/perf/arch/arm64/util/machine.c
@@ -1,18 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#include <inttypes.h>
-#include <stdio.h>
-#include <string.h>
-#include "debug.h"
-#include "symbol.h"
-#include "callchain.h"
+#include "callchain.h" // prototype of arch__add_leaf_frame_record_opts
 #include "perf_regs.h"
 #include "record.h"
-#include "util/perf_regs.h"
+
+#define SMPL_REG_MASK(b) (1ULL << (b))
 
 void arch__add_leaf_frame_record_opts(struct record_opts *opts)
 {
-	const struct sample_reg *sample_reg_masks = arch__sample_reg_masks();
-
-	opts->sample_user_regs |= sample_reg_masks[PERF_REG_ARM64_LR].mask;
+	opts->sample_user_regs |= SMPL_REG_MASK(PERF_REG_ARM64_LR);
 }
diff --git a/tools/perf/arch/arm64/util/perf_regs.c b/tools/perf/arch/arm64/util/perf_regs.c
index 09308665e28a..9bb768e1bea1 100644
--- a/tools/perf/arch/arm64/util/perf_regs.c
+++ b/tools/perf/arch/arm64/util/perf_regs.c
@@ -12,48 +12,12 @@
 #include "../../../util/event.h"
 #include "../../../util/perf_regs.h"
 
+#define SMPL_REG_MASK(b) (1ULL << (b))
+
 #ifndef HWCAP_SVE
 #define HWCAP_SVE	(1 << 22)
 #endif
 
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG(x0, PERF_REG_ARM64_X0),
-	SMPL_REG(x1, PERF_REG_ARM64_X1),
-	SMPL_REG(x2, PERF_REG_ARM64_X2),
-	SMPL_REG(x3, PERF_REG_ARM64_X3),
-	SMPL_REG(x4, PERF_REG_ARM64_X4),
-	SMPL_REG(x5, PERF_REG_ARM64_X5),
-	SMPL_REG(x6, PERF_REG_ARM64_X6),
-	SMPL_REG(x7, PERF_REG_ARM64_X7),
-	SMPL_REG(x8, PERF_REG_ARM64_X8),
-	SMPL_REG(x9, PERF_REG_ARM64_X9),
-	SMPL_REG(x10, PERF_REG_ARM64_X10),
-	SMPL_REG(x11, PERF_REG_ARM64_X11),
-	SMPL_REG(x12, PERF_REG_ARM64_X12),
-	SMPL_REG(x13, PERF_REG_ARM64_X13),
-	SMPL_REG(x14, PERF_REG_ARM64_X14),
-	SMPL_REG(x15, PERF_REG_ARM64_X15),
-	SMPL_REG(x16, PERF_REG_ARM64_X16),
-	SMPL_REG(x17, PERF_REG_ARM64_X17),
-	SMPL_REG(x18, PERF_REG_ARM64_X18),
-	SMPL_REG(x19, PERF_REG_ARM64_X19),
-	SMPL_REG(x20, PERF_REG_ARM64_X20),
-	SMPL_REG(x21, PERF_REG_ARM64_X21),
-	SMPL_REG(x22, PERF_REG_ARM64_X22),
-	SMPL_REG(x23, PERF_REG_ARM64_X23),
-	SMPL_REG(x24, PERF_REG_ARM64_X24),
-	SMPL_REG(x25, PERF_REG_ARM64_X25),
-	SMPL_REG(x26, PERF_REG_ARM64_X26),
-	SMPL_REG(x27, PERF_REG_ARM64_X27),
-	SMPL_REG(x28, PERF_REG_ARM64_X28),
-	SMPL_REG(x29, PERF_REG_ARM64_X29),
-	SMPL_REG(lr, PERF_REG_ARM64_LR),
-	SMPL_REG(sp, PERF_REG_ARM64_SP),
-	SMPL_REG(pc, PERF_REG_ARM64_PC),
-	SMPL_REG(vg, PERF_REG_ARM64_VG),
-	SMPL_REG_END
-};
-
 /* %xNUM */
 #define SDT_OP_REGEX1  "^(x[1-2]?[0-9]|3[0-1])$"
 
@@ -175,8 +139,3 @@ uint64_t arch__user_reg_mask(void)
 	}
 	return PERF_REGS_MASK;
 }
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/csky/util/perf_regs.c b/tools/perf/arch/csky/util/perf_regs.c
index 6b1665f41180..2cf7a54106e0 100644
--- a/tools/perf/arch/csky/util/perf_regs.c
+++ b/tools/perf/arch/csky/util/perf_regs.c
@@ -2,10 +2,6 @@
 #include "perf_regs.h"
 #include "../../util/perf_regs.h"
 
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
-
 uint64_t arch__intr_reg_mask(void)
 {
 	return PERF_REGS_MASK;
@@ -15,8 +11,3 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/loongarch/util/perf_regs.c b/tools/perf/arch/loongarch/util/perf_regs.c
index f94a0210c7b7..03a5bc0cf64c 100644
--- a/tools/perf/arch/loongarch/util/perf_regs.c
+++ b/tools/perf/arch/loongarch/util/perf_regs.c
@@ -2,10 +2,6 @@
 #include "perf_regs.h"
 #include "../../../util/perf_regs.h"
 
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
-
 uint64_t arch__intr_reg_mask(void)
 {
 	return PERF_REGS_MASK;
@@ -15,8 +11,3 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/mips/util/perf_regs.c b/tools/perf/arch/mips/util/perf_regs.c
index 6b1665f41180..2cf7a54106e0 100644
--- a/tools/perf/arch/mips/util/perf_regs.c
+++ b/tools/perf/arch/mips/util/perf_regs.c
@@ -2,10 +2,6 @@
 #include "perf_regs.h"
 #include "../../util/perf_regs.h"
 
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
-
 uint64_t arch__intr_reg_mask(void)
 {
 	return PERF_REGS_MASK;
@@ -15,8 +11,3 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
index bd36cfd420a2..779073f7e992 100644
--- a/tools/perf/arch/powerpc/util/perf_regs.c
+++ b/tools/perf/arch/powerpc/util/perf_regs.c
@@ -18,69 +18,6 @@
 #define PVR_POWER10		0x0080
 #define PVR_POWER11		0x0082
 
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG(r0, PERF_REG_POWERPC_R0),
-	SMPL_REG(r1, PERF_REG_POWERPC_R1),
-	SMPL_REG(r2, PERF_REG_POWERPC_R2),
-	SMPL_REG(r3, PERF_REG_POWERPC_R3),
-	SMPL_REG(r4, PERF_REG_POWERPC_R4),
-	SMPL_REG(r5, PERF_REG_POWERPC_R5),
-	SMPL_REG(r6, PERF_REG_POWERPC_R6),
-	SMPL_REG(r7, PERF_REG_POWERPC_R7),
-	SMPL_REG(r8, PERF_REG_POWERPC_R8),
-	SMPL_REG(r9, PERF_REG_POWERPC_R9),
-	SMPL_REG(r10, PERF_REG_POWERPC_R10),
-	SMPL_REG(r11, PERF_REG_POWERPC_R11),
-	SMPL_REG(r12, PERF_REG_POWERPC_R12),
-	SMPL_REG(r13, PERF_REG_POWERPC_R13),
-	SMPL_REG(r14, PERF_REG_POWERPC_R14),
-	SMPL_REG(r15, PERF_REG_POWERPC_R15),
-	SMPL_REG(r16, PERF_REG_POWERPC_R16),
-	SMPL_REG(r17, PERF_REG_POWERPC_R17),
-	SMPL_REG(r18, PERF_REG_POWERPC_R18),
-	SMPL_REG(r19, PERF_REG_POWERPC_R19),
-	SMPL_REG(r20, PERF_REG_POWERPC_R20),
-	SMPL_REG(r21, PERF_REG_POWERPC_R21),
-	SMPL_REG(r22, PERF_REG_POWERPC_R22),
-	SMPL_REG(r23, PERF_REG_POWERPC_R23),
-	SMPL_REG(r24, PERF_REG_POWERPC_R24),
-	SMPL_REG(r25, PERF_REG_POWERPC_R25),
-	SMPL_REG(r26, PERF_REG_POWERPC_R26),
-	SMPL_REG(r27, PERF_REG_POWERPC_R27),
-	SMPL_REG(r28, PERF_REG_POWERPC_R28),
-	SMPL_REG(r29, PERF_REG_POWERPC_R29),
-	SMPL_REG(r30, PERF_REG_POWERPC_R30),
-	SMPL_REG(r31, PERF_REG_POWERPC_R31),
-	SMPL_REG(nip, PERF_REG_POWERPC_NIP),
-	SMPL_REG(msr, PERF_REG_POWERPC_MSR),
-	SMPL_REG(orig_r3, PERF_REG_POWERPC_ORIG_R3),
-	SMPL_REG(ctr, PERF_REG_POWERPC_CTR),
-	SMPL_REG(link, PERF_REG_POWERPC_LINK),
-	SMPL_REG(xer, PERF_REG_POWERPC_XER),
-	SMPL_REG(ccr, PERF_REG_POWERPC_CCR),
-	SMPL_REG(softe, PERF_REG_POWERPC_SOFTE),
-	SMPL_REG(trap, PERF_REG_POWERPC_TRAP),
-	SMPL_REG(dar, PERF_REG_POWERPC_DAR),
-	SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR),
-	SMPL_REG(sier, PERF_REG_POWERPC_SIER),
-	SMPL_REG(mmcra, PERF_REG_POWERPC_MMCRA),
-	SMPL_REG(mmcr0, PERF_REG_POWERPC_MMCR0),
-	SMPL_REG(mmcr1, PERF_REG_POWERPC_MMCR1),
-	SMPL_REG(mmcr2, PERF_REG_POWERPC_MMCR2),
-	SMPL_REG(mmcr3, PERF_REG_POWERPC_MMCR3),
-	SMPL_REG(sier2, PERF_REG_POWERPC_SIER2),
-	SMPL_REG(sier3, PERF_REG_POWERPC_SIER3),
-	SMPL_REG(pmc1, PERF_REG_POWERPC_PMC1),
-	SMPL_REG(pmc2, PERF_REG_POWERPC_PMC2),
-	SMPL_REG(pmc3, PERF_REG_POWERPC_PMC3),
-	SMPL_REG(pmc4, PERF_REG_POWERPC_PMC4),
-	SMPL_REG(pmc5, PERF_REG_POWERPC_PMC5),
-	SMPL_REG(pmc6, PERF_REG_POWERPC_PMC6),
-	SMPL_REG(sdar, PERF_REG_POWERPC_SDAR),
-	SMPL_REG(siar, PERF_REG_POWERPC_SIAR),
-	SMPL_REG_END
-};
-
 /* REG or %rREG */
 #define SDT_OP_REGEX1  "^(%r)?([1-2]?[0-9]|3[0-1])$"
 
@@ -233,8 +170,3 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/riscv/util/perf_regs.c b/tools/perf/arch/riscv/util/perf_regs.c
index 6b1665f41180..2cf7a54106e0 100644
--- a/tools/perf/arch/riscv/util/perf_regs.c
+++ b/tools/perf/arch/riscv/util/perf_regs.c
@@ -2,10 +2,6 @@
 #include "perf_regs.h"
 #include "../../util/perf_regs.h"
 
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
-
 uint64_t arch__intr_reg_mask(void)
 {
 	return PERF_REGS_MASK;
@@ -15,8 +11,3 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/s390/util/perf_regs.c b/tools/perf/arch/s390/util/perf_regs.c
index 6b1665f41180..2cf7a54106e0 100644
--- a/tools/perf/arch/s390/util/perf_regs.c
+++ b/tools/perf/arch/s390/util/perf_regs.c
@@ -2,10 +2,6 @@
 #include "perf_regs.h"
 #include "../../util/perf_regs.h"
 
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
-
 uint64_t arch__intr_reg_mask(void)
 {
 	return PERF_REGS_MASK;
@@ -15,8 +11,3 @@ uint64_t arch__user_reg_mask(void)
 {
 	return PERF_REGS_MASK;
 }
-
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c
index 12fd93f04802..a7ca4154fdf9 100644
--- a/tools/perf/arch/x86/util/perf_regs.c
+++ b/tools/perf/arch/x86/util/perf_regs.c
@@ -13,48 +13,6 @@
 #include "../../../util/pmu.h"
 #include "../../../util/pmus.h"
 
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG(AX, PERF_REG_X86_AX),
-	SMPL_REG(BX, PERF_REG_X86_BX),
-	SMPL_REG(CX, PERF_REG_X86_CX),
-	SMPL_REG(DX, PERF_REG_X86_DX),
-	SMPL_REG(SI, PERF_REG_X86_SI),
-	SMPL_REG(DI, PERF_REG_X86_DI),
-	SMPL_REG(BP, PERF_REG_X86_BP),
-	SMPL_REG(SP, PERF_REG_X86_SP),
-	SMPL_REG(IP, PERF_REG_X86_IP),
-	SMPL_REG(FLAGS, PERF_REG_X86_FLAGS),
-	SMPL_REG(CS, PERF_REG_X86_CS),
-	SMPL_REG(SS, PERF_REG_X86_SS),
-#ifdef HAVE_ARCH_X86_64_SUPPORT
-	SMPL_REG(R8, PERF_REG_X86_R8),
-	SMPL_REG(R9, PERF_REG_X86_R9),
-	SMPL_REG(R10, PERF_REG_X86_R10),
-	SMPL_REG(R11, PERF_REG_X86_R11),
-	SMPL_REG(R12, PERF_REG_X86_R12),
-	SMPL_REG(R13, PERF_REG_X86_R13),
-	SMPL_REG(R14, PERF_REG_X86_R14),
-	SMPL_REG(R15, PERF_REG_X86_R15),
-#endif
-	SMPL_REG2(XMM0, PERF_REG_X86_XMM0),
-	SMPL_REG2(XMM1, PERF_REG_X86_XMM1),
-	SMPL_REG2(XMM2, PERF_REG_X86_XMM2),
-	SMPL_REG2(XMM3, PERF_REG_X86_XMM3),
-	SMPL_REG2(XMM4, PERF_REG_X86_XMM4),
-	SMPL_REG2(XMM5, PERF_REG_X86_XMM5),
-	SMPL_REG2(XMM6, PERF_REG_X86_XMM6),
-	SMPL_REG2(XMM7, PERF_REG_X86_XMM7),
-	SMPL_REG2(XMM8, PERF_REG_X86_XMM8),
-	SMPL_REG2(XMM9, PERF_REG_X86_XMM9),
-	SMPL_REG2(XMM10, PERF_REG_X86_XMM10),
-	SMPL_REG2(XMM11, PERF_REG_X86_XMM11),
-	SMPL_REG2(XMM12, PERF_REG_X86_XMM12),
-	SMPL_REG2(XMM13, PERF_REG_X86_XMM13),
-	SMPL_REG2(XMM14, PERF_REG_X86_XMM14),
-	SMPL_REG2(XMM15, PERF_REG_X86_XMM15),
-	SMPL_REG_END
-};
-
 struct sdt_name_reg {
 	const char *sdt_name;
 	const char *uprobe_name;
@@ -276,11 +234,6 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
 	return SDT_ARG_VALID;
 }
 
-const struct sample_reg *arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
-
 uint64_t arch__intr_reg_mask(void)
 {
 	struct perf_event_attr attr = {
diff --git a/tools/perf/util/arm64-frame-pointer-unwind-support.c b/tools/perf/util/arm64-frame-pointer-unwind-support.c
index 958afe8b821e..858ce2b01812 100644
--- a/tools/perf/util/arm64-frame-pointer-unwind-support.c
+++ b/tools/perf/util/arm64-frame-pointer-unwind-support.c
@@ -2,7 +2,6 @@
 #include "arm64-frame-pointer-unwind-support.h"
 #include "callchain.h"
 #include "event.h"
-#include "perf_regs.h" // SMPL_REG_MASK
 #include "unwind.h"
 #include <string.h>
 
@@ -15,6 +14,8 @@ struct entries {
 	size_t length;
 };
 
+#define SMPL_REG_MASK(b) (1ULL << (b))
+
 static bool get_leaf_frame_caller_enabled(struct perf_sample *sample)
 {
 	struct regs_dump *regs;
diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c
index cda1c620968e..c0d0ef9fd495 100644
--- a/tools/perf/util/parse-regs-options.c
+++ b/tools/perf/util/parse-regs-options.c
@@ -5,15 +5,54 @@
 #include <string.h>
 #include <stdio.h>
 #include "util/debug.h"
+#include <dwarf-regs.h>
 #include <subcmd/parse-options.h>
 #include "util/perf_regs.h"
 #include "util/parse-regs-options.h"
 
+static void list_perf_regs(FILE *fp, uint64_t mask)
+{
+	const char *last_name = NULL;
+
+	fprintf(fp, "available registers: ");
+	for (int reg = 0; reg < 64; reg++) {
+		const char *name;
+
+		if (((1ULL << reg) & mask) == 0)
+			continue;
+
+		name = perf_reg_name(reg, EM_HOST);
+		if (name && (!last_name || strcmp(last_name, name)))
+			fprintf(fp, "%s%s", reg > 0 ? " " : "", name);
+		last_name = name;
+	}
+	fputc('\n', fp);
+}
+
+static uint64_t name_to_perf_reg_mask(const char *to_match, uint64_t mask)
+{
+	uint64_t reg_mask = 0;
+
+	for (int reg = 0; reg < 64; reg++) {
+		const char *name;
+
+		if (((1ULL << reg) & mask) == 0)
+			continue;
+
+		name = perf_reg_name(reg, EM_HOST);
+		if (!name)
+			continue;
+
+		if (!strcasecmp(to_match, name))
+			reg_mask |= 1ULL << reg;
+	}
+	return reg_mask;
+}
+
 static int
 __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
 {
 	uint64_t *mode = (uint64_t *)opt->value;
-	const struct sample_reg *r = NULL;
 	char *s, *os = NULL, *p;
 	int ret = -1;
 	uint64_t mask;
@@ -27,50 +66,41 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
 	if (*mode)
 		return -1;
 
-	if (intr)
-		mask = arch__intr_reg_mask();
-	else
-		mask = arch__user_reg_mask();
-
 	/* str may be NULL in case no arg is passed to -I */
-	if (str) {
-		/* because str is read-only */
-		s = os = strdup(str);
-		if (!s)
-			return -1;
-
-		for (;;) {
-			p = strchr(s, ',');
-			if (p)
-				*p = '\0';
-
-			if (!strcmp(s, "?")) {
-				fprintf(stderr, "available registers: ");
-				for (r = arch__sample_reg_masks(); r->name; r++) {
-					if (r->mask & mask)
-						fprintf(stderr, "%s ", r->name);
-				}
-				fputc('\n', stderr);
-				/* just printing available regs */
-				goto error;
-			}
-			for (r = arch__sample_reg_masks(); r->name; r++) {
-				if ((r->mask & mask) && !strcasecmp(s, r->name))
-					break;
-			}
-			if (!r || !r->name) {
-				ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n",
-					    s, intr ? "-I" : "--user-regs=");
-				goto error;
-			}
-
-			*mode |= r->mask;
-
-			if (!p)
-				break;
-
-			s = p + 1;
+	if (!str)
+		return -1;
+
+	mask = intr ? arch__intr_reg_mask() : arch__user_reg_mask();
+
+	/* because str is read-only */
+	s = os = strdup(str);
+	if (!s)
+		return -1;
+
+	for (;;) {
+		uint64_t reg_mask;
+
+		p = strchr(s, ',');
+		if (p)
+			*p = '\0';
+
+		if (!strcmp(s, "?")) {
+			list_perf_regs(stderr, mask);
+			goto error;
+		}
+
+		reg_mask = name_to_perf_reg_mask(s, mask);
+		if (reg_mask == 0) {
+			ui__warning("Unknown register \"%s\", check man page or run \"perf record %s?\"\n",
+				s, intr ? "-I" : "--user-regs=");
+			goto error;
 		}
+		*mode |= reg_mask;
+
+		if (!p)
+			break;
+
+		s = p + 1;
 	}
 	ret = 0;
 
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index f9723091e673..cd5acee3dc62 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -23,15 +23,6 @@ uint64_t __weak arch__user_reg_mask(void)
 	return 0;
 }
 
-static const struct sample_reg sample_reg_masks[] = {
-	SMPL_REG_END
-};
-
-const struct sample_reg * __weak arch__sample_reg_masks(void)
-{
-	return sample_reg_masks;
-}
-
 const char *perf_reg_name(int id, uint16_t e_machine)
 {
 	const char *reg_name = NULL;
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index 7bfc6a34c02b..2c2a8de6912d 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -7,17 +7,6 @@
 
 struct regs_dump;
 
-struct sample_reg {
-	const char *name;
-	uint64_t mask;
-};
-
-#define SMPL_REG_MASK(b) (1ULL << (b))
-#define SMPL_REG(n, b) { .name = #n, .mask = SMPL_REG_MASK(b) }
-#define SMPL_REG2_MASK(b) (3ULL << (b))
-#define SMPL_REG2(n, b) { .name = #n, .mask = SMPL_REG2_MASK(b) }
-#define SMPL_REG_END { .name = NULL }
-
 enum {
 	SDT_ARG_VALID = 0,
 	SDT_ARG_SKIP,
@@ -26,7 +15,6 @@ enum {
 int arch_sdt_arg_parse_op(char *old_op, char **new_op);
 uint64_t arch__intr_reg_mask(void);
 uint64_t arch__user_reg_mask(void);
-const struct sample_reg *arch__sample_reg_masks(void);
 
 const char *perf_reg_name(int id, uint16_t e_machine);
 int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
-- 
cgit v1.2.3


From d8df878140506e7938928195f540c10b1089fdaf Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 22 Jan 2026 21:51:22 -0800
Subject: selftests/bpf: Fix task_local_data failure with 64K page

On arm64 systems with 64K pages, the selftest task_local_data has the following
failures:
  ...
  test_task_local_data_basic:PASS:tld_create_key 0 nsec
  test_task_local_data_basic:FAIL:tld_create_key unexpected tld_create_key: actual 0 != expected -28
  ...
  test_task_local_data_basic_thread:PASS:run task_main 0 nsec
  test_task_local_data_basic_thread:FAIL:task_main retval unexpected error: 2 (errno 0)
  test_task_local_data_basic_thread:FAIL:tld_get_data value0 unexpected tld_get_data value0: actual 0 != expected 6268
  ...
  #447/1   task_local_data/task_local_data_basic:FAIL
  ...
  #447/2   task_local_data/task_local_data_race:FAIL
  #447     task_local_data:FAIL

When TLD_DYN_DATA_SIZE is 64K page size, for
  struct tld_meta_u {
       _Atomic __u8 cnt;
       __u16 size;
        struct tld_metadata metadata[];
  };
field 'cnt' would overflow. For example, for 4K page, 'cnt' will
be 4096/64 = 64. But for 64K page, 'cnt' will be 65536/64 = 1024
and 'cnt' is not enough for 1024. To accommodate 64K page,
'_Atomic __u8 cnt' becomes '_Atomic __u16 cnt'. A few other places
are adjusted accordingly.

In test_task_local_data.c, the value for TLD_DYN_DATA_SIZE is changed
from 4096 to (getpagesize() - 8) since the maximum buffer size for
TLD_DYN_DATA_SIZE is (getpagesize() - 8).

Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Cc: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Amery Hung <ameryhung@gmail.com>
Link: https://lore.kernel.org/r/20260123055122.494352-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/task_local_data.h      | 4 ++--
 tools/testing/selftests/bpf/prog_tests/test_task_local_data.c | 2 +-
 tools/testing/selftests/bpf/progs/task_local_data.bpf.h       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h
index 2de38776a2d4..0f86b9275cf9 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_data.h
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h
@@ -94,7 +94,7 @@ struct tld_metadata {
 };
 
 struct tld_meta_u {
-	_Atomic __u8 cnt;
+	_Atomic __u16 cnt;
 	__u16 size;
 	struct tld_metadata metadata[];
 };
@@ -217,7 +217,7 @@ out:
 static tld_key_t __tld_create_key(const char *name, size_t size, bool dyn_data)
 {
 	int err, i, sz, off = 0;
-	__u8 cnt;
+	__u16 cnt;
 
 	if (!TLD_READ_ONCE(tld_meta_p)) {
 		err = __tld_init_meta_p();
diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c
index 9fd6306b455c..9556ad3d986f 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c
@@ -4,7 +4,7 @@
 #include <test_progs.h>
 
 #define TLD_FREE_DATA_ON_THREAD_EXIT
-#define TLD_DYN_DATA_SIZE 4096
+#define TLD_DYN_DATA_SIZE (getpagesize() - 8)
 #include "task_local_data.h"
 
 struct test_tld_struct {
diff --git a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h
index 432fff2af844..fed53d63a7e5 100644
--- a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h
+++ b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h
@@ -80,7 +80,7 @@ struct tld_metadata {
 };
 
 struct tld_meta_u {
-	__u8 cnt;
+	__u16 cnt;
 	__u16 size;
 	struct tld_metadata metadata[TLD_MAX_DATA_CNT];
 };
-- 
cgit v1.2.3


From c7900f225a102219f5fe2c1c93a7dec5467315ee Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 22 Jan 2026 21:51:28 -0800
Subject: selftests/bpf: Fix xdp_pull_data failure with 64K page

If the argument 'pull_len' of run_test() is 'PULL_MAX' or
'PULL_MAX | PULL_PLUS_ONE', the eventual pull_len size
will close to the page size. On arm64 systems with 64K pages,
the pull_len size will be close to 64K. But the existing buffer
will be close to 9000 which is not enough to pull.

For those failed run_tests(), make buff size to
  pg_sz + (pg_sz / 2)
This way, there will be enough buffer space to pull
regardless of page size.

Tested-by: Alan Maguire <alan.maguire@oracle.com>
Cc: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Amery Hung <ameryhung@gmail.com>
Link: https://lore.kernel.org/r/20260123055128.495265-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c
index efa350d04ec5..910dabe95afd 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c
@@ -114,12 +114,14 @@ static void test_xdp_pull_data_basic(void)
 {
 	u32 pg_sz, max_meta_len, max_data_len;
 	struct test_xdp_pull_data *skel;
+	int buff_len;
 
 	skel = test_xdp_pull_data__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "test_xdp_pull_data__open_and_load"))
 		return;
 
 	pg_sz = sysconf(_SC_PAGE_SIZE);
+	buff_len = pg_sz + pg_sz / 2;
 
 	if (find_xdp_sizes(skel, pg_sz))
 		goto out;
@@ -140,13 +142,13 @@ static void test_xdp_pull_data_basic(void)
 	run_test(skel, XDP_PASS, pg_sz, 9000, 0, 1025, 1025);
 
 	/* multi-buf pkt, empty linear data area, pull requires memmove */
-	run_test(skel, XDP_PASS, pg_sz, 9000, 0, 0, PULL_MAX);
+	run_test(skel, XDP_PASS, pg_sz, buff_len, 0, 0, PULL_MAX);
 
 	/* multi-buf pkt, no headroom */
-	run_test(skel, XDP_PASS, pg_sz, 9000, max_meta_len, 1024, PULL_MAX);
+	run_test(skel, XDP_PASS, pg_sz, buff_len, max_meta_len, 1024, PULL_MAX);
 
 	/* multi-buf pkt, no tailroom, pull requires memmove */
-	run_test(skel, XDP_PASS, pg_sz, 9000, 0, max_data_len, PULL_MAX);
+	run_test(skel, XDP_PASS, pg_sz, buff_len, 0, max_data_len, PULL_MAX);
 
 	/* Test cases with invalid pull length */
 
@@ -154,18 +156,18 @@ static void test_xdp_pull_data_basic(void)
 	run_test(skel, XDP_DROP, pg_sz, 2048, 0, 2048, 2049);
 
 	/* multi-buf pkt with no space left in linear data area */
-	run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, max_data_len,
+	run_test(skel, XDP_DROP, pg_sz, buff_len, max_meta_len, max_data_len,
 		 PULL_MAX | PULL_PLUS_ONE);
 
 	/* multi-buf pkt, empty linear data area */
-	run_test(skel, XDP_DROP, pg_sz, 9000, 0, 0, PULL_MAX | PULL_PLUS_ONE);
+	run_test(skel, XDP_DROP, pg_sz, buff_len, 0, 0, PULL_MAX | PULL_PLUS_ONE);
 
 	/* multi-buf pkt, no headroom */
-	run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, 1024,
+	run_test(skel, XDP_DROP, pg_sz, buff_len, max_meta_len, 1024,
 		 PULL_MAX | PULL_PLUS_ONE);
 
 	/* multi-buf pkt, no tailroom */
-	run_test(skel, XDP_DROP, pg_sz, 9000, 0, max_data_len,
+	run_test(skel, XDP_DROP, pg_sz, buff_len, 0, max_data_len,
 		 PULL_MAX | PULL_PLUS_ONE);
 
 out:
-- 
cgit v1.2.3


From 2d419c44658f75e7655794341a95c0687830f3df Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Sat, 24 Jan 2026 14:19:56 +0800
Subject: bpf: add fsession support

The fsession is something that similar to kprobe session. It allow to
attach a single BPF program to both the entry and the exit of the target
functions.

Introduce the struct bpf_fsession_link, which allows to add the link to
both the fentry and fexit progs_hlist of the trampoline.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Co-developed-by: Leon Hwang <leon.hwang@linux.dev>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260124062008.8657-2-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                                | 19 ++++++++
 include/uapi/linux/bpf.h                           |  1 +
 kernel/bpf/btf.c                                   |  2 +
 kernel/bpf/syscall.c                               | 18 +++++++-
 kernel/bpf/trampoline.c                            | 53 ++++++++++++++++++----
 kernel/bpf/verifier.c                              | 12 +++--
 net/bpf/test_run.c                                 |  1 +
 net/core/bpf_sk_storage.c                          |  1 +
 tools/include/uapi/linux/bpf.h                     |  1 +
 .../selftests/bpf/prog_tests/tracing_failure.c     |  2 +-
 10 files changed, 97 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5936f8e2996f..41228b0add52 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1309,6 +1309,7 @@ enum bpf_tramp_prog_type {
 	BPF_TRAMP_MODIFY_RETURN,
 	BPF_TRAMP_MAX,
 	BPF_TRAMP_REPLACE, /* more than MAX */
+	BPF_TRAMP_FSESSION,
 };
 
 struct bpf_tramp_image {
@@ -1875,6 +1876,11 @@ struct bpf_tracing_link {
 	struct bpf_prog *tgt_prog;
 };
 
+struct bpf_fsession_link {
+	struct bpf_tracing_link link;
+	struct bpf_tramp_link fexit;
+};
+
 struct bpf_raw_tp_link {
 	struct bpf_link link;
 	struct bpf_raw_event_map *btp;
@@ -2169,6 +2175,19 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op
 
 #endif
 
+static inline int bpf_fsession_cnt(struct bpf_tramp_links *links)
+{
+	struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY];
+	int cnt = 0;
+
+	for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) {
+		if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION)
+			cnt++;
+	}
+
+	return cnt;
+}
+
 int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog,
 			       const struct bpf_ctx_arg_aux *info, u32 cnt);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2a2ade4be60f..44e7dbc278e3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1145,6 +1145,7 @@ enum bpf_attach_type {
 	BPF_NETKIT_PEER,
 	BPF_TRACE_KPROBE_SESSION,
 	BPF_TRACE_UPROBE_SESSION,
+	BPF_TRACE_FSESSION,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index d10b3404260f..8959f3bc1e92 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6219,6 +6219,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct
 		case BPF_TRACE_FENTRY:
 		case BPF_TRACE_FEXIT:
 		case BPF_MODIFY_RETURN:
+		case BPF_TRACE_FSESSION:
 			/* allow u64* as ctx */
 			if (btf_is_int(t) && t->size == 8)
 				return 0;
@@ -6820,6 +6821,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			fallthrough;
 		case BPF_LSM_CGROUP:
 		case BPF_TRACE_FEXIT:
+		case BPF_TRACE_FSESSION:
 			/* When LSM programs are attached to void LSM hooks
 			 * they use FEXIT trampolines and when attached to
 			 * int LSM hooks, they use MODIFY_RETURN trampolines.
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3c5c03d43f5f..b9184545c3fd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3577,6 +3577,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 	case BPF_PROG_TYPE_TRACING:
 		if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
 		    prog->expected_attach_type != BPF_TRACE_FEXIT &&
+		    prog->expected_attach_type != BPF_TRACE_FSESSION &&
 		    prog->expected_attach_type != BPF_MODIFY_RETURN) {
 			err = -EINVAL;
 			goto out_put_prog;
@@ -3626,7 +3627,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 		key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
 	}
 
-	link = kzalloc(sizeof(*link), GFP_USER);
+	if (prog->expected_attach_type == BPF_TRACE_FSESSION) {
+		struct bpf_fsession_link *fslink;
+
+		fslink = kzalloc(sizeof(*fslink), GFP_USER);
+		if (fslink) {
+			bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING,
+				      &bpf_tracing_link_lops, prog, attach_type);
+			fslink->fexit.cookie = bpf_cookie;
+			link = &fslink->link;
+		} else {
+			link = NULL;
+		}
+	} else {
+		link = kzalloc(sizeof(*link), GFP_USER);
+	}
 	if (!link) {
 		err = -ENOMEM;
 		goto out_put_prog;
@@ -4350,6 +4365,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_TRACE_RAW_TP:
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FSESSION:
 	case BPF_MODIFY_RETURN:
 		return BPF_PROG_TYPE_TRACING;
 	case BPF_LSM_MAC:
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 2a125d063e62..edf9da43762d 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -109,10 +109,17 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 	enum bpf_attach_type eatype = prog->expected_attach_type;
 	enum bpf_prog_type ptype = prog->type;
 
-	return (ptype == BPF_PROG_TYPE_TRACING &&
-		(eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
-		 eatype == BPF_MODIFY_RETURN)) ||
-		(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
+	switch (ptype) {
+	case BPF_PROG_TYPE_TRACING:
+		if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+		    eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION)
+			return true;
+		return false;
+	case BPF_PROG_TYPE_LSM:
+		return eatype == BPF_LSM_MAC;
+	default:
+		return false;
+	}
 }
 
 void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym)
@@ -559,6 +566,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
 		return BPF_TRAMP_MODIFY_RETURN;
 	case BPF_TRACE_FEXIT:
 		return BPF_TRAMP_FEXIT;
+	case BPF_TRACE_FSESSION:
+		return BPF_TRAMP_FSESSION;
 	case BPF_LSM_MAC:
 		if (!prog->aux->attach_func_proto->type)
 			/* The function returns void, we cannot modify its
@@ -594,8 +603,10 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
 				      struct bpf_trampoline *tr,
 				      struct bpf_prog *tgt_prog)
 {
+	struct bpf_fsession_link *fslink = NULL;
 	enum bpf_tramp_prog_type kind;
 	struct bpf_tramp_link *link_exiting;
+	struct hlist_head *prog_list;
 	int err = 0;
 	int cnt = 0, i;
 
@@ -621,24 +632,43 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
 					  BPF_MOD_JUMP, NULL,
 					  link->link.prog->bpf_func);
 	}
+	if (kind == BPF_TRAMP_FSESSION) {
+		prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY];
+		cnt++;
+	} else {
+		prog_list = &tr->progs_hlist[kind];
+	}
 	if (cnt >= BPF_MAX_TRAMP_LINKS)
 		return -E2BIG;
 	if (!hlist_unhashed(&link->tramp_hlist))
 		/* prog already linked */
 		return -EBUSY;
-	hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
+	hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) {
 		if (link_exiting->link.prog != link->link.prog)
 			continue;
 		/* prog already linked */
 		return -EBUSY;
 	}
 
-	hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
-	tr->progs_cnt[kind]++;
+	hlist_add_head(&link->tramp_hlist, prog_list);
+	if (kind == BPF_TRAMP_FSESSION) {
+		tr->progs_cnt[BPF_TRAMP_FENTRY]++;
+		fslink = container_of(link, struct bpf_fsession_link, link.link);
+		hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]);
+		tr->progs_cnt[BPF_TRAMP_FEXIT]++;
+	} else {
+		tr->progs_cnt[kind]++;
+	}
 	err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
 	if (err) {
 		hlist_del_init(&link->tramp_hlist);
-		tr->progs_cnt[kind]--;
+		if (kind == BPF_TRAMP_FSESSION) {
+			tr->progs_cnt[BPF_TRAMP_FENTRY]--;
+			hlist_del_init(&fslink->fexit.tramp_hlist);
+			tr->progs_cnt[BPF_TRAMP_FEXIT]--;
+		} else {
+			tr->progs_cnt[kind]--;
+		}
 	}
 	return err;
 }
@@ -672,6 +702,13 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
 		guard(mutex)(&tgt_prog->aux->ext_mutex);
 		tgt_prog->aux->is_extended = false;
 		return err;
+	} else if (kind == BPF_TRAMP_FSESSION) {
+		struct bpf_fsession_link *fslink =
+			container_of(link, struct bpf_fsession_link, link.link);
+
+		hlist_del_init(&fslink->fexit.tramp_hlist);
+		tr->progs_cnt[BPF_TRAMP_FEXIT]--;
+		kind = BPF_TRAMP_FENTRY;
 	}
 	hlist_del_init(&link->tramp_hlist);
 	tr->progs_cnt[kind]--;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c7f5234d5fd2..41bbed6418b5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -17848,6 +17848,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 		switch (env->prog->expected_attach_type) {
 		case BPF_TRACE_FENTRY:
 		case BPF_TRACE_FEXIT:
+		case BPF_TRACE_FSESSION:
 			range = retval_range(0, 0);
 			break;
 		case BPF_TRACE_RAW_TP:
@@ -23774,6 +23775,7 @@ patch_map_ops_generic:
 		if (prog_type == BPF_PROG_TYPE_TRACING &&
 		    insn->imm == BPF_FUNC_get_func_ret) {
 			if (eatype == BPF_TRACE_FEXIT ||
+			    eatype == BPF_TRACE_FSESSION ||
 			    eatype == BPF_MODIFY_RETURN) {
 				/* Load nr_args from ctx - 8 */
 				insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
@@ -24725,7 +24727,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 		if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
 		    prog_extension &&
 		    (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
-		     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
+		     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT ||
+		     tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) {
 			/* Program extensions can extend all program types
 			 * except fentry/fexit. The reason is the following.
 			 * The fentry/fexit programs are used for performance
@@ -24740,7 +24743,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			 * beyond reasonable stack size. Hence extending fentry
 			 * is not allowed.
 			 */
-			bpf_log(log, "Cannot extend fentry/fexit\n");
+			bpf_log(log, "Cannot extend fentry/fexit/fsession\n");
 			return -EINVAL;
 		}
 	} else {
@@ -24824,6 +24827,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	case BPF_LSM_CGROUP:
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FSESSION:
 		if (!btf_type_is_func(t)) {
 			bpf_log(log, "attach_btf_id %u is not a function\n",
 				btf_id);
@@ -24990,6 +24994,7 @@ static bool can_be_sleepable(struct bpf_prog *prog)
 		case BPF_TRACE_FEXIT:
 		case BPF_MODIFY_RETURN:
 		case BPF_TRACE_ITER:
+		case BPF_TRACE_FSESSION:
 			return true;
 		default:
 			return false;
@@ -25071,9 +25076,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 			tgt_info.tgt_name);
 		return -EINVAL;
 	} else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
+		   prog->expected_attach_type == BPF_TRACE_FSESSION ||
 		   prog->expected_attach_type == BPF_MODIFY_RETURN) &&
 		   btf_id_set_contains(&noreturn_deny, btf_id)) {
-		verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n",
+		verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n",
 			tgt_info.tgt_name);
 		return -EINVAL;
 	}
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 26cfcfdc45eb..178c4738e63b 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -685,6 +685,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
 	switch (prog->expected_attach_type) {
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FSESSION:
 		if (bpf_fentry_test1(1) != 2 ||
 		    bpf_fentry_test2(2, 3) != 5 ||
 		    bpf_fentry_test3(4, 5, 6) != 15 ||
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 850dd736ccd1..de111818f3a0 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -365,6 +365,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
 		return true;
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FSESSION:
 		return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage",
 				 strlen("bpf_sk_storage"));
 	default:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b816bc53d2e1..3ca7d76e05f0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1145,6 +1145,7 @@ enum bpf_attach_type {
 	BPF_NETKIT_PEER,
 	BPF_TRACE_KPROBE_SESSION,
 	BPF_TRACE_UPROBE_SESSION,
+	BPF_TRACE_FSESSION,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c
index 10e231965589..f9f9e1cb87bf 100644
--- a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c
@@ -73,7 +73,7 @@ static void test_tracing_deny(void)
 static void test_fexit_noreturns(void)
 {
 	test_tracing_fail_prog("fexit_noreturns",
-			       "Attaching fexit/fmod_ret to __noreturn function 'do_exit' is rejected.");
+			       "Attaching fexit/fsession/fmod_ret to __noreturn function 'do_exit' is rejected.");
 }
 
 void test_tracing_failure(void)
-- 
cgit v1.2.3


From 8fe4dc4f6456b3d2c9e6f8aeb1f978b7bff0f6c8 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Sat, 24 Jan 2026 14:19:58 +0800
Subject: bpf: change prototype of bpf_session_{cookie,is_return}

Add the function argument of "void *ctx" to bpf_session_cookie() and
bpf_session_is_return(), which is a preparation of the next patch.

The two kfunc is seldom used now, so it will not introduce much effect
to change their function prototype.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20260124062008.8657-4-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                                     |  6 +++++-
 kernel/trace/bpf_trace.c                                  |  4 ++--
 tools/testing/selftests/bpf/bpf_kfuncs.h                  |  3 ---
 .../selftests/bpf/progs/kprobe_multi_session_cookie.c     | 15 +++++++--------
 tools/testing/selftests/bpf/progs/uprobe_multi_session.c  |  7 +++----
 .../selftests/bpf/progs/uprobe_multi_session_cookie.c     | 15 +++++++--------
 .../selftests/bpf/progs/uprobe_multi_session_recursive.c  | 11 +++++------
 7 files changed, 29 insertions(+), 32 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2081343a848d..0fa73d56cb8b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12484,6 +12484,7 @@ enum special_kfunc_type {
 	KF_bpf_arena_alloc_pages,
 	KF_bpf_arena_free_pages,
 	KF_bpf_arena_reserve_pages,
+	KF_bpf_session_is_return,
 };
 
 BTF_ID_LIST(special_kfunc_list)
@@ -12561,6 +12562,7 @@ BTF_ID(func, bpf_task_work_schedule_resume)
 BTF_ID(func, bpf_arena_alloc_pages)
 BTF_ID(func, bpf_arena_free_pages)
 BTF_ID(func, bpf_arena_reserve_pages)
+BTF_ID(func, bpf_session_is_return)
 
 static bool is_task_work_add_kfunc(u32 func_id)
 {
@@ -12615,7 +12617,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	struct bpf_reg_state *reg = &regs[regno];
 	bool arg_mem_size = false;
 
-	if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
+	if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+	    meta->func_id == special_kfunc_list[KF_bpf_session_is_return] ||
+	    meta->func_id == special_kfunc_list[KF_bpf_session_cookie])
 		return KF_ARG_PTR_TO_CTX;
 
 	if (argno + 1 < nargs &&
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d466a1503da3..13f0a2de33b7 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -3323,7 +3323,7 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
 
 __bpf_kfunc_start_defs();
 
-__bpf_kfunc bool bpf_session_is_return(void)
+__bpf_kfunc bool bpf_session_is_return(void *ctx)
 {
 	struct bpf_session_run_ctx *session_ctx;
 
@@ -3331,7 +3331,7 @@ __bpf_kfunc bool bpf_session_is_return(void)
 	return session_ctx->is_return;
 }
 
-__bpf_kfunc __u64 *bpf_session_cookie(void)
+__bpf_kfunc __u64 *bpf_session_cookie(void *ctx)
 {
 	struct bpf_session_run_ctx *session_ctx;
 
diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h
index e0189254bb6e..7dad01439391 100644
--- a/tools/testing/selftests/bpf/bpf_kfuncs.h
+++ b/tools/testing/selftests/bpf/bpf_kfuncs.h
@@ -79,9 +79,6 @@ extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr,
 				      struct bpf_dynptr *sig_ptr,
 				      struct bpf_key *trusted_keyring) __ksym;
 
-extern bool bpf_session_is_return(void) __ksym __weak;
-extern __u64 *bpf_session_cookie(void) __ksym __weak;
-
 struct dentry;
 /* Description
  *  Returns xattr of a dentry
diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c
index 0835b5edf685..ad627016e3e5 100644
--- a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c
+++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c
@@ -1,9 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/bpf.h>
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <stdbool.h>
-#include "bpf_kfuncs.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -23,16 +22,16 @@ int BPF_PROG(trigger)
 	return 0;
 }
 
-static int check_cookie(__u64 val, __u64 *result)
+static int check_cookie(struct pt_regs *ctx, __u64 val, __u64 *result)
 {
 	__u64 *cookie;
 
 	if (bpf_get_current_pid_tgid() >> 32 != pid)
 		return 1;
 
-	cookie = bpf_session_cookie();
+	cookie = bpf_session_cookie(ctx);
 
-	if (bpf_session_is_return())
+	if (bpf_session_is_return(ctx))
 		*result = *cookie == val ? val : 0;
 	else
 		*cookie = val;
@@ -42,17 +41,17 @@ static int check_cookie(__u64 val, __u64 *result)
 SEC("kprobe.session/bpf_fentry_test1")
 int test_kprobe_1(struct pt_regs *ctx)
 {
-	return check_cookie(1, &test_kprobe_1_result);
+	return check_cookie(ctx, 1, &test_kprobe_1_result);
 }
 
 SEC("kprobe.session/bpf_fentry_test1")
 int test_kprobe_2(struct pt_regs *ctx)
 {
-	return check_cookie(2, &test_kprobe_2_result);
+	return check_cookie(ctx, 2, &test_kprobe_2_result);
 }
 
 SEC("kprobe.session/bpf_fentry_test1")
 int test_kprobe_3(struct pt_regs *ctx)
 {
-	return check_cookie(3, &test_kprobe_3_result);
+	return check_cookie(ctx, 3, &test_kprobe_3_result);
 }
diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c
index 30bff90b68dc..6e46bb00ff58 100644
--- a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c
+++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c
@@ -1,9 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/bpf.h>
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <stdbool.h>
-#include "bpf_kfuncs.h"
 #include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
@@ -51,7 +50,7 @@ static int uprobe_multi_check(void *ctx, bool is_return)
 SEC("uprobe.session//proc/self/exe:uprobe_multi_func_*")
 int uprobe(struct pt_regs *ctx)
 {
-	return uprobe_multi_check(ctx, bpf_session_is_return());
+	return uprobe_multi_check(ctx, bpf_session_is_return(ctx));
 }
 
 static __always_inline bool verify_sleepable_user_copy(void)
@@ -67,5 +66,5 @@ int uprobe_sleepable(struct pt_regs *ctx)
 {
 	if (verify_sleepable_user_copy())
 		uprobe_multi_sleep_result++;
-	return uprobe_multi_check(ctx, bpf_session_is_return());
+	return uprobe_multi_check(ctx, bpf_session_is_return(ctx));
 }
diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c
index 5befdf944dc6..b5db196614a9 100644
--- a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c
+++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c
@@ -1,9 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/bpf.h>
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <stdbool.h>
-#include "bpf_kfuncs.h"
 
 char _license[] SEC("license") = "GPL";
 
@@ -13,16 +12,16 @@ __u64 test_uprobe_1_result = 0;
 __u64 test_uprobe_2_result = 0;
 __u64 test_uprobe_3_result = 0;
 
-static int check_cookie(__u64 val, __u64 *result)
+static int check_cookie(struct pt_regs *ctx, __u64 val, __u64 *result)
 {
 	__u64 *cookie;
 
 	if (bpf_get_current_pid_tgid() >> 32 != pid)
 		return 1;
 
-	cookie = bpf_session_cookie();
+	cookie = bpf_session_cookie(ctx);
 
-	if (bpf_session_is_return())
+	if (bpf_session_is_return(ctx))
 		*result = *cookie == val ? val : 0;
 	else
 		*cookie = val;
@@ -32,17 +31,17 @@ static int check_cookie(__u64 val, __u64 *result)
 SEC("uprobe.session//proc/self/exe:uprobe_multi_func_1")
 int uprobe_1(struct pt_regs *ctx)
 {
-	return check_cookie(1, &test_uprobe_1_result);
+	return check_cookie(ctx, 1, &test_uprobe_1_result);
 }
 
 SEC("uprobe.session//proc/self/exe:uprobe_multi_func_2")
 int uprobe_2(struct pt_regs *ctx)
 {
-	return check_cookie(2, &test_uprobe_2_result);
+	return check_cookie(ctx, 2, &test_uprobe_2_result);
 }
 
 SEC("uprobe.session//proc/self/exe:uprobe_multi_func_3")
 int uprobe_3(struct pt_regs *ctx)
 {
-	return check_cookie(3, &test_uprobe_3_result);
+	return check_cookie(ctx, 3, &test_uprobe_3_result);
 }
diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c
index 8fbcd69fae22..3ce309248a04 100644
--- a/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c
+++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c
@@ -1,9 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/bpf.h>
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <stdbool.h>
-#include "bpf_kfuncs.h"
 #include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
@@ -16,11 +15,11 @@ int idx_return = 0;
 __u64 test_uprobe_cookie_entry[6];
 __u64 test_uprobe_cookie_return[3];
 
-static int check_cookie(void)
+static int check_cookie(struct pt_regs *ctx)
 {
-	__u64 *cookie = bpf_session_cookie();
+	__u64 *cookie = bpf_session_cookie(ctx);
 
-	if (bpf_session_is_return()) {
+	if (bpf_session_is_return(ctx)) {
 		if (idx_return >= ARRAY_SIZE(test_uprobe_cookie_return))
 			return 1;
 		test_uprobe_cookie_return[idx_return++] = *cookie;
@@ -40,5 +39,5 @@ int uprobe_recursive(struct pt_regs *ctx)
 	if (bpf_get_current_pid_tgid() >> 32 != pid)
 		return 1;
 
-	return check_cookie();
+	return check_cookie(ctx);
 }
-- 
cgit v1.2.3


From 257c43688b143fd9805cdfef9d2623dde92989e6 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Sat, 24 Jan 2026 14:20:03 +0800
Subject: libbpf: add fsession support

Add BPF_TRACE_FSESSION to libbpf.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Link: https://lore.kernel.org/r/20260124062008.8657-9-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/bpf.c    | 1 +
 tools/lib/bpf/libbpf.c | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 21b57a629916..5846de364209 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -794,6 +794,7 @@ int bpf_link_create(int prog_fd, int target_fd,
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
 	case BPF_MODIFY_RETURN:
+	case BPF_TRACE_FSESSION:
 	case BPF_LSM_MAC:
 		attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0);
 		if (!OPTS_ZEROED(opts, tracing))
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index bbcfd72b07d5..0c8bf0b5cce4 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -115,6 +115,7 @@ static const char * const attach_type_name[] = {
 	[BPF_TRACE_FENTRY]		= "trace_fentry",
 	[BPF_TRACE_FEXIT]		= "trace_fexit",
 	[BPF_MODIFY_RETURN]		= "modify_return",
+	[BPF_TRACE_FSESSION]		= "trace_fsession",
 	[BPF_LSM_MAC]			= "lsm_mac",
 	[BPF_LSM_CGROUP]		= "lsm_cgroup",
 	[BPF_SK_LOOKUP]			= "sk_lookup",
@@ -9859,6 +9860,8 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("fentry.s+",		TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
 	SEC_DEF("fmod_ret.s+",		TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
 	SEC_DEF("fexit.s+",		TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
+	SEC_DEF("fsession+",		TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF, attach_trace),
+	SEC_DEF("fsession.s+",		TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
 	SEC_DEF("freplace+",		EXT, 0, SEC_ATTACH_BTF, attach_trace),
 	SEC_DEF("lsm+",			LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm),
 	SEC_DEF("lsm.s+",		LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm),
-- 
cgit v1.2.3


From 85fc4be6d811372f8f9a2a131a092735418fdbf2 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Sat, 24 Jan 2026 14:20:04 +0800
Subject: bpftool: add fsession support

Add BPF_TRACE_FSESSION to bpftool.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Link: https://lore.kernel.org/r/20260124062008.8657-10-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/bpf/bpftool/common.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index e8daf963ecef..8bfcff9e2f63 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -1191,6 +1191,7 @@ const char *bpf_attach_type_input_str(enum bpf_attach_type t)
 	case BPF_TRACE_FENTRY:			return "fentry";
 	case BPF_TRACE_FEXIT:			return "fexit";
 	case BPF_MODIFY_RETURN:			return "mod_ret";
+	case BPF_TRACE_FSESSION:		return "fsession";
 	case BPF_SK_REUSEPORT_SELECT:		return "sk_skb_reuseport_select";
 	case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:	return "sk_skb_reuseport_select_or_migrate";
 	default:	return libbpf_bpf_attach_type_str(t);
-- 
cgit v1.2.3


From f7afef5617b685c3491db3593ca09abc33815774 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Sat, 24 Jan 2026 14:20:05 +0800
Subject: selftests/bpf: add testcases for fsession

Add testcases for BPF_TRACE_FSESSION. The function arguments and return
value are tested both in the entry and exit. And the kfunc
bpf_session_is_ret() is also tested.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Link: https://lore.kernel.org/r/20260124062008.8657-11-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/fsession_test.c       | 90 ++++++++++++++++++++
 tools/testing/selftests/bpf/progs/fsession_test.c  | 97 ++++++++++++++++++++++
 2 files changed, 187 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/fsession_test.c
 create mode 100644 tools/testing/selftests/bpf/progs/fsession_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c
new file mode 100644
index 000000000000..75bb42942b67
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 ChinaTelecom */
+#include <test_progs.h>
+#include "fsession_test.skel.h"
+
+static int check_result(struct fsession_test *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, prog_fd;
+
+	/* Trigger test function calls */
+	prog_fd = bpf_program__fd(skel->progs.test1);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	if (!ASSERT_OK(err, "test_run_opts err"))
+		return err;
+	if (!ASSERT_OK(topts.retval, "test_run_opts retval"))
+		return topts.retval;
+
+	for (int i = 0; i < sizeof(*skel->bss) / sizeof(__u64); i++) {
+		if (!ASSERT_EQ(((__u64 *)skel->bss)[i], 1, "test_result"))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void test_fsession_basic(void)
+{
+	struct fsession_test *skel = NULL;
+	int err;
+
+	skel = fsession_test__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load"))
+		goto cleanup;
+
+	err = fsession_test__attach(skel);
+	if (!ASSERT_OK(err, "fsession_attach"))
+		goto cleanup;
+
+	check_result(skel);
+cleanup:
+	fsession_test__destroy(skel);
+}
+
+static void test_fsession_reattach(void)
+{
+	struct fsession_test *skel = NULL;
+	int err;
+
+	skel = fsession_test__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load"))
+		goto cleanup;
+
+	/* first attach */
+	err = fsession_test__attach(skel);
+	if (!ASSERT_OK(err, "fsession_first_attach"))
+		goto cleanup;
+
+	if (check_result(skel))
+		goto cleanup;
+
+	/* detach */
+	fsession_test__detach(skel);
+
+	/* reset counters */
+	memset(skel->bss, 0, sizeof(*skel->bss));
+
+	/* second attach */
+	err = fsession_test__attach(skel);
+	if (!ASSERT_OK(err, "fsession_second_attach"))
+		goto cleanup;
+
+	if (check_result(skel))
+		goto cleanup;
+
+cleanup:
+	fsession_test__destroy(skel);
+}
+
+void test_fsession_test(void)
+{
+#if !defined(__x86_64__)
+	test__skip();
+	return;
+#endif
+	if (test__start_subtest("fsession_test"))
+		test_fsession_basic();
+	if (test__start_subtest("fsession_reattach"))
+		test_fsession_reattach();
+}
diff --git a/tools/testing/selftests/bpf/progs/fsession_test.c b/tools/testing/selftests/bpf/progs/fsession_test.c
new file mode 100644
index 000000000000..0e1b66b2dddc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/fsession_test.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 ChinaTelecom */
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u64 test1_entry_result = 0;
+__u64 test1_exit_result = 0;
+
+SEC("fsession/bpf_fentry_test1")
+int BPF_PROG(test1, int a, int ret)
+{
+	bool is_exit = bpf_session_is_return(ctx);
+
+	if (!is_exit) {
+		test1_entry_result = a == 1 && ret == 0;
+		return 0;
+	}
+
+	test1_exit_result = a == 1 && ret == 2;
+	return 0;
+}
+
+__u64 test2_entry_result = 0;
+__u64 test2_exit_result = 0;
+
+SEC("fsession/bpf_fentry_test3")
+int BPF_PROG(test2, char a, int b, __u64 c, int ret)
+{
+	bool is_exit = bpf_session_is_return(ctx);
+
+	if (!is_exit) {
+		test2_entry_result = a == 4 && b == 5 && c == 6 && ret == 0;
+		return 0;
+	}
+
+	test2_exit_result = a == 4 && b == 5 && c == 6 && ret == 15;
+	return 0;
+}
+
+__u64 test3_entry_result = 0;
+__u64 test3_exit_result = 0;
+
+SEC("fsession/bpf_fentry_test4")
+int BPF_PROG(test3, void *a, char b, int c, __u64 d, int ret)
+{
+	bool is_exit = bpf_session_is_return(ctx);
+
+	if (!is_exit) {
+		test3_entry_result = a == (void *)7 && b == 8 && c == 9 && d == 10 && ret == 0;
+		return 0;
+	}
+
+	test3_exit_result = a == (void *)7 && b == 8 && c == 9 && d == 10 && ret == 34;
+	return 0;
+}
+
+__u64 test4_entry_result = 0;
+__u64 test4_exit_result = 0;
+
+SEC("fsession/bpf_fentry_test5")
+int BPF_PROG(test4, __u64 a, void *b, short c, int d, __u64 e, int ret)
+{
+	bool is_exit = bpf_session_is_return(ctx);
+
+	if (!is_exit) {
+		test4_entry_result = a == 11 && b == (void *)12 && c == 13 && d == 14 &&
+			e == 15 && ret == 0;
+		return 0;
+	}
+
+	test4_exit_result = a == 11 && b == (void *)12 && c == 13 && d == 14 &&
+		e == 15 && ret == 65;
+	return 0;
+}
+
+__u64 test5_entry_result = 0;
+__u64 test5_exit_result = 0;
+
+SEC("fsession/bpf_fentry_test7")
+int BPF_PROG(test5, struct bpf_fentry_test_t *arg, int ret)
+{
+	bool is_exit = bpf_session_is_return(ctx);
+
+	if (!is_exit) {
+		if (!arg)
+			test5_entry_result = ret == 0;
+		return 0;
+	}
+
+	if (!arg)
+		test5_exit_result = 1;
+	return 0;
+}
+
-- 
cgit v1.2.3


From a5533a6eaa5b602fe54e53d85f787e09eab4e771 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Sat, 24 Jan 2026 14:20:06 +0800
Subject: selftests/bpf: test bpf_get_func_* for fsession

Test following bpf helper for fsession:
  bpf_get_func_arg()
  bpf_get_func_arg_cnt()
  bpf_get_func_ret()
  bpf_get_func_ip()

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Link: https://lore.kernel.org/r/20260124062008.8657-12-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/get_func_args_test.c  |  1 +
 .../selftests/bpf/prog_tests/get_func_ip_test.c    |  2 ++
 .../selftests/bpf/progs/get_func_args_test.c       | 40 +++++++++++++++++++++-
 .../testing/selftests/bpf/progs/get_func_ip_test.c | 23 +++++++++++++
 4 files changed, 65 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c
index fadee95d3ae8..96b27de05524 100644
--- a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c
@@ -41,6 +41,7 @@ void test_get_func_args_test(void)
 	ASSERT_EQ(skel->bss->test4_result, 1, "test4_result");
 	ASSERT_EQ(skel->bss->test5_result, 1, "test5_result");
 	ASSERT_EQ(skel->bss->test6_result, 1, "test6_result");
+	ASSERT_EQ(skel->bss->test7_result, 1, "test7_result");
 
 cleanup:
 	get_func_args_test__destroy(skel);
diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c
index c40242dfa8fb..7772a0f288d3 100644
--- a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c
@@ -46,6 +46,8 @@ static void test_function_entry(void)
 	ASSERT_EQ(skel->bss->test5_result, 1, "test5_result");
 	ASSERT_EQ(skel->bss->test7_result, 1, "test7_result");
 	ASSERT_EQ(skel->bss->test8_result, 1, "test8_result");
+	ASSERT_EQ(skel->bss->test9_entry_result, 1, "test9_entry_result");
+	ASSERT_EQ(skel->bss->test9_exit_result, 1, "test9_exit_result");
 
 cleanup:
 	get_func_ip_test__destroy(skel);
diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c
index 5b7233afef05..0a3236a7a109 100644
--- a/tools/testing/selftests/bpf/progs/get_func_args_test.c
+++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/bpf.h>
+#include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <errno.h>
@@ -165,3 +165,41 @@ int BPF_PROG(tp_test2)
 
 	return 0;
 }
+
+__u64 test7_result = 0;
+#ifdef __TARGET_ARCH_x86
+SEC("fsession/bpf_fentry_test1")
+int BPF_PROG(test7)
+{
+	__u64 cnt = bpf_get_func_arg_cnt(ctx);
+	__u64 a = 0, z = 0, ret = 0;
+	__s64 err;
+
+	test7_result = cnt == 1;
+
+	/* valid arguments */
+	err = bpf_get_func_arg(ctx, 0, &a);
+	test7_result &= err == 0 && ((int) a == 1);
+
+	/* not valid argument */
+	err = bpf_get_func_arg(ctx, 1, &z);
+	test7_result &= err == -EINVAL;
+
+	if (bpf_session_is_return(ctx)) {
+		err = bpf_get_func_ret(ctx, &ret);
+		test7_result &= err == 0 && ret == 2;
+	} else {
+		err = bpf_get_func_ret(ctx, &ret);
+		test7_result &= err == 0 && ret == 0;
+	}
+
+	return 0;
+}
+#else
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(test7)
+{
+	test7_result = 1;
+	return 0;
+}
+#endif
diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
index 2011cacdeb18..65f7e1f182bf 100644
--- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c
+++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
@@ -103,3 +103,26 @@ int BPF_URETPROBE(test8, int ret)
 	test8_result = (const void *) addr == (const void *) uprobe_trigger;
 	return 0;
 }
+
+__u64 test9_entry_result = 0;
+__u64 test9_exit_result = 0;
+#ifdef __TARGET_ARCH_x86
+SEC("fsession/bpf_fentry_test1")
+int BPF_PROG(test9, int a)
+{
+	__u64 addr = bpf_get_func_ip(ctx);
+
+	if (bpf_session_is_return(ctx))
+		test9_exit_result = (const void *) addr == &bpf_fentry_test1;
+	else
+		test9_entry_result = (const void *) addr == &bpf_fentry_test1;
+	return 0;
+}
+#else
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(test9, int a)
+{
+	test9_entry_result = test9_exit_result = 1;
+	return 0;
+}
+#endif
-- 
cgit v1.2.3


From 8909b3fb23e245f8ade903dfcfcc43522cf28a56 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Sat, 24 Jan 2026 14:20:07 +0800
Subject: selftests/bpf: add testcases for fsession cookie

Test session cookie for fsession. Multiple fsession BPF progs is attached
to bpf_fentry_test1() and session cookie is read and write in the
testcase.

bpf_get_func_ip() will influence the layout of the session cookies, so we
test the cookie in two case: with and without bpf_get_func_ip().

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Link: https://lore.kernel.org/r/20260124062008.8657-13-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/fsession_test.c       | 34 +++++++++++
 tools/testing/selftests/bpf/progs/fsession_test.c  | 66 ++++++++++++++++++++++
 2 files changed, 100 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c
index 75bb42942b67..0c4b428e1cee 100644
--- a/tools/testing/selftests/bpf/prog_tests/fsession_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c
@@ -77,6 +77,38 @@ cleanup:
 	fsession_test__destroy(skel);
 }
 
+static void test_fsession_cookie(void)
+{
+	struct fsession_test *skel = NULL;
+	int err;
+
+	skel = fsession_test__open();
+	if (!ASSERT_OK_PTR(skel, "fsession_test__open"))
+		goto cleanup;
+
+	/*
+	 * The test_fsession_basic() will test the session cookie with
+	 * bpf_get_func_ip() case, so we need only check
+	 * the cookie without bpf_get_func_ip() case here
+	 */
+	bpf_program__set_autoload(skel->progs.test6, false);
+
+	err = fsession_test__load(skel);
+	if (!ASSERT_OK(err, "fsession_test__load"))
+		goto cleanup;
+
+	err = fsession_test__attach(skel);
+	if (!ASSERT_OK(err, "fsession_attach"))
+		goto cleanup;
+
+	skel->bss->test6_entry_result = 1;
+	skel->bss->test6_exit_result = 1;
+
+	check_result(skel);
+cleanup:
+	fsession_test__destroy(skel);
+}
+
 void test_fsession_test(void)
 {
 #if !defined(__x86_64__)
@@ -87,4 +119,6 @@ void test_fsession_test(void)
 		test_fsession_basic();
 	if (test__start_subtest("fsession_reattach"))
 		test_fsession_reattach();
+	if (test__start_subtest("fsession_cookie"))
+		test_fsession_cookie();
 }
diff --git a/tools/testing/selftests/bpf/progs/fsession_test.c b/tools/testing/selftests/bpf/progs/fsession_test.c
index 0e1b66b2dddc..211332bdcccb 100644
--- a/tools/testing/selftests/bpf/progs/fsession_test.c
+++ b/tools/testing/selftests/bpf/progs/fsession_test.c
@@ -95,3 +95,69 @@ int BPF_PROG(test5, struct bpf_fentry_test_t *arg, int ret)
 	return 0;
 }
 
+__u64 test6_entry_result = 0;
+__u64 test6_exit_result = 0;
+SEC("fsession/bpf_fentry_test1")
+int BPF_PROG(test6, int a)
+{
+	__u64 addr = bpf_get_func_ip(ctx);
+
+	if (bpf_session_is_return(ctx))
+		test6_exit_result = (const void *) addr == &bpf_fentry_test1;
+	else
+		test6_entry_result = (const void *) addr == &bpf_fentry_test1;
+	return 0;
+}
+
+__u64 test7_entry_ok = 0;
+__u64 test7_exit_ok = 0;
+SEC("fsession/bpf_fentry_test1")
+int BPF_PROG(test7, int a)
+{
+	volatile __u64 *cookie = bpf_session_cookie(ctx);
+
+	if (!bpf_session_is_return(ctx)) {
+		*cookie = 0xAAAABBBBCCCCDDDDull;
+		test7_entry_ok = *cookie == 0xAAAABBBBCCCCDDDDull;
+		return 0;
+	}
+
+	test7_exit_ok = *cookie == 0xAAAABBBBCCCCDDDDull;
+	return 0;
+}
+
+__u64 test8_entry_ok = 0;
+__u64 test8_exit_ok = 0;
+
+SEC("fsession/bpf_fentry_test1")
+int BPF_PROG(test8, int a)
+{
+	volatile __u64 *cookie = bpf_session_cookie(ctx);
+
+	if (!bpf_session_is_return(ctx)) {
+		*cookie = 0x1111222233334444ull;
+		test8_entry_ok = *cookie == 0x1111222233334444ull;
+		return 0;
+	}
+
+	test8_exit_ok = *cookie == 0x1111222233334444ull;
+	return 0;
+}
+
+__u64 test9_entry_result = 0;
+__u64 test9_exit_result = 0;
+
+SEC("fsession/bpf_fentry_test1")
+int BPF_PROG(test9, int a, int ret)
+{
+	__u64 *cookie = bpf_session_cookie(ctx);
+
+	if (!bpf_session_is_return(ctx)) {
+		test9_entry_result = a == 1 && ret == 0;
+		*cookie = 0x123456ULL;
+		return 0;
+	}
+
+	test9_exit_result = a == 1 && ret == 2 && *cookie == 0x123456ULL;
+	return 0;
+}
-- 
cgit v1.2.3


From cb4bfacfb0110aa1b10ab60c64a3df0e176998c5 Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Sat, 24 Jan 2026 14:20:08 +0800
Subject: selftests/bpf: test fsession mixed with fentry and fexit

Test the fsession when it is used together with fentry, fexit.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Link: https://lore.kernel.org/r/20260124062008.8657-14-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/fsession_test.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/fsession_test.c b/tools/testing/selftests/bpf/progs/fsession_test.c
index 211332bdcccb..86e8a2fe467e 100644
--- a/tools/testing/selftests/bpf/progs/fsession_test.c
+++ b/tools/testing/selftests/bpf/progs/fsession_test.c
@@ -161,3 +161,19 @@ int BPF_PROG(test9, int a, int ret)
 	test9_exit_result = a == 1 && ret == 2 && *cookie == 0x123456ULL;
 	return 0;
 }
+
+__u64 test10_result = 0;
+SEC("fexit/bpf_fentry_test1")
+int BPF_PROG(test10, int a, int ret)
+{
+	test10_result = a == 1 && ret == 2;
+	return 0;
+}
+
+__u64 test11_result = 0;
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(test11, int a)
+{
+	test11_result = a == 1;
+	return 0;
+}
-- 
cgit v1.2.3


From c31df36bd26a5ed8898bb3fcc8c37ea9157ba784 Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Sun, 25 Jan 2026 20:54:12 +0900
Subject: selftests/bpf: Introduce execution context detection helpers

Introduce bpf_in_nmi(), bpf_in_hardirq(), bpf_in_serving_softirq(), and
bpf_in_task() inline helpers in bpf_experimental.h. These allow BPF
programs to query the current execution context with higher granularity
than the existing bpf_in_interrupt() helper.

While BPF programs can often infer their context from attachment points,
subsystems like sched_ext may call the same BPF logic from multiple
contexts (e.g., task-to-task wake-ups vs. interrupt-to-task wake-ups).
These helpers provide a reliable way for logic to branch based on
the current CPU execution state.

Implementing these as BPF-native inline helpers wrapping
get_preempt_count() allows the compiler and JIT to inline the logic. The
implementation accounts for differences in preempt_count layout between
standard and PREEMPT_RT kernels.

Reviewed-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Changwoo Min <changwoo@igalia.com>
Link: https://lore.kernel.org/r/20260125115413.117502-2-changwoo@igalia.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/bpf_experimental.h | 58 ++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 68a49b1f77ae..a39576c8ba04 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -610,6 +610,8 @@ extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str,
 #define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
 #define NMI_MASK	(__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
 
+#define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
+
 extern bool CONFIG_PREEMPT_RT __kconfig __weak;
 #ifdef bpf_target_x86
 extern const int __preempt_count __ksym;
@@ -648,4 +650,60 @@ static inline int bpf_in_interrupt(void)
 	       (tsk->softirq_disable_cnt & SOFTIRQ_MASK);
 }
 
+/* Description
+ *	Report whether it is in NMI context. Only works on the following archs:
+ *	* x86
+ *	* arm64
+ */
+static inline int bpf_in_nmi(void)
+{
+	return get_preempt_count() & NMI_MASK;
+}
+
+/* Description
+ *	Report whether it is in hard IRQ context. Only works on the following archs:
+ *	* x86
+ *	* arm64
+ */
+static inline int bpf_in_hardirq(void)
+{
+	return get_preempt_count() & HARDIRQ_MASK;
+}
+
+/* Description
+ *	Report whether it is in softirq context. Only works on the following archs:
+ *	* x86
+ *	* arm64
+ */
+static inline int bpf_in_serving_softirq(void)
+{
+	struct task_struct___preempt_rt *tsk;
+	int pcnt;
+
+	pcnt = get_preempt_count();
+	if (!CONFIG_PREEMPT_RT)
+		return (pcnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET;
+
+	tsk = (void *) bpf_get_current_task_btf();
+	return (tsk->softirq_disable_cnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET;
+}
+
+/* Description
+ *	Report whether it is in task context. Only works on the following archs:
+ *	* x86
+ *	* arm64
+ */
+static inline int bpf_in_task(void)
+{
+	struct task_struct___preempt_rt *tsk;
+	int pcnt;
+
+	pcnt = get_preempt_count();
+	if (!CONFIG_PREEMPT_RT)
+		return !(pcnt & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+
+	tsk = (void *) bpf_get_current_task_btf();
+	return !((pcnt & (NMI_MASK | HARDIRQ_MASK)) |
+		 ((tsk->softirq_disable_cnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET));
+}
 #endif
-- 
cgit v1.2.3


From 221b5e76c1c6e8ad4fa7c95a689e44ff45daab1c Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Sun, 25 Jan 2026 20:54:13 +0900
Subject: selftests/bpf: Add tests for execution context helpers

Add a new selftest suite `exe_ctx` to verify the accuracy of the
bpf_in_task(), bpf_in_hardirq(), and bpf_in_serving_softirq() helpers
introduced in bpf_experimental.h.

Testing these execution contexts deterministically requires crossing
context boundaries within a single CPU. To achieve this, the test
implements a "Trigger-Observer" pattern using bpf_testmod:

1. Trigger: A BPF syscall program calls a new bpf_testmod kfunc
   bpf_kfunc_trigger_ctx_check().
2. Task to HardIRQ: The kfunc uses irq_work_queue() to trigger a
   self-IPI on the local CPU.
3. HardIRQ to SoftIRQ: The irq_work handler calls a dummy function
   (observed by BPF fentry) and then schedules a tasklet to
   transition into SoftIRQ context.

The user-space runner ensures determinism by pinning itself to CPU 0
before execution, forcing the entire interrupt chain to remain on a
single core. Dummy noinline functions with compiler barriers are
added to bpf_testmod.c to serve as stable attachment points for
fentry programs. A retry loop is used in user-space to wait for the
asynchronous SoftIRQ to complete.

Note that testing on s390x is avoided because supporting those helpers
purely in BPF on s390x is not possible at this point.

Reviewed-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Changwoo Min <changwoo@igalia.com>
Link: https://lore.kernel.org/r/20260125115413.117502-3-changwoo@igalia.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.s390x         |  1 +
 tools/testing/selftests/bpf/prog_tests/exe_ctx.c   | 59 ++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/test_ctx.c       | 48 ++++++++++++++++++
 .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 32 ++++++++++++
 .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h   |  4 ++
 5 files changed, 144 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/exe_ctx.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_ctx.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index a17baf8c6fd7..f7e1e5f5511c 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -1,4 +1,5 @@
 # TEMPORARY
 # Alphabetical order
+exe_ctx                                  # execution context check (e.g., hardirq, softirq, etc)
 get_stack_raw_tp                         # user_stack corrupted user stack                                             (no backchain userspace)
 stacktrace_build_id                      # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2                   (?)
diff --git a/tools/testing/selftests/bpf/prog_tests/exe_ctx.c b/tools/testing/selftests/bpf/prog_tests/exe_ctx.c
new file mode 100644
index 000000000000..aed6a6ef0876
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/exe_ctx.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2026 Valve Corporation.
+ * Author: Changwoo Min <changwoo@igalia.com>
+ */
+
+#include <test_progs.h>
+#include <sys/syscall.h>
+#include "test_ctx.skel.h"
+
+void test_exe_ctx(void)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+	cpu_set_t old_cpuset, target_cpuset;
+	struct test_ctx *skel;
+	int err, prog_fd;
+
+	/* 1. Pin the current process to CPU 0. */
+	if (sched_getaffinity(0, sizeof(old_cpuset), &old_cpuset) == 0) {
+		CPU_ZERO(&target_cpuset);
+		CPU_SET(0, &target_cpuset);
+		ASSERT_OK(sched_setaffinity(0, sizeof(target_cpuset),
+					    &target_cpuset), "setaffinity");
+	}
+
+	skel = test_ctx__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_load"))
+		goto restore_affinity;
+
+	err = test_ctx__attach(skel);
+	if (!ASSERT_OK(err, "skel_attach"))
+		goto cleanup;
+
+	/* 2. When we run this, the kernel will execute the BPF prog on CPU 0. */
+	prog_fd = bpf_program__fd(skel->progs.trigger_all_contexts);
+	err = bpf_prog_test_run_opts(prog_fd, &opts);
+	ASSERT_OK(err, "test_run_trigger");
+
+	/* 3. Wait for the local CPU's softirq/tasklet to finish. */
+	for (int i = 0; i < 1000; i++) {
+		if (skel->bss->count_task > 0 &&
+		    skel->bss->count_hardirq > 0 &&
+		    skel->bss->count_softirq > 0)
+			break;
+		usleep(1000); /* Wait 1ms per iteration, up to 1 sec total */
+	}
+
+	/* On CPU 0, these should now all be non-zero. */
+	ASSERT_GT(skel->bss->count_task, 0, "task_ok");
+	ASSERT_GT(skel->bss->count_hardirq, 0, "hardirq_ok");
+	ASSERT_GT(skel->bss->count_softirq, 0, "softirq_ok");
+
+cleanup:
+	test_ctx__destroy(skel);
+
+restore_affinity:
+	ASSERT_OK(sched_setaffinity(0, sizeof(old_cpuset), &old_cpuset),
+		  "restore_affinity");
+}
diff --git a/tools/testing/selftests/bpf/progs/test_ctx.c b/tools/testing/selftests/bpf/progs/test_ctx.c
new file mode 100644
index 000000000000..7d4995506717
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ctx.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2026 Valve Corporation.
+ * Author: Changwoo Min <changwoo@igalia.com>
+ */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_experimental.h"
+
+char _license[] SEC("license") = "GPL";
+
+extern void bpf_kfunc_trigger_ctx_check(void) __ksym;
+
+int count_hardirq;
+int count_softirq;
+int count_task;
+
+/* Triggered via bpf_prog_test_run from user-space */
+SEC("syscall")
+int trigger_all_contexts(void *ctx)
+{
+	if (bpf_in_task())
+		__sync_fetch_and_add(&count_task, 1);
+
+	/* Trigger the firing of a hardirq and softirq for test. */
+	bpf_kfunc_trigger_ctx_check();
+	return 0;
+}
+
+/* Observer for HardIRQ */
+SEC("fentry/bpf_testmod_test_hardirq_fn")
+int BPF_PROG(on_hardirq)
+{
+	if (bpf_in_hardirq())
+		__sync_fetch_and_add(&count_hardirq, 1);
+	return 0;
+}
+
+/* Observer for SoftIRQ */
+SEC("fentry/bpf_testmod_test_softirq_fn")
+int BPF_PROG(on_softirq)
+{
+	if (bpf_in_serving_softirq())
+		__sync_fetch_and_add(&count_softirq, 1);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index 77a81fa8ec6a..186a25ab429a 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -1168,6 +1168,33 @@ __bpf_kfunc int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux);
 __bpf_kfunc int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux);
 __bpf_kfunc int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux);
 
+/* hook targets */
+noinline void bpf_testmod_test_hardirq_fn(void) { barrier(); }
+noinline void bpf_testmod_test_softirq_fn(void) { barrier(); }
+
+/* Tasklet for SoftIRQ context */
+static void ctx_check_tasklet_fn(struct tasklet_struct *t)
+{
+	bpf_testmod_test_softirq_fn();
+}
+
+DECLARE_TASKLET(ctx_check_tasklet, ctx_check_tasklet_fn);
+
+/* IRQ Work for HardIRQ context */
+static void ctx_check_irq_fn(struct irq_work *work)
+{
+	bpf_testmod_test_hardirq_fn();
+	tasklet_schedule(&ctx_check_tasklet);
+}
+
+static struct irq_work ctx_check_irq = IRQ_WORK_INIT_HARD(ctx_check_irq_fn);
+
+/* The kfunc trigger */
+__bpf_kfunc void bpf_kfunc_trigger_ctx_check(void)
+{
+	irq_work_queue(&ctx_check_irq);
+}
+
 BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
@@ -1213,6 +1240,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_assoc, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy_impl)
+BTF_ID_FLAGS(func, bpf_kfunc_trigger_ctx_check)
 BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids)
 
 static int bpf_testmod_ops_init(struct btf *btf)
@@ -1844,6 +1872,10 @@ static void bpf_testmod_exit(void)
 	while (refcount_read(&prog_test_struct.cnt) > 1)
 		msleep(20);
 
+	/* Clean up irqwork and tasklet */
+	irq_work_sync(&ctx_check_irq);
+	tasklet_kill(&ctx_check_tasklet);
+
 	bpf_kfunc_close_sock();
 	sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file);
 	unregister_bpf_testmod_uprobe();
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
index 10f89f06245f..d5c5454e257e 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
@@ -169,4 +169,8 @@ extern int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args) __weak
 struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void) __ksym;
 void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr) __ksym;
 
+void bpf_testmod_test_hardirq_fn(void);
+void bpf_testmod_test_softirq_fn(void);
+void bpf_kfunc_trigger_ctx_check(void) __ksym;
+
 #endif /* _BPF_TESTMOD_KFUNC_H */
-- 
cgit v1.2.3


From 8e5bcc3a955a2cc4460b391f55d3b49905eb248e Mon Sep 17 00:00:00 2001
From: Alexander Atanasov <alex@zazolabs.com>
Date: Sun, 25 Jan 2026 08:57:46 +0000
Subject: selftests: ublk: add missing gitignore for metadata_size binary

A new utility metadata_size was added in
commit 261b67f4e347 ("selftests: ublk: add utility to get block device metadata size")
but it was not added to .gitignore. Fix that by adding it there.

While at it sort all entries alphabetically and add a SPDX license header.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Fixes: 261b67f4e347 ("selftests: ublk: add utility to get block device metadata size")
Signed-off-by: Alexander Atanasov <alex@zazolabs.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/.gitignore | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/.gitignore b/tools/testing/selftests/ublk/.gitignore
index 8b2871ea7751..e17bd28f27e0 100644
--- a/tools/testing/selftests/ublk/.gitignore
+++ b/tools/testing/selftests/ublk/.gitignore
@@ -1,3 +1,5 @@
-kublk
-/tools
+# SPDX-License-Identifier: GPL-2.0
 *-verify.state
+/tools
+kublk
+metadata_size
-- 
cgit v1.2.3


From 1742272bd3fae6362301d0f11eb9db9030348afc Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <fmancera@suse.de>
Date: Wed, 21 Jan 2026 20:44:09 +0100
Subject: selftests: net: add ipv6 ping to local address from localhost

Test ipv6 pinging to local configured address and linklocal address from
localhost with -I ::1.

Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20260121194409.6749-2-fmancera@suse.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/fcnal-test.sh | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 844a580ae74e..890c3f8e51bb 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -2327,6 +2327,13 @@ ipv6_ping_novrf()
 		log_test_addr ${a} $? 2 "ping local, device bind"
 	done
 
+	for a in ${NSA_LO_IP6} ${NSA_LINKIP6}%${NSA_DEV} ${NSA_IP6}
+	do
+		log_start
+		run_cmd ${ping6} -c1 -w1 -I ::1 ${a}
+		log_test_addr ${a} $? 0 "ping local, from localhost"
+	done
+
 	#
 	# ip rule blocks address
 	#
-- 
cgit v1.2.3


From 3c58f03e805f8f9025f09fe393103947dca49c57 Mon Sep 17 00:00:00 2001
From: I-Hsin Cheng <richard120310@gmail.com>
Date: Sat, 24 Jan 2026 20:32:41 +0800
Subject: kselftest/arm64: Add missing file in .gitignore

The binary generated by check_hugetlb_options is missing in .gitignore
under the directory. Add it into the file so it won't be logged into
version control.

Signed-off-by: I-Hsin Cheng <richard120310@gmail.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/mte/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/mte/.gitignore b/tools/testing/selftests/arm64/mte/.gitignore
index 052d0f9f92b3..f6937f890039 100644
--- a/tools/testing/selftests/arm64/mte/.gitignore
+++ b/tools/testing/selftests/arm64/mte/.gitignore
@@ -6,3 +6,4 @@ check_mmap_options
 check_prctl
 check_ksm_options
 check_user_mem
+check_hugetlb_options
-- 
cgit v1.2.3


From 87a6e3b6c494ac519548c30b82b0d87b233b9649 Mon Sep 17 00:00:00 2001
From: Khushal Chitturi <kc9282016@gmail.com>
Date: Wed, 19 Nov 2025 01:22:58 +0530
Subject: xdrgen: improve error reporting for invalid void declarations

RFC 4506 defines void as a zero-length type that may appear only as
union arms or as program argument/result types. It cannot be declared
with an identifier, so constructs like "typedef void temp;" are not
valid XDR.

Previously, xdrgen raised a NotImplementedError when it encountered a
void declaration in a typedef. Which was misleading, as the problem is an
invalid RPC specification rather than missing functionality in xdrgen.

This patch replaces the NotImplementedError for _XdrVoid in typedef
handling with a clearer ValueError that specifies incorrect use of void
in the XDR input, making it clear that the issue lies in the RPC
specification being parsed.

Signed-off-by: Khushal Chitturi <kc9282016@gmail.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 tools/net/sunrpc/xdrgen/generators/typedef.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/net/sunrpc/xdrgen/generators/typedef.py b/tools/net/sunrpc/xdrgen/generators/typedef.py
index fab72e9d6915..75e3a40e14e1 100644
--- a/tools/net/sunrpc/xdrgen/generators/typedef.py
+++ b/tools/net/sunrpc/xdrgen/generators/typedef.py
@@ -58,7 +58,7 @@ def emit_typedef_declaration(environment: Environment, node: _XdrDeclaration) ->
     elif isinstance(node, _XdrOptionalData):
         raise NotImplementedError("<optional_data> typedef not yet implemented")
     elif isinstance(node, _XdrVoid):
-        raise NotImplementedError("<void> typedef not yet implemented")
+        raise ValueError("invalid void usage in RPC Specification")
     else:
         raise NotImplementedError("typedef: type not recognized")
 
@@ -104,7 +104,7 @@ def emit_type_definition(environment: Environment, node: _XdrDeclaration) -> Non
     elif isinstance(node, _XdrOptionalData):
         raise NotImplementedError("<optional_data> typedef not yet implemented")
     elif isinstance(node, _XdrVoid):
-        raise NotImplementedError("<void> typedef not yet implemented")
+        raise ValueError("invalid void usage in RPC Specification")
     else:
         raise NotImplementedError("typedef: type not recognized")
 
@@ -165,7 +165,7 @@ def emit_typedef_decoder(environment: Environment, node: _XdrDeclaration) -> Non
     elif isinstance(node, _XdrOptionalData):
         raise NotImplementedError("<optional_data> typedef not yet implemented")
     elif isinstance(node, _XdrVoid):
-        raise NotImplementedError("<void> typedef not yet implemented")
+        raise ValueError("invalid void usage in RPC Specification")
     else:
         raise NotImplementedError("typedef: type not recognized")
 
@@ -225,7 +225,7 @@ def emit_typedef_encoder(environment: Environment, node: _XdrDeclaration) -> Non
     elif isinstance(node, _XdrOptionalData):
         raise NotImplementedError("<optional_data> typedef not yet implemented")
     elif isinstance(node, _XdrVoid):
-        raise NotImplementedError("<void> typedef not yet implemented")
+        raise ValueError("invalid void usage in RPC Specification")
     else:
         raise NotImplementedError("typedef: type not recognized")
 
-- 
cgit v1.2.3


From 9654a0388a3abcfa51cb2d010a6624a6f1f46710 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 20 Nov 2025 15:15:52 -0500
Subject: xdrgen: Generate "if" instead of "switch" for boolean union
 enumerators

Eliminate this warning in code generated by xdrgen:

fs/nfsd/nfs3xdr_gen.c:220:2: warning: switch condition has boolean value [-Wswitch-bool]
  220 |         switch (ptr->attributes_follow) {
      |         ^       ~~~~~~~~~~~~~~~~~~~~~~

No more -Wswitch-bool warnings when compiling with W=1.

The generated code is functionally equivalent but somewhat more
idiomatic.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202511172336.Y75zj4v6-lkp@intel.com/
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 tools/net/sunrpc/xdrgen/generators/union.py        | 115 +++++++++++++++++----
 .../xdrgen/templates/C/union/decoder/bool_spec.j2  |   7 ++
 .../xdrgen/templates/C/union/encoder/bool_spec.j2  |   7 ++
 3 files changed, 109 insertions(+), 20 deletions(-)
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2

(limited to 'tools')

diff --git a/tools/net/sunrpc/xdrgen/generators/union.py b/tools/net/sunrpc/xdrgen/generators/union.py
index ad1f214ef22a..d15837dae651 100644
--- a/tools/net/sunrpc/xdrgen/generators/union.py
+++ b/tools/net/sunrpc/xdrgen/generators/union.py
@@ -84,6 +84,31 @@ def emit_union_switch_spec_decoder(
     print(template.render(name=node.name, type=node.spec.type_name))
 
 
+def emit_union_arm_decoder(
+    environment: Environment, node: _XdrCaseSpec
+) -> None:
+    """Emit decoder for an XDR union's arm (data only, no case/break)"""
+
+    if isinstance(node.arm, _XdrVoid):
+        return
+    if isinstance(node.arm, _XdrString):
+        type_name = "char *"
+        classifier = ""
+    else:
+        type_name = node.arm.spec.type_name
+        classifier = node.arm.spec.c_classifier
+
+    assert isinstance(node.arm, (_XdrBasic, _XdrString))
+    template = get_jinja2_template(environment, "decoder", node.arm.template)
+    print(
+        template.render(
+            name=node.arm.name,
+            type=type_name,
+            classifier=classifier,
+        )
+    )
+
+
 def emit_union_case_spec_decoder(
     environment: Environment, node: _XdrCaseSpec, big_endian_discriminant: bool
 ) -> None:
@@ -151,19 +176,33 @@ def emit_union_decoder(environment: Environment, node: _XdrUnion) -> None:
     template = get_jinja2_template(environment, "decoder", "open")
     print(template.render(name=node.name))
 
-    emit_union_switch_spec_decoder(environment, node.discriminant)
+    # For boolean discriminants, use if statement instead of switch
+    if node.discriminant.spec.type_name == "bool":
+        template = get_jinja2_template(environment, "decoder", "bool_spec")
+        print(template.render(name=node.discriminant.name, type=node.discriminant.spec.type_name))
 
-    for case in node.cases:
-        emit_union_case_spec_decoder(
-            environment,
-            case,
-            node.discriminant.spec.type_name in big_endian,
-        )
+        # Find and emit the TRUE case
+        for case in node.cases:
+            if case.values and case.values[0] == "TRUE":
+                emit_union_arm_decoder(environment, case)
+                break
 
-    emit_union_default_spec_decoder(environment, node)
+        template = get_jinja2_template(environment, "decoder", "close")
+        print(template.render())
+    else:
+        emit_union_switch_spec_decoder(environment, node.discriminant)
 
-    template = get_jinja2_template(environment, "decoder", "close")
-    print(template.render())
+        for case in node.cases:
+            emit_union_case_spec_decoder(
+                environment,
+                case,
+                node.discriminant.spec.type_name in big_endian,
+            )
+
+        emit_union_default_spec_decoder(environment, node)
+
+        template = get_jinja2_template(environment, "decoder", "close")
+        print(template.render())
 
 
 def emit_union_switch_spec_encoder(
@@ -175,6 +214,28 @@ def emit_union_switch_spec_encoder(
     print(template.render(name=node.name, type=node.spec.type_name))
 
 
+def emit_union_arm_encoder(
+    environment: Environment, node: _XdrCaseSpec
+) -> None:
+    """Emit encoder for an XDR union's arm (data only, no case/break)"""
+
+    if isinstance(node.arm, _XdrVoid):
+        return
+    if isinstance(node.arm, _XdrString):
+        type_name = "char *"
+    else:
+        type_name = node.arm.spec.type_name
+
+    assert isinstance(node.arm, (_XdrBasic, _XdrString))
+    template = get_jinja2_template(environment, "encoder", node.arm.template)
+    print(
+        template.render(
+            name=node.arm.name,
+            type=type_name,
+        )
+    )
+
+
 def emit_union_case_spec_encoder(
     environment: Environment, node: _XdrCaseSpec, big_endian_discriminant: bool
 ) -> None:
@@ -235,19 +296,33 @@ def emit_union_encoder(environment, node: _XdrUnion) -> None:
     template = get_jinja2_template(environment, "encoder", "open")
     print(template.render(name=node.name))
 
-    emit_union_switch_spec_encoder(environment, node.discriminant)
+    # For boolean discriminants, use if statement instead of switch
+    if node.discriminant.spec.type_name == "bool":
+        template = get_jinja2_template(environment, "encoder", "bool_spec")
+        print(template.render(name=node.discriminant.name, type=node.discriminant.spec.type_name))
 
-    for case in node.cases:
-        emit_union_case_spec_encoder(
-            environment,
-            case,
-            node.discriminant.spec.type_name in big_endian,
-        )
+        # Find and emit the TRUE case
+        for case in node.cases:
+            if case.values and case.values[0] == "TRUE":
+                emit_union_arm_encoder(environment, case)
+                break
 
-    emit_union_default_spec_encoder(environment, node)
+        template = get_jinja2_template(environment, "encoder", "close")
+        print(template.render())
+    else:
+        emit_union_switch_spec_encoder(environment, node.discriminant)
 
-    template = get_jinja2_template(environment, "encoder", "close")
-    print(template.render())
+        for case in node.cases:
+            emit_union_case_spec_encoder(
+                environment,
+                case,
+                node.discriminant.spec.type_name in big_endian,
+            )
+
+        emit_union_default_spec_encoder(environment, node)
+
+        template = get_jinja2_template(environment, "encoder", "close")
+        print(template.render())
 
 
 def emit_union_maxsize(environment: Environment, node: _XdrUnion) -> None:
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2
new file mode 100644
index 000000000000..05ad491f74af
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/decoder/bool_spec.j2
@@ -0,0 +1,7 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* discriminant {{ name }} */
+{% endif %}
+	if (!xdrgen_decode_{{ type }}(xdr, &ptr->{{ name }}))
+		return false;
+	if (ptr->{{ name }}) {
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2
new file mode 100644
index 000000000000..e5135ed6471c
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/encoder/bool_spec.j2
@@ -0,0 +1,7 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+{% if annotate %}
+	/* discriminant {{ name }} */
+{% endif %}
+	if (!xdrgen_encode_{{ type }}(xdr, ptr->{{ name }}))
+		return false;
+	if (ptr->{{ name }}) {
-- 
cgit v1.2.3


From 4329010ad9c36775e7092e451c37c24c4f90243f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 1 Dec 2025 17:19:46 -0500
Subject: xdrgen: Address some checkpatch whitespace complaints

This is a roll-up of three template fixes that eliminate noise from
checkpatch output so that it's easier to spot non-trivial problems.

To follow conventional kernel C style, when a union declaration is
marked with "pragma public", there should be a blank line between
the emitted "union xxx { ... };" and the decoder and encoder
function declarations.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2    | 1 -
 tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2    | 1 +
 tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2 | 1 +
 tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2   | 1 +
 4 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2
index d1405c7c5354..c7ae506076bb 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/declaration/enum.j2
@@ -1,4 +1,3 @@
 {# SPDX-License-Identifier: GPL-2.0 #}
-
 bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr);
 bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, {{ name }} value);
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2
index a07586cbee17..446266ad6d17 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close.j2
@@ -1,3 +1,4 @@
 {# SPDX-License-Identifier: GPL-2.0 #}
 };
+
 typedef enum {{ name }} {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2
index 2c18948bddf7..cfeee2287e68 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/definition/close_be.j2
@@ -1,3 +1,4 @@
 {# SPDX-License-Identifier: GPL-2.0 #}
 };
+
 typedef __be32 {{ name }};
diff --git a/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2 b/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2
index 01d716d0099e..5fc1937ba774 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/union/definition/close.j2
@@ -3,6 +3,7 @@
 };
 {%- if name in public_apis %}
 
+
 bool xdrgen_decode_{{ name }}(struct xdr_stream *xdr, struct {{ name }} *ptr);
 bool xdrgen_encode_{{ name }}(struct xdr_stream *xdr, const struct {{ name }} *ptr);
 {%- endif -%}
-- 
cgit v1.2.3


From bf0fe9ad3d597d8e1378dc9953ca96dfc3addb2b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 8 Dec 2025 11:15:32 -0500
Subject: xdrgen: Fix struct prefix for typedef types in program wrappers

The program templates for decoder/argument.j2 and encoder/result.j2
unconditionally add 'struct' prefix to all types. This is incorrect
when an RPC protocol specification lists a typedef'd basic type or
an enum as a procedure argument or result (e.g., NFSv2's fhandle or
stat), resulting in compiler errors when building generated C code.

Fixes: 4b132aacb076 ("tools: Add xdrgen")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 tools/net/sunrpc/xdrgen/generators/__init__.py                  | 3 ++-
 tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2 | 4 ++++
 tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2   | 6 ++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/sunrpc/xdrgen/generators/__init__.py b/tools/net/sunrpc/xdrgen/generators/__init__.py
index e22632cf38fb..1d577a986c6c 100644
--- a/tools/net/sunrpc/xdrgen/generators/__init__.py
+++ b/tools/net/sunrpc/xdrgen/generators/__init__.py
@@ -6,7 +6,7 @@ from pathlib import Path
 from jinja2 import Environment, FileSystemLoader, Template
 
 from xdr_ast import _XdrAst, Specification, _RpcProgram, _XdrTypeSpecifier
-from xdr_ast import public_apis, pass_by_reference, get_header_name
+from xdr_ast import public_apis, pass_by_reference, structs, get_header_name
 from xdr_parse import get_xdr_annotate
 
 
@@ -25,6 +25,7 @@ def create_jinja2_environment(language: str, xdr_type: str) -> Environment:
             environment.globals["annotate"] = get_xdr_annotate()
             environment.globals["public_apis"] = public_apis
             environment.globals["pass_by_reference"] = pass_by_reference
+            environment.globals["structs"] = structs
             return environment
         case _:
             raise NotImplementedError("Language not supported")
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2
index 0b1709cca0d4..19b219dd276d 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/decoder/argument.j2
@@ -14,7 +14,11 @@ bool {{ program }}_svc_decode_{{ argument }}(struct svc_rqst *rqstp, struct xdr_
 {% if argument == 'void' %}
 	return xdrgen_decode_void(xdr);
 {% else %}
+{% if argument in structs %}
 	struct {{ argument }} *argp = rqstp->rq_argp;
+{% else %}
+	{{ argument }} *argp = rqstp->rq_argp;
+{% endif %}
 
 	return xdrgen_decode_{{ argument }}(xdr, argp);
 {% endif %}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2
index 6fc61a5d47b7..746592cfda56 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/encoder/result.j2
@@ -14,8 +14,14 @@ bool {{ program }}_svc_encode_{{ result }}(struct svc_rqst *rqstp, struct xdr_st
 {% if result == 'void' %}
 	return xdrgen_encode_void(xdr);
 {% else %}
+{% if result in structs %}
 	struct {{ result }} *resp = rqstp->rq_resp;
 
 	return xdrgen_encode_{{ result }}(xdr, resp);
+{% else %}
+	{{ result }} *resp = rqstp->rq_resp;
+
+	return xdrgen_encode_{{ result }}(xdr, *resp);
+{% endif %}
 {% endif %}
 }
-- 
cgit v1.2.3


From 288d9ddbb74f52e07b1e2bc628768f7847dcb7e6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 10 Dec 2025 09:04:24 -0500
Subject: xdrgen: Emit the program number definition

"xdrgen definitions" was not providing a definition of a symbolic
constant for the RPC program number being defined.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 tools/net/sunrpc/xdrgen/generators/program.py                     | 3 +++
 tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2 | 5 +++++
 2 files changed, 8 insertions(+)
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2

(limited to 'tools')

diff --git a/tools/net/sunrpc/xdrgen/generators/program.py b/tools/net/sunrpc/xdrgen/generators/program.py
index ac3cf1694b68..decb092ef02c 100644
--- a/tools/net/sunrpc/xdrgen/generators/program.py
+++ b/tools/net/sunrpc/xdrgen/generators/program.py
@@ -127,6 +127,9 @@ class XdrProgramGenerator(SourceGenerator):
         for version in node.versions:
             emit_version_definitions(self.environment, program, version)
 
+        template = self.environment.get_template("definition/program.j2")
+        print(template.render(name=raw_name, value=node.number))
+
     def emit_declaration(self, node: _RpcProgram) -> None:
         """Emit a declaration pair for each of an RPC programs's procedures"""
         raw_name = node.name
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2
new file mode 100644
index 000000000000..320663ffc37f
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/definition/program.j2
@@ -0,0 +1,5 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+#ifndef {{ name }}
+#define {{ name }} ({{ value }})
+#endif
-- 
cgit v1.2.3


From ae78eb497868f335919db83b82eb59849c6cf251 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 16 Dec 2025 11:23:09 -0500
Subject: xdrgen: Implement short (16-bit) integer types

"short" and "unsigned short" types are not defined in RFC 4506, but
are supported by the rpcgen program. An upcoming protocol
specification includes at least one "unsigned short" field, so xdrgen
needs to implement support for these types.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/xdrgen/_builtins.h        | 60 ++++++++++++++++++++++++++
 tools/net/sunrpc/xdrgen/generators/__init__.py |  2 +
 tools/net/sunrpc/xdrgen/grammars/xdr.lark      |  4 ++
 tools/net/sunrpc/xdrgen/xdr_ast.py             |  4 ++
 4 files changed, 70 insertions(+)

(limited to 'tools')

diff --git a/include/linux/sunrpc/xdrgen/_builtins.h b/include/linux/sunrpc/xdrgen/_builtins.h
index 66ca3ece951a..52ed9a9151c4 100644
--- a/include/linux/sunrpc/xdrgen/_builtins.h
+++ b/include/linux/sunrpc/xdrgen/_builtins.h
@@ -46,6 +46,66 @@ xdrgen_encode_bool(struct xdr_stream *xdr, bool val)
 	return true;
 }
 
+/*
+ * De facto (non-standard but commonly implemented) signed short type:
+ *  - Wire sends sign-extended 32-bit value (e.g., 0xFFFFFFFF)
+ *  - be32_to_cpup() returns u32 (0xFFFFFFFF)
+ *  - Explicit (s16) cast truncates to 16 bits (0xFFFF = -1)
+ */
+static inline bool
+xdrgen_decode_short(struct xdr_stream *xdr, s16 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = (s16)be32_to_cpup(p);
+	return true;
+}
+
+/*
+ * De facto (non-standard but commonly implemented) signed short type:
+ *  - C integer promotion sign-extends s16 val to int before passing to
+ *    cpu_to_be32()
+ *  - This is well-defined: -1 as s16 -1 as int 0xFFFFFFFF on wire
+ */
+static inline bool
+xdrgen_encode_short(struct xdr_stream *xdr, s16 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*p = cpu_to_be32(val);
+	return true;
+}
+
+/*
+ * De facto (non-standard but commonly implemented) unsigned short type:
+ * 16-bit integer zero-extended to fill one XDR_UNIT.
+ */
+static inline bool
+xdrgen_decode_unsigned_short(struct xdr_stream *xdr, u16 *ptr)
+{
+	__be32 *p = xdr_inline_decode(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*ptr = (u16)be32_to_cpup(p);
+	return true;
+}
+
+static inline bool
+xdrgen_encode_unsigned_short(struct xdr_stream *xdr, u16 val)
+{
+	__be32 *p = xdr_reserve_space(xdr, XDR_UNIT);
+
+	if (unlikely(!p))
+		return false;
+	*p = cpu_to_be32(val);
+	return true;
+}
+
 static inline bool
 xdrgen_decode_int(struct xdr_stream *xdr, s32 *ptr)
 {
diff --git a/tools/net/sunrpc/xdrgen/generators/__init__.py b/tools/net/sunrpc/xdrgen/generators/__init__.py
index 1d577a986c6c..5c3a4a47ded8 100644
--- a/tools/net/sunrpc/xdrgen/generators/__init__.py
+++ b/tools/net/sunrpc/xdrgen/generators/__init__.py
@@ -59,6 +59,8 @@ def kernel_c_type(spec: _XdrTypeSpecifier) -> str:
     """Return name of C type"""
     builtin_native_c_type = {
         "bool": "bool",
+        "short": "s16",
+        "unsigned_short": "u16",
         "int": "s32",
         "unsigned_int": "u32",
         "long": "s32",
diff --git a/tools/net/sunrpc/xdrgen/grammars/xdr.lark b/tools/net/sunrpc/xdrgen/grammars/xdr.lark
index 7c2c1b8c86d1..b7c664f2acb7 100644
--- a/tools/net/sunrpc/xdrgen/grammars/xdr.lark
+++ b/tools/net/sunrpc/xdrgen/grammars/xdr.lark
@@ -20,9 +20,11 @@ constant                : decimal_constant | hexadecimal_constant | octal_consta
 type_specifier          : unsigned_hyper
                         | unsigned_long
                         | unsigned_int
+                        | unsigned_short
                         | hyper
                         | long
                         | int
+                        | short
                         | float
                         | double
                         | quadruple
@@ -35,9 +37,11 @@ type_specifier          : unsigned_hyper
 unsigned_hyper          : "unsigned" "hyper"
 unsigned_long           : "unsigned" "long"
 unsigned_int            : "unsigned" "int"
+unsigned_short          : "unsigned" "short"
 hyper                   : "hyper"
 long                    : "long"
 int                     : "int"
+short                   : "short"
 float                   : "float"
 double                  : "double"
 quadruple               : "quadruple"
diff --git a/tools/net/sunrpc/xdrgen/xdr_ast.py b/tools/net/sunrpc/xdrgen/xdr_ast.py
index 5233e73c7046..2b5d160a0a60 100644
--- a/tools/net/sunrpc/xdrgen/xdr_ast.py
+++ b/tools/net/sunrpc/xdrgen/xdr_ast.py
@@ -34,6 +34,8 @@ def xdr_quadlen(val: str) -> int:
 symbolic_widths = {
     "void": ["XDR_void"],
     "bool": ["XDR_bool"],
+    "short": ["XDR_short"],
+    "unsigned_short": ["XDR_unsigned_short"],
     "int": ["XDR_int"],
     "unsigned_int": ["XDR_unsigned_int"],
     "long": ["XDR_long"],
@@ -48,6 +50,8 @@ symbolic_widths = {
 max_widths = {
     "void": 0,
     "bool": 1,
+    "short": 1,
+    "unsigned_short": 1,
     "int": 1,
     "unsigned_int": 1,
     "long": 1,
-- 
cgit v1.2.3


From eb1f3b55ac6202a013daf14ed508066947cdafa8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 22 Dec 2025 09:44:29 -0500
Subject: xdrgen: Remove inclusion of nlm4.h header

The client-side source code template mistakenly includes the
nlm4.h header file, which is specific to the NLM protocol and
should not be present in the generic template that generates
client stubs for all XDR-based protocols.

Fixes: 903a7d37d9ea ("xdrgen: Update the files included in client-side source code")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2 | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2 b/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2
index c5518c519854..df3598c38b2c 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/source_top/client.j2
@@ -8,6 +8,5 @@
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/xdrgen/_defs.h>
 #include <linux/sunrpc/xdrgen/_builtins.h>
-#include <linux/sunrpc/xdrgen/nlm4.h>
 
 #include <linux/sunrpc/clnt.h>
-- 
cgit v1.2.3


From 9abb3549227e4fb70f0d8ba515bf7ddd249ad710 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 22 Dec 2025 09:44:59 -0500
Subject: xdrgen: Improve parse error reporting

The current verbose Lark exception output makes it difficult to
quickly identify and fix syntax errors in XDR specifications. Users
must wade through hundreds of lines of cascading errors to find the
root cause.

Replace this with concise, compiler-style error messages showing
file, line, column, the unexpected token, and the source line with
a caret pointing to the error location.

Before:
  Unexpected token Token('__ANON_1', '+1') at line 14, column 35.
  Expected one of:
          * SEMICOLON
  Previous tokens: [Token('__ANON_0', 'LM_MAXSTRLEN')]
  [hundreds more cascading errors...]

After:
  file.x:14:35: parse error
  Unexpected number '+1'

      const LM_MAXNAMELEN = LM_MAXSTRLEN+1;
                                        ^

The error handler now raises XdrParseError on the first error,
preventing cascading messages that obscure the root cause.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 tools/net/sunrpc/xdrgen/subcmds/declarations.py | 16 ++---
 tools/net/sunrpc/xdrgen/subcmds/definitions.py  | 16 ++---
 tools/net/sunrpc/xdrgen/subcmds/lint.py         | 17 +++--
 tools/net/sunrpc/xdrgen/subcmds/source.py       | 16 ++---
 tools/net/sunrpc/xdrgen/xdr_parse.py            | 86 +++++++++++++++++++++++++
 tools/net/sunrpc/xdrgen/xdrgen                  |  2 -
 6 files changed, 118 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/tools/net/sunrpc/xdrgen/subcmds/declarations.py b/tools/net/sunrpc/xdrgen/subcmds/declarations.py
index c5e8d79986ef..2fd5c255a547 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/declarations.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/declarations.py
@@ -8,7 +8,6 @@ import logging
 
 from argparse import Namespace
 from lark import logger
-from lark.exceptions import UnexpectedInput
 
 from generators.constant import XdrConstantGenerator
 from generators.enum import XdrEnumGenerator
@@ -24,6 +23,7 @@ from xdr_ast import transform_parse_tree, _RpcProgram, Specification
 from xdr_ast import _XdrConstant, _XdrEnum, _XdrPointer
 from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion
 from xdr_parse import xdr_parser, set_xdr_annotate
+from xdr_parse import make_error_handler, XdrParseError
 
 logger.setLevel(logging.INFO)
 
@@ -50,19 +50,19 @@ def emit_header_declarations(
         gen.emit_declaration(definition.value)
 
 
-def handle_parse_error(e: UnexpectedInput) -> bool:
-    """Simple parse error reporting, no recovery attempted"""
-    print(e)
-    return True
-
-
 def subcmd(args: Namespace) -> int:
     """Generate definitions and declarations"""
 
     set_xdr_annotate(args.annotate)
     parser = xdr_parser()
     with open(args.filename, encoding="utf-8") as f:
-        parse_tree = parser.parse(f.read(), on_error=handle_parse_error)
+        source = f.read()
+        try:
+            parse_tree = parser.parse(
+                source, on_error=make_error_handler(source, args.filename)
+            )
+        except XdrParseError:
+            return 1
         ast = transform_parse_tree(parse_tree)
 
         gen = XdrHeaderTopGenerator(args.language, args.peer)
diff --git a/tools/net/sunrpc/xdrgen/subcmds/definitions.py b/tools/net/sunrpc/xdrgen/subcmds/definitions.py
index c956e27f37c0..8ea5c57cc37a 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/definitions.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/definitions.py
@@ -8,7 +8,6 @@ import logging
 
 from argparse import Namespace
 from lark import logger
-from lark.exceptions import UnexpectedInput
 
 from generators.constant import XdrConstantGenerator
 from generators.enum import XdrEnumGenerator
@@ -24,6 +23,7 @@ from xdr_ast import transform_parse_tree, Specification
 from xdr_ast import _RpcProgram, _XdrConstant, _XdrEnum, _XdrPointer
 from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion
 from xdr_parse import xdr_parser, set_xdr_annotate
+from xdr_parse import make_error_handler, XdrParseError
 
 logger.setLevel(logging.INFO)
 
@@ -69,19 +69,19 @@ def emit_header_maxsize(root: Specification, language: str, peer: str) -> None:
         gen.emit_maxsize(definition.value)
 
 
-def handle_parse_error(e: UnexpectedInput) -> bool:
-    """Simple parse error reporting, no recovery attempted"""
-    print(e)
-    return True
-
-
 def subcmd(args: Namespace) -> int:
     """Generate definitions"""
 
     set_xdr_annotate(args.annotate)
     parser = xdr_parser()
     with open(args.filename, encoding="utf-8") as f:
-        parse_tree = parser.parse(f.read(), on_error=handle_parse_error)
+        source = f.read()
+        try:
+            parse_tree = parser.parse(
+                source, on_error=make_error_handler(source, args.filename)
+            )
+        except XdrParseError:
+            return 1
         ast = transform_parse_tree(parse_tree)
 
         gen = XdrHeaderTopGenerator(args.language, args.peer)
diff --git a/tools/net/sunrpc/xdrgen/subcmds/lint.py b/tools/net/sunrpc/xdrgen/subcmds/lint.py
index 36cc43717d30..2c48fa57c4e5 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/lint.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/lint.py
@@ -8,26 +8,25 @@ import logging
 
 from argparse import Namespace
 from lark import logger
-from lark.exceptions import UnexpectedInput
 
-from xdr_parse import xdr_parser
+from xdr_parse import xdr_parser, make_error_handler, XdrParseError
 from xdr_ast import transform_parse_tree
 
 logger.setLevel(logging.DEBUG)
 
 
-def handle_parse_error(e: UnexpectedInput) -> bool:
-    """Simple parse error reporting, no recovery attempted"""
-    print(e)
-    return True
-
-
 def subcmd(args: Namespace) -> int:
     """Lexical and syntax check of an XDR specification"""
 
     parser = xdr_parser()
     with open(args.filename, encoding="utf-8") as f:
-        parse_tree = parser.parse(f.read(), on_error=handle_parse_error)
+        source = f.read()
+        try:
+            parse_tree = parser.parse(
+                source, on_error=make_error_handler(source, args.filename)
+            )
+        except XdrParseError:
+            return 1
         transform_parse_tree(parse_tree)
 
     return 0
diff --git a/tools/net/sunrpc/xdrgen/subcmds/source.py b/tools/net/sunrpc/xdrgen/subcmds/source.py
index 2024954748f0..bc7d38802df3 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/source.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/source.py
@@ -8,7 +8,6 @@ import logging
 
 from argparse import Namespace
 from lark import logger
-from lark.exceptions import UnexpectedInput
 
 from generators.source_top import XdrSourceTopGenerator
 from generators.enum import XdrEnumGenerator
@@ -23,6 +22,7 @@ from xdr_ast import _XdrAst, _XdrEnum, _XdrPointer
 from xdr_ast import _XdrStruct, _XdrTypedef, _XdrUnion
 
 from xdr_parse import xdr_parser, set_xdr_annotate
+from xdr_parse import make_error_handler, XdrParseError
 
 logger.setLevel(logging.INFO)
 
@@ -92,19 +92,19 @@ def generate_client_source(filename: str, root: Specification, language: str) ->
     # cel: todo: client needs PROC macros
 
 
-def handle_parse_error(e: UnexpectedInput) -> bool:
-    """Simple parse error reporting, no recovery attempted"""
-    print(e)
-    return True
-
-
 def subcmd(args: Namespace) -> int:
     """Generate encoder and decoder functions"""
 
     set_xdr_annotate(args.annotate)
     parser = xdr_parser()
     with open(args.filename, encoding="utf-8") as f:
-        parse_tree = parser.parse(f.read(), on_error=handle_parse_error)
+        source = f.read()
+        try:
+            parse_tree = parser.parse(
+                source, on_error=make_error_handler(source, args.filename)
+            )
+        except XdrParseError:
+            return 1
         ast = transform_parse_tree(parse_tree)
         match args.peer:
             case "server":
diff --git a/tools/net/sunrpc/xdrgen/xdr_parse.py b/tools/net/sunrpc/xdrgen/xdr_parse.py
index 964b44e675df..426513be066c 100644
--- a/tools/net/sunrpc/xdrgen/xdr_parse.py
+++ b/tools/net/sunrpc/xdrgen/xdr_parse.py
@@ -3,12 +3,40 @@
 
 """Common parsing code for xdrgen"""
 
+import sys
+from typing import Callable
+
 from lark import Lark
+from lark.exceptions import UnexpectedInput, UnexpectedToken
 
 
 # Set to True to emit annotation comments in generated source
 annotate = False
 
+# Map internal Lark token names to human-readable names
+TOKEN_NAMES = {
+    "__ANON_0": "identifier",
+    "__ANON_1": "number",
+    "SEMICOLON": "';'",
+    "LBRACE": "'{'",
+    "RBRACE": "'}'",
+    "LPAR": "'('",
+    "RPAR": "')'",
+    "LSQB": "'['",
+    "RSQB": "']'",
+    "LESSTHAN": "'<'",
+    "MORETHAN": "'>'",
+    "EQUAL": "'='",
+    "COLON": "':'",
+    "COMMA": "','",
+    "STAR": "'*'",
+    "$END": "end of file",
+}
+
+
+class XdrParseError(Exception):
+    """Raised when XDR parsing fails"""
+
 
 def set_xdr_annotate(set_it: bool) -> None:
     """Set 'annotate' if --annotate was specified on the command line"""
@@ -21,6 +49,64 @@ def get_xdr_annotate() -> bool:
     return annotate
 
 
+def make_error_handler(source: str, filename: str) -> Callable[[UnexpectedInput], bool]:
+    """Create an error handler that reports the first parse error and aborts.
+
+    Args:
+        source: The XDR source text being parsed
+        filename: The name of the file being parsed
+
+    Returns:
+        An error handler function for use with Lark's on_error parameter
+    """
+    lines = source.splitlines()
+
+    def handle_parse_error(e: UnexpectedInput) -> bool:
+        """Report a parse error with context and abort parsing"""
+        line_num = e.line
+        column = e.column
+        line_text = lines[line_num - 1] if 0 < line_num <= len(lines) else ""
+
+        # Build the error message
+        msg_parts = [f"{filename}:{line_num}:{column}: parse error"]
+
+        # Show what was found vs what was expected
+        if isinstance(e, UnexpectedToken):
+            token = e.token
+            if token.type == "__ANON_0":
+                found = f"identifier '{token.value}'"
+            elif token.type == "__ANON_1":
+                found = f"number '{token.value}'"
+            else:
+                found = f"'{token.value}'"
+            msg_parts.append(f"Unexpected {found}")
+
+            # Provide helpful expected tokens list
+            expected = e.expected
+            if expected:
+                readable = [
+                    TOKEN_NAMES.get(exp, exp.lower().replace("_", " "))
+                    for exp in sorted(expected)
+                ]
+                if len(readable) == 1:
+                    msg_parts.append(f"Expected {readable[0]}")
+                elif len(readable) <= 4:
+                    msg_parts.append(f"Expected one of: {', '.join(readable)}")
+        else:
+            msg_parts.append(str(e).split("\n")[0])
+
+        # Show the offending line with a caret pointing to the error
+        msg_parts.append("")
+        msg_parts.append(f"    {line_text}")
+        prefix = line_text[: column - 1].expandtabs()
+        msg_parts.append(f"    {' ' * len(prefix)}^")
+
+        sys.stderr.write("\n".join(msg_parts) + "\n")
+        raise XdrParseError()
+
+    return handle_parse_error
+
+
 def xdr_parser() -> Lark:
     """Return a Lark parser instance configured with the XDR language grammar"""
 
diff --git a/tools/net/sunrpc/xdrgen/xdrgen b/tools/net/sunrpc/xdrgen/xdrgen
index 3afd0547d67c..e22638f8324b 100755
--- a/tools/net/sunrpc/xdrgen/xdrgen
+++ b/tools/net/sunrpc/xdrgen/xdrgen
@@ -133,7 +133,5 @@ There is NO WARRANTY, to the extent permitted by law.""",
 try:
     if __name__ == "__main__":
         sys.exit(main())
-except SystemExit:
-    sys.exit(0)
 except (KeyboardInterrupt, BrokenPipeError):
     sys.exit(1)
-- 
cgit v1.2.3


From 63a5425ff5e077c54eb2719c735108e2aa1f9eb6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Dec 2025 10:19:33 -0500
Subject: xdrgen: Extend error reporting to AST transformation phase

Commit 277df18d7df9 ("xdrgen: Improve parse error reporting") added
clean, compiler-style error messages for syntax errors detected during
parsing. However, semantic errors discovered during AST transformation
still produce verbose Python stack traces.

When an XDR specification references an undefined type, the transformer
raises a VisitError wrapping a KeyError. Before this change:

  Traceback (most recent call last):
    File ".../lark/visitors.py", line 124, in _call_userfunc
      return f(children)
    ...
  KeyError: 'fsh4_mode'
  ...
  lark.exceptions.VisitError: Error trying to process rule "basic":
  'fsh4_mode'

After this change:

  file.x:156:2: semantic error
  Undefined type 'fsh4_mode'

      	fsh4_mode	mode;
              ^

The new handle_transform_error() function extracts position information
from the Lark tree node metadata and formats the error consistently with
parse error messages.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 tools/net/sunrpc/xdrgen/subcmds/declarations.py |  8 ++++-
 tools/net/sunrpc/xdrgen/subcmds/definitions.py  |  8 ++++-
 tools/net/sunrpc/xdrgen/subcmds/lint.py         |  8 ++++-
 tools/net/sunrpc/xdrgen/subcmds/source.py       |  8 ++++-
 tools/net/sunrpc/xdrgen/xdr_parse.py            | 40 ++++++++++++++++++++++++-
 5 files changed, 67 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/net/sunrpc/xdrgen/subcmds/declarations.py b/tools/net/sunrpc/xdrgen/subcmds/declarations.py
index 2fd5c255a547..97ffb76a02f1 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/declarations.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/declarations.py
@@ -8,6 +8,7 @@ import logging
 
 from argparse import Namespace
 from lark import logger
+from lark.exceptions import VisitError
 
 from generators.constant import XdrConstantGenerator
 from generators.enum import XdrEnumGenerator
@@ -24,6 +25,7 @@ from xdr_ast import _XdrConstant, _XdrEnum, _XdrPointer
 from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion
 from xdr_parse import xdr_parser, set_xdr_annotate
 from xdr_parse import make_error_handler, XdrParseError
+from xdr_parse import handle_transform_error
 
 logger.setLevel(logging.INFO)
 
@@ -63,7 +65,11 @@ def subcmd(args: Namespace) -> int:
             )
         except XdrParseError:
             return 1
-        ast = transform_parse_tree(parse_tree)
+        try:
+            ast = transform_parse_tree(parse_tree)
+        except VisitError as e:
+            handle_transform_error(e, source, args.filename)
+            return 1
 
         gen = XdrHeaderTopGenerator(args.language, args.peer)
         gen.emit_declaration(args.filename, ast)
diff --git a/tools/net/sunrpc/xdrgen/subcmds/definitions.py b/tools/net/sunrpc/xdrgen/subcmds/definitions.py
index 8ea5c57cc37a..d6c2dcd6f78f 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/definitions.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/definitions.py
@@ -8,6 +8,7 @@ import logging
 
 from argparse import Namespace
 from lark import logger
+from lark.exceptions import VisitError
 
 from generators.constant import XdrConstantGenerator
 from generators.enum import XdrEnumGenerator
@@ -24,6 +25,7 @@ from xdr_ast import _RpcProgram, _XdrConstant, _XdrEnum, _XdrPointer
 from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion
 from xdr_parse import xdr_parser, set_xdr_annotate
 from xdr_parse import make_error_handler, XdrParseError
+from xdr_parse import handle_transform_error
 
 logger.setLevel(logging.INFO)
 
@@ -82,7 +84,11 @@ def subcmd(args: Namespace) -> int:
             )
         except XdrParseError:
             return 1
-        ast = transform_parse_tree(parse_tree)
+        try:
+            ast = transform_parse_tree(parse_tree)
+        except VisitError as e:
+            handle_transform_error(e, source, args.filename)
+            return 1
 
         gen = XdrHeaderTopGenerator(args.language, args.peer)
         gen.emit_definition(args.filename, ast)
diff --git a/tools/net/sunrpc/xdrgen/subcmds/lint.py b/tools/net/sunrpc/xdrgen/subcmds/lint.py
index 2c48fa57c4e5..e1da49632e62 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/lint.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/lint.py
@@ -8,8 +8,10 @@ import logging
 
 from argparse import Namespace
 from lark import logger
+from lark.exceptions import VisitError
 
 from xdr_parse import xdr_parser, make_error_handler, XdrParseError
+from xdr_parse import handle_transform_error
 from xdr_ast import transform_parse_tree
 
 logger.setLevel(logging.DEBUG)
@@ -27,6 +29,10 @@ def subcmd(args: Namespace) -> int:
             )
         except XdrParseError:
             return 1
-        transform_parse_tree(parse_tree)
+        try:
+            transform_parse_tree(parse_tree)
+        except VisitError as e:
+            handle_transform_error(e, source, args.filename)
+            return 1
 
     return 0
diff --git a/tools/net/sunrpc/xdrgen/subcmds/source.py b/tools/net/sunrpc/xdrgen/subcmds/source.py
index bc7d38802df3..08c883f547d7 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/source.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/source.py
@@ -8,6 +8,7 @@ import logging
 
 from argparse import Namespace
 from lark import logger
+from lark.exceptions import VisitError
 
 from generators.source_top import XdrSourceTopGenerator
 from generators.enum import XdrEnumGenerator
@@ -23,6 +24,7 @@ from xdr_ast import _XdrStruct, _XdrTypedef, _XdrUnion
 
 from xdr_parse import xdr_parser, set_xdr_annotate
 from xdr_parse import make_error_handler, XdrParseError
+from xdr_parse import handle_transform_error
 
 logger.setLevel(logging.INFO)
 
@@ -105,7 +107,11 @@ def subcmd(args: Namespace) -> int:
             )
         except XdrParseError:
             return 1
-        ast = transform_parse_tree(parse_tree)
+        try:
+            ast = transform_parse_tree(parse_tree)
+        except VisitError as e:
+            handle_transform_error(e, source, args.filename)
+            return 1
         match args.peer:
             case "server":
                 generate_server_source(args.filename, ast, args.language)
diff --git a/tools/net/sunrpc/xdrgen/xdr_parse.py b/tools/net/sunrpc/xdrgen/xdr_parse.py
index 426513be066c..38724ad5aea2 100644
--- a/tools/net/sunrpc/xdrgen/xdr_parse.py
+++ b/tools/net/sunrpc/xdrgen/xdr_parse.py
@@ -7,7 +7,7 @@ import sys
 from typing import Callable
 
 from lark import Lark
-from lark.exceptions import UnexpectedInput, UnexpectedToken
+from lark.exceptions import UnexpectedInput, UnexpectedToken, VisitError
 
 
 # Set to True to emit annotation comments in generated source
@@ -107,6 +107,44 @@ def make_error_handler(source: str, filename: str) -> Callable[[UnexpectedInput]
     return handle_parse_error
 
 
+def handle_transform_error(e: VisitError, source: str, filename: str) -> None:
+    """Report a transform error with context.
+
+    Args:
+        e: The VisitError from Lark's transformer
+        source: The XDR source text being parsed
+        filename: The name of the file being parsed
+    """
+    lines = source.splitlines()
+
+    # Extract position from the tree node if available
+    line_num = 0
+    column = 0
+    if hasattr(e.obj, "meta") and e.obj.meta:
+        line_num = e.obj.meta.line
+        column = e.obj.meta.column
+
+    line_text = lines[line_num - 1] if 0 < line_num <= len(lines) else ""
+
+    # Build the error message
+    msg_parts = [f"{filename}:{line_num}:{column}: semantic error"]
+
+    # The original exception is typically a KeyError for undefined types
+    if isinstance(e.orig_exc, KeyError):
+        msg_parts.append(f"Undefined type '{e.orig_exc.args[0]}'")
+    else:
+        msg_parts.append(str(e.orig_exc))
+
+    # Show the offending line with a caret pointing to the error
+    if line_text:
+        msg_parts.append("")
+        msg_parts.append(f"    {line_text}")
+        prefix = line_text[: column - 1].expandtabs()
+        msg_parts.append(f"    {' ' * len(prefix)}^")
+
+    sys.stderr.write("\n".join(msg_parts) + "\n")
+
+
 def xdr_parser() -> Lark:
     """Return a Lark parser instance configured with the XDR language grammar"""
 
-- 
cgit v1.2.3


From 4c53b89032f14577e94d747a3ca0aee63f18d856 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Dec 2025 10:19:34 -0500
Subject: xdrgen: Emit a max_arg_sz macro

struct svc_service has a .vs_xdrsize field that is filled in by
servers for each of their RPC programs. This field is supposed to
contain the size of the largest procedure argument in the RPC
program. This value is also sometimes used to size network
transport buffers.

Currently, server implementations must manually calculate and
hard-code this value, which is error-prone and requires updates
when procedure arguments change.

Update xdrgen to determine which procedure argument structure is
largest, and emit a macro with a well-known name that contains
the size of that structure. Server code then uses this macro when
initializing the .vs_xdrsize field.

For NLM version 4, xdrgen now emits:

    #define NLM4_MAX_ARGS_SZ (NLM4_nlm4_lockargs_sz)

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 tools/net/sunrpc/xdrgen/generators/program.py      | 35 +++++++++++++++++++++-
 tools/net/sunrpc/xdrgen/subcmds/definitions.py     |  2 ++
 .../xdrgen/templates/C/program/maxsize/max_args.j2 |  3 ++
 3 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2

(limited to 'tools')

diff --git a/tools/net/sunrpc/xdrgen/generators/program.py b/tools/net/sunrpc/xdrgen/generators/program.py
index decb092ef02c..c0cb3f6d3319 100644
--- a/tools/net/sunrpc/xdrgen/generators/program.py
+++ b/tools/net/sunrpc/xdrgen/generators/program.py
@@ -5,8 +5,9 @@
 
 from jinja2 import Environment
 
-from generators import SourceGenerator, create_jinja2_environment
+from generators import SourceGenerator, create_jinja2_environment, get_jinja2_template
 from xdr_ast import _RpcProgram, _RpcVersion, excluded_apis
+from xdr_ast import max_widths, get_header_name
 
 
 def emit_version_definitions(
@@ -169,3 +170,35 @@ class XdrProgramGenerator(SourceGenerator):
                     emit_version_argument_encoders(
                         self.environment, program, version,
                     )
+
+    def emit_maxsize(self, node: _RpcProgram) -> None:
+        """Emit maxsize macro for maximum RPC argument size"""
+        header = get_header_name().upper()
+
+        # Find the largest argument across all versions
+        max_arg_width = 0
+        max_arg_name = None
+        for version in node.versions:
+            for procedure in version.procedures:
+                if procedure.name in excluded_apis:
+                    continue
+                arg_name = procedure.argument.type_name
+                if arg_name == "void":
+                    continue
+                if arg_name not in max_widths:
+                    continue
+                if max_widths[arg_name] > max_arg_width:
+                    max_arg_width = max_widths[arg_name]
+                    max_arg_name = arg_name
+
+        if max_arg_name is None:
+            return
+
+        macro_name = header + "_MAX_ARGS_SZ"
+        template = get_jinja2_template(self.environment, "maxsize", "max_args")
+        print(
+            template.render(
+                macro=macro_name,
+                width=header + "_" + max_arg_name + "_sz",
+            )
+        )
diff --git a/tools/net/sunrpc/xdrgen/subcmds/definitions.py b/tools/net/sunrpc/xdrgen/subcmds/definitions.py
index d6c2dcd6f78f..b17526a03dda 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/definitions.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/definitions.py
@@ -66,6 +66,8 @@ def emit_header_maxsize(root: Specification, language: str, peer: str) -> None:
             gen = XdrStructGenerator(language, peer)
         elif isinstance(definition.value, _XdrUnion):
             gen = XdrUnionGenerator(language, peer)
+        elif isinstance(definition.value, _RpcProgram):
+            gen = XdrProgramGenerator(language, peer)
         else:
             continue
         gen.emit_maxsize(definition.value)
diff --git a/tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2 b/tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2
new file mode 100644
index 000000000000..9f3bfb47d2f4
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/program/maxsize/max_args.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+#define {{ '{:<31}'.format(macro) }} \
+	({{ width }})
-- 
cgit v1.2.3


From 5288993c4d1a8e59310e007aa68cf9b856551cc6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 26 Dec 2025 10:19:35 -0500
Subject: xdrgen: Add enum value validation to generated decoders

XDR enum decoders generated by xdrgen do not verify that incoming
values are valid members of the enum. Incoming out-of-range values
from malicious or buggy peers propagate through the system
unchecked.

Add validation logic to generated enum decoders using a switch
statement that explicitly lists valid enumerator values. The
compiler optimizes this to a simple range check when enum values
are dense (contiguous), while correctly rejecting invalid values
for sparse enums with gaps in their value ranges.

The --no-enum-validation option on the source subcommand disables
this validation when not needed.

The minimum and maximum fields in _XdrEnum, which were previously
unused placeholders for a range-based validation approach, have
been removed since the switch-based validation handles both dense
and sparse enums correctly.

Because the new mechanism results in substantive changes to
generated code, existing .x files are regenerated. Unrelated white
space and semicolon changes in the generated code are due to recent
commit 1c873a2fd110 ("xdrgen: Don't generate unnecessary semicolon")
and commit 38c4df91242b ("xdrgen: Address some checkpatch whitespace
complaints").

Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr_gen.c                              | 105 +++++++++++++++++----
 fs/nfsd/nfs4xdr_gen.h                              |   2 +-
 include/linux/sunrpc/xdrgen/nfs4_1.h               |   8 +-
 tools/net/sunrpc/xdrgen/generators/enum.py         |   9 +-
 tools/net/sunrpc/xdrgen/subcmds/source.py          |   3 +-
 .../sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 |  11 +++
 .../xdrgen/templates/C/enum/decoder/enum_be.j2     |  20 ++++
 tools/net/sunrpc/xdrgen/xdr_ast.py                 |   6 +-
 tools/net/sunrpc/xdrgen/xdr_parse.py               |  14 +++
 tools/net/sunrpc/xdrgen/xdrgen                     |   6 ++
 10 files changed, 156 insertions(+), 28 deletions(-)

(limited to 'tools')

diff --git a/fs/nfsd/nfs4xdr_gen.c b/fs/nfsd/nfs4xdr_gen.c
index a17b5d8e60b3..1e5e2243625c 100644
--- a/fs/nfsd/nfs4xdr_gen.c
+++ b/fs/nfsd/nfs4xdr_gen.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 // Generated by xdrgen. Manual edits will be lost.
 // XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x
-// XDR specification modification time: Mon Oct 14 09:10:13 2024
+// XDR specification modification time: Thu Dec 25 13:44:43 2025
 
 #include <linux/sunrpc/svc.h>
 
@@ -11,13 +11,13 @@ static bool __maybe_unused
 xdrgen_decode_int64_t(struct xdr_stream *xdr, int64_t *ptr)
 {
 	return xdrgen_decode_hyper(xdr, ptr);
-};
+}
 
 static bool __maybe_unused
 xdrgen_decode_uint32_t(struct xdr_stream *xdr, uint32_t *ptr)
 {
 	return xdrgen_decode_unsigned_int(xdr, ptr);
-};
+}
 
 static bool __maybe_unused
 xdrgen_decode_bitmap4(struct xdr_stream *xdr, bitmap4 *ptr)
@@ -28,7 +28,7 @@ xdrgen_decode_bitmap4(struct xdr_stream *xdr, bitmap4 *ptr)
 		if (!xdrgen_decode_uint32_t(xdr, &ptr->element[i]))
 			return false;
 	return true;
-};
+}
 
 static bool __maybe_unused
 xdrgen_decode_nfstime4(struct xdr_stream *xdr, struct nfstime4 *ptr)
@@ -38,13 +38,13 @@ xdrgen_decode_nfstime4(struct xdr_stream *xdr, struct nfstime4 *ptr)
 	if (!xdrgen_decode_uint32_t(xdr, &ptr->nseconds))
 		return false;
 	return true;
-};
+}
 
 static bool __maybe_unused
 xdrgen_decode_fattr4_offline(struct xdr_stream *xdr, fattr4_offline *ptr)
 {
 	return xdrgen_decode_bool(xdr, ptr);
-};
+}
 
 static bool __maybe_unused
 xdrgen_decode_open_arguments4(struct xdr_stream *xdr, struct open_arguments4 *ptr)
@@ -60,7 +60,7 @@ xdrgen_decode_open_arguments4(struct xdr_stream *xdr, struct open_arguments4 *pt
 	if (!xdrgen_decode_bitmap4(xdr, &ptr->oa_create_mode))
 		return false;
 	return true;
-};
+}
 
 static bool __maybe_unused
 xdrgen_decode_open_args_share_access4(struct xdr_stream *xdr, open_args_share_access4 *ptr)
@@ -69,6 +69,15 @@ xdrgen_decode_open_args_share_access4(struct xdr_stream *xdr, open_args_share_ac
 
 	if (xdr_stream_decode_u32(xdr, &val) < 0)
 		return false;
+	/* Compiler may optimize to a range check for dense enums */
+	switch (val) {
+	case OPEN_ARGS_SHARE_ACCESS_READ:
+	case OPEN_ARGS_SHARE_ACCESS_WRITE:
+	case OPEN_ARGS_SHARE_ACCESS_BOTH:
+		break;
+	default:
+		return false;
+	}
 	*ptr = val;
 	return true;
 }
@@ -80,6 +89,16 @@ xdrgen_decode_open_args_share_deny4(struct xdr_stream *xdr, open_args_share_deny
 
 	if (xdr_stream_decode_u32(xdr, &val) < 0)
 		return false;
+	/* Compiler may optimize to a range check for dense enums */
+	switch (val) {
+	case OPEN_ARGS_SHARE_DENY_NONE:
+	case OPEN_ARGS_SHARE_DENY_READ:
+	case OPEN_ARGS_SHARE_DENY_WRITE:
+	case OPEN_ARGS_SHARE_DENY_BOTH:
+		break;
+	default:
+		return false;
+	}
 	*ptr = val;
 	return true;
 }
@@ -91,6 +110,19 @@ xdrgen_decode_open_args_share_access_want4(struct xdr_stream *xdr, open_args_sha
 
 	if (xdr_stream_decode_u32(xdr, &val) < 0)
 		return false;
+	/* Compiler may optimize to a range check for dense enums */
+	switch (val) {
+	case OPEN_ARGS_SHARE_ACCESS_WANT_ANY_DELEG:
+	case OPEN_ARGS_SHARE_ACCESS_WANT_NO_DELEG:
+	case OPEN_ARGS_SHARE_ACCESS_WANT_CANCEL:
+	case OPEN_ARGS_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL:
+	case OPEN_ARGS_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED:
+	case OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS:
+	case OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION:
+		break;
+	default:
+		return false;
+	}
 	*ptr = val;
 	return true;
 }
@@ -102,6 +134,19 @@ xdrgen_decode_open_args_open_claim4(struct xdr_stream *xdr, open_args_open_claim
 
 	if (xdr_stream_decode_u32(xdr, &val) < 0)
 		return false;
+	/* Compiler may optimize to a range check for dense enums */
+	switch (val) {
+	case OPEN_ARGS_OPEN_CLAIM_NULL:
+	case OPEN_ARGS_OPEN_CLAIM_PREVIOUS:
+	case OPEN_ARGS_OPEN_CLAIM_DELEGATE_CUR:
+	case OPEN_ARGS_OPEN_CLAIM_DELEGATE_PREV:
+	case OPEN_ARGS_OPEN_CLAIM_FH:
+	case OPEN_ARGS_OPEN_CLAIM_DELEG_CUR_FH:
+	case OPEN_ARGS_OPEN_CLAIM_DELEG_PREV_FH:
+		break;
+	default:
+		return false;
+	}
 	*ptr = val;
 	return true;
 }
@@ -113,6 +158,16 @@ xdrgen_decode_open_args_createmode4(struct xdr_stream *xdr, open_args_createmode
 
 	if (xdr_stream_decode_u32(xdr, &val) < 0)
 		return false;
+	/* Compiler may optimize to a range check for dense enums */
+	switch (val) {
+	case OPEN_ARGS_CREATEMODE_UNCHECKED4:
+	case OPEN_ARGS_CREATE_MODE_GUARDED:
+	case OPEN_ARGS_CREATEMODE_EXCLUSIVE4:
+	case OPEN_ARGS_CREATE_MODE_EXCLUSIVE4_1:
+		break;
+	default:
+		return false;
+	}
 	*ptr = val;
 	return true;
 }
@@ -121,19 +176,19 @@ bool
 xdrgen_decode_fattr4_open_arguments(struct xdr_stream *xdr, fattr4_open_arguments *ptr)
 {
 	return xdrgen_decode_open_arguments4(xdr, ptr);
-};
+}
 
 bool
 xdrgen_decode_fattr4_time_deleg_access(struct xdr_stream *xdr, fattr4_time_deleg_access *ptr)
 {
 	return xdrgen_decode_nfstime4(xdr, ptr);
-};
+}
 
 bool
 xdrgen_decode_fattr4_time_deleg_modify(struct xdr_stream *xdr, fattr4_time_deleg_modify *ptr)
 {
 	return xdrgen_decode_nfstime4(xdr, ptr);
-};
+}
 
 static bool __maybe_unused
 xdrgen_decode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type4 *ptr)
@@ -142,6 +197,18 @@ xdrgen_decode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type
 
 	if (xdr_stream_decode_u32(xdr, &val) < 0)
 		return false;
+	/* Compiler may optimize to a range check for dense enums */
+	switch (val) {
+	case OPEN_DELEGATE_NONE:
+	case OPEN_DELEGATE_READ:
+	case OPEN_DELEGATE_WRITE:
+	case OPEN_DELEGATE_NONE_EXT:
+	case OPEN_DELEGATE_READ_ATTRS_DELEG:
+	case OPEN_DELEGATE_WRITE_ATTRS_DELEG:
+		break;
+	default:
+		return false;
+	}
 	*ptr = val;
 	return true;
 }
@@ -150,13 +217,13 @@ static bool __maybe_unused
 xdrgen_encode_int64_t(struct xdr_stream *xdr, const int64_t value)
 {
 	return xdrgen_encode_hyper(xdr, value);
-};
+}
 
 static bool __maybe_unused
 xdrgen_encode_uint32_t(struct xdr_stream *xdr, const uint32_t value)
 {
 	return xdrgen_encode_unsigned_int(xdr, value);
-};
+}
 
 static bool __maybe_unused
 xdrgen_encode_bitmap4(struct xdr_stream *xdr, const bitmap4 value)
@@ -167,7 +234,7 @@ xdrgen_encode_bitmap4(struct xdr_stream *xdr, const bitmap4 value)
 		if (!xdrgen_encode_uint32_t(xdr, value.element[i]))
 			return false;
 	return true;
-};
+}
 
 static bool __maybe_unused
 xdrgen_encode_nfstime4(struct xdr_stream *xdr, const struct nfstime4 *value)
@@ -177,13 +244,13 @@ xdrgen_encode_nfstime4(struct xdr_stream *xdr, const struct nfstime4 *value)
 	if (!xdrgen_encode_uint32_t(xdr, value->nseconds))
 		return false;
 	return true;
-};
+}
 
 static bool __maybe_unused
 xdrgen_encode_fattr4_offline(struct xdr_stream *xdr, const fattr4_offline value)
 {
 	return xdrgen_encode_bool(xdr, value);
-};
+}
 
 static bool __maybe_unused
 xdrgen_encode_open_arguments4(struct xdr_stream *xdr, const struct open_arguments4 *value)
@@ -199,7 +266,7 @@ xdrgen_encode_open_arguments4(struct xdr_stream *xdr, const struct open_argument
 	if (!xdrgen_encode_bitmap4(xdr, value->oa_create_mode))
 		return false;
 	return true;
-};
+}
 
 static bool __maybe_unused
 xdrgen_encode_open_args_share_access4(struct xdr_stream *xdr, open_args_share_access4 value)
@@ -235,19 +302,19 @@ bool
 xdrgen_encode_fattr4_open_arguments(struct xdr_stream *xdr, const fattr4_open_arguments *value)
 {
 	return xdrgen_encode_open_arguments4(xdr, value);
-};
+}
 
 bool
 xdrgen_encode_fattr4_time_deleg_access(struct xdr_stream *xdr, const fattr4_time_deleg_access *value)
 {
 	return xdrgen_encode_nfstime4(xdr, value);
-};
+}
 
 bool
 xdrgen_encode_fattr4_time_deleg_modify(struct xdr_stream *xdr, const fattr4_time_deleg_modify *value)
 {
 	return xdrgen_encode_nfstime4(xdr, value);
-};
+}
 
 static bool __maybe_unused
 xdrgen_encode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type4 value)
diff --git a/fs/nfsd/nfs4xdr_gen.h b/fs/nfsd/nfs4xdr_gen.h
index 41a0033b7256..47437876e803 100644
--- a/fs/nfsd/nfs4xdr_gen.h
+++ b/fs/nfsd/nfs4xdr_gen.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Generated by xdrgen. Manual edits will be lost. */
 /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */
-/* XDR specification modification time: Mon Oct 14 09:10:13 2024 */
+/* XDR specification modification time: Thu Dec 25 13:44:43 2025 */
 
 #ifndef _LINUX_XDRGEN_NFS4_1_DECL_H
 #define _LINUX_XDRGEN_NFS4_1_DECL_H
diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h
index cf21a14aa885..352bffda08f7 100644
--- a/include/linux/sunrpc/xdrgen/nfs4_1.h
+++ b/include/linux/sunrpc/xdrgen/nfs4_1.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Generated by xdrgen. Manual edits will be lost. */
 /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */
-/* XDR specification modification time: Mon Oct 14 09:10:13 2024 */
+/* XDR specification modification time: Thu Dec 25 13:44:43 2025 */
 
 #ifndef _LINUX_XDRGEN_NFS4_1_DEF_H
 #define _LINUX_XDRGEN_NFS4_1_DEF_H
@@ -40,6 +40,7 @@ enum open_args_share_access4 {
 	OPEN_ARGS_SHARE_ACCESS_WRITE = 2,
 	OPEN_ARGS_SHARE_ACCESS_BOTH = 3,
 };
+
 typedef enum open_args_share_access4 open_args_share_access4;
 
 enum open_args_share_deny4 {
@@ -48,6 +49,7 @@ enum open_args_share_deny4 {
 	OPEN_ARGS_SHARE_DENY_WRITE = 2,
 	OPEN_ARGS_SHARE_DENY_BOTH = 3,
 };
+
 typedef enum open_args_share_deny4 open_args_share_deny4;
 
 enum open_args_share_access_want4 {
@@ -59,6 +61,7 @@ enum open_args_share_access_want4 {
 	OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 20,
 	OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 21,
 };
+
 typedef enum open_args_share_access_want4 open_args_share_access_want4;
 
 enum open_args_open_claim4 {
@@ -70,6 +73,7 @@ enum open_args_open_claim4 {
 	OPEN_ARGS_OPEN_CLAIM_DELEG_CUR_FH = 5,
 	OPEN_ARGS_OPEN_CLAIM_DELEG_PREV_FH = 6,
 };
+
 typedef enum open_args_open_claim4 open_args_open_claim4;
 
 enum open_args_createmode4 {
@@ -78,6 +82,7 @@ enum open_args_createmode4 {
 	OPEN_ARGS_CREATEMODE_EXCLUSIVE4 = 2,
 	OPEN_ARGS_CREATE_MODE_EXCLUSIVE4_1 = 3,
 };
+
 typedef enum open_args_createmode4 open_args_createmode4;
 
 typedef struct open_arguments4 fattr4_open_arguments;
@@ -124,6 +129,7 @@ enum open_delegation_type4 {
 	OPEN_DELEGATE_READ_ATTRS_DELEG = 4,
 	OPEN_DELEGATE_WRITE_ATTRS_DELEG = 5,
 };
+
 typedef enum open_delegation_type4 open_delegation_type4;
 
 #define NFS4_int64_t_sz                 \
diff --git a/tools/net/sunrpc/xdrgen/generators/enum.py b/tools/net/sunrpc/xdrgen/generators/enum.py
index e62f715d3996..b4ed3ed6431e 100644
--- a/tools/net/sunrpc/xdrgen/generators/enum.py
+++ b/tools/net/sunrpc/xdrgen/generators/enum.py
@@ -5,6 +5,7 @@
 
 from generators import SourceGenerator, create_jinja2_environment
 from xdr_ast import _XdrEnum, public_apis, big_endian, get_header_name
+from xdr_parse import get_xdr_enum_validation
 
 
 class XdrEnumGenerator(SourceGenerator):
@@ -42,7 +43,13 @@ class XdrEnumGenerator(SourceGenerator):
             template = self.environment.get_template("decoder/enum_be.j2")
         else:
             template = self.environment.get_template("decoder/enum.j2")
-        print(template.render(name=node.name))
+        print(
+            template.render(
+                name=node.name,
+                enumerators=node.enumerators,
+                validate=get_xdr_enum_validation(),
+            )
+        )
 
     def emit_encoder(self, node: _XdrEnum) -> None:
         """Emit one encoder function for an XDR enum type"""
diff --git a/tools/net/sunrpc/xdrgen/subcmds/source.py b/tools/net/sunrpc/xdrgen/subcmds/source.py
index 08c883f547d7..6508563494fe 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/source.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/source.py
@@ -22,7 +22,7 @@ from xdr_ast import transform_parse_tree, _RpcProgram, Specification
 from xdr_ast import _XdrAst, _XdrEnum, _XdrPointer
 from xdr_ast import _XdrStruct, _XdrTypedef, _XdrUnion
 
-from xdr_parse import xdr_parser, set_xdr_annotate
+from xdr_parse import xdr_parser, set_xdr_annotate, set_xdr_enum_validation
 from xdr_parse import make_error_handler, XdrParseError
 from xdr_parse import handle_transform_error
 
@@ -98,6 +98,7 @@ def subcmd(args: Namespace) -> int:
     """Generate encoder and decoder functions"""
 
     set_xdr_annotate(args.annotate)
+    set_xdr_enum_validation(not args.no_enum_validation)
     parser = xdr_parser()
     with open(args.filename, encoding="utf-8") as f:
         source = f.read()
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2
index 6482984f1cb7..735a34157fdf 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum.j2
@@ -14,6 +14,17 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr)
 
 	if (xdr_stream_decode_u32(xdr, &val) < 0)
 		return false;
+{% if validate and enumerators %}
+	/* Compiler may optimize to a range check for dense enums */
+	switch (val) {
+{% for e in enumerators %}
+	case {{ e.name }}:
+{% endfor %}
+		break;
+	default:
+		return false;
+	}
+{% endif %}
 	*ptr = val;
 	return true;
 }
diff --git a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2 b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2
index 44c391c10b42..82782a510d47 100644
--- a/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2
+++ b/tools/net/sunrpc/xdrgen/templates/C/enum/decoder/enum_be.j2
@@ -10,5 +10,25 @@ static bool __maybe_unused
 {% endif %}
 xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ name }} *ptr)
 {
+{% if validate and enumerators %}
+	__be32 raw;
+	u32 val;
+
+	if (xdr_stream_decode_be32(xdr, &raw) < 0)
+		return false;
+	val = be32_to_cpu(raw);
+	/* Compiler may optimize to a range check for dense enums */
+	switch (val) {
+{% for e in enumerators %}
+	case {{ e.name }}:
+{% endfor %}
+		break;
+	default:
+		return false;
+	}
+	*ptr = raw;
+	return true;
+{% else %}
 	return xdr_stream_decode_be32(xdr, ptr) == 0;
+{% endif %}
 }
diff --git a/tools/net/sunrpc/xdrgen/xdr_ast.py b/tools/net/sunrpc/xdrgen/xdr_ast.py
index 2b5d160a0a60..dc2fa9fd8ec2 100644
--- a/tools/net/sunrpc/xdrgen/xdr_ast.py
+++ b/tools/net/sunrpc/xdrgen/xdr_ast.py
@@ -330,8 +330,6 @@ class _XdrEnum(_XdrAst):
     """An XDR enum definition"""
 
     name: str
-    minimum: int
-    maximum: int
     enumerators: List[_XdrEnumerator]
 
     def max_width(self) -> int:
@@ -572,8 +570,6 @@ class ParseToAst(Transformer):
         value = children[1].value
         return _XdrConstant(name, value)
 
-    # cel: Python can compute a min() and max() for the enumerator values
-    #      so that the generated code can perform proper range checking.
     def enum(self, children):
         """Instantiate one _XdrEnum object"""
         enum_name = children[0].symbol
@@ -587,7 +583,7 @@ class ParseToAst(Transformer):
             enumerators.append(_XdrEnumerator(name, value))
             i = i + 2
 
-        return _XdrEnum(enum_name, 0, 0, enumerators)
+        return _XdrEnum(enum_name, enumerators)
 
     def fixed_length_opaque(self, children):
         """Instantiate one _XdrFixedLengthOpaque declaration object"""
diff --git a/tools/net/sunrpc/xdrgen/xdr_parse.py b/tools/net/sunrpc/xdrgen/xdr_parse.py
index 38724ad5aea2..241e96c1fdd9 100644
--- a/tools/net/sunrpc/xdrgen/xdr_parse.py
+++ b/tools/net/sunrpc/xdrgen/xdr_parse.py
@@ -13,6 +13,9 @@ from lark.exceptions import UnexpectedInput, UnexpectedToken, VisitError
 # Set to True to emit annotation comments in generated source
 annotate = False
 
+# Set to True to emit enum value validation in decoders
+enum_validation = True
+
 # Map internal Lark token names to human-readable names
 TOKEN_NAMES = {
     "__ANON_0": "identifier",
@@ -49,6 +52,17 @@ def get_xdr_annotate() -> bool:
     return annotate
 
 
+def set_xdr_enum_validation(set_it: bool) -> None:
+    """Set 'enum_validation' based on command line options"""
+    global enum_validation
+    enum_validation = set_it
+
+
+def get_xdr_enum_validation() -> bool:
+    """Return True when enum validation is enabled for decoder generation"""
+    return enum_validation
+
+
 def make_error_handler(source: str, filename: str) -> Callable[[UnexpectedInput], bool]:
     """Create an error handler that reports the first parse error and aborts.
 
diff --git a/tools/net/sunrpc/xdrgen/xdrgen b/tools/net/sunrpc/xdrgen/xdrgen
index e22638f8324b..b2fb43f4a2ec 100755
--- a/tools/net/sunrpc/xdrgen/xdrgen
+++ b/tools/net/sunrpc/xdrgen/xdrgen
@@ -123,6 +123,12 @@ There is NO WARRANTY, to the extent permitted by law.""",
         help="Generate code for client or server side",
         type=str,
     )
+    source_parser.add_argument(
+        "--no-enum-validation",
+        action="store_true",
+        default=False,
+        help="Disable enum value validation in decoders",
+    )
     source_parser.add_argument("filename", help="File containing an XDR specification")
     source_parser.set_defaults(func=source.subcmd)
 
-- 
cgit v1.2.3


From 78980b4c7fcb5ef74b7af65fbef5ce8d718cf791 Mon Sep 17 00:00:00 2001
From: Leon Hwang <leon.hwang@linux.dev>
Date: Mon, 19 Jan 2026 21:34:17 +0800
Subject: selftests/bpf: Harden cpu flags test for lru_percpu_hash map

CI occasionally reports failures in the
percpu_alloc/cpu_flag_lru_percpu_hash selftest, for example:

 First test_progs failure (test_progs_no_alu32-x86_64-llvm-21):
 #264/15 percpu_alloc/cpu_flag_lru_percpu_hash
 ...
 test_percpu_map_op_cpu_flag:FAIL:bpf_map_lookup_batch value on specified cpu unexpected bpf_map_lookup_batch value on specified cpu: actual 0 != expected 3735929054

The unexpected value indicates that an element was removed from the map.
However, the test never calls delete_elem(), so the only possible cause
is LRU eviction.

This can happen when the current task migrates to another CPU: an
update_elem() triggers eviction because there is no available LRU node
on local freelist and global freelist.

Harden the test against this behavior by provisioning sufficient spare
elements. Set max_entries to 'nr_cpus * 2' and restrict the test to using
the first nr_cpus entries, ensuring that updates do not spuriously trigger
LRU eviction.

Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260119133417.19739-1-leon.hwang@linux.dev
---
 tools/testing/selftests/bpf/prog_tests/percpu_alloc.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c
index c1d0949f093f..a72ae0b29f6e 100644
--- a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c
@@ -236,6 +236,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t
 		err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts);
 		if (!ASSERT_OK(err, "bpf_map_update_batch all_cpus"))
 			goto out;
+		if (!ASSERT_EQ(count, entries, "bpf_map_update_batch count"))
+			goto out;
 
 		/* update values on specified CPU */
 		for (i = 0; i < entries; i++)
@@ -246,6 +248,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t
 		err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts);
 		if (!ASSERT_OK(err, "bpf_map_update_batch specified cpu"))
 			goto out;
+		if (!ASSERT_EQ(count, entries, "bpf_map_update_batch count"))
+			goto out;
 
 		/* lookup values on specified CPU */
 		batch = 0;
@@ -254,6 +258,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t
 		err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values, &count, &batch_opts);
 		if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch specified cpu"))
 			goto out;
+		if (!ASSERT_EQ(count, entries, "bpf_map_lookup_batch count"))
+			goto out;
 
 		for (i = 0; i < entries; i++)
 			if (!ASSERT_EQ(values[i], value,
@@ -269,6 +275,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t
 					   &batch_opts);
 		if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch all_cpus"))
 			goto out;
+		if (!ASSERT_EQ(count, entries, "bpf_map_lookup_batch count"))
+			goto out;
 
 		for (i = 0; i < entries; i++) {
 			values_row = (void *) values_percpu +
@@ -287,7 +295,6 @@ out:
 	free(values);
 }
 
-
 static void test_percpu_map_cpu_flag(enum bpf_map_type map_type)
 {
 	struct percpu_alloc_array *skel;
@@ -300,7 +307,7 @@ static void test_percpu_map_cpu_flag(enum bpf_map_type map_type)
 	if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus"))
 		return;
 
-	max_entries = nr_cpus + 1;
+	max_entries = nr_cpus * 2;
 	keys = calloc(max_entries, key_sz);
 	if (!ASSERT_OK_PTR(keys, "calloc keys"))
 		return;
@@ -322,7 +329,7 @@ static void test_percpu_map_cpu_flag(enum bpf_map_type map_type)
 	if (!ASSERT_OK(err, "test_percpu_alloc__load"))
 		goto out;
 
-	test_percpu_map_op_cpu_flag(map, keys, key_sz, max_entries - 1, nr_cpus, true);
+	test_percpu_map_op_cpu_flag(map, keys, key_sz, nr_cpus, nr_cpus, true);
 out:
 	percpu_alloc_array__destroy(skel);
 	free(keys);
-- 
cgit v1.2.3


From 096b86ce08332fbcb0ec6ff6714c44899ec03970 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Thu, 8 Jan 2026 09:43:24 +0000
Subject: tools headers: Go back to include asm-generic/unistd.h for arm64

The header unistd.h is included under Arm64's uAPI folder (see
tools/arch/arm64/include/uapi/asm/), but it does not include its
dependent header unistd_64.h.

The intention is for unistd_64.h to be generated dynamically using
scripts/Makefile.asm-headers.

However, this dynamic approach causes problems because the header is not
available early enough, even though it is widely included throughout
tools.

Using the perf build as an example:

 1) Feature detection: Perf first runs feature tests.

    The BPF feature program test-bpf.c includes unistd.h.  Since
    unistd_64.h has not been generated yet, the program fails to build,
    and the BPF feature ends up being disabled.

 2) libperf build:

    The libperf Makefile later generates unistd_64.h on the fly, so
    libperf itself builds successfully.

 3) Final perf build:

    Although the perf binary can build successfully using the generated
    header, we never get a chance to build BPF skeleton programs,
    because BPF support was already disabled earlier.

Restore to include asm-generic/unistd.h for fixing the issue.  This
aligns with most architectures (x86 is a special case that keeps
unistd_32.h/unistd_64.h for its particular syscall numbers) and ensures
the header is available from the start.

Fixes: 22f72088ffe69a37 ("tools headers: Update the syscall table with the kernel sources")
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/arm64/include/uapi/asm/unistd.h | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/arch/arm64/include/uapi/asm/unistd.h b/tools/arch/arm64/include/uapi/asm/unistd.h
index df36f23876e8..9306726337fe 100644
--- a/tools/arch/arm64/include/uapi/asm/unistd.h
+++ b/tools/arch/arm64/include/uapi/asm/unistd.h
@@ -1,2 +1,24 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm/unistd_64.h>
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define __ARCH_WANT_RENAMEAT
+#define __ARCH_WANT_NEW_STAT
+#define __ARCH_WANT_SET_GET_RLIMIT
+#define __ARCH_WANT_TIME32_SYSCALLS
+#define __ARCH_WANT_MEMFD_SECRET
+
+#include <asm-generic/unistd.h>
-- 
cgit v1.2.3


From 129bb23a6f7d022610f902b57d36d69d7d210128 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Thu, 8 Jan 2026 09:43:25 +0000
Subject: Revert "perf tools: Fix arm64 build by generating unistd_64.h"

This reverts:

commit 8988c4b91945173a ("perf tools: Fix in-source libperf build")
commit bfb713ea53c746b0 ("perf tools: Fix arm64 build by generating unistd_64.h")

Since we now have a static unistd_64.h for the arm64 build, there is no
need to generate unistd_64.h in libperf.  Revert all patches related to
generating unistd_64.h.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/Makefile    | 14 ++------------
 tools/perf/Makefile.config |  1 -
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile
index 27e6490f64dc..9692d0742ed0 100644
--- a/tools/lib/perf/Makefile
+++ b/tools/lib/perf/Makefile
@@ -42,7 +42,6 @@ libdir_relative_SQ = $(subst ','\'',$(libdir_relative))
 TEST_ARGS := $(if $(V),-v)
 
 INCLUDES = \
--I$(OUTPUT)arch/$(SRCARCH)/include/generated/uapi \
 -I$(srctree)/tools/lib/perf/include \
 -I$(srctree)/tools/lib/ \
 -I$(srctree)/tools/include \
@@ -100,16 +99,7 @@ $(LIBAPI)-clean:
 	$(call QUIET_CLEAN, libapi)
 	$(Q)$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null
 
-uapi-asm := $(OUTPUT)arch/$(SRCARCH)/include/generated/uapi/asm
-ifeq ($(SRCARCH),arm64)
-	syscall-y := $(uapi-asm)/unistd_64.h
-endif
-uapi-asm-generic:
-	$(if $(syscall-y),\
-		$(Q)$(MAKE) -f $(srctree)/scripts/Makefile.asm-headers obj=$(uapi-asm) \
-		generic=include/uapi/asm-generic $(syscall-y),)
-
-$(LIBPERF_IN): uapi-asm-generic FORCE
+$(LIBPERF_IN): FORCE
 	$(Q)$(MAKE) $(build)=libperf
 
 $(LIBPERF_A): $(LIBPERF_IN)
@@ -130,7 +120,7 @@ all: fixdep
 clean: $(LIBAPI)-clean
 	$(call QUIET_CLEAN, libperf) $(RM) $(LIBPERF_A) \
                 *.o *~ *.a *.so *.so.$(VERSION) *.so.$(LIBPERF_VERSION) .*.d .*.cmd tests/*.o LIBPERF-CFLAGS $(LIBPERF_PC) \
-                $(TESTS_STATIC) $(TESTS_SHARED) $(syscall-y)
+                $(TESTS_STATIC) $(TESTS_SHARED)
 
 TESTS_IN = tests-in.o
 
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 5e4ae775987f..63ca9b2be663 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -64,7 +64,6 @@ include $(srctree)/tools/scripts/Makefile.arch
 $(call detected_var,SRCARCH)
 
 CFLAGS += -I$(OUTPUT)arch/$(SRCARCH)/include/generated
-CFLAGS += -I$(OUTPUT)libperf/arch/$(SRCARCH)/include/generated/uapi
 
 # Additional ARCH settings for ppc
 ifeq ($(SRCARCH),powerpc)
-- 
cgit v1.2.3


From 9966b382d06733f7467484bb440d6db68b743207 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Thu, 8 Jan 2026 09:43:26 +0000
Subject: tools headers: Don't check arm64's unistd.h

The arm64 unistd.h in tools now diverges from the kernel header.
Comparing the two headers is pointless, remove the check.

Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/check-headers.sh | 1 -
 1 file changed, 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index e0537f275da2..da3aca87457f 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -54,7 +54,6 @@ declare -a FILES=(
   "arch/s390/include/uapi/asm/kvm.h"
   "arch/s390/include/uapi/asm/sie.h"
   "arch/arm64/include/uapi/asm/kvm.h"
-  "arch/arm64/include/uapi/asm/unistd.h"
   "arch/alpha/include/uapi/asm/errno.h"
   "arch/mips/include/asm/errno.h"
   "arch/mips/include/uapi/asm/errno.h"
-- 
cgit v1.2.3


From dda5f926a1006c735b00ed5c27291fce64236656 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 26 Jan 2026 17:25:00 -0300
Subject: perf annotate: Fix BUILD_NONDISTRO=1 missing args->ms conversions to
 pointer

Fix a few missing conversions to pointer in the usage of 'struct
annotate_args' 'ms' member in symbol__disassemble_bpf_libbfd().

Fixes: 00419892bac28bf1 ("perf annotate: Fix args leak of map_symbol")
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/libbfd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/libbfd.c b/tools/perf/util/libbfd.c
index 79f4528234a9..63ea3fb53e77 100644
--- a/tools/perf/util/libbfd.c
+++ b/tools/perf/util/libbfd.c
@@ -501,7 +501,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused,
 	struct bpf_prog_info_node *info_node;
 	int len = sym->end - sym->start;
 	disassembler_ftype disassemble;
-	struct map *map = args->ms.map;
+	struct map *map = args->ms->map;
 	struct perf_bpil *info_linear;
 	struct disassemble_info info;
 	struct dso *dso = map__dso(map);
@@ -612,7 +612,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused,
 			args->line = strdup(srcline);
 			args->line_nr = 0;
 			args->fileloc = NULL;
-			args->ms.sym  = sym;
+			args->ms->sym = sym;
 			dl = disasm_line__new(args);
 			if (dl) {
 				annotation_line__add(&dl->al,
@@ -624,7 +624,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused,
 		args->line = buf + prev_buf_size;
 		args->line_nr = 0;
 		args->fileloc = NULL;
-		args->ms.sym  = sym;
+		args->ms->sym = sym;
 		dl = disasm_line__new(args);
 		if (dl)
 			annotation_line__add(&dl->al, &notes->src->source);
-- 
cgit v1.2.3


From 008603bda19b29687edce533e4c09acff68c1077 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Mon, 26 Jan 2026 11:18:23 +0100
Subject: perf test: Fix test perf evlist for z/VM s390x

Perf test case 'perf evlist tests' fails on z/VM machines on s390.

The failure is causes by event cycles. This event is not available
on virtualized machines like z/VM on s390.

Change to software event cpu-clock to fix this.

    Output before:
      # ./perf test 78
      79: perf evlist tests              : FAILED!
      #

    Output after:
      # ./perf test 78
      79: perf evlist tests              : Ok
      #

Fixes: b04d2b9199129f4f ("perf test: Fix test case perf evlist tests for s390x")
Reviewed-by: Ian Rogers <irogers@google.com>
Reviewed-by: Jan Polensky <japo@linux.ibm.com>
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Tested-by: Jan Polensky <japo@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/evlist.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/evlist.sh b/tools/perf/tests/shell/evlist.sh
index 5632be391710..8a22f4171c07 100755
--- a/tools/perf/tests/shell/evlist.sh
+++ b/tools/perf/tests/shell/evlist.sh
@@ -21,13 +21,13 @@ trap trap_cleanup EXIT TERM INT
 
 test_evlist_simple() {
 	echo "Simple evlist test"
-	if ! perf record -e cycles -o "${perfdata}" true 2> /dev/null
+	if ! perf record -e cpu-clock -o "${perfdata}" true 2> /dev/null
 	then
 		echo "Simple evlist [Failed record]"
 		err=1
 		return
 	fi
-	if ! perf evlist -i "${perfdata}" | grep -q "cycles"
+	if ! perf evlist -i "${perfdata}" | grep -q "cpu-clock"
 	then
 		echo "Simple evlist [Failed to list event]"
 		err=1
-- 
cgit v1.2.3


From 76b2cf07a6d2a836108f9c2486d76599f7adf6e8 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan.das@amd.com>
Date: Thu, 22 Jan 2026 13:39:46 +0530
Subject: perf vendor events amd: Fix Zen 5 MAB allocation events

The unit masks for PMCx041 vary across different generations of Zen
processors.

Fix the Zen 5 events based on PMCx041 as they incorrectly use the same
unit masks as that of Zen 4.

Fixes: 45c072f2537ab07b ("perf vendor events amd: Add Zen 5 core events")
Reported-by: Suyash Mahar <smahar@meta.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Sandipan Das <sandipan.das@amd.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth Narayan <ananth.narayan@amd.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/arch/x86/amdzen5/load-store.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json b/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json
index ff6627a77805..06bbaea15925 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen5/load-store.json
@@ -70,19 +70,19 @@
     "EventName": "ls_mab_alloc.load_store_allocations",
     "EventCode": "0x41",
     "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for load-store allocations.",
-    "UMask": "0x3f"
+    "UMask": "0x07"
   },
   {
     "EventName": "ls_mab_alloc.hardware_prefetcher_allocations",
     "EventCode": "0x41",
     "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for hardware prefetcher allocations.",
-    "UMask": "0x40"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_mab_alloc.all_allocations",
     "EventCode": "0x41",
     "BriefDescription": "Miss Address Buffer (MAB) entries allocated by a Load-Store (LS) pipe for all types of allocations.",
-    "UMask": "0x7f"
+    "UMask": "0x0f"
   },
   {
     "EventName": "ls_dmnd_fills_from_sys.local_l2",
-- 
cgit v1.2.3


From 7d0ebeb6c0f735d4eddc679283a1de1dea2ae878 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 23 Jan 2026 14:22:06 -0800
Subject: perf dso: Factor out e_machine reading for use in thread

Factor out the resilient e_machine reading code in dso so that it may
be used in thread.

As there is no dso in that case, make the dso optional.

This makes some minor other changes as the swap type from the dso cannot
be ascertained.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/dso.c    | 110 +++++++++++++++++++++++++++++------------------
 tools/perf/util/dso.h    |  10 +++--
 tools/perf/util/thread.c |   5 +--
 3 files changed, 75 insertions(+), 50 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 3b272a6fae24..91c9f7cb9d8c 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -1203,6 +1203,68 @@ ssize_t dso__data_read_offset(struct dso *dso, struct machine *machine,
 	return data_read_write_offset(dso, machine, offset, data, size, true);
 }
 
+static enum dso_swap_type dso_swap_type__from_elf_data(unsigned char eidata)
+{
+	static const unsigned int endian = 1;
+
+	switch (eidata) {
+	case ELFDATA2LSB:
+		/* We are big endian, DSO is little endian. */
+		return (*(unsigned char const *)&endian != 1) ? DSO_SWAP__YES : DSO_SWAP__NO;
+	case ELFDATA2MSB:
+		/* We are little endian, DSO is big endian. */
+		return (*(unsigned char const *)&endian != 0) ? DSO_SWAP__YES : DSO_SWAP__NO;
+	default:
+		return DSO_SWAP__UNSET;
+	}
+}
+
+/* Reads e_machine from fd, optionally caching data in dso. */
+uint16_t dso__read_e_machine(struct dso *optional_dso, int fd)
+{
+	uint16_t e_machine = EM_NONE;
+	unsigned char e_ident[EI_NIDENT];
+	enum dso_swap_type swap_type;
+
+	_Static_assert(offsetof(Elf32_Ehdr, e_ident) == 0, "Unexpected offset");
+	_Static_assert(offsetof(Elf64_Ehdr, e_ident) == 0, "Unexpected offset");
+	if (pread(fd, &e_ident, sizeof(e_ident), 0) != sizeof(e_ident))
+		return EM_NONE; // Read failed.
+
+	if (memcmp(e_ident, ELFMAG, SELFMAG) != 0)
+		return EM_NONE; // Not an ELF file.
+
+	if (e_ident[EI_CLASS] == ELFCLASSNONE || e_ident[EI_CLASS] >= ELFCLASSNUM)
+		return EM_NONE; // Bad ELF class (32 or 64-bit objects).
+
+	if (e_ident[EI_VERSION] != EV_CURRENT)
+		return EM_NONE; // Bad ELF version.
+
+	swap_type = dso_swap_type__from_elf_data(e_ident[EI_DATA]);
+	if (swap_type == DSO_SWAP__UNSET)
+		return EM_NONE; // Bad ELF data encoding.
+
+	/* Cache the need for swapping. */
+	if (optional_dso) {
+		assert(dso__needs_swap(optional_dso) == DSO_SWAP__UNSET ||
+		       dso__needs_swap(optional_dso) == swap_type);
+		dso__set_needs_swap(optional_dso, swap_type);
+	}
+
+	{
+		_Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset");
+		_Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset");
+		if (pread(fd, &e_machine, sizeof(e_machine), 18) != sizeof(e_machine))
+			return EM_NONE; // e_machine read failed.
+	}
+
+	e_machine = DSO_SWAP_TYPE__SWAP(swap_type, uint16_t, e_machine);
+	if (e_machine >= EM_NUM)
+		return EM_NONE; // Bad ELF machine number.
+
+	return e_machine;
+}
+
 uint16_t dso__e_machine(struct dso *dso, struct machine *machine)
 {
 	uint16_t e_machine = EM_NONE;
@@ -1248,30 +1310,9 @@ uint16_t dso__e_machine(struct dso *dso, struct machine *machine)
 	 */
 	try_to_open_dso(dso, machine);
 	fd = dso__data(dso)->fd;
-	if (fd >= 0) {
-		unsigned char e_ident[EI_NIDENT];
-
-		_Static_assert(offsetof(Elf32_Ehdr, e_ident) == 0, "Unexpected offset");
-		_Static_assert(offsetof(Elf64_Ehdr, e_ident) == 0, "Unexpected offset");
-		if (pread(fd, &e_ident, sizeof(e_ident), 0) == sizeof(e_ident) &&
-		    memcmp(e_ident, ELFMAG, SELFMAG) == 0 &&
-		    e_ident[EI_CLASS] > ELFCLASSNONE && e_ident[EI_CLASS] < ELFCLASSNUM &&
-		    e_ident[EI_DATA] > ELFDATANONE && e_ident[EI_DATA] < ELFDATANUM &&
-		    e_ident[EI_VERSION] == EV_CURRENT) {
-			_Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset");
-			_Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset");
-
-			if (dso__needs_swap(dso) == DSO_SWAP__UNSET)
-				dso__swap_init(dso, e_ident[EI_DATA]);
-
-			if (dso__needs_swap(dso) != DSO_SWAP__UNSET &&
-			    pread(fd, &e_machine, sizeof(e_machine), 18) == sizeof(e_machine) &&
-			    e_machine < EM_NUM)
-				e_machine = DSO__SWAP(dso, uint16_t, e_machine);
-			else
-				e_machine = EM_NONE;
-		}
-	}
+	if (fd >= 0)
+		e_machine = dso__read_e_machine(dso, fd);
+
 	mutex_unlock(dso__data_open_lock());
 	return e_machine;
 }
@@ -1656,28 +1697,13 @@ void dso__put(struct dso *dso)
 
 int dso__swap_init(struct dso *dso, unsigned char eidata)
 {
-	static unsigned int const endian = 1;
-
-	dso__set_needs_swap(dso, DSO_SWAP__NO);
+	enum dso_swap_type type = dso_swap_type__from_elf_data(eidata);
 
-	switch (eidata) {
-	case ELFDATA2LSB:
-		/* We are big endian, DSO is little endian. */
-		if (*(unsigned char const *)&endian != 1)
-			dso__set_needs_swap(dso, DSO_SWAP__YES);
-		break;
-
-	case ELFDATA2MSB:
-		/* We are little endian, DSO is big endian. */
-		if (*(unsigned char const *)&endian != 0)
-			dso__set_needs_swap(dso, DSO_SWAP__YES);
-		break;
-
-	default:
+	dso__set_needs_swap(dso, type);
+	if (type == DSO_SWAP__UNSET) {
 		pr_err("unrecognized DSO data encoding %d\n", eidata);
 		return -EINVAL;
 	}
-
 	return 0;
 }
 
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index ac725bc8ea74..a95fee7d634b 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -160,12 +160,11 @@ enum dso_load_errno {
 	__DSO_LOAD_ERRNO__END,
 };
 
-#define DSO__SWAP(dso, type, val)				\
+#define DSO_SWAP_TYPE__SWAP(swap_type, type, val)		\
 ({								\
 	type ____r = val;					\
-	enum dso_swap_type ___dst = dso__needs_swap(dso);	\
-	BUG_ON(___dst == DSO_SWAP__UNSET);			\
-	if (___dst == DSO_SWAP__YES) {				\
+	BUG_ON(swap_type == DSO_SWAP__UNSET);			\
+	if (swap_type == DSO_SWAP__YES) {			\
 		switch (sizeof(____r)) {			\
 		case 2:						\
 			____r = bswap_16(val);			\
@@ -183,6 +182,8 @@ enum dso_load_errno {
 	____r;							\
 })
 
+#define DSO__SWAP(dso, type, val) DSO_SWAP_TYPE__SWAP(dso__needs_swap(dso), type, val)
+
 #define DSO__DATA_CACHE_SIZE 4096
 #define DSO__DATA_CACHE_MASK ~(DSO__DATA_CACHE_SIZE - 1)
 
@@ -865,6 +866,7 @@ int dso__data_file_size(struct dso *dso, struct machine *machine);
 off_t dso__data_size(struct dso *dso, struct machine *machine);
 ssize_t dso__data_read_offset(struct dso *dso, struct machine *machine,
 			      u64 offset, u8 *data, ssize_t size);
+uint16_t dso__read_e_machine(struct dso *optional_dso, int fd);
 uint16_t dso__e_machine(struct dso *dso, struct machine *machine);
 ssize_t dso__data_read_addr(struct dso *dso, struct map *map,
 			    struct machine *machine, u64 addr,
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index aa9c58bbf9d3..3642858e6cbc 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -458,10 +458,7 @@ static uint16_t read_proc_e_machine_for_pid(pid_t pid)
 	snprintf(path, sizeof(path), "/proc/%d/exe", pid);
 	fd = open(path, O_RDONLY);
 	if (fd >= 0) {
-		_Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset");
-		_Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset");
-		if (pread(fd, &e_machine, sizeof(e_machine), 18) != sizeof(e_machine))
-			e_machine = EM_NONE;
+		e_machine = dso__read_e_machine(/*optional_dso=*/NULL, fd);
 		close(fd);
 	}
 	return e_machine;
-- 
cgit v1.2.3


From 4e66527f8859a6614a3a8afd11778c832a30ebbb Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 23 Jan 2026 14:22:07 -0800
Subject: perf thread: Add optional e_flags output argument to
 thread__e_machine

The e_flags are needed to accurately compute complete perf register
information for CSKY.

Add the ability to read and have this value associated with a thread.

This change doesn't wire up the use of the e_flags except in disasm
where use already exists but just wasn't set up yet.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c                        | 14 +++--
 tools/perf/builtin-trace.c                         | 12 +++--
 tools/perf/util/annotate.c                         |  5 +-
 tools/perf/util/disasm.c                           |  5 +-
 tools/perf/util/disasm.h                           |  2 +-
 tools/perf/util/dso.c                              | 43 +++++++++++++---
 tools/perf/util/dso.h                              |  4 +-
 .../util/scripting-engines/trace-event-python.c    |  2 +-
 tools/perf/util/session.c                          |  4 +-
 tools/perf/util/thread.c                           | 59 +++++++++++++++-------
 tools/perf/util/thread.h                           | 16 +++++-
 tools/perf/util/unwind-libdw.c                     |  4 +-
 12 files changed, 122 insertions(+), 48 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 372bede30230..8c0de27a9713 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2504,11 +2504,17 @@ static void process_event(struct perf_script *script,
 				    symbol_conf.bt_stop_list, fp);
 	}
 
-	if (PRINT_FIELD(IREGS))
-		perf_sample__fprintf_iregs(sample, attr, thread__e_machine(thread, machine), fp);
+	if (PRINT_FIELD(IREGS)) {
+		perf_sample__fprintf_iregs(sample, attr,
+					   thread__e_machine(thread, machine, /*e_flags=*/NULL),
+					   fp);
+	}
 
-	if (PRINT_FIELD(UREGS))
-		perf_sample__fprintf_uregs(sample, attr, thread__e_machine(thread, machine), fp);
+	if (PRINT_FIELD(UREGS)) {
+		perf_sample__fprintf_uregs(sample, attr,
+					   thread__e_machine(thread, machine, /*e_flags=*/NULL),
+					   fp);
+	}
 
 	if (PRINT_FIELD(BRSTACK))
 		perf_sample__fprintf_brstack(sample, thread, evsel, fp);
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 8df5ca44e4f9..311d9da9896a 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2789,7 +2789,7 @@ static int trace__sys_enter(struct trace *trace, struct evsel *evsel,
 	struct thread_trace *ttrace;
 
 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
-	e_machine = thread__e_machine(thread, trace->host);
+	e_machine = thread__e_machine(thread, trace->host, /*e_flags=*/NULL);
 	sc = trace__syscall_info(trace, evsel, e_machine, id);
 	if (sc == NULL)
 		goto out_put;
@@ -2868,7 +2868,7 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel,
 
 
 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
-	e_machine = thread__e_machine(thread, trace->host);
+	e_machine = thread__e_machine(thread, trace->host, /*e_flags=*/NULL);
 	sc = trace__syscall_info(trace, evsel, e_machine, id);
 	if (sc == NULL)
 		goto out_put;
@@ -2934,7 +2934,7 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
 	struct thread_trace *ttrace;
 
 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
-	e_machine = thread__e_machine(thread, trace->host);
+	e_machine = thread__e_machine(thread, trace->host, /*e_flags=*/NULL);
 	sc = trace__syscall_info(trace, evsel, e_machine, id);
 	if (sc == NULL)
 		goto out_put;
@@ -3285,7 +3285,9 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
 
 	if (evsel == trace->syscalls.events.bpf_output) {
 		int id = perf_evsel__sc_tp_uint(evsel, id, sample);
-		int e_machine = thread ? thread__e_machine(thread, trace->host) : EM_HOST;
+		int e_machine = thread
+			? thread__e_machine(thread, trace->host, /*e_flags=*/NULL)
+			: EM_HOST;
 		struct syscall *sc = trace__syscall_info(trace, evsel, e_machine, id);
 
 		if (sc) {
@@ -4916,7 +4918,7 @@ static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trac
 {
 	size_t printed = 0;
 	struct thread_trace *ttrace = thread__priv(thread);
-	int e_machine = thread__e_machine(thread, trace->host);
+	int e_machine = thread__e_machine(thread, trace->host, /*e_flags=*/NULL);
 	double ratio;
 
 	if (ttrace == NULL)
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index c16c6dfaa959..880b1bd300c2 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -984,6 +984,7 @@ int thread__get_arch(struct thread *thread, const struct arch **parch)
 {
 	const struct arch *arch;
 	struct machine *machine;
+	uint32_t e_flags;
 	uint16_t e_machine;
 
 	if (!thread) {
@@ -992,8 +993,8 @@ int thread__get_arch(struct thread *thread, const struct arch **parch)
 	}
 
 	machine = maps__machine(thread__maps(thread));
-	e_machine = thread__e_machine(thread, machine);
-	arch = arch__find(e_machine, machine->env ? machine->env->cpuid : NULL);
+	e_machine = thread__e_machine(thread, machine, &e_flags);
+	arch = arch__find(e_machine, e_flags, machine->env ? machine->env->cpuid : NULL);
 	if (arch == NULL) {
 		pr_err("%s: unsupported arch %d\n", __func__, e_machine);
 		return errno;
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 9b0ba1fc5aec..6b36287f30fe 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -134,7 +134,7 @@ static int arch__cmp(const void *a, const void *b)
 	return e_machine_and_eflags__cmp(&(*aa)->id, &(*ab)->id);
 }
 
-const struct arch *arch__find(uint16_t e_machine, const char *cpuid)
+const struct arch *arch__find(uint16_t e_machine, uint32_t e_flags, const char *cpuid)
 {
 	static const struct arch *(*const arch_new_fn[])(const struct e_machine_and_e_flags *id,
 							 const char *cpuid) = {
@@ -157,8 +157,7 @@ const struct arch *arch__find(uint16_t e_machine, const char *cpuid)
 	static size_t num_archs;
 	struct e_machine_and_e_flags key = {
 		.e_machine = e_machine,
-		// TODO: e_flags should really come from the same source as e_machine.
-		.e_flags = EF_HOST,
+		.e_flags = e_flags,
 	};
 	const struct arch *result = NULL, **tmp;
 
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index 6a1905f9d4fc..a6e478caf61a 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -108,7 +108,7 @@ struct annotate_args {
 	char			  *fileloc;
 };
 
-const struct arch *arch__find(uint16_t e_machine, const char *cpuid);
+const struct arch *arch__find(uint16_t e_machine, uint32_t e_flags, const char *cpuid);
 bool arch__is_x86(const struct arch *arch);
 bool arch__is_powerpc(const struct arch *arch);
 
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 91c9f7cb9d8c..b791e1b6b2cf 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -1220,14 +1220,20 @@ static enum dso_swap_type dso_swap_type__from_elf_data(unsigned char eidata)
 }
 
 /* Reads e_machine from fd, optionally caching data in dso. */
-uint16_t dso__read_e_machine(struct dso *optional_dso, int fd)
+uint16_t dso__read_e_machine(struct dso *optional_dso, int fd, uint32_t *e_flags)
 {
 	uint16_t e_machine = EM_NONE;
 	unsigned char e_ident[EI_NIDENT];
 	enum dso_swap_type swap_type;
+	bool need_e_flags;
 
-	_Static_assert(offsetof(Elf32_Ehdr, e_ident) == 0, "Unexpected offset");
-	_Static_assert(offsetof(Elf64_Ehdr, e_ident) == 0, "Unexpected offset");
+	if (e_flags)
+		*e_flags = 0;
+
+	{
+		_Static_assert(offsetof(Elf32_Ehdr, e_ident) == 0, "Unexpected offset");
+		_Static_assert(offsetof(Elf64_Ehdr, e_ident) == 0, "Unexpected offset");
+	}
 	if (pread(fd, &e_ident, sizeof(e_ident), 0) != sizeof(e_ident))
 		return EM_NONE; // Read failed.
 
@@ -1254,18 +1260,35 @@ uint16_t dso__read_e_machine(struct dso *optional_dso, int fd)
 	{
 		_Static_assert(offsetof(Elf32_Ehdr, e_machine) == 18, "Unexpected offset");
 		_Static_assert(offsetof(Elf64_Ehdr, e_machine) == 18, "Unexpected offset");
-		if (pread(fd, &e_machine, sizeof(e_machine), 18) != sizeof(e_machine))
-			return EM_NONE; // e_machine read failed.
 	}
+	if (pread(fd, &e_machine, sizeof(e_machine), 18) != sizeof(e_machine))
+		return EM_NONE; // e_machine read failed.
 
 	e_machine = DSO_SWAP_TYPE__SWAP(swap_type, uint16_t, e_machine);
 	if (e_machine >= EM_NUM)
 		return EM_NONE; // Bad ELF machine number.
 
+#ifdef NDEBUG
+	/* In production code the e_flags are only needed on CSKY. */
+	need_e_flags = e_flags && e_machine == EM_CSKY;
+#else
+	/* Debug code will always read the e_flags. */
+	need_e_flags = e_flags != NULL;
+#endif
+	if (need_e_flags) {
+		off_t offset = e_ident[EI_CLASS] == ELFCLASS32
+			? offsetof(Elf32_Ehdr, e_flags)
+			: offsetof(Elf64_Ehdr, e_flags);
+
+		if (pread(fd, e_flags, sizeof(*e_flags), offset) != sizeof(*e_flags)) {
+			*e_flags = 0;
+			return EM_NONE; // e_flags read failed.
+		}
+	}
 	return e_machine;
 }
 
-uint16_t dso__e_machine(struct dso *dso, struct machine *machine)
+uint16_t dso__e_machine(struct dso *dso, struct machine *machine, uint32_t *e_flags)
 {
 	uint16_t e_machine = EM_NONE;
 	int fd;
@@ -1285,6 +1308,8 @@ uint16_t dso__e_machine(struct dso *dso, struct machine *machine)
 	case DSO_BINARY_TYPE__BPF_IMAGE:
 	case DSO_BINARY_TYPE__OOL:
 	case DSO_BINARY_TYPE__JAVA_JIT:
+		if (e_flags)
+			*e_flags = EF_HOST;
 		return EM_HOST;
 	case DSO_BINARY_TYPE__DEBUGLINK:
 	case DSO_BINARY_TYPE__BUILD_ID_CACHE:
@@ -1299,6 +1324,8 @@ uint16_t dso__e_machine(struct dso *dso, struct machine *machine)
 		break;
 	case DSO_BINARY_TYPE__NOT_FOUND:
 	default:
+		if (e_flags)
+			*e_flags = 0;
 		return EM_NONE;
 	}
 
@@ -1311,7 +1338,9 @@ uint16_t dso__e_machine(struct dso *dso, struct machine *machine)
 	try_to_open_dso(dso, machine);
 	fd = dso__data(dso)->fd;
 	if (fd >= 0)
-		e_machine = dso__read_e_machine(dso, fd);
+		e_machine = dso__read_e_machine(dso, fd, e_flags);
+	else if (e_flags)
+		*e_flags = 0;
 
 	mutex_unlock(dso__data_open_lock());
 	return e_machine;
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index a95fee7d634b..ede691e9a249 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -866,8 +866,8 @@ int dso__data_file_size(struct dso *dso, struct machine *machine);
 off_t dso__data_size(struct dso *dso, struct machine *machine);
 ssize_t dso__data_read_offset(struct dso *dso, struct machine *machine,
 			      u64 offset, u8 *data, ssize_t size);
-uint16_t dso__read_e_machine(struct dso *optional_dso, int fd);
-uint16_t dso__e_machine(struct dso *dso, struct machine *machine);
+uint16_t dso__read_e_machine(struct dso *optional_dso, int fd, uint32_t *e_flags);
+uint16_t dso__e_machine(struct dso *dso, struct machine *machine, uint32_t *e_flags);
 ssize_t dso__data_read_addr(struct dso *dso, struct map *map,
 			    struct machine *machine, u64 addr,
 			    u8 *data, ssize_t size);
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index b90edc147796..50f0d16520cc 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -925,7 +925,7 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample,
 
 	if (al->thread) {
 		machine = maps__machine(thread__maps(al->thread));
-		e_machine = thread__e_machine(al->thread, machine);
+		e_machine = thread__e_machine(al->thread, machine, /*e_flags=*/NULL);
 	}
 	if (set_regs_in_dict(dict, sample, evsel, e_machine))
 		Py_FatalError("Failed to setting regs in dict");
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index c0231bc000e7..0e8a128d7c04 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1124,7 +1124,7 @@ static void dump_sample(struct machine *machine, struct evsel *evsel, union perf
 	if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR)) {
 		struct thread *thread = machine__find_thread(machine, sample->pid, sample->pid);
 
-		e_machine = thread__e_machine(thread, machine);
+		e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL);
 	}
 
 	printf("(IP, 0x%x): %d/%d: %#" PRIx64 " period: %" PRIu64 " addr: %#" PRIx64 "\n",
@@ -2965,7 +2965,7 @@ static int perf_session__e_machine_cb(struct thread *thread,
 	uint16_t *result = arg;
 	struct machine *machine = maps__machine(thread__maps(thread));
 
-	*result = thread__e_machine(thread, machine);
+	*result = thread__e_machine(thread, machine, /*e_flags=*/NULL);
 	return *result != EM_NONE ? 1 : 0;
 }
 
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 3642858e6cbc..618f29afb160 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -449,7 +449,7 @@ void thread__find_cpumode_addr_location(struct thread *thread, u64 addr,
 	}
 }
 
-static uint16_t read_proc_e_machine_for_pid(pid_t pid)
+static uint16_t read_proc_e_machine_for_pid(pid_t pid, uint32_t *e_flags)
 {
 	char path[6 /* "/proc/" */ + 11 /* max length of pid */ + 5 /* "/exe\0" */];
 	int fd;
@@ -458,30 +458,46 @@ static uint16_t read_proc_e_machine_for_pid(pid_t pid)
 	snprintf(path, sizeof(path), "/proc/%d/exe", pid);
 	fd = open(path, O_RDONLY);
 	if (fd >= 0) {
-		e_machine = dso__read_e_machine(/*optional_dso=*/NULL, fd);
+		e_machine = dso__read_e_machine(/*optional_dso=*/NULL, fd, e_flags);
 		close(fd);
 	}
 	return e_machine;
 }
 
-static int thread__e_machine_callback(struct map *map, void *machine)
+struct thread__e_machine_callback_args {
+	struct machine *machine;
+	uint32_t e_flags;
+	uint16_t e_machine;
+};
+
+static int thread__e_machine_callback(struct map *map, void *_args)
 {
+	struct thread__e_machine_callback_args *args = _args;
 	struct dso *dso = map__dso(map);
 
-	_Static_assert(0 == EM_NONE, "Unexpected EM_NONE");
 	if (!dso)
-		return EM_NONE;
+		return 0; // No dso, continue search.
 
-	return dso__e_machine(dso, machine);
+	args->e_machine = dso__e_machine(dso, args->machine, &args->e_flags);
+	return args->e_machine != EM_NONE ? 1 /* stop search */ : 0 /* continue search */;
 }
 
-uint16_t thread__e_machine(struct thread *thread, struct machine *machine)
+uint16_t thread__e_machine(struct thread *thread, struct machine *machine, uint32_t *e_flags)
 {
 	pid_t tid, pid;
 	uint16_t e_machine = RC_CHK_ACCESS(thread)->e_machine;
+	uint32_t local_e_flags = 0;
+	struct thread__e_machine_callback_args args = {
+		.machine = machine,
+		.e_flags = 0,
+		.e_machine = EM_NONE,
+	};
 
-	if (e_machine != EM_NONE)
+	if (e_machine != EM_NONE) {
+		if (e_flags)
+			*e_flags = thread__e_flags(thread);
 		return e_machine;
+	}
 
 	tid = thread__tid(thread);
 	pid = thread__pid(thread);
@@ -489,18 +505,19 @@ uint16_t thread__e_machine(struct thread *thread, struct machine *machine)
 		struct thread *parent = machine__findnew_thread(machine, pid, pid);
 
 		if (parent) {
-			e_machine = thread__e_machine(parent, machine);
+			e_machine = thread__e_machine(parent, machine, &local_e_flags);
 			thread__put(parent);
-			thread__set_e_machine(thread, e_machine);
-			return e_machine;
+			goto out;
 		}
 		/* Something went wrong, fallback. */
 	}
 	/* Reading on the PID thread. First try to find from the maps. */
-	e_machine = maps__for_each_map(thread__maps(thread),
-				       thread__e_machine_callback,
-				       machine);
-	if (e_machine == EM_NONE) {
+	maps__for_each_map(thread__maps(thread), thread__e_machine_callback, &args);
+
+	if (args.e_machine != EM_NONE) {
+		e_machine = args.e_machine;
+		local_e_flags = args.e_flags;
+	} else {
 		/* Maps failed, perhaps we're live with map events disabled. */
 		bool is_live = machine->machines == NULL;
 
@@ -514,12 +531,18 @@ uint16_t thread__e_machine(struct thread *thread, struct machine *machine)
 		}
 		/* Read from /proc/pid/exe if live. */
 		if (is_live)
-			e_machine = read_proc_e_machine_for_pid(pid);
+			e_machine = read_proc_e_machine_for_pid(pid, &local_e_flags);
 	}
-	if (e_machine != EM_NONE)
+out:
+	if (e_machine != EM_NONE) {
 		thread__set_e_machine(thread, e_machine);
-	else
+		thread__set_e_flags(thread, local_e_flags);
+	} else {
 		e_machine = EM_HOST;
+		local_e_flags = EF_HOST;
+	}
+	if (e_flags)
+		*e_flags = local_e_flags;
 	return e_machine;
 }
 
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 310eaea344bb..f5792d3e8a16 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -60,6 +60,10 @@ DECLARE_RC_STRUCT(thread) {
 	struct srccode_state	srccode_state;
 	bool			filter;
 	int			filter_entry_depth;
+	/**
+	 * @e_flags: The ELF EF_* associated with the thread. Valid if e_machine != EM_NONE.
+	 */
+	uint16_t		e_flags;
 	/**
 	 * @e_machine: The ELF EM_* associated with the thread. EM_NONE if not
 	 * computed.
@@ -307,13 +311,23 @@ static inline void thread__set_filter_entry_depth(struct thread *thread, int dep
 	RC_CHK_ACCESS(thread)->filter_entry_depth = depth;
 }
 
-uint16_t thread__e_machine(struct thread *thread, struct machine *machine);
+uint16_t thread__e_machine(struct thread *thread, struct machine *machine, uint32_t *e_flags);
 
 static inline void thread__set_e_machine(struct thread *thread, uint16_t e_machine)
 {
 	RC_CHK_ACCESS(thread)->e_machine = e_machine;
 }
 
+static inline uint32_t thread__e_flags(const struct thread *thread)
+{
+	return RC_CHK_ACCESS(thread)->e_flags;
+}
+
+static inline void thread__set_e_flags(struct thread *thread, uint32_t e_flags)
+{
+	RC_CHK_ACCESS(thread)->e_flags = e_flags;
+}
+
 
 static inline bool thread__lbr_stitch_enable(const struct thread *thread)
 {
diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 9cb0960ef905..3fdcfa06bf22 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -213,7 +213,7 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *
 {
 	struct dwfl_ui_thread_info *dwfl_ui_ti = arg;
 	struct unwind_info *ui = dwfl_ui_ti->ui;
-	uint16_t e_machine = thread__e_machine(ui->thread, ui->machine);
+	uint16_t e_machine = thread__e_machine(ui->thread, ui->machine, /*e_flags=*/NULL);
 	struct stack_dump *stack = &ui->sample->user_stack;
 	u64 start, end;
 	int offset;
@@ -348,7 +348,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 {
 	struct maps *maps = thread__maps(thread);
 	struct machine *machine = maps__machine(maps);
-	uint16_t e_machine = thread__e_machine(thread, machine);
+	uint16_t e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL);
 	struct dwfl_ui_thread_info *dwfl_ui_ti;
 	static struct unwind_info *ui;
 	Dwfl *dwfl;
-- 
cgit v1.2.3


From 0403930f7b1534e383116f7122539873dad3c6a6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 23 Jan 2026 14:22:08 -0800
Subject: perf perf_regs: Accurately compute register names for CSKY

CSKY needs the e_flags to determine the ABI level and know whether
additional registers are encoded or not. Wire this up now that the
e_flags for a thread can be determined.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Tianyou Li <tianyou.li@intel.com>
[ Conditionally define EF_CSKY_ABIMASK and EF_CSKY_ABIV2 for older distros ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-script.c                        | 28 +++++++++++++++-------
 tools/perf/util/parse-regs-options.c               |  4 ++--
 tools/perf/util/perf-regs-arch/perf_regs_csky.c    | 19 ++++++++++-----
 tools/perf/util/perf_regs.c                        |  4 ++--
 tools/perf/util/perf_regs.h                        |  4 ++--
 .../util/scripting-engines/trace-event-python.c    | 17 +++++++------
 tools/perf/util/session.c                          | 24 ++++++++++---------
 7 files changed, 62 insertions(+), 38 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 8c0de27a9713..6ec225c697a4 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -717,7 +717,8 @@ out:
 	return 0;
 }
 
-static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, uint16_t e_machine,
+static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask,
+				     uint16_t e_machine, uint32_t e_flags,
 				     FILE *fp)
 {
 	unsigned i = 0, r;
@@ -730,7 +731,9 @@ static int perf_sample__fprintf_regs(struct regs_dump *regs, uint64_t mask, uint
 
 	for_each_set_bit(r, (unsigned long *) &mask, sizeof(mask) * 8) {
 		u64 val = regs->regs[i++];
-		printed += fprintf(fp, "%5s:0x%"PRIx64" ", perf_reg_name(r, e_machine), val);
+		printed += fprintf(fp, "%5s:0x%"PRIx64" ",
+				   perf_reg_name(r, e_machine, e_flags),
+				   val);
 	}
 
 	return printed;
@@ -787,23 +790,29 @@ tod_scnprintf(struct perf_script *script, char *buf, int buflen,
 }
 
 static int perf_sample__fprintf_iregs(struct perf_sample *sample,
-				      struct perf_event_attr *attr, uint16_t e_machine, FILE *fp)
+				      struct perf_event_attr *attr,
+				      uint16_t e_machine,
+				      uint32_t e_flags,
+				      FILE *fp)
 {
 	if (!sample->intr_regs)
 		return 0;
 
 	return perf_sample__fprintf_regs(perf_sample__intr_regs(sample),
-					 attr->sample_regs_intr, e_machine, fp);
+					 attr->sample_regs_intr, e_machine, e_flags, fp);
 }
 
 static int perf_sample__fprintf_uregs(struct perf_sample *sample,
-				      struct perf_event_attr *attr, uint16_t e_machine, FILE *fp)
+				      struct perf_event_attr *attr,
+				      uint16_t e_machine,
+				      uint32_t e_flags,
+				      FILE *fp)
 {
 	if (!sample->user_regs)
 		return 0;
 
 	return perf_sample__fprintf_regs(perf_sample__user_regs(sample),
-					 attr->sample_regs_user, e_machine, fp);
+					 attr->sample_regs_user, e_machine, e_flags, fp);
 }
 
 static int perf_sample__fprintf_start(struct perf_script *script,
@@ -2418,6 +2427,7 @@ static void process_event(struct perf_script *script,
 	struct evsel_script *es = evsel->priv;
 	FILE *fp = es->fp;
 	char str[PAGE_SIZE_NAME_LEN];
+	uint32_t e_flags;
 
 	if (output[type].fields == 0)
 		return;
@@ -2506,13 +2516,15 @@ static void process_event(struct perf_script *script,
 
 	if (PRINT_FIELD(IREGS)) {
 		perf_sample__fprintf_iregs(sample, attr,
-					   thread__e_machine(thread, machine, /*e_flags=*/NULL),
+					   thread__e_machine(thread, machine, &e_flags),
+					   e_flags,
 					   fp);
 	}
 
 	if (PRINT_FIELD(UREGS)) {
 		perf_sample__fprintf_uregs(sample, attr,
-					   thread__e_machine(thread, machine, /*e_flags=*/NULL),
+					   thread__e_machine(thread, machine, &e_flags),
+					   e_flags,
 					   fp);
 	}
 
diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c
index c0d0ef9fd495..8dd35f50f644 100644
--- a/tools/perf/util/parse-regs-options.c
+++ b/tools/perf/util/parse-regs-options.c
@@ -21,7 +21,7 @@ static void list_perf_regs(FILE *fp, uint64_t mask)
 		if (((1ULL << reg) & mask) == 0)
 			continue;
 
-		name = perf_reg_name(reg, EM_HOST);
+		name = perf_reg_name(reg, EM_HOST, EF_HOST);
 		if (name && (!last_name || strcmp(last_name, name)))
 			fprintf(fp, "%s%s", reg > 0 ? " " : "", name);
 		last_name = name;
@@ -39,7 +39,7 @@ static uint64_t name_to_perf_reg_mask(const char *to_match, uint64_t mask)
 		if (((1ULL << reg) & mask) == 0)
 			continue;
 
-		name = perf_reg_name(reg, EM_HOST);
+		name = perf_reg_name(reg, EM_HOST, EF_HOST);
 		if (!name)
 			continue;
 
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_csky.c b/tools/perf/util/perf-regs-arch/perf_regs_csky.c
index 75b461ef2eba..95808f93d45b 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_csky.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_csky.c
@@ -1,10 +1,21 @@
 // SPDX-License-Identifier: GPL-2.0
-
+#include <elf.h>
+#ifndef EF_CSKY_ABIMASK
+#define EF_CSKY_ABIMASK	0XF0000000
+#endif
+#ifndef EF_CSKY_ABIV2
+#define EF_CSKY_ABIV2	0X20000000
+#endif
 #include "../perf_regs.h"
+#undef __CSKYABIV2__
+#define __CSKYABIV2__ 1  // Always want the V2 register definitions.
 #include "../../arch/csky/include/uapi/asm/perf_regs.h"
 
-const char *__perf_reg_name_csky(int id)
+const char *__perf_reg_name_csky(int id, uint32_t e_flags)
 {
+	if (id >= PERF_REG_CSKY_EXREGS0 && (e_flags & EF_CSKY_ABIMASK) == EF_CSKY_ABIV2)
+		return NULL;
+
 	switch (id) {
 	case PERF_REG_CSKY_A0:
 		return "a0";
@@ -40,7 +51,6 @@ const char *__perf_reg_name_csky(int id)
 		return "lr";
 	case PERF_REG_CSKY_PC:
 		return "pc";
-#if defined(__CSKYABIV2__)
 	case PERF_REG_CSKY_EXREGS0:
 		return "exregs0";
 	case PERF_REG_CSKY_EXREGS1:
@@ -77,12 +87,9 @@ const char *__perf_reg_name_csky(int id)
 		return "hi";
 	case PERF_REG_CSKY_LO:
 		return "lo";
-#endif
 	default:
 		return NULL;
 	}
-
-	return NULL;
 }
 
 uint64_t __perf_reg_ip_csky(void)
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index cd5acee3dc62..14b7be30ab20 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -23,7 +23,7 @@ uint64_t __weak arch__user_reg_mask(void)
 	return 0;
 }
 
-const char *perf_reg_name(int id, uint16_t e_machine)
+const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags)
 {
 	const char *reg_name = NULL;
 
@@ -35,7 +35,7 @@ const char *perf_reg_name(int id, uint16_t e_machine)
 		reg_name = __perf_reg_name_arm64(id);
 		break;
 	case EM_CSKY:
-		reg_name = __perf_reg_name_csky(id);
+		reg_name = __perf_reg_name_csky(id, e_flags);
 		break;
 	case EM_LOONGARCH:
 		reg_name = __perf_reg_name_loongarch(id);
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index 2c2a8de6912d..ed7c1b1358fa 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -16,7 +16,7 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op);
 uint64_t arch__intr_reg_mask(void);
 uint64_t arch__user_reg_mask(void);
 
-const char *perf_reg_name(int id, uint16_t e_machine);
+const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags);
 int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
 uint64_t perf_arch_reg_ip(uint16_t e_machine);
 uint64_t perf_arch_reg_sp(uint16_t e_machine);
@@ -26,7 +26,7 @@ uint64_t __perf_reg_sp_arm64(void);
 const char *__perf_reg_name_arm(int id);
 uint64_t __perf_reg_ip_arm(void);
 uint64_t __perf_reg_sp_arm(void);
-const char *__perf_reg_name_csky(int id);
+const char *__perf_reg_name_csky(int id, uint32_t e_flags);
 uint64_t __perf_reg_ip_csky(void);
 uint64_t __perf_reg_sp_csky(void);
 const char *__perf_reg_name_loongarch(int id);
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 50f0d16520cc..62c9c73daef5 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -714,7 +714,8 @@ static void set_sample_datasrc_in_dict(PyObject *dict,
 			_PyUnicode_FromString(decode));
 }
 
-static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine, char *bf, int size)
+static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine, uint32_t e_flags,
+		     char *bf, int size)
 {
 	unsigned int i = 0, r;
 	int printed = 0;
@@ -732,7 +733,7 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine,
 
 		printed += scnprintf(bf + printed, size - printed,
 				     "%5s:0x%" PRIx64 " ",
-				     perf_reg_name(r, e_machine), val);
+				     perf_reg_name(r, e_machine, e_flags), val);
 	}
 }
 
@@ -741,7 +742,8 @@ static void regs_map(struct regs_dump *regs, uint64_t mask, uint16_t e_machine,
 static int set_regs_in_dict(PyObject *dict,
 			     struct perf_sample *sample,
 			     struct evsel *evsel,
-			     uint16_t e_machine)
+			     uint16_t e_machine,
+			     uint32_t e_flags)
 {
 	struct perf_event_attr *attr = &evsel->core.attr;
 
@@ -753,7 +755,7 @@ static int set_regs_in_dict(PyObject *dict,
 		if (!bf)
 			return -1;
 
-		regs_map(sample->intr_regs, attr->sample_regs_intr, e_machine, bf, size);
+		regs_map(sample->intr_regs, attr->sample_regs_intr, e_machine, e_flags, bf, size);
 
 		pydict_set_item_string_decref(dict, "iregs",
 					_PyUnicode_FromString(bf));
@@ -765,7 +767,7 @@ static int set_regs_in_dict(PyObject *dict,
 			if (!bf)
 				return -1;
 		}
-		regs_map(sample->user_regs, attr->sample_regs_user, e_machine, bf, size);
+		regs_map(sample->user_regs, attr->sample_regs_user, e_machine, e_flags, bf, size);
 
 		pydict_set_item_string_decref(dict, "uregs",
 					_PyUnicode_FromString(bf));
@@ -837,6 +839,7 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample,
 	PyObject *dict, *dict_sample, *brstack, *brstacksym;
 	struct machine *machine;
 	uint16_t e_machine = EM_HOST;
+	uint32_t e_flags = EF_HOST;
 
 	dict = PyDict_New();
 	if (!dict)
@@ -925,9 +928,9 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample,
 
 	if (al->thread) {
 		machine = maps__machine(thread__maps(al->thread));
-		e_machine = thread__e_machine(al->thread, machine, /*e_flags=*/NULL);
+		e_machine = thread__e_machine(al->thread, machine, &e_flags);
 	}
-	if (set_regs_in_dict(dict, sample, evsel, e_machine))
+	if (set_regs_in_dict(dict, sample, evsel, e_machine, e_flags))
 		Py_FatalError("Failed to setting regs in dict");
 
 	return dict;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 0e8a128d7c04..7c7c65b0f536 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -959,7 +959,7 @@ static void branch_stack__printf(struct perf_sample *sample,
 	}
 }
 
-static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine)
+static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine, uint32_t e_flags)
 {
 	unsigned rid, i = 0;
 
@@ -967,7 +967,7 @@ static void regs_dump__printf(u64 mask, u64 *regs, uint16_t e_machine)
 		u64 val = regs[i++];
 
 		printf(".... %-5s 0x%016" PRIx64 "\n",
-		       perf_reg_name(rid, e_machine), val);
+		       perf_reg_name(rid, e_machine, e_flags), val);
 	}
 }
 
@@ -985,7 +985,8 @@ static inline const char *regs_dump_abi(struct regs_dump *d)
 	return regs_abi[d->abi];
 }
 
-static void regs__printf(const char *type, struct regs_dump *regs, uint16_t e_machine)
+static void regs__printf(const char *type, struct regs_dump *regs,
+			 uint16_t e_machine, uint32_t e_flags)
 {
 	u64 mask = regs->mask;
 
@@ -994,10 +995,10 @@ static void regs__printf(const char *type, struct regs_dump *regs, uint16_t e_ma
 	       mask,
 	       regs_dump_abi(regs));
 
-	regs_dump__printf(mask, regs->regs, e_machine);
+	regs_dump__printf(mask, regs->regs, e_machine, e_flags);
 }
 
-static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine)
+static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags)
 {
 	struct regs_dump *user_regs;
 
@@ -1007,10 +1008,10 @@ static void regs_user__printf(struct perf_sample *sample, uint16_t e_machine)
 	user_regs = perf_sample__user_regs(sample);
 
 	if (user_regs->regs)
-		regs__printf("user", user_regs, e_machine);
+		regs__printf("user", user_regs, e_machine, e_flags);
 }
 
-static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine)
+static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine, uint32_t e_flags)
 {
 	struct regs_dump *intr_regs;
 
@@ -1020,7 +1021,7 @@ static void regs_intr__printf(struct perf_sample *sample, uint16_t e_machine)
 	intr_regs = perf_sample__intr_regs(sample);
 
 	if (intr_regs->regs)
-		regs__printf("intr", intr_regs, e_machine);
+		regs__printf("intr", intr_regs, e_machine, e_flags);
 }
 
 static void stack_user__printf(struct stack_dump *dump)
@@ -1115,6 +1116,7 @@ static void dump_sample(struct machine *machine, struct evsel *evsel, union perf
 	u64 sample_type;
 	char str[PAGE_SIZE_NAME_LEN];
 	uint16_t e_machine = EM_NONE;
+	uint32_t e_flags = 0;
 
 	if (!dump_trace)
 		return;
@@ -1124,7 +1126,7 @@ static void dump_sample(struct machine *machine, struct evsel *evsel, union perf
 	if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR)) {
 		struct thread *thread = machine__find_thread(machine, sample->pid, sample->pid);
 
-		e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL);
+		e_machine = thread__e_machine(thread, machine, &e_flags);
 	}
 
 	printf("(IP, 0x%x): %d/%d: %#" PRIx64 " period: %" PRIu64 " addr: %#" PRIx64 "\n",
@@ -1138,10 +1140,10 @@ static void dump_sample(struct machine *machine, struct evsel *evsel, union perf
 		branch_stack__printf(sample, evsel);
 
 	if (sample_type & PERF_SAMPLE_REGS_USER)
-		regs_user__printf(sample, e_machine);
+		regs_user__printf(sample, e_machine, e_flags);
 
 	if (sample_type & PERF_SAMPLE_REGS_INTR)
-		regs_intr__printf(sample, e_machine);
+		regs_intr__printf(sample, e_machine, e_flags);
 
 	if (sample_type & PERF_SAMPLE_STACK_USER)
 		stack_user__printf(&sample->user_stack);
-- 
cgit v1.2.3


From 2becdd163ab37c9dca05f31da7e943f59f55e510 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 23 Jan 2026 14:22:09 -0800
Subject: perf unwind-libdw: Wire up e_flags for CSKY

Wire up the e_flags now it can be read for a thread. The e_flags
encode the CSKY ABI level and this can impact which perf registers
need setting up for unwinding.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergei Trofimovich <slyich@gmail.com>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>
Cc: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Tianyou Li <tianyou.li@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/unwind-libdw.c | 9 +++++----
 tools/perf/util/unwind-libdw.h | 1 +
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/unwind-libdw.c b/tools/perf/util/unwind-libdw.c
index 3fdcfa06bf22..05e8e68bd49c 100644
--- a/tools/perf/util/unwind-libdw.c
+++ b/tools/perf/util/unwind-libdw.c
@@ -213,7 +213,6 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *
 {
 	struct dwfl_ui_thread_info *dwfl_ui_ti = arg;
 	struct unwind_info *ui = dwfl_ui_ti->ui;
-	uint16_t e_machine = thread__e_machine(ui->thread, ui->machine, /*e_flags=*/NULL);
 	struct stack_dump *stack = &ui->sample->user_stack;
 	u64 start, end;
 	int offset;
@@ -223,7 +222,7 @@ static bool memory_read(Dwfl *dwfl __maybe_unused, Dwarf_Addr addr, Dwarf_Word *
 		return false;
 
 	ret = perf_reg_value(&start, ui->sample->user_regs,
-			     perf_arch_reg_sp(e_machine));
+			     perf_arch_reg_sp(ui->e_machine));
 	if (ret)
 		return false;
 
@@ -260,7 +259,7 @@ static bool libdw_set_initial_registers(Dwfl_Thread *thread, void *arg)
 	int max_dwarf_reg = 0;
 	bool ret;
 	uint16_t e_machine = ui->e_machine;
-	int e_flags = 0;
+	int e_flags = ui->e_flags;
 	uint64_t ip_perf_reg = perf_arch_reg_ip(e_machine);
 	Dwarf_Word val = 0;
 
@@ -348,7 +347,8 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 {
 	struct maps *maps = thread__maps(thread);
 	struct machine *machine = maps__machine(maps);
-	uint16_t e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL);
+	uint32_t e_flags = 0;
+	uint16_t e_machine = thread__e_machine(thread, machine, &e_flags);
 	struct dwfl_ui_thread_info *dwfl_ui_ti;
 	static struct unwind_info *ui;
 	Dwfl *dwfl;
@@ -370,6 +370,7 @@ int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 		.arg		= arg,
 		.max_stack	= max_stack,
 		.e_machine	= e_machine,
+		.e_flags	= e_flags,
 		.best_effort    = best_effort
 	};
 
diff --git a/tools/perf/util/unwind-libdw.h b/tools/perf/util/unwind-libdw.h
index 3dec0ab8bd50..6423bf5a2492 100644
--- a/tools/perf/util/unwind-libdw.h
+++ b/tools/perf/util/unwind-libdw.h
@@ -20,6 +20,7 @@ struct unwind_info {
 	void			*arg;
 	int			max_stack;
 	int			idx;
+	uint32_t		e_flags;
 	uint16_t		e_machine;
 	bool			best_effort;
 	struct unwind_entry	entries[];
-- 
cgit v1.2.3


From 0a6fb6604746c92bccc71867fd0bf3d3294335d1 Mon Sep 17 00:00:00 2001
From: Hrishikesh Suresh <hrishikesh123s@gmail.com>
Date: Sun, 25 Jan 2026 21:06:52 +0100
Subject: perf session: Print all machines in session dump

perf_session__fprintf() prints only the host.

This has been changed to print details of host and all guests, by
traversing through the RB-Tree.

These are visible when using high verbosity (-vvvv) in KVM environments,
during perf report dumps.

Testing:

- Test 1: Record the local machine and guest VM using 'perf kvm record' and
generate the report using 'perf kvm report -vvvv -D'. The dump should show
the threads and other details related to local and guest machine.
    - 1 Ubuntu VM running on Fedora host
    - VM is running a noisy program =>
	$ dd if=/dev/urandom of=/dev/null
    - On host run =>
	$ sudo ./perf kvm --guestvmlinux=/tmp/shared/guest_vmlinux \
                        --guestkallsyms=/tmp/shared/guest_kallsyms \
                        --guestmodules=/tmp/shared/guest_modules \
                        record -a -g -o perf.data.guest
      and exit after a few seconds.
      [ perf record: Woken up 9 times to write data ]
      [ perf record: Captured and wrote 3.150 MB perf.data.guest \
	(29311 samples) ]
    - Generate dump =>
	$ sudo ./perf kvm --guestkallsyms /tmp/shared/guest_kallsyms \
                        report -vvvv -D -i perf.data.guest > output.txt
    - Check for threads associated with guest machine.
      $ grep "Thread 0" output.txt
      Thread 0 swapper
      Thread 0 [guest/0]
    PASS

- Test 2: Record the local machine and guest VM using 'perf kvm record' and
generate the report using 'perf kvm report'. The functions running on
guest VM should be seen in the report.
    - Same setup as Test 1 but the test looks at the performance profile,
      to check if the function names are visible.
    - Peek into profile using =>
	$ sudo ./perf kvm  --guestkallsyms /tmp/shared/guest_kallsyms \
                                    report -i perf.data.guest
    - Samples: 29K of event 'cycles', Event count (approx.): 28711693142
Children   Self  Command  Shared Object            Symbol
35.69%   35.69%  :5820    [guest.kernel.kallsyms]  [g] chacha_permute
11.56%   11.56%  :5820    [guest.kernel.kallsyms]  [g] entry_SYSRETQ_unsXXX
11.12%   11.12%  :5820    [guest.kernel.kallsyms]  [g] syscall_return_viXXX
 7.36%    7.36%  :5820    [guest.kernel.kallsyms]  [g] entry_SYSCALL_64_XXX
 6.07%    6.07%  :5820    [guest.kernel.kallsyms]  [g] chacha_block_generic
 5.40%    5.40%  :5820    [guest.kernel.kallsyms]  [g] _copy_to_iter
 ....
    PASS

- Test 3: Record the local and 2 guest VMs using 'perf kvm record' and
generate the report using 'perf kvm report -vvvv -D'. The dump should show
the threads and other details related to local and guest machines.
    - 1 Ubuntu and 1 Alpine VMs running on Fedora host.
    - Find PIDs of qemu instances and use them during record and report
	$ pgrep qemu
        5816
        25098
    - Record the activity =>
	$ sudo ./perf kvm record -p 5816,25098 -a -g -o perf.data.guests
        Warning:
        PID/TID switch overriding SYSTEM
        [ perf record: Woken up 325927 times to write data ]
        [ perf record: Captured and wrote 3.692 MB perf.data.guests \
	  (57389 samples) ]
    - Generate dump =>
	$ sudo ./perf kvm report -vvvv -D -i perf.data.guests > output.txt
    - Check if the threads related to the local machine and guest VMs
      are present =>
	$ grep "Thread 0" output.txt
        Thread 0 swapper
        Thread 0 [guest/0]
      NOTE: Threads from Ubuntu and Alpine VMs are bundled together and
      appear as one guest machine.
      Looking into output.txt =>
	Threads: 6
	Thread 0 [guest/0]
	Thread 5816 :5816
	Thread 25098 :25098
	Thread 5819 :5819
	Thread 5820 :5820
	Thread 25103 :25103
      To conclude, information is collected for both VMs and not listed
      as two different guest machines.
    PASS

- Test 4: Check if any guest-related information is printed in
perf annotate. This test is included because the command calls
perf_session__fprintf() in its code path when using -vvvv option.
This could be explained by inability / lack of options for 'perf annotate'
to look into guest VM from host machine, due to no option to specify the
guest's kallsyms or modules. A similar explanation for 'perf mem' could
be used, as perf_session__fprintf() is also present in its code path.
    - Run annotate =>
	$ sudo ./perf annotate -i perf.data.guest -vvvv  > output.txt
    - Check for threads from local machine or guest VM =>
	$ grep "Thread 0" output.txt
        Thread 0 swapper

        Threads from local machine are found while threads from guest VM
	are not found. It is possibly because of a lack of a guest kallsyms
	option for DSO matching in perf annotate.
    PASS

- Test 5: Run kvm test available on perf path
    - $ sudo ./perf test kvm
        89: perf kvm tests                                            : Ok
    PASS

Signed-off-by: Hrishikesh Suresh <hrishikesh123s@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
[ Declare 'nd' in the 'for' line and and 'pos' inside the loop body, to make it more compact ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/session.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7c7c65b0f536..ae62d5c9889f 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2730,11 +2730,14 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
 
 size_t perf_session__fprintf(struct perf_session *session, FILE *fp)
 {
-	/*
-	 * FIXME: Here we have to actually print all the machines in this
-	 * session, not just the host...
-	 */
-	return machine__fprintf(&session->machines.host, fp);
+	size_t ret = machine__fprintf(&session->machines.host, fp);
+
+	for (struct rb_node *nd = rb_first_cached(&session->machines.guests); nd; nd = rb_next(nd)) {
+		struct machine *pos = rb_entry(nd, struct machine, rb_node);
+
+		ret += machine__fprintf(pos, fp);
+	}
+	return ret;
 }
 
 void perf_session__dump_kmaps(struct perf_session *session)
-- 
cgit v1.2.3


From f21fae57744607330ae5dabdd996538faac7f5ab Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Fri, 23 Jan 2026 09:30:08 +0100
Subject: selftests/bpf: Add a few helpers for bpftool testing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to integrate some bpftool tests into test_progs, define a few
specific helpers that allow to execute bpftool commands, while possibly
retrieving the command output. Those helpers most notably set the
path to the bpftool binary under test. This version checks different
possible paths relative to the directories where the different
test_progs runners are executed, as we want to make sure not to
accidentally use a bootstrap version of the binary.

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20260123-bpftool-tests-v4-1-a6653a7f28e7@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile          |  3 +-
 tools/testing/selftests/bpf/bpftool_helpers.c | 74 +++++++++++++++++++++++++++
 tools/testing/selftests/bpf/bpftool_helpers.h | 11 ++++
 3 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/bpftool_helpers.c
 create mode 100644 tools/testing/selftests/bpf/bpftool_helpers.h

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 9488d076c740..2cc559ecc723 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -747,7 +747,8 @@ TRUNNER_EXTRA_SOURCES := test_progs.c		\
 			 json_writer.c 		\
 			 $(VERIFY_SIG_HDR)		\
 			 flow_dissector_load.h	\
-			 ip_check_defrag_frags.h
+			 ip_check_defrag_frags.h	\
+			 bpftool_helpers.c
 TRUNNER_LIB_SOURCES := find_bit.c
 TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read				\
 		       $(OUTPUT)/liburandom_read.so			\
diff --git a/tools/testing/selftests/bpf/bpftool_helpers.c b/tools/testing/selftests/bpf/bpftool_helpers.c
new file mode 100644
index 000000000000..a5824945a4a5
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpftool_helpers.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "bpftool_helpers.h"
+#include <unistd.h>
+#include <string.h>
+#include <stdbool.h>
+
+#define BPFTOOL_PATH_MAX_LEN		64
+#define BPFTOOL_FULL_CMD_MAX_LEN	512
+
+#define BPFTOOL_DEFAULT_PATH		"tools/sbin/bpftool"
+
+static int detect_bpftool_path(char *buffer)
+{
+	char tmp[BPFTOOL_PATH_MAX_LEN];
+
+	/* Check default bpftool location (will work if we are running the
+	 * default flavor of test_progs)
+	 */
+	snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "./%s", BPFTOOL_DEFAULT_PATH);
+	if (access(tmp, X_OK) == 0) {
+		strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN);
+		return 0;
+	}
+
+	/* Check alternate bpftool location (will work if we are running a
+	 * specific flavor of test_progs, e.g. cpuv4 or no_alu32)
+	 */
+	snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "../%s", BPFTOOL_DEFAULT_PATH);
+	if (access(tmp, X_OK) == 0) {
+		strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN);
+		return 0;
+	}
+
+	/* Failed to find bpftool binary */
+	return 1;
+}
+
+static int run_command(char *args, char *output_buf, size_t output_max_len)
+{
+	static char bpftool_path[BPFTOOL_PATH_MAX_LEN] = {0};
+	bool suppress_output = !(output_buf && output_max_len);
+	char command[BPFTOOL_FULL_CMD_MAX_LEN];
+	FILE *f;
+	int ret;
+
+	/* Detect and cache bpftool binary location */
+	if (bpftool_path[0] == 0 && detect_bpftool_path(bpftool_path))
+		return 1;
+
+	ret = snprintf(command, BPFTOOL_FULL_CMD_MAX_LEN, "%s %s%s",
+		       bpftool_path, args,
+		       suppress_output ? " > /dev/null 2>&1" : "");
+
+	f = popen(command, "r");
+	if (!f)
+		return 1;
+
+	if (!suppress_output)
+		fread(output_buf, 1, output_max_len, f);
+	ret = pclose(f);
+
+	return ret;
+}
+
+int run_bpftool_command(char *args)
+{
+	return run_command(args, NULL, 0);
+}
+
+int get_bpftool_command_output(char *args, char *output_buf, size_t output_max_len)
+{
+	return run_command(args, output_buf, output_max_len);
+}
+
diff --git a/tools/testing/selftests/bpf/bpftool_helpers.h b/tools/testing/selftests/bpf/bpftool_helpers.h
new file mode 100644
index 000000000000..dec1ba201410
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpftool_helpers.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+
+#define MAX_BPFTOOL_CMD_LEN	(256)
+
+int run_bpftool_command(char *args);
+int get_bpftool_command_output(char *args, char *output_buf, size_t output_max_len);
-- 
cgit v1.2.3


From 1c0b505908a201054dadc87930f550bea2631c1e Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Fri, 23 Jan 2026 09:30:09 +0100
Subject: selftests/bpf: convert test_bpftool_metadata.sh into test_progs
 framework
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The test_bpftool_metadata.sh script validates that bpftool properly
returns in its ouptput any metadata generated by bpf programs through
some .rodata sections.

Port this test to the test_progs framework so that it can be executed
automatically in CI. The new test, similarly to the former script,
checks that valid data appears both for textual output and json output,
as well as for both data not used at all and used data. For the json
check part, the expected json string is hardcoded to avoid bringing a
new external dependency (eg: a json deserializer) for test_progs.
As the test is now converted into test_progs, remove the former script.

The newly converted test brings two new subtests:

  #37/1    bpftool_metadata/metadata_unused:OK
  #37/2    bpftool_metadata/metadata_used:OK
  #37      bpftool_metadata:OK
  Summary: 1/2 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Link: https://lore.kernel.org/r/20260123-bpftool-tests-v4-2-a6653a7f28e7@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile               |   1 -
 .../selftests/bpf/prog_tests/bpftool_metadata.c    | 144 +++++++++++++++++++++
 .../testing/selftests/bpf/test_bpftool_metadata.sh |  85 ------------
 3 files changed, 144 insertions(+), 86 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c
 delete mode 100755 tools/testing/selftests/bpf/test_bpftool_metadata.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 2cc559ecc723..1bb7db1ed6ea 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -109,7 +109,6 @@ TEST_PROGS := test_kmod.sh \
 	test_bpftool_build.sh \
 	test_bpftool.sh \
 	test_bpftool_map.sh \
-	test_bpftool_metadata.sh \
 	test_doc_build.sh \
 	test_xsk.sh \
 	test_xdp_features.sh
diff --git a/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c b/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c
new file mode 100644
index 000000000000..408ace90dc7e
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <bpftool_helpers.h>
+#include <test_progs.h>
+#include <linux/bpf.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+
+#define BPFFS_DIR	"/sys/fs/bpf/test_metadata"
+#define BPFFS_USED	BPFFS_DIR "/used"
+#define BPFFS_UNUSED	BPFFS_DIR "/unused"
+
+#define BPF_FILE_USED		"metadata_used.bpf.o"
+#define BPF_FILE_UNUSED		"metadata_unused.bpf.o"
+#define METADATA_MAP_NAME	"metadata.rodata"
+
+#define MAX_BPFTOOL_OUTPUT_LEN	(64*1024)
+
+#define MAX_TOKENS_TO_CHECK	3
+static char output[MAX_BPFTOOL_OUTPUT_LEN];
+
+struct test_desc {
+	char *name;
+	char *bpf_prog;
+	char *bpffs_path;
+	char *expected_output[MAX_TOKENS_TO_CHECK];
+	char *expected_output_json[MAX_TOKENS_TO_CHECK];
+	char *metadata_map_name;
+};
+
+static int setup(struct test_desc *test)
+{
+	return mkdir(BPFFS_DIR, 0700);
+}
+
+static void cleanup(struct test_desc *test)
+{
+	unlink(test->bpffs_path);
+	rmdir(BPFFS_DIR);
+}
+
+static int check_metadata(char *buf, char * const *tokens, int count)
+{
+	int i;
+
+	for (i = 0; i < count && tokens[i]; i++)
+		if (!strstr(buf, tokens[i]))
+			return 1;
+
+	return 0;
+}
+
+static void run_test(struct test_desc *test)
+{
+	int ret;
+	char cmd[MAX_BPFTOOL_CMD_LEN];
+
+	ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog load %s %s",
+			test->bpf_prog, test->bpffs_path);
+	if (!ASSERT_GT(ret, 0, "format prog insert command"))
+		return;
+	ret = run_bpftool_command(cmd);
+	if (!ASSERT_OK(ret, "load program"))
+		return;
+
+	/* Check output with default format */
+	ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog show pinned %s",
+		       test->bpffs_path);
+	if (!ASSERT_GT(ret, 0, "format pinned prog check command"))
+		return;
+	ret = get_bpftool_command_output(cmd, output,
+			MAX_BPFTOOL_OUTPUT_LEN);
+	if (ASSERT_OK(ret, "get program info")) {
+		ret = check_metadata(output, test->expected_output,
+				ARRAY_SIZE(test->expected_output));
+		ASSERT_OK(ret, "find metadata");
+	}
+
+	/* Check output with json format */
+	ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog -j show pinned %s",
+		       test->bpffs_path);
+	if (!ASSERT_GT(ret, 0, "format pinned prog check command in json"))
+		return;
+	ret = get_bpftool_command_output(cmd, output,
+					 MAX_BPFTOOL_OUTPUT_LEN);
+	if (ASSERT_OK(ret, "get program info in json")) {
+		ret = check_metadata(output, test->expected_output_json,
+				ARRAY_SIZE(test->expected_output_json));
+		ASSERT_OK(ret, "find metadata in json");
+	}
+
+	/* Check that the corresponding map can be found and accessed */
+	ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "map show name %s",
+		       test->metadata_map_name);
+	if (!ASSERT_GT(ret, 0, "format map check command"))
+		return;
+	ASSERT_OK(run_bpftool_command(cmd), "access metadata map");
+}
+
+static struct test_desc tests[] = {
+	{
+		.name = "metadata_unused",
+		.bpf_prog = BPF_FILE_UNUSED,
+		.bpffs_path = BPFFS_UNUSED,
+		.expected_output = {
+			"a = \"foo\"",
+			"b = 1"
+		},
+		.expected_output_json = {
+			"\"metadata\":{\"a\":\"foo\",\"b\":1}"
+		},
+		.metadata_map_name = METADATA_MAP_NAME
+	},
+	{
+		.name = "metadata_used",
+		.bpf_prog = BPF_FILE_USED,
+		.bpffs_path = BPFFS_USED,
+		.expected_output = {
+			"a = \"bar\"",
+			"b = 2"
+		},
+		.expected_output_json = {
+			"\"metadata\":{\"a\":\"bar\",\"b\":2}"
+		},
+		.metadata_map_name = METADATA_MAP_NAME
+	}
+};
+static const int tests_count = ARRAY_SIZE(tests);
+
+void test_bpftool_metadata(void)
+{
+	int i;
+
+	for (i = 0; i < tests_count; i++) {
+		if (!test__start_subtest(tests[i].name))
+			continue;
+		if (ASSERT_OK(setup(&tests[i]), "setup bpffs pin dir")) {
+			run_test(&tests[i]);
+			cleanup(&tests[i]);
+		}
+	}
+}
diff --git a/tools/testing/selftests/bpf/test_bpftool_metadata.sh b/tools/testing/selftests/bpf/test_bpftool_metadata.sh
deleted file mode 100755
index b5520692f41b..000000000000
--- a/tools/testing/selftests/bpf/test_bpftool_metadata.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-BPF_FILE_USED="metadata_used.bpf.o"
-BPF_FILE_UNUSED="metadata_unused.bpf.o"
-
-TESTNAME=bpftool_metadata
-BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts)
-BPF_DIR=$BPF_FS/test_$TESTNAME
-
-_cleanup()
-{
-	set +e
-	rm -rf $BPF_DIR 2> /dev/null
-}
-
-cleanup_skip()
-{
-	echo "selftests: $TESTNAME [SKIP]"
-	_cleanup
-
-	exit $ksft_skip
-}
-
-cleanup()
-{
-	if [ "$?" = 0 ]; then
-		echo "selftests: $TESTNAME [PASS]"
-	else
-		echo "selftests: $TESTNAME [FAILED]"
-	fi
-	_cleanup
-}
-
-if [ $(id -u) -ne 0 ]; then
-	echo "selftests: $TESTNAME [SKIP] Need root privileges"
-	exit $ksft_skip
-fi
-
-if [ -z "$BPF_FS" ]; then
-	echo "selftests: $TESTNAME [SKIP] Could not run test without bpffs mounted"
-	exit $ksft_skip
-fi
-
-if ! bpftool version > /dev/null 2>&1; then
-	echo "selftests: $TESTNAME [SKIP] Could not run test without bpftool"
-	exit $ksft_skip
-fi
-
-set -e
-
-trap cleanup_skip EXIT
-
-mkdir $BPF_DIR
-
-trap cleanup EXIT
-
-bpftool prog load $BPF_FILE_UNUSED $BPF_DIR/unused
-
-METADATA_PLAIN="$(bpftool prog)"
-echo "$METADATA_PLAIN" | grep 'a = "foo"' > /dev/null
-echo "$METADATA_PLAIN" | grep 'b = 1' > /dev/null
-
-bpftool prog --json | grep '"metadata":{"a":"foo","b":1}' > /dev/null
-
-bpftool map | grep 'metadata.rodata' > /dev/null
-
-rm $BPF_DIR/unused
-
-bpftool prog load $BPF_FILE_USED $BPF_DIR/used
-
-METADATA_PLAIN="$(bpftool prog)"
-echo "$METADATA_PLAIN" | grep 'a = "bar"' > /dev/null
-echo "$METADATA_PLAIN" | grep 'b = 2' > /dev/null
-
-bpftool prog --json | grep '"metadata":{"a":"bar","b":2}' > /dev/null
-
-bpftool map | grep 'metadata.rodata' > /dev/null
-
-rm $BPF_DIR/used
-
-exit 0
-- 
cgit v1.2.3


From 2d96bbdfd3b5d28306001036c0161fcb1713f964 Mon Sep 17 00:00:00 2001
From: "Alexis Lothoré (eBPF Foundation)" <alexis.lothore@bootlin.com>
Date: Fri, 23 Jan 2026 09:30:10 +0100
Subject: selftests/bpf: convert test_bpftool_map_access.sh into test_progs
 framework
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The test_bpftool_map.sh script tests that maps read/write accesses
are being properly allowed/refused by the kernel depending on a specific
fmod_ret program being attached on security_bpf_map function.

Rewrite this test to integrate it in the test_progs. The
new test spawns a few subtests:

  #36/1    bpftool_maps_access/unprotected_unpinned:OK
  #36/2    bpftool_maps_access/unprotected_pinned:OK
  #36/3    bpftool_maps_access/protected_unpinned:OK
  #36/4    bpftool_maps_access/protected_pinned:OK
  #36/5    bpftool_maps_access/nested_maps:OK
  #36/6    bpftool_maps_access/btf_list:OK
  #36      bpftool_maps_access:OK
  Summary: 1/6 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore@bootlin.com>
Acked-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/r/20260123-bpftool-tests-v4-3-a6653a7f28e7@bootlin.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile               |   1 -
 .../selftests/bpf/prog_tests/bpftool_maps_access.c | 371 +++++++++++++++++++
 tools/testing/selftests/bpf/test_bpftool_map.sh    | 398 ---------------------
 3 files changed, 371 insertions(+), 399 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c
 delete mode 100755 tools/testing/selftests/bpf/test_bpftool_map.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 1bb7db1ed6ea..2c2f68a171ed 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -108,7 +108,6 @@ TEST_PROGS := test_kmod.sh \
 	test_xdping.sh \
 	test_bpftool_build.sh \
 	test_bpftool.sh \
-	test_bpftool_map.sh \
 	test_doc_build.sh \
 	test_xsk.sh \
 	test_xdp_features.sh
diff --git a/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c b/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c
new file mode 100644
index 000000000000..e0eb869cb1b4
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c
@@ -0,0 +1,371 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <bpf/libbpf.h>
+#include <bpftool_helpers.h>
+#include <test_progs.h>
+#include <bpf/bpf.h>
+#include "security_bpf_map.skel.h"
+
+#define PROTECTED_MAP_NAME	"prot_map"
+#define UNPROTECTED_MAP_NAME	"not_prot_map"
+#define BPF_ITER_FILE		"bpf_iter_map_elem.bpf.o"
+#define BPFFS_PIN_DIR		"/sys/fs/bpf/test_bpftool_map"
+#define INNER_MAP_NAME		"inner_map_tt"
+#define OUTER_MAP_NAME		"outer_map_tt"
+
+#define MAP_NAME_MAX_LEN	64
+#define PATH_MAX_LEN		128
+
+enum map_protection {
+	PROTECTED,
+	UNPROTECTED
+};
+
+struct test_desc {
+	char *name;
+	enum map_protection protection;
+	struct bpf_map *map;
+	char *map_name;
+	bool pinned;
+	char pin_path[PATH_MAX_LEN];
+	bool write_must_fail;
+};
+
+static struct security_bpf_map *general_setup(void)
+{
+	struct security_bpf_map *skel;
+	uint32_t key, value;
+	int ret, i;
+
+	skel = security_bpf_map__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open and load skeleton"))
+		goto end;
+
+	struct bpf_map *maps[] = {skel->maps.prot_map, skel->maps.not_prot_map};
+
+	ret = security_bpf_map__attach(skel);
+	if (!ASSERT_OK(ret, "attach maps security programs"))
+		goto end_destroy;
+
+	for (i = 0; i < sizeof(maps)/sizeof(struct bpf_map *); i++) {
+		for (key = 0; key < 2; key++) {
+			int ret = bpf_map__update_elem(maps[i], &key,
+					sizeof(key), &key, sizeof(key),
+					0);
+			if (!ASSERT_OK(ret, "set initial map value"))
+				goto end_destroy;
+		}
+	}
+
+	key = 0;
+	value = 1;
+	ret = bpf_map__update_elem(skel->maps.prot_status_map, &key,
+			sizeof(key), &value, sizeof(value), 0);
+	if (!ASSERT_OK(ret, "configure map protection"))
+		goto end_destroy;
+
+	if (!ASSERT_OK(mkdir(BPFFS_PIN_DIR, S_IFDIR), "create bpffs pin dir"))
+		goto end_destroy;
+
+	return skel;
+end_destroy:
+	security_bpf_map__destroy(skel);
+end:
+	return NULL;
+}
+
+static void general_cleanup(struct security_bpf_map *skel)
+{
+	rmdir(BPFFS_PIN_DIR);
+	security_bpf_map__destroy(skel);
+}
+
+static void update_test_desc(struct security_bpf_map *skel,
+			      struct test_desc *test)
+{
+	/* Now that the skeleton is loaded, update all missing fields to
+	 * have the subtest properly configured
+	 */
+	if (test->protection == PROTECTED) {
+		test->map = skel->maps.prot_map;
+		test->map_name = PROTECTED_MAP_NAME;
+	} else {
+		test->map = skel->maps.not_prot_map;
+		test->map_name = UNPROTECTED_MAP_NAME;
+	}
+}
+
+static int test_setup(struct security_bpf_map *skel, struct test_desc *desc)
+{
+	int ret;
+
+	update_test_desc(skel, desc);
+
+	if (desc->pinned) {
+		ret = snprintf(desc->pin_path, PATH_MAX_LEN, "%s/%s", BPFFS_PIN_DIR,
+				desc->name);
+		if (!ASSERT_GT(ret, 0, "format pin path"))
+			return 1;
+		ret = bpf_map__pin(desc->map, desc->pin_path);
+		if (!ASSERT_OK(ret, "pin map"))
+			return 1;
+	}
+
+	return 0;
+}
+
+static void test_cleanup(struct test_desc *desc)
+{
+	if (desc->pinned)
+		bpf_map__unpin(desc->map, NULL);
+}
+
+static int lookup_map_value(char *map_handle)
+{
+	char cmd[MAX_BPFTOOL_CMD_LEN];
+	int ret = 0;
+
+	ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "map lookup %s key 0 0 0 0",
+			map_handle);
+	if (!ASSERT_GT(ret, 0, "format map lookup cmd"))
+		return 1;
+	return run_bpftool_command(cmd);
+}
+
+static int read_map_btf_data(char *map_handle)
+{
+	char cmd[MAX_BPFTOOL_CMD_LEN];
+	int ret = 0;
+
+	ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "btf dump map %s",
+			map_handle);
+	if (!ASSERT_GT(ret, 0, "format map btf dump cmd"))
+		return 1;
+	return run_bpftool_command(cmd);
+}
+
+static int write_map_value(char *map_handle)
+{
+	char cmd[MAX_BPFTOOL_CMD_LEN];
+	int ret = 0;
+
+	ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN,
+		       "map update %s key 0 0 0 0 value 1 1 1 1", map_handle);
+	if (!ASSERT_GT(ret, 0, "format value write cmd"))
+		return 1;
+	return run_bpftool_command(cmd);
+}
+
+static int delete_map_value(char *map_handle)
+{
+	char cmd[MAX_BPFTOOL_CMD_LEN];
+	int ret = 0;
+
+	ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN,
+		       "map delete %s key 0 0 0 0", map_handle);
+	if (!ASSERT_GT(ret, 0, "format value deletion cmd"))
+		return 1;
+	return run_bpftool_command(cmd);
+}
+
+static int iterate_on_map_values(char *map_handle, char *iter_pin_path)
+{
+	char cmd[MAX_BPFTOOL_CMD_LEN];
+	int ret = 0;
+
+
+	ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "iter pin %s %s map %s",
+		       BPF_ITER_FILE, iter_pin_path, map_handle);
+	if (!ASSERT_GT(ret, 0, "format iterator creation cmd"))
+		return 1;
+	ret = run_bpftool_command(cmd);
+	if (ret)
+		return ret;
+	ret = snprintf(cmd, MAP_NAME_MAX_LEN, "cat %s", iter_pin_path);
+	if (ret < 0)
+		goto cleanup;
+	ret = system(cmd);
+
+cleanup:
+	unlink(iter_pin_path);
+	return ret;
+}
+
+static int create_inner_map(void)
+{
+	char cmd[MAX_BPFTOOL_CMD_LEN];
+	int ret = 0;
+
+	ret = snprintf(
+		cmd, MAX_BPFTOOL_CMD_LEN,
+		"map create %s/%s type array key 4 value 4 entries 4 name %s",
+		BPFFS_PIN_DIR, INNER_MAP_NAME, INNER_MAP_NAME);
+	if (!ASSERT_GT(ret, 0, "format inner map create cmd"))
+		return 1;
+	return run_bpftool_command(cmd);
+}
+
+static int create_outer_map(void)
+{
+	char cmd[MAX_BPFTOOL_CMD_LEN];
+	int ret = 0;
+
+	ret = snprintf(
+		cmd, MAX_BPFTOOL_CMD_LEN,
+		"map create %s/%s type hash_of_maps key 4 value 4 entries 2 name %s inner_map name %s",
+		BPFFS_PIN_DIR, OUTER_MAP_NAME, OUTER_MAP_NAME, INNER_MAP_NAME);
+	if (!ASSERT_GT(ret, 0, "format outer map create cmd"))
+		return 1;
+	return run_bpftool_command(cmd);
+}
+
+static void delete_pinned_map(char *map_name)
+{
+	char pin_path[PATH_MAX_LEN];
+	int ret;
+
+	ret = snprintf(pin_path, PATH_MAX_LEN, "%s/%s", BPFFS_PIN_DIR,
+		       map_name);
+	if (ret >= 0)
+		unlink(pin_path);
+}
+
+static int add_outer_map_entry(int key)
+{
+	char cmd[MAX_BPFTOOL_CMD_LEN];
+	int ret = 0;
+
+	ret = snprintf(
+		cmd, MAX_BPFTOOL_CMD_LEN,
+		"map update pinned %s/%s key %d 0 0 0 value name %s",
+		BPFFS_PIN_DIR, OUTER_MAP_NAME, key, INNER_MAP_NAME);
+	if (!ASSERT_GT(ret, 0, "format outer map value addition cmd"))
+		return 1;
+	return run_bpftool_command(cmd);
+}
+
+static void test_basic_access(struct test_desc *desc)
+{
+	char map_handle[MAP_NAME_MAX_LEN];
+	char iter_pin_path[PATH_MAX_LEN];
+	int ret;
+
+	if (desc->pinned)
+		ret = snprintf(map_handle, MAP_NAME_MAX_LEN, "pinned %s",
+			       desc->pin_path);
+	else
+		ret = snprintf(map_handle, MAP_NAME_MAX_LEN, "name %s",
+			       desc->map_name);
+	if (!ASSERT_GT(ret, 0, "format map handle"))
+		return;
+
+	ret = lookup_map_value(map_handle);
+	ASSERT_OK(ret, "read map value");
+
+	ret = read_map_btf_data(map_handle);
+	ASSERT_OK(ret, "read map btf data");
+
+	ret = write_map_value(map_handle);
+	ASSERT_OK(desc->write_must_fail ? !ret : ret, "write map value");
+
+	ret = delete_map_value(map_handle);
+	ASSERT_OK(desc->write_must_fail ? !ret : ret, "delete map value");
+	/* Restore deleted value */
+	if (!ret)
+		write_map_value(map_handle);
+
+	ret = snprintf(iter_pin_path, PATH_MAX_LEN, "%s/iter", BPFFS_PIN_DIR);
+	if (ASSERT_GT(ret, 0, "format iter pin path")) {
+		ret = iterate_on_map_values(map_handle, iter_pin_path);
+		ASSERT_OK(ret, "iterate on map values");
+	}
+}
+
+static void test_create_nested_maps(void)
+{
+	if (!ASSERT_OK(create_inner_map(), "create inner map"))
+		return;
+	if (!ASSERT_OK(create_outer_map(), "create outer map"))
+		goto end_cleanup_inner;
+	ASSERT_OK(add_outer_map_entry(0), "add a first entry in outer map");
+	ASSERT_OK(add_outer_map_entry(1), "add a second entry in outer map");
+	ASSERT_NEQ(add_outer_map_entry(2), 0, "add a third entry in outer map");
+
+	delete_pinned_map(OUTER_MAP_NAME);
+end_cleanup_inner:
+	delete_pinned_map(INNER_MAP_NAME);
+}
+
+static void test_btf_list(void)
+{
+	ASSERT_OK(run_bpftool_command("btf list"), "list btf data");
+}
+
+static struct test_desc tests[] = {
+	{
+		.name = "unprotected_unpinned",
+		.protection = UNPROTECTED,
+		.map_name = UNPROTECTED_MAP_NAME,
+		.pinned = false,
+		.write_must_fail = false,
+	},
+	{
+		.name = "unprotected_pinned",
+		.protection = UNPROTECTED,
+		.map_name = UNPROTECTED_MAP_NAME,
+		.pinned = true,
+		.write_must_fail = false,
+	},
+	{
+		.name = "protected_unpinned",
+		.protection = PROTECTED,
+		.map_name = UNPROTECTED_MAP_NAME,
+		.pinned = false,
+		.write_must_fail = true,
+	},
+	{
+		.name = "protected_pinned",
+		.protection = PROTECTED,
+		.map_name = UNPROTECTED_MAP_NAME,
+		.pinned = true,
+		.write_must_fail = true,
+	}
+};
+
+static const size_t tests_count = ARRAY_SIZE(tests);
+
+void test_bpftool_maps_access(void)
+{
+	struct security_bpf_map *skel;
+	struct test_desc *current;
+	int i;
+
+	skel = general_setup();
+	if (!ASSERT_OK_PTR(skel, "prepare programs"))
+		goto cleanup;
+
+	for (i = 0; i < tests_count; i++) {
+		current = &tests[i];
+		if (!test__start_subtest(current->name))
+			continue;
+		if (ASSERT_OK(test_setup(skel, current), "subtest setup")) {
+			test_basic_access(current);
+			test_cleanup(current);
+		}
+	}
+	if (test__start_subtest("nested_maps"))
+		test_create_nested_maps();
+	if (test__start_subtest("btf_list"))
+		test_btf_list();
+
+cleanup:
+	general_cleanup(skel);
+}
+
diff --git a/tools/testing/selftests/bpf/test_bpftool_map.sh b/tools/testing/selftests/bpf/test_bpftool_map.sh
deleted file mode 100755
index 515b1df0501e..000000000000
--- a/tools/testing/selftests/bpf/test_bpftool_map.sh
+++ /dev/null
@@ -1,398 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-TESTNAME="bpftool_map"
-BPF_FILE="security_bpf_map.bpf.o"
-BPF_ITER_FILE="bpf_iter_map_elem.bpf.o"
-PROTECTED_MAP_NAME="prot_map"
-NOT_PROTECTED_MAP_NAME="not_prot_map"
-BPF_FS_TMP_PARENT="/tmp"
-BPF_FS_PARENT=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts)
-BPF_FS_PARENT=${BPF_FS_PARENT:-$BPF_FS_TMP_PARENT}
-# bpftool will mount bpf file system under BPF_DIR if it is not mounted
-# under BPF_FS_PARENT.
-BPF_DIR="$BPF_FS_PARENT/test_$TESTNAME"
-SCRIPT_DIR=$(dirname $(realpath "$0"))
-BPF_FILE_PATH="$SCRIPT_DIR/$BPF_FILE"
-BPF_ITER_FILE_PATH="$SCRIPT_DIR/$BPF_ITER_FILE"
-BPFTOOL_PATH="bpftool"
-# Assume the script is located under tools/testing/selftests/bpf/
-KDIR_ROOT_DIR=$(realpath "$SCRIPT_DIR"/../../../../)
-
-_cleanup()
-{
-	set +eu
-
-	# If BPF_DIR is a mount point this will not remove the mount point itself.
-	[ -d "$BPF_DIR" ] && rm -rf "$BPF_DIR" 2> /dev/null
-
-	# Unmount if BPF filesystem was temporarily created.
-	if [ "$BPF_FS_PARENT" = "$BPF_FS_TMP_PARENT" ]; then
-		# A loop and recursive unmount are required as bpftool might
-		# create multiple mounts. For example, a bind mount of the directory
-		# to itself. The bind mount is created to change mount propagation
-		# flags on an actual mount point.
-		max_attempts=3
-		attempt=0
-		while mountpoint -q "$BPF_DIR" && [ $attempt -lt $max_attempts ]; do
-			umount -R "$BPF_DIR" 2>/dev/null
-			attempt=$((attempt+1))
-		done
-
-		# The directory still exists. Remove it now.
-		[ -d "$BPF_DIR" ] && rm -rf "$BPF_DIR" 2>/dev/null
-	fi
-}
-
-cleanup_skip()
-{
-	echo "selftests: $TESTNAME [SKIP]"
-	_cleanup
-
-	exit $ksft_skip
-}
-
-cleanup()
-{
-	if [ "$?" = 0 ]; then
-		echo "selftests: $TESTNAME [PASS]"
-	else
-		echo "selftests: $TESTNAME [FAILED]"
-	fi
-	_cleanup
-}
-
-check_root_privileges() {
-	if [ $(id -u) -ne 0 ]; then
-		echo "Need root privileges"
-		exit $ksft_skip
-	fi
-}
-
-# Function to verify bpftool path.
-# Parameters:
-#   $1: bpftool path
-verify_bpftool_path() {
-	local bpftool_path="$1"
-	if ! "$bpftool_path" version > /dev/null 2>&1; then
-		echo "Could not run test without bpftool"
-		exit $ksft_skip
-	fi
-}
-
-# Function to verify BTF support.
-# The test requires BTF support for fmod_ret programs.
-verify_btf_support() {
-	if [ ! -f /sys/kernel/btf/vmlinux ]; then
-		echo "Could not run test without BTF support"
-		exit $ksft_skip
-	fi
-}
-
-# Function to initialize map entries with keys [0..2] and values set to 0.
-# Parameters:
-#  $1: Map name
-#  $2: bpftool path
-initialize_map_entries() {
-	local map_name="$1"
-	local bpftool_path="$2"
-
-	for key in 0 1 2; do
-		"$bpftool_path" map update name "$map_name" key $key 0 0 0 value 0 0 0 $key
-	done
-}
-
-# Test read access to the map.
-# Parameters:
-#   $1: Name command (name/pinned)
-#   $2: Map name
-#   $3: bpftool path
-#   $4: key
-access_for_read() {
-	local name_cmd="$1"
-	local map_name="$2"
-	local bpftool_path="$3"
-	local key="$4"
-
-	# Test read access to the map.
-	if ! "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then
-		echo " Read access to $key in $map_name failed"
-		exit 1
-	fi
-
-	# Test read access to map's BTF data.
-	if ! "$bpftool_path" btf dump map "$name_cmd" "$map_name" 1>/dev/null; then
-		echo " Read access to $map_name for BTF data failed"
-		exit 1
-	fi
-}
-
-# Test write access to the map.
-# Parameters:
-#   $1: Name command (name/pinned)
-#   $2: Map name
-#   $3: bpftool path
-#   $4: key
-#   $5: Whether write should succeed (true/false)
-access_for_write() {
-	local name_cmd="$1"
-	local map_name="$2"
-	local bpftool_path="$3"
-	local key="$4"
-	local write_should_succeed="$5"
-	local value="1 1 1 1"
-
-	if "$bpftool_path" map update "$name_cmd" "$map_name" key $key value \
-			$value 2>/dev/null; then
-		if [ "$write_should_succeed" = "false" ]; then
-			echo " Write access to $key in $map_name succeeded but should have failed"
-			exit 1
-		fi
-	else
-		if [ "$write_should_succeed" = "true" ]; then
-			echo " Write access to $key in $map_name failed but should have succeeded"
-			exit 1
-		fi
-	fi
-}
-
-# Test entry deletion for the map.
-# Parameters:
-#   $1: Name command (name/pinned)
-#   $2: Map name
-#   $3: bpftool path
-#   $4: key
-#   $5: Whether write should succeed (true/false)
-access_for_deletion() {
-	local name_cmd="$1"
-	local map_name="$2"
-	local bpftool_path="$3"
-	local key="$4"
-	local write_should_succeed="$5"
-	local value="1 1 1 1"
-
-	# Test deletion by key for the map.
-	# Before deleting, check the key exists.
-	if ! "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then
-		echo " Key $key does not exist in $map_name"
-		exit 1
-	fi
-
-	# Delete by key.
-	if "$bpftool_path" map delete "$name_cmd" "$map_name" key $key 2>/dev/null; then
-		if [ "$write_should_succeed" = "false" ]; then
-			echo " Deletion for $key in $map_name succeeded but should have failed"
-			exit 1
-		fi
-	else
-		if [ "$write_should_succeed" = "true" ]; then
-			echo " Deletion for $key in $map_name failed but should have succeeded"
-			exit 1
-		fi
-	fi
-
-	# After deleting, check the entry existence according to the expected status.
-	if "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then
-		if [ "$write_should_succeed" = "true" ]; then
-			echo " Key $key for $map_name was not deleted but should have been deleted"
-			exit 1
-		fi
-	else
-		if [ "$write_should_succeed" = "false" ]; then
-			echo "Key $key for $map_name was deleted but should have not been deleted"
-			exit 1
-		fi
-	fi
-
-	# Test creation of map's deleted entry, if deletion was successful.
-	# Otherwise, the entry exists.
-	if "$bpftool_path" map update "$name_cmd" "$map_name" key $key value \
-				$value 2>/dev/null; then
-		if [ "$write_should_succeed" = "false" ]; then
-			echo " Write access to $key in $map_name succeeded after deletion attempt but should have failed"
-			exit 1
-		fi
-	else
-		if [ "$write_should_succeed" = "true" ]; then
-			echo " Write access to $key in $map_name failed after deletion attempt but should have succeeded"
-			exit 1
-		fi
-	fi
-}
-
-# Test map elements iterator.
-# Parameters:
-#   $1: Name command (name/pinned)
-#   $2: Map name
-#   $3: bpftool path
-#   $4: BPF_DIR
-#   $5: bpf iterator object file path
-iterate_map_elem() {
-	local name_cmd="$1"
-	local map_name="$2"
-	local bpftool_path="$3"
-	local bpf_dir="$4"
-	local bpf_file="$5"
-	local pin_path="$bpf_dir/map_iterator"
-
-	"$bpftool_path" iter pin "$bpf_file" "$pin_path" map "$name_cmd" "$map_name"
-	if [ ! -f "$pin_path" ]; then
-		echo " Failed to pin iterator to $pin_path"
-		exit 1
-	fi
-
-	cat "$pin_path" 1>/dev/null
-	rm "$pin_path" 2>/dev/null
-}
-
-# Function to test map access with configurable write expectations
-# Parameters:
-#   $1: Name command (name/pinned)
-#   $2: Map name
-#   $3: bpftool path
-#   $4: key for rw
-#   $5: key to delete
-#   $6: Whether write should succeed (true/false)
-#   $7: BPF_DIR
-#   $8: bpf iterator object file path
-access_map() {
-	local name_cmd="$1"
-	local map_name="$2"
-	local bpftool_path="$3"
-	local key_for_rw="$4"
-	local key_to_del="$5"
-	local write_should_succeed="$6"
-	local bpf_dir="$7"
-	local bpf_iter_file_path="$8"
-
-	access_for_read "$name_cmd" "$map_name" "$bpftool_path" "$key_for_rw"
-	access_for_write "$name_cmd" "$map_name" "$bpftool_path" "$key_for_rw" \
-		"$write_should_succeed"
-	access_for_deletion "$name_cmd" "$map_name" "$bpftool_path" "$key_to_del" \
-		"$write_should_succeed"
-	iterate_map_elem "$name_cmd" "$map_name" "$bpftool_path" "$bpf_dir" \
-		"$bpf_iter_file_path"
-}
-
-# Function to test map access with configurable write expectations
-# Parameters:
-#   $1: Map name
-#   $2: bpftool path
-#   $3: BPF_DIR
-#   $4: Whether write should succeed (true/false)
-#   $5: bpf iterator object file path
-test_map_access() {
-	local map_name="$1"
-	local bpftool_path="$2"
-	local bpf_dir="$3"
-	local pin_path="$bpf_dir/${map_name}_pinned"
-	local write_should_succeed="$4"
-	local bpf_iter_file_path="$5"
-
-	# Test access to the map by name.
-	access_map "name" "$map_name" "$bpftool_path" "0 0 0 0" "1 0 0 0" \
-		"$write_should_succeed" "$bpf_dir" "$bpf_iter_file_path"
-
-	# Pin the map to the BPF filesystem
-	"$bpftool_path" map pin name "$map_name" "$pin_path"
-	if [ ! -e "$pin_path" ]; then
-		echo " Failed to pin $map_name"
-		exit 1
-	fi
-
-	# Test access to the pinned map.
-	access_map "pinned" "$pin_path" "$bpftool_path" "0 0 0 0" "2 0 0 0" \
-		"$write_should_succeed" "$bpf_dir" "$bpf_iter_file_path"
-}
-
-# Function to test map creation and map-of-maps
-# Parameters:
-#   $1: bpftool path
-#   $2: BPF_DIR
-test_map_creation_and_map_of_maps() {
-	local bpftool_path="$1"
-	local bpf_dir="$2"
-	local outer_map_name="outer_map_tt"
-	local inner_map_name="inner_map_tt"
-
-	"$bpftool_path" map create "$bpf_dir/$inner_map_name" type array key 4 \
-		value 4 entries 4 name "$inner_map_name"
-	if [ ! -f "$bpf_dir/$inner_map_name" ]; then
-		echo " Failed to create inner map file at $bpf_dir/$outer_map_name"
-		return 1
-	fi
-
-	"$bpftool_path" map create "$bpf_dir/$outer_map_name" type hash_of_maps \
-		key 4 value 4 entries 2 name "$outer_map_name" inner_map name "$inner_map_name"
-	if [ ! -f "$bpf_dir/$outer_map_name" ]; then
-		echo " Failed to create outer map file at $bpf_dir/$outer_map_name"
-		return 1
-	fi
-
-	# Add entries to the outer map by name and by pinned path.
-	"$bpftool_path" map update pinned "$bpf_dir/$outer_map_name" key 0 0 0 0 \
-		value pinned "$bpf_dir/$inner_map_name"
-	"$bpftool_path" map update name "$outer_map_name" key 1 0 0 0 value \
-		name "$inner_map_name"
-
-	# The outer map should be full by now.
-	# The following map update command is expected to fail.
-	if "$bpftool_path" map update name "$outer_map_name" key 2 0 0 0 value name \
-		"$inner_map_name" 2>/dev/null; then
-		echo " Update for $outer_map_name succeeded but should have failed"
-		exit 1
-	fi
-}
-
-# Function to test map access with the btf list command
-# Parameters:
-#   $1: bpftool path
-test_map_access_with_btf_list() {
-	local bpftool_path="$1"
-
-	# The btf list command iterates over maps for
-	# loaded BPF programs.
-	if ! "$bpftool_path" btf list 1>/dev/null; then
-		echo " Failed to access btf data"
-		exit 1
-	fi
-}
-
-set -eu
-
-trap cleanup_skip EXIT
-
-check_root_privileges
-
-verify_bpftool_path "$BPFTOOL_PATH"
-
-verify_btf_support
-
-trap cleanup EXIT
-
-# Load and attach the BPF programs to control maps access.
-"$BPFTOOL_PATH" prog loadall "$BPF_FILE_PATH" "$BPF_DIR" autoattach
-
-initialize_map_entries "$PROTECTED_MAP_NAME" "$BPFTOOL_PATH"
-initialize_map_entries "$NOT_PROTECTED_MAP_NAME" "$BPFTOOL_PATH"
-
-# Activate the map protection mechanism. Protection status is controlled
-# by a value stored in the prot_status_map at index 0.
-"$BPFTOOL_PATH" map update name prot_status_map key 0 0 0 0 value 1 0 0 0
-
-# Test protected map (write should fail).
-test_map_access "$PROTECTED_MAP_NAME" "$BPFTOOL_PATH" "$BPF_DIR" "false" \
- "$BPF_ITER_FILE_PATH"
-
-# Test not protected map (write should succeed).
-test_map_access "$NOT_PROTECTED_MAP_NAME" "$BPFTOOL_PATH" "$BPF_DIR" "true" \
- "$BPF_ITER_FILE_PATH"
-
-test_map_creation_and_map_of_maps "$BPFTOOL_PATH" "$BPF_DIR"
-
-test_map_access_with_btf_list "$BPFTOOL_PATH"
-
-exit 0
-- 
cgit v1.2.3


From 08e8f1ef3df270daef4ffc9c4bb15669f72d5d2f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 14 Jan 2026 22:47:56 -0800
Subject: kernel-chktaint: add reporting for tainted modules

Check all loaded modules and report any that have their 'taint'
flags set.  The tainted module output format is:
 * <module_name> (<taint_flags>)

Example output:

Kernel is "tainted" for the following reasons:
 * externally-built ('out-of-tree') module was loaded  (#12)
 * unsigned module was loaded (#13)
Raw taint value as int/string: 12288/'G           OE      '

Tainted modules:
 * dump_test (OE)

Link: https://lkml.kernel.org/r/20260115064756.531592-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Thorsten Leemhuis <linux@leemhuis.info>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/debugging/kernel-chktaint | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/debugging/kernel-chktaint b/tools/debugging/kernel-chktaint
index e7da0909d097..e1571c04afb5 100755
--- a/tools/debugging/kernel-chktaint
+++ b/tools/debugging/kernel-chktaint
@@ -211,9 +211,25 @@ else
 	addout "J"
 	echo " * fwctl's mutating debug interface was used (#19)"
 fi
+echo "Raw taint value as int/string: $taint/'$out'"
+
+# report on any tainted loadable modules
+[ "$1" = "" ] && [ -r /sys/module/ ] && \
+	cnt=`grep [A-Z] /sys/module/*/taint | wc -l` || cnt=0
 
+if [ $cnt -ne 0 ]; then
+	echo
+	echo "Tainted modules:"
+	for dir in `ls /sys/module` ; do
+		if [ -r /sys/module/$dir/taint ]; then
+			modtnt=`cat /sys/module/$dir/taint`
+			[ "$modtnt" = "" ] || echo " * $dir ($modtnt)"
+		fi
+	done
+fi
+
+echo
 echo "For a more detailed explanation of the various taint flags see"
 echo " Documentation/admin-guide/tainted-kernels.rst in the Linux kernel sources"
 echo " or https://kernel.org/doc/html/latest/admin-guide/tainted-kernels.html"
-echo "Raw taint value as int/string: $taint/'$out'"
 #EOF#
-- 
cgit v1.2.3


From a84a1fe0fb2e9bfccb1d5a2929a249960a93264d Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Sun, 25 Jan 2026 12:55:24 +0200
Subject: selftests: net: fix wrong boolean evaluation in __exit__

The __exit__ method receives ex_type as the exception class when an
exception occurs. The previous code used implicit boolean evaluation:

    terminate = self.terminate or (self._exit_wait and ex_type)
                                                   ^^^^^^^^^^^

In Python, the and operator can be used with non-boolean values, but it
does not always return a boolean result.

This is probably not what we want, because 'self._exit_wait and ex_type'
could return the actual ex_type value (the exception class) rather than
a boolean True when an exception occurs.

Use explicit `ex_type is not None` check to properly evaluate whether
an exception occurred, returning a boolean result.

Reviewed-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Link: https://patch.msgid.link/20260125105524.773993-1-gal@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/lib/py/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py
index 37243103aee3..85884f3e827b 100644
--- a/tools/testing/selftests/net/lib/py/utils.py
+++ b/tools/testing/selftests/net/lib/py/utils.py
@@ -160,7 +160,7 @@ class bkg(cmd):
 
     def __exit__(self, ex_type, ex_value, ex_tb):
         # Force termination on exception
-        terminate = self.terminate or (self._exit_wait and ex_type)
+        terminate = self.terminate or (self._exit_wait and ex_type is not None)
         return self.process(terminate=terminate, fail=self.check_fail)
 
 
-- 
cgit v1.2.3


From 53eb797ffc3abe30418b19777922b55fb339fc1f Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Sun, 18 Jan 2026 14:50:41 +0000
Subject: mm/rmap: remove anon_vma_merge() function

This function is confusing, we already have the concept of anon_vma merge
to adjacent VMA's anon_vma's to increase probability of anon_vma
compatibility and therefore VMA merge (see is_mergeable_anon_vma() etc.),
as well as anon_vma reuse, along side the usual VMA merge logic.

We can remove the anon_vma check as it is redundant - a merge would not
have been permitted with removal if the anon_vma's were not the same (and
in the case of an unfaulted/faulted merge, we would have already set the
unfaulted VMA's anon_vma to vp->remove->anon_vma in dup_anon_vma()).

Avoid overloading this term when we're very simply unlinking anon_vma
state from a removed VMA upon merge.

Link: https://lkml.kernel.org/r/56bbe45e309f7af197b1c4f94a9a0c8931ff2d29.1768746221.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chris Li <chriscli@google.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h             | 7 -------
 mm/vma.c                         | 2 +-
 tools/testing/vma/vma_internal.h | 5 -----
 3 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'tools')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index daa92a58585d..832bfc0ccfc6 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -165,13 +165,6 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma)
 	return __anon_vma_prepare(vma);
 }
 
-static inline void anon_vma_merge(struct vm_area_struct *vma,
-				  struct vm_area_struct *next)
-{
-	VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
-	unlink_anon_vmas(next);
-}
-
 struct anon_vma *folio_get_anon_vma(const struct folio *folio);
 
 #ifdef CONFIG_MM_ID
diff --git a/mm/vma.c b/mm/vma.c
index f81a5cfcd7cc..6c458c8656b8 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -381,7 +381,7 @@ again:
 			fput(vp->file);
 		}
 		if (vp->remove->anon_vma)
-			anon_vma_merge(vp->vma, vp->remove);
+			unlink_anon_vmas(vp->remove);
 		mm->map_count--;
 		mpol_put(vma_policy(vp->remove));
 		if (!vp->remove2)
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 9f0a9f5ed0fe..93e5792306d9 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -1265,11 +1265,6 @@ static inline void i_mmap_unlock_write(struct address_space *mapping)
 {
 }
 
-static inline void anon_vma_merge(struct vm_area_struct *vma,
-				  struct vm_area_struct *next)
-{
-}
-
 static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
 					 unsigned long start,
 					 unsigned long end,
-- 
cgit v1.2.3


From d17f02417a337de0a0c6e763e938ee5e41a97c3d Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Sun, 18 Jan 2026 14:50:45 +0000
Subject: mm/rmap: separate out fork-only logic on anon_vma_clone()

Specify which operation is being performed to anon_vma_clone(), which
allows us to do checks specific to each operation type, as well as to
separate out and make clear that the anon_vma reuse logic is absolutely
specific to fork only.

This opens the door to further refactorings and refinements later as we
have more information to work with.

Link: https://lkml.kernel.org/r/cf7da7a2d973cdc72a1b80dd9a73260519e8fa9f.1768746221.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chris Li <chriscli@google.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/internal.h                    | 11 +++++-
 mm/rmap.c                        | 74 +++++++++++++++++++++++++++-------------
 mm/vma.c                         |  6 ++--
 tools/testing/vma/vma_internal.h | 11 +++++-
 4 files changed, 74 insertions(+), 28 deletions(-)

(limited to 'tools')

diff --git a/mm/internal.h b/mm/internal.h
index aac4ec53fe15..5585059f0209 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -244,7 +244,16 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
 
 struct anon_vma *folio_get_anon_vma(const struct folio *folio);
 
-int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src);
+/* Operations which modify VMAs. */
+enum vma_operation {
+	VMA_OP_SPLIT,
+	VMA_OP_MERGE_UNFAULTED,
+	VMA_OP_REMAP,
+	VMA_OP_FORK,
+};
+
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+	enum vma_operation operation);
 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma);
 int  __anon_vma_prepare(struct vm_area_struct *vma);
 void unlink_anon_vmas(struct vm_area_struct *vma);
diff --git a/mm/rmap.c b/mm/rmap.c
index a5ce9163454a..6ddbf58111ff 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -232,12 +232,13 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
 }
 
 static void check_anon_vma_clone(struct vm_area_struct *dst,
-				 struct vm_area_struct *src)
+				 struct vm_area_struct *src,
+				 enum vma_operation operation)
 {
 	/* The write lock must be held. */
 	mmap_assert_write_locked(src->vm_mm);
-	/* If not a fork (implied by dst->anon_vma) then must be on same mm. */
-	VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm);
+	/* If not a fork then must be on same mm. */
+	VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm);
 
 	/* If we have anything to do src->anon_vma must be provided. */
 	VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
@@ -249,6 +250,40 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
 	 * must be the same across dst and src.
 	 */
 	VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
+	/*
+	 * Essentially equivalent to above - if not a no-op, we should expect
+	 * dst->anon_vma to be set for everything except a fork.
+	 */
+	VM_WARN_ON_ONCE(operation != VMA_OP_FORK && src->anon_vma &&
+			!dst->anon_vma);
+	/* For the anon_vma to be compatible, it can only be singular. */
+	VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED &&
+			!list_is_singular(&src->anon_vma_chain));
+#ifdef CONFIG_PER_VMA_LOCK
+	/* Only merging an unfaulted VMA leaves the destination attached. */
+	VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED &&
+			vma_is_attached(dst));
+#endif
+}
+
+static void maybe_reuse_anon_vma(struct vm_area_struct *dst,
+		struct anon_vma *anon_vma)
+{
+	/* If already populated, nothing to do.*/
+	if (dst->anon_vma)
+		return;
+
+	/*
+	 * We reuse an anon_vma if any linking VMAs were unmapped and it has
+	 * only a single child at most.
+	 */
+	if (anon_vma->num_active_vmas > 0)
+		return;
+	if (anon_vma->num_children > 1)
+		return;
+
+	dst->anon_vma = anon_vma;
+	anon_vma->num_active_vmas++;
 }
 
 static void cleanup_partial_anon_vmas(struct vm_area_struct *vma);
@@ -258,6 +293,7 @@ static void cleanup_partial_anon_vmas(struct vm_area_struct *vma);
  * all of the anon_vma objects contained within @src anon_vma_chain's.
  * @dst: The destination VMA with an empty anon_vma_chain.
  * @src: The source VMA we wish to duplicate.
+ * @operation: The type of operation which resulted in the clone.
  *
  * This is the heart of the VMA side of the anon_vma implementation - we invoke
  * this function whenever we need to set up a new VMA's anon_vma state.
@@ -280,17 +316,17 @@ static void cleanup_partial_anon_vmas(struct vm_area_struct *vma);
  *
  * Returns: 0 on success, -ENOMEM on failure.
  */
-int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+		   enum vma_operation operation)
 {
 	struct anon_vma_chain *avc, *pavc;
+	struct anon_vma *active_anon_vma = src->anon_vma;
 
-	check_anon_vma_clone(dst, src);
+	check_anon_vma_clone(dst, src, operation);
 
-	if (!src->anon_vma)
+	if (!active_anon_vma)
 		return 0;
 
-	check_anon_vma_clone(dst, src);
-
 	/*
 	 * Allocate AVCs. We don't need an anon_vma lock for this as we
 	 * are not updating the anon_vma rbtree nor are we changing
@@ -318,22 +354,14 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 		struct anon_vma *anon_vma = avc->anon_vma;
 
 		anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
-
-		/*
-		 * Reuse existing anon_vma if it has no vma and only one
-		 * anon_vma child.
-		 *
-		 * Root anon_vma is never reused:
-		 * it has self-parent reference and at least one child.
-		 */
-		if (!dst->anon_vma && src->anon_vma &&
-		    anon_vma->num_children < 2 &&
-		    anon_vma->num_active_vmas == 0)
-			dst->anon_vma = anon_vma;
+		if (operation == VMA_OP_FORK)
+			maybe_reuse_anon_vma(dst, anon_vma);
 	}
-	if (dst->anon_vma)
+
+	if (operation != VMA_OP_FORK)
 		dst->anon_vma->num_active_vmas++;
-	anon_vma_unlock_write(src->anon_vma);
+
+	anon_vma_unlock_write(active_anon_vma);
 	return 0;
 
  enomem_failure:
@@ -372,7 +400,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
 	 * First, attach the new VMA to the parent VMA's anon_vmas,
 	 * so rmap can find non-COWed pages in child processes.
 	 */
-	rc = anon_vma_clone(vma, pvma);
+	rc = anon_vma_clone(vma, pvma, VMA_OP_FORK);
 	/* An error arose or an existing anon_vma was reused, all done then. */
 	if (rc || vma->anon_vma) {
 		put_anon_vma(anon_vma);
diff --git a/mm/vma.c b/mm/vma.c
index 6c458c8656b8..3dbe414eff89 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -530,7 +530,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	if (err)
 		goto out_free_vmi;
 
-	err = anon_vma_clone(new, vma);
+	err = anon_vma_clone(new, vma, VMA_OP_SPLIT);
 	if (err)
 		goto out_free_mpol;
 
@@ -628,7 +628,7 @@ static int dup_anon_vma(struct vm_area_struct *dst,
 
 		vma_assert_write_locked(dst);
 		dst->anon_vma = src->anon_vma;
-		ret = anon_vma_clone(dst, src);
+		ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED);
 		if (ret)
 			return ret;
 
@@ -1901,7 +1901,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		vma_set_range(new_vma, addr, addr + len, pgoff);
 		if (vma_dup_policy(vma, new_vma))
 			goto out_free_vma;
-		if (anon_vma_clone(new_vma, vma))
+		if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP))
 			goto out_free_mempol;
 		if (new_vma->vm_file)
 			get_file(new_vma->vm_file);
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 93e5792306d9..7fa56dcc53a6 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -600,6 +600,14 @@ struct mmap_action {
 	bool hide_from_rmap_until_complete :1;
 };
 
+/* Operations which modify VMAs. */
+enum vma_operation {
+	VMA_OP_SPLIT,
+	VMA_OP_MERGE_UNFAULTED,
+	VMA_OP_REMAP,
+	VMA_OP_FORK,
+};
+
 /*
  * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
  * manipulate mutable fields which will cause those fields to be updated in the
@@ -1157,7 +1165,8 @@ static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_stru
 	return 0;
 }
 
-static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
+static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+				 enum vma_operation operation)
 {
 	/* For testing purposes. We indicate that an anon_vma has been cloned. */
 	if (src->anon_vma != NULL) {
-- 
cgit v1.2.3


From 7ce6dfc603ed01044ebe58472a584d9995281ca2 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 26 Jan 2026 14:05:48 -0800
Subject: perf script: Fix script_fetch_insn for more than just x86

The script_fetch_insn code was only supported on natively running x86.

Implement a crude elf_machine_max_instruction_length function and use to
give an instruction length on more than just x86.

Use the ELF machine to determine the length to use to support
cross-architecture development.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Yujie Liu <yujie.liu@intel.com>
[ Conditionally define EM_CSKY and EM_LOONGARCH for older distros ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/x86/util/Build                     |  1 -
 tools/perf/arch/x86/util/archinsn.c                | 27 --------
 tools/perf/builtin-script.c                        | 16 +----
 .../perf/scripts/python/Perf-Trace-Util/Context.c  |  2 +-
 tools/perf/tests/dlfilter-test.c                   |  1 -
 tools/perf/util/archinsn.h                         | 12 ----
 tools/perf/util/dlfilter.c                         |  3 +-
 tools/perf/util/sample.c                           | 77 ++++++++++++++++++++++
 tools/perf/util/sample.h                           |  7 ++
 tools/perf/util/trace-event-scripting.c            | 16 -----
 tools/perf/util/trace-event.h                      |  3 -
 11 files changed, 87 insertions(+), 78 deletions(-)
 delete mode 100644 tools/perf/arch/x86/util/archinsn.c
 delete mode 100644 tools/perf/util/archinsn.h

(limited to 'tools')

diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
index fad256252bb9..76127eefde8b 100644
--- a/tools/perf/arch/x86/util/Build
+++ b/tools/perf/arch/x86/util/Build
@@ -14,6 +14,5 @@ perf-util-y += iostat.o
 perf-util-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind.o
 
 perf-util-y += auxtrace.o
-perf-util-y += archinsn.o
 perf-util-y += intel-pt.o
 perf-util-y += intel-bts.o
diff --git a/tools/perf/arch/x86/util/archinsn.c b/tools/perf/arch/x86/util/archinsn.c
deleted file mode 100644
index 546feda08428..000000000000
--- a/tools/perf/arch/x86/util/archinsn.c
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "archinsn.h"
-#include "event.h"
-#include "machine.h"
-#include "thread.h"
-#include "symbol.h"
-#include "../../../../arch/x86/include/asm/insn.h"
-
-void arch_fetch_insn(struct perf_sample *sample,
-		     struct thread *thread,
-		     struct machine *machine)
-{
-	struct insn insn;
-	int len, ret;
-	bool is64bit = false;
-
-	if (!sample->ip)
-		return;
-	len = thread__memcpy(thread, machine, sample->insn, sample->ip, sizeof(sample->insn), &is64bit);
-	if (len <= 0)
-		return;
-
-	ret = insn_decode(&insn, sample->insn, len,
-			  is64bit ? INSN_MODE_64 : INSN_MODE_32);
-	if (ret >= 0 && insn.length <= len)
-		sample->insn_len = insn.length;
-}
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 6ec225c697a4..69af25780fc5 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -37,7 +37,6 @@
 #include "ui/ui.h"
 #include "print_binary.h"
 #include "print_insn.h"
-#include "archinsn.h"
 #include <linux/bitmap.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
@@ -90,7 +89,6 @@ static bool			print_flags;
 static const char		*cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 static int			max_blocks;
-static bool			native_arch;
 static struct dlfilter		*dlfilter;
 static int			dlargc;
 static char			**dlargv;
@@ -1627,7 +1625,7 @@ static int perf_sample__fprintf_insn(struct perf_sample *sample,
 {
 	int printed = 0;
 
-	script_fetch_insn(sample, thread, machine, native_arch);
+	perf_sample__fetch_insn(sample, thread, machine);
 
 	if (PRINT_FIELD(INSNLEN))
 		printed += fprintf(fp, " ilen: %d", sample->insn_len);
@@ -4034,7 +4032,6 @@ int cmd_script(int argc, const char **argv)
 		.set = false,
 		.default_no_sample = true,
 	};
-	struct utsname uts;
 	char *script_path = NULL;
 	const char *dlfilter_file = NULL;
 	const char **__argv;
@@ -4456,17 +4453,6 @@ script_found:
 	if (symbol__init(env) < 0)
 		goto out_delete;
 
-	uname(&uts);
-	if (data.is_pipe) { /* Assume pipe_mode indicates native_arch */
-		native_arch = true;
-	} else if (env->arch) {
-		if (!strcmp(uts.machine, env->arch))
-			native_arch = true;
-		else if (!strcmp(uts.machine, "x86_64") &&
-			 !strcmp(env->arch, "i386"))
-			native_arch = true;
-	}
-
 	script.session = session;
 	script__setup_sample_type(&script);
 
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Context.c b/tools/perf/scripts/python/Perf-Trace-Util/Context.c
index 60dcfe56d4d9..c19f44610983 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/Context.c
+++ b/tools/perf/scripts/python/Perf-Trace-Util/Context.c
@@ -93,7 +93,7 @@ static PyObject *perf_sample_insn(PyObject *obj, PyObject *args)
 	if (c->sample->ip && !c->sample->insn_len && thread__maps(c->al->thread)) {
 		struct machine *machine =  maps__machine(thread__maps(c->al->thread));
 
-		script_fetch_insn(c->sample, c->al->thread, machine, /*native_arch=*/true);
+		perf_sample__fetch_insn(c->sample, c->al->thread, machine);
 	}
 	if (!c->sample->insn_len)
 		Py_RETURN_NONE; /* N.B. This is a return statement */
diff --git a/tools/perf/tests/dlfilter-test.c b/tools/perf/tests/dlfilter-test.c
index 80a1c941138d..e63790c61d53 100644
--- a/tools/perf/tests/dlfilter-test.c
+++ b/tools/perf/tests/dlfilter-test.c
@@ -30,7 +30,6 @@
 #include "symbol.h"
 #include "synthetic-events.h"
 #include "util.h"
-#include "archinsn.h"
 #include "dlfilter.h"
 #include "tests.h"
 #include "util/sample.h"
diff --git a/tools/perf/util/archinsn.h b/tools/perf/util/archinsn.h
deleted file mode 100644
index 448cbb6b8d7e..000000000000
--- a/tools/perf/util/archinsn.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef INSN_H
-#define INSN_H 1
-
-struct perf_sample;
-struct machine;
-struct thread;
-
-void arch_fetch_insn(struct perf_sample *sample,
-		     struct thread *thread,
-		     struct machine *machine);
-
-#endif
diff --git a/tools/perf/util/dlfilter.c b/tools/perf/util/dlfilter.c
index c0afcbd954f8..dc31b5e7149e 100644
--- a/tools/perf/util/dlfilter.c
+++ b/tools/perf/util/dlfilter.c
@@ -234,8 +234,7 @@ static const __u8 *dlfilter__insn(void *ctx, __u32 *len)
 			struct machine *machine = maps__machine(thread__maps(al->thread));
 
 			if (machine)
-				script_fetch_insn(d->sample, al->thread, machine,
-						  /*native_arch=*/true);
+				perf_sample__fetch_insn(d->sample, al->thread, machine);
 		}
 	}
 
diff --git a/tools/perf/util/sample.c b/tools/perf/util/sample.c
index 605fee971f55..6d70a5db00a2 100644
--- a/tools/perf/util/sample.c
+++ b/tools/perf/util/sample.c
@@ -1,9 +1,18 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include "sample.h"
 #include "debug.h"
+#include "thread.h"
+#include <elf.h>
+#ifndef EM_CSKY
+#define EM_CSKY		252
+#endif
+#ifndef EM_LOONGARCH
+#define EM_LOONGARCH	258
+#endif
 #include <linux/zalloc.h>
 #include <stdlib.h>
 #include <string.h>
+#include "../../arch/x86/include/asm/insn.h"
 
 void perf_sample__init(struct perf_sample *sample, bool all)
 {
@@ -41,3 +50,71 @@ struct regs_dump *perf_sample__intr_regs(struct perf_sample *sample)
 	}
 	return sample->intr_regs;
 }
+
+static int elf_machine_max_instruction_length(uint16_t e_machine)
+{
+	switch (e_machine) {
+	/* Fixed 4-byte (32-bit) architectures */
+	case EM_AARCH64:
+	case EM_PPC:
+	case EM_PPC64:
+	case EM_MIPS:
+	case EM_SPARC:
+	case EM_SPARCV9:
+	case EM_ALPHA:
+	case EM_LOONGARCH:
+	case EM_PARISC:
+	case EM_SH:
+		return 4;
+
+	/* Variable length or mixed-mode architectures */
+	case EM_ARM:    /* Variable due to Thumb/Thumb-2 */
+	case EM_RISCV:  /* Variable due to Compressed (C) extension */
+	case EM_CSKY:   /* Variable (16 or 32 bit) */
+	case EM_ARC:    /* Variable (ARCompact) */
+		return 4;
+	case EM_S390:   /* Variable (2, 4, or 6 bytes) */
+		return 6;
+	case EM_68K:
+		return 10;
+	case EM_386:
+	case EM_X86_64:
+		return 15;
+	case EM_XTENSA: /* Variable (FLIX) */
+		return 16;
+	default:
+		return MAX_INSN;
+	}
+}
+
+void perf_sample__fetch_insn(struct perf_sample *sample,
+			     struct thread *thread,
+			     struct machine *machine)
+{
+	int ret, len;
+	bool is64bit = false;
+	uint16_t e_machine;
+
+	if (!sample->ip || sample->insn_len != 0)
+		return;
+
+	e_machine = thread__e_machine(thread, machine);
+	len = elf_machine_max_instruction_length(e_machine);
+	len = thread__memcpy(thread, machine, sample->insn,
+			     sample->ip, len,
+			     &is64bit);
+	if (len <= 0)
+		return;
+
+	sample->insn_len = len;
+
+	if (e_machine == EM_386 || e_machine == EM_X86_64) {
+		/* Refine the x86 instruction length with the decoder. */
+		struct insn insn;
+
+		ret = insn_decode(&insn, sample->insn, len,
+				  is64bit ? INSN_MODE_64 : INSN_MODE_32);
+		if (ret >= 0 && insn.length <= len)
+			sample->insn_len = insn.length;
+	}
+}
diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
index a8307b20a9ea..3cce8dd202aa 100644
--- a/tools/perf/util/sample.h
+++ b/tools/perf/util/sample.h
@@ -5,6 +5,9 @@
 #include <linux/perf_event.h>
 #include <linux/types.h>
 
+struct machine;
+struct thread;
+
 /* number of register is bound by the number of bits in regs_dump::mask (64) */
 #define PERF_SAMPLE_REGS_CACHE_SIZE (8 * sizeof(u64))
 
@@ -127,6 +130,10 @@ void perf_sample__exit(struct perf_sample *sample);
 struct regs_dump *perf_sample__user_regs(struct perf_sample *sample);
 struct regs_dump *perf_sample__intr_regs(struct perf_sample *sample);
 
+void perf_sample__fetch_insn(struct perf_sample *sample,
+			     struct thread *thread,
+			     struct machine *machine);
+
 /*
  * raw_data is always 4 bytes from an 8-byte boundary, so subtract 4 to get
  * 8-byte alignment.
diff --git a/tools/perf/util/trace-event-scripting.c b/tools/perf/util/trace-event-scripting.c
index 72abb28b7b5a..fa850e44cb46 100644
--- a/tools/perf/util/trace-event-scripting.c
+++ b/tools/perf/util/trace-event-scripting.c
@@ -13,7 +13,6 @@
 #include <event-parse.h>
 #endif
 
-#include "archinsn.h"
 #include "debug.h"
 #include "event.h"
 #include "trace-event.h"
@@ -274,21 +273,6 @@ void setup_perl_scripting(void)
 #endif
 #endif
 
-#if !defined(__i386__) && !defined(__x86_64__)
-void arch_fetch_insn(struct perf_sample *sample __maybe_unused,
-		     struct thread *thread __maybe_unused,
-		     struct machine *machine __maybe_unused)
-{
-}
-#endif
-
-void script_fetch_insn(struct perf_sample *sample, struct thread *thread,
-		       struct machine *machine, bool native_arch)
-{
-	if (sample->insn_len == 0 && native_arch)
-		arch_fetch_insn(sample, thread, machine);
-}
-
 static const struct {
 	u32 flags;
 	const char *name;
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 71e680bc3d4b..914d9b69ed62 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -116,9 +116,6 @@ extern unsigned int scripting_max_stack;
 struct scripting_ops *script_spec__lookup(const char *spec);
 int script_spec__for_each(int (*cb)(struct scripting_ops *ops, const char *spec));
 
-void script_fetch_insn(struct perf_sample *sample, struct thread *thread,
-		       struct machine *machine, bool native_arch);
-
 void setup_perl_scripting(void);
 void setup_python_scripting(void);
 
-- 
cgit v1.2.3


From f33e7aa42ea79f2142f073df777c01125def45e5 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 26 Jan 2026 14:05:49 -0800
Subject: perf callchain: Switch callchain_param_setup from an arch to an
 e_machine

Increase use of e_machine by replacing callchain_param_setup's arch
argument to be an e_machine typically read from the session.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 6 ++----
 tools/perf/builtin-script.c | 4 ++--
 tools/perf/util/callchain.c | 4 ++--
 tools/perf/util/callchain.h | 2 +-
 tools/perf/util/sample.c    | 2 +-
 5 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 2e936928e8c0..810ffd66b11c 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -448,7 +448,7 @@ static int report__setup_sample_type(struct report *rep)
 		}
 	}
 
-	callchain_param_setup(sample_type, perf_env__arch(perf_session__env(rep->session)));
+	callchain_param_setup(sample_type, perf_session__e_machine(session));
 
 	if (rep->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) {
 		ui__warning("Can't find LBR callchain. Switch off --stitch-lbr.\n"
@@ -1283,7 +1283,6 @@ static int process_attr(const struct perf_tool *tool __maybe_unused,
 			struct evlist **pevlist)
 {
 	struct perf_session *session;
-	struct perf_env *env;
 	u64 sample_type;
 	int err;
 
@@ -1297,8 +1296,7 @@ static int process_attr(const struct perf_tool *tool __maybe_unused,
 	 */
 	sample_type = evlist__combined_sample_type(*pevlist);
 	session = (*pevlist)->session;
-	env = perf_session__env(session);
-	callchain_param_setup(sample_type, perf_env__arch(env));
+	callchain_param_setup(sample_type, perf_session__e_machine(session));
 	return 0;
 }
 
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 69af25780fc5..c7d5a325b5cb 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2859,7 +2859,7 @@ static int process_attr(const struct perf_tool *tool, union perf_event *event,
 	 * on events sample_type.
 	 */
 	sample_type = evlist__combined_sample_type(evlist);
-	callchain_param_setup(sample_type, perf_env__arch(perf_session__env(scr->session)));
+	callchain_param_setup(sample_type, perf_session__e_machine(evsel__session(evsel)));
 
 	/* Enable fields for callchain entries */
 	if (symbol_conf.use_callchain &&
@@ -3834,7 +3834,7 @@ static void script__setup_sample_type(struct perf_script *script)
 	struct perf_session *session = script->session;
 	u64 sample_type = evlist__combined_sample_type(session->evlist);
 
-	callchain_param_setup(sample_type, perf_env__arch(session->machines.host.env));
+	callchain_param_setup(sample_type, perf_session__e_machine(session));
 
 	if (script->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) {
 		pr_warning("Can't find LBR callchain. Switch off --stitch-lbr.\n"
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 515bb8b5da01..8ff0898799ee 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1681,7 +1681,7 @@ void callchain_cursor_reset(struct callchain_cursor *cursor)
 		map_symbol__exit(&node->ms);
 }
 
-void callchain_param_setup(u64 sample_type, const char *arch)
+void callchain_param_setup(u64 sample_type, uint16_t e_machine)
 {
 	if (symbol_conf.use_callchain || symbol_conf.cumulate_callchain) {
 		if ((sample_type & PERF_SAMPLE_REGS_USER) &&
@@ -1703,7 +1703,7 @@ void callchain_param_setup(u64 sample_type, const char *arch)
 	 * erroneous entries. Always skipping the LR and starting from the FP
 	 * can result in missing entries.
 	 */
-	if (callchain_param.record_mode == CALLCHAIN_FP && !strcmp(arch, "arm64"))
+	if (callchain_param.record_mode == CALLCHAIN_FP && e_machine == EM_AARCH64)
 		dwarf_callchain_users = true;
 }
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 2a52af8c80ac..df54ddb8c0cb 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -303,7 +303,7 @@ int callchain_branch_counts(struct callchain_root *root,
 			    u64 *branch_count, u64 *predicted_count,
 			    u64 *abort_count, u64 *cycles_count);
 
-void callchain_param_setup(u64 sample_type, const char *arch);
+void callchain_param_setup(u64 sample_type, uint16_t e_machine);
 
 bool callchain_cnode_matched(struct callchain_node *base_cnode,
 			     struct callchain_node *pair_cnode);
diff --git a/tools/perf/util/sample.c b/tools/perf/util/sample.c
index 6d70a5db00a2..8f82aaf1aab6 100644
--- a/tools/perf/util/sample.c
+++ b/tools/perf/util/sample.c
@@ -98,7 +98,7 @@ void perf_sample__fetch_insn(struct perf_sample *sample,
 	if (!sample->ip || sample->insn_len != 0)
 		return;
 
-	e_machine = thread__e_machine(thread, machine);
+	e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL);
 	len = elf_machine_max_instruction_length(e_machine);
 	len = thread__memcpy(thread, machine, sample->insn,
 			     sample->ip, len,
-- 
cgit v1.2.3


From 4b870f62c5079b48a6a19c852f4db5d2569a5239 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 26 Jan 2026 14:05:50 -0800
Subject: perf thread-stack: Switch thread_stack__init() to use e_machine

The architecture type is used to set the retpoline state.

Rather than use the arch string switch to using the ELF machine that's
readily available within the thread.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/thread-stack.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c
index c6a0a27b12c2..c5ce741b0744 100644
--- a/tools/perf/util/thread-stack.c
+++ b/tools/perf/util/thread-stack.c
@@ -157,10 +157,10 @@ static int thread_stack__init(struct thread_stack *ts, struct thread *thread,
 
 	if (thread__maps(thread) && maps__machine(thread__maps(thread))) {
 		struct machine *machine = maps__machine(thread__maps(thread));
-		const char *arch = perf_env__arch(machine->env);
+		uint16_t e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL);
 
 		ts->kernel_start = machine__kernel_start(machine);
-		if (!strcmp(arch, "x86"))
+		if (e_machine == EM_X86_64 || e_machine == EM_386)
 			ts->rstate = X86_RETPOLINE_POSSIBLE;
 	} else {
 		ts->kernel_start = 1ULL << 63;
-- 
cgit v1.2.3


From 23262369e650c9995505eb4f69f16449467e6bfe Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Jan 2026 01:15:47 -0300
Subject: perf disasm: Constify variables storing the result of bsearch() on
 const tables

As newer glibcs will propagate the const attribute of the searched table
to its return.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/disasm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 6b36287f30fe..ddcc488f2e5f 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -761,7 +761,7 @@ static void ins__sort(struct arch *arch)
 static const struct ins_ops *__ins__find(const struct arch *arch, const char *name,
 				     struct disasm_line *dl)
 {
-	struct ins *ins;
+	const struct ins *ins;
 	const int nmemb = arch->nr_instructions;
 
 	if (arch__is_powerpc(arch)) {
-- 
cgit v1.2.3


From b42868624c7d00206f77d19a6fbfea73a44ff6f2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Jan 2026 01:15:47 -0300
Subject: perf metricgroup: Constify variables storing the result of strchr()
 on const tables

As newer glibcs will propagate the const attribute of the searched table
to its return.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/metricgroup.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 25c75fdbfc52..40a1e14de418 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -367,7 +367,7 @@ static int setup_metric_events(const char *pmu, struct hashmap *ids,
 static bool match_metric_or_groups(const char *metric_or_groups, const char *sought)
 {
 	int len;
-	char *m;
+	const char *m;
 
 	if (!sought)
 		return false;
@@ -450,11 +450,10 @@ static const char *code_characters = ",-=@";
 
 static int encode_metric_id(struct strbuf *sb, const char *x)
 {
-	char *c;
 	int ret = 0;
 
 	for (; *x; x++) {
-		c = strchr(code_characters, *x);
+		const char *c = strchr(code_characters, *x);
 		if (c) {
 			ret = strbuf_addch(sb, '!');
 			if (ret)
-- 
cgit v1.2.3


From 678ed6b707e4b2db250f255d2f959322896dae65 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Jan 2026 02:03:01 -0300
Subject: perf strlist: Don't write to const memory

Do a strdup to the list string and parse from it, free at the end.

This is to deal with newer glibcs const-correctness.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/strlist.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/strlist.c b/tools/perf/util/strlist.c
index 8a868cbeffae..98883672fcf4 100644
--- a/tools/perf/util/strlist.c
+++ b/tools/perf/util/strlist.c
@@ -139,21 +139,25 @@ out:
 	return err;
 }
 
-static int strlist__parse_list(struct strlist *slist, const char *s, const char *subst_dir)
+static int strlist__parse_list(struct strlist *slist, const char *list, const char *subst_dir)
 {
-	char *sep;
+	char *sep, *s = strdup(list), *sdup = s;
 	int err;
 
+	if (s == NULL)
+		return -ENOMEM;
+
 	while ((sep = strchr(s, ',')) != NULL) {
 		*sep = '\0';
 		err = strlist__parse_list_entry(slist, s, subst_dir);
-		*sep = ',';
 		if (err != 0)
 			return err;
 		s = sep + 1;
 	}
 
-	return *s ? strlist__parse_list_entry(slist, s, subst_dir) : 0;
+	err = *s ? strlist__parse_list_entry(slist, s, subst_dir) : 0;
+	free(sdup);
+	return err;
 }
 
 struct strlist *strlist__new(const char *list, const struct strlist_config *config)
-- 
cgit v1.2.3


From f1321cce848c558fde4c0c6bcd5e53f3cefd3af2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Jan 2026 02:09:37 -0300
Subject: perf session: Don't write to memory pointed to a const pointer

Since it is freshly allocated just attribute it to a non-const pointer
and then change it via that pointer.

That way we avoid const-correctness warnings in recent glibc versions.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/session.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index ae62d5c9889f..d0053618f540 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2676,7 +2676,7 @@ bool perf_session__has_switch_events(struct perf_session *session)
 
 int map__set_kallsyms_ref_reloc_sym(struct map *map, const char *symbol_name, u64 addr)
 {
-	char *bracket;
+	char *bracket, *name;
 	struct ref_reloc_sym *ref;
 	struct kmap *kmap;
 
@@ -2684,13 +2684,13 @@ int map__set_kallsyms_ref_reloc_sym(struct map *map, const char *symbol_name, u6
 	if (ref == NULL)
 		return -ENOMEM;
 
-	ref->name = strdup(symbol_name);
+	ref->name = name = strdup(symbol_name);
 	if (ref->name == NULL) {
 		free(ref);
 		return -ENOMEM;
 	}
 
-	bracket = strchr(ref->name, ']');
+	bracket = strchr(name, ']');
 	if (bracket)
 		*bracket = '\0';
 
-- 
cgit v1.2.3


From 0e47251e8cc438d5b59fcd86d27efade01976fe1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Jan 2026 02:15:30 -0300
Subject: perf hwmon_pmu: Constify the variables returning bsearch() on const
 tables

To address const-correctness errors on newer glibcs (-Werror=discarded-qualifiers).

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hwmon_pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/hwmon_pmu.c b/tools/perf/util/hwmon_pmu.c
index 279d6b1a47f0..fb3ffa8d32ad 100644
--- a/tools/perf/util/hwmon_pmu.c
+++ b/tools/perf/util/hwmon_pmu.c
@@ -161,7 +161,7 @@ bool parse_hwmon_filename(const char *filename,
 			  bool *alarm)
 {
 	char fn_type[24];
-	const char **elem;
+	const char * const *elem;
 	const char *fn_item = NULL;
 	size_t fn_item_len;
 
-- 
cgit v1.2.3


From 0341eab66ba03a1f439db91f03bccd5b0a360842 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Jan 2026 02:22:00 -0300
Subject: perf tp_pmu: Address const-correctness errors in recent glibcs

To avoid having more variables, just cast the const variable searched to
non-const since the result will not be modified, its only later that
that variable will be used to modify something, but then its non-const
memory being modified, so using a cast is the cheapest thing here.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/tp_pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/tp_pmu.c b/tools/perf/util/tp_pmu.c
index eddb9807131a..c2be8c9f9084 100644
--- a/tools/perf/util/tp_pmu.c
+++ b/tools/perf/util/tp_pmu.c
@@ -192,7 +192,7 @@ bool tp_pmu__have_event(struct perf_pmu *pmu __maybe_unused, const char *name)
 	char *dup_name, *colon;
 	int id;
 
-	colon = strchr(name, ':');
+	colon = strchr((char *)name, ':');
 	if (colon == NULL)
 		return false;
 
-- 
cgit v1.2.3


From 97b81df7225830c4db3c17ed1235d2f3eb613d3d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Jan 2026 01:15:47 -0300
Subject: perf trace-event: Constify variables storing the result of strchr()
 on const tables

As newer glibcs will propagate the const attribute of the searched table
to its return.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/trace-event-info.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index c8755679281e..45774722f249 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -482,7 +482,7 @@ char *tracepoint_id_to_name(u64 config)
 static struct tracepoint_path *tracepoint_name_to_path(const char *name)
 {
 	struct tracepoint_path *path = zalloc(sizeof(*path));
-	char *str = strchr(name, ':');
+	const char *str = strchr(name, ':');
 
 	if (path == NULL || str == NULL) {
 		free(path);
-- 
cgit v1.2.3


From 0e14cb3b24f8f301cf6490a4493afc98321ed5bb Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Jan 2026 01:15:47 -0300
Subject: perf units: Constify variables storing the result of strchr() on
 const tables

As newer glibcs will propagate the const attribute of the searched table
to its return.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/units.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/units.c b/tools/perf/util/units.c
index 4c6a86e1cb54..0bbacf5a29aa 100644
--- a/tools/perf/util/units.c
+++ b/tools/perf/util/units.c
@@ -12,7 +12,7 @@ unsigned long parse_tag_value(const char *str, struct parse_tag *tags)
 	struct parse_tag *i = tags;
 
 	while (i->tag) {
-		char *s = strchr(str, i->tag);
+		const char *s = strchr(str, i->tag);
 
 		if (s) {
 			unsigned long int value;
-- 
cgit v1.2.3


From 21c0bc9144834e39762dd6fddbb255ebb80cf079 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Jan 2026 01:15:47 -0300
Subject: perf time-utils: Constify variables storing the result of strchr() on
 const tables

As newer glibcs will propagate the const attribute of the searched table
to its return.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/time-utils.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/time-utils.c b/tools/perf/util/time-utils.c
index 1b91ccd4d523..d43c4577d7eb 100644
--- a/tools/perf/util/time-utils.c
+++ b/tools/perf/util/time-utils.c
@@ -325,7 +325,7 @@ static int percent_comma_split(struct perf_time_interval *ptime_buf, int num,
 }
 
 static int one_percent_convert(struct perf_time_interval *ptime_buf,
-			       const char *ostr, u64 start, u64 end, char *c)
+			       const char *ostr, u64 start, u64 end, const char *c)
 {
 	char *str;
 	int len = strlen(ostr), ret;
@@ -358,7 +358,7 @@ static int one_percent_convert(struct perf_time_interval *ptime_buf,
 int perf_time__percent_parse_str(struct perf_time_interval *ptime_buf, int num,
 				 const char *ostr, u64 start, u64 end)
 {
-	char *c;
+	const char *c;
 
 	/*
 	 * ostr example:
-- 
cgit v1.2.3


From 79bba3a1834e7ba6c437674582cc9f3ae6fb638c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Jan 2026 01:15:47 -0300
Subject: perf demangle-java: Constify variables storing the result of strchr()
 on const tables

As newer glibcs will propagate the const attribute of the searched table
to its return.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/demangle-java.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/demangle-java.c b/tools/perf/util/demangle-java.c
index ddf33d58bcd3..c3cb327ed562 100644
--- a/tools/perf/util/demangle-java.c
+++ b/tools/perf/util/demangle-java.c
@@ -158,7 +158,7 @@ char *
 java_demangle_sym(const char *str, int flags)
 {
 	char *buf, *ptr;
-	char *p;
+	const char *p;
 	size_t len, l1 = 0;
 
 	if (!str)
-- 
cgit v1.2.3


From 8bf093acb3f1f07d846c86e32308f9f9954ed579 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Jan 2026 01:15:47 -0300
Subject: perf bpf-event: Constify variables storing the result of strchr() on
 const tables

As newer glibcs will propagate the const attribute of the searched table
to its return.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf-event.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index 2e6da3ad0a4f..67e7786bb878 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -733,7 +733,8 @@ kallsyms_process_symbol(void *data, const char *_name,
 			char type __maybe_unused, u64 start)
 {
 	char disp[KSYM_NAME_LEN];
-	char *module, *name;
+	const char *module;
+	char *name;
 	unsigned long id;
 	int err = 0;
 
-- 
cgit v1.2.3


From 68abacb0686651dd3f0bbce2fa94b438afeb2fc4 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Jan 2026 01:15:47 -0300
Subject: perf jitdump: Constify variables storing the result of strchr() on
 const tables

As newer glibcs will propagate the const attribute of the searched table
to its return.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/jitdump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index d4fe35f9d9a5..e0ce8b904729 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -758,7 +758,7 @@ jit_inject(struct jit_buf_desc *jd, const char *path)
 static int
 jit_detect(const char *mmap_name, pid_t pid, struct nsinfo *nsi, bool *in_pidns)
  {
-	char *p;
+	const char *p;
 	char *end = NULL;
 	pid_t pid2;
 
-- 
cgit v1.2.3


From 873e7de9f9a3b67b08b380057b2c7828b0d78cae Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobbyeshleman@meta.com>
Date: Wed, 21 Jan 2026 14:11:44 -0800
Subject: selftests/vsock: increase timeout to 1200

Increase the timeout from 300s to 1200s. On a modern bare metal server
my last run showed the new set of tests taking ~400s. Multiply by an
(arbitrary) factor of three to account for slower/nested runners.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-4-2859a7512097@meta.com
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/vsock/settings | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vsock/settings b/tools/testing/selftests/vsock/settings
index 694d70710ff0..79b65bdf05db 100644
--- a/tools/testing/selftests/vsock/settings
+++ b/tools/testing/selftests/vsock/settings
@@ -1 +1 @@
-timeout=300
+timeout=1200
-- 
cgit v1.2.3


From 423ec6383edba92e78abbb99a776147b3fe7b2ca Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobbyeshleman@meta.com>
Date: Wed, 21 Jan 2026 14:11:45 -0800
Subject: selftests/vsock: add namespace helpers to vmtest.sh

Add functions for initializing namespaces with the different vsock NS
modes. Callers can use add_namespaces() and del_namespaces() to create
namespaces global0, global1, local0, and local1.

The add_namespaces() function initializes global0, local0, etc... with
their respective vsock NS mode by toggling child_ns_mode before creating
the namespace.

Remove namespaces upon exiting the program in cleanup(). This is
unlikely to be needed for a healthy run, but it is useful for tests that
are manually killed mid-test.

This patch is in preparation for later namespace tests.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-5-2859a7512097@meta.com
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/vsock/vmtest.sh | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh
index c7b270dd77a9..c2bdc293b94c 100755
--- a/tools/testing/selftests/vsock/vmtest.sh
+++ b/tools/testing/selftests/vsock/vmtest.sh
@@ -49,6 +49,7 @@ readonly TEST_DESCS=(
 )
 
 readonly USE_SHARED_VM=(vm_server_host_client vm_client_host_server vm_loopback)
+readonly NS_MODES=("local" "global")
 
 VERBOSE=0
 
@@ -103,6 +104,36 @@ check_result() {
 	fi
 }
 
+add_namespaces() {
+	local orig_mode
+	orig_mode=$(cat /proc/sys/net/vsock/child_ns_mode)
+
+	for mode in "${NS_MODES[@]}"; do
+		echo "${mode}" > /proc/sys/net/vsock/child_ns_mode
+		ip netns add "${mode}0" 2>/dev/null
+		ip netns add "${mode}1" 2>/dev/null
+	done
+
+	echo "${orig_mode}" > /proc/sys/net/vsock/child_ns_mode
+}
+
+init_namespaces() {
+	for mode in "${NS_MODES[@]}"; do
+		# we need lo for qemu port forwarding
+		ip netns exec "${mode}0" ip link set dev lo up
+		ip netns exec "${mode}1" ip link set dev lo up
+	done
+}
+
+del_namespaces() {
+	for mode in "${NS_MODES[@]}"; do
+		ip netns del "${mode}0" &>/dev/null
+		ip netns del "${mode}1" &>/dev/null
+		log_host "removed ns ${mode}0"
+		log_host "removed ns ${mode}1"
+	done
+}
+
 vm_ssh() {
 	ssh -q -o UserKnownHostsFile=/dev/null -p ${SSH_HOST_PORT} localhost "$@"
 	return $?
@@ -110,6 +141,7 @@ vm_ssh() {
 
 cleanup() {
 	terminate_pidfiles "${!PIDFILES[@]}"
+	del_namespaces
 }
 
 check_args() {
-- 
cgit v1.2.3


From fd1b41725d585f29029b8d8610a155f26727c18e Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobbyeshleman@meta.com>
Date: Wed, 21 Jan 2026 14:11:46 -0800
Subject: selftests/vsock: prepare vm management helpers for namespaces

Add namespace support to vm management, ssh helpers, and vsock_test
wrapper functions. This enables running VMs and test helpers in specific
namespaces, which is required for upcoming namespace isolation tests.

The functions still work correctly within the init ns, though the caller
must now pass "init_ns" explicitly.

No functional changes for existing tests. All have been updated to pass
"init_ns" explicitly.

Affected functions (such as vm_start() and vm_ssh()) now wrap their
commands with 'ip netns exec' when executing commands in non-init
namespaces.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-6-2859a7512097@meta.com
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/vsock/vmtest.sh | 101 ++++++++++++++++++++++----------
 1 file changed, 69 insertions(+), 32 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh
index c2bdc293b94c..c4d73dd0a4cf 100755
--- a/tools/testing/selftests/vsock/vmtest.sh
+++ b/tools/testing/selftests/vsock/vmtest.sh
@@ -135,7 +135,18 @@ del_namespaces() {
 }
 
 vm_ssh() {
-	ssh -q -o UserKnownHostsFile=/dev/null -p ${SSH_HOST_PORT} localhost "$@"
+	local ns_exec
+
+	if [[ "${1}" == init_ns ]]; then
+		ns_exec=""
+	else
+		ns_exec="ip netns exec ${1}"
+	fi
+
+	shift
+
+	${ns_exec} ssh -q -o UserKnownHostsFile=/dev/null -p "${SSH_HOST_PORT}" localhost "$@"
+
 	return $?
 }
 
@@ -258,10 +269,12 @@ terminate_pidfiles() {
 
 vm_start() {
 	local pidfile=$1
+	local ns=$2
 	local logfile=/dev/null
 	local verbose_opt=""
 	local kernel_opt=""
 	local qemu_opts=""
+	local ns_exec=""
 	local qemu
 
 	qemu=$(command -v "${QEMU}")
@@ -282,7 +295,11 @@ vm_start() {
 		kernel_opt="${KERNEL_CHECKOUT}"
 	fi
 
-	vng \
+	if [[ "${ns}" != "init_ns" ]]; then
+		ns_exec="ip netns exec ${ns}"
+	fi
+
+	${ns_exec} vng \
 		--run \
 		${kernel_opt} \
 		${verbose_opt} \
@@ -297,6 +314,7 @@ vm_start() {
 }
 
 vm_wait_for_ssh() {
+	local ns=$1
 	local i
 
 	i=0
@@ -304,7 +322,8 @@ vm_wait_for_ssh() {
 		if [[ ${i} -gt ${WAIT_PERIOD_MAX} ]]; then
 			die "Timed out waiting for guest ssh"
 		fi
-		if vm_ssh -- true; then
+
+		if vm_ssh "${ns}" -- true; then
 			break
 		fi
 		i=$(( i + 1 ))
@@ -338,30 +357,41 @@ wait_for_listener()
 }
 
 vm_wait_for_listener() {
-	local port=$1
+	local ns=$1
+	local port=$2
 
-	vm_ssh <<EOF
+	vm_ssh "${ns}" <<EOF
 $(declare -f wait_for_listener)
 wait_for_listener ${port} ${WAIT_PERIOD} ${WAIT_PERIOD_MAX}
 EOF
 }
 
 host_wait_for_listener() {
-	local port=$1
+	local ns=$1
+	local port=$2
 
-	wait_for_listener "${port}" "${WAIT_PERIOD}" "${WAIT_PERIOD_MAX}"
+	if [[ "${ns}" == "init_ns" ]]; then
+		wait_for_listener "${port}" "${WAIT_PERIOD}" "${WAIT_PERIOD_MAX}"
+	else
+		ip netns exec "${ns}" bash <<-EOF
+			$(declare -f wait_for_listener)
+			wait_for_listener ${port} ${WAIT_PERIOD} ${WAIT_PERIOD_MAX}
+		EOF
+	fi
 }
 
+
 vm_vsock_test() {
-	local host=$1
-	local cid=$2
-	local port=$3
+	local ns=$1
+	local host=$2
+	local cid=$3
+	local port=$4
 	local rc
 
 	# log output and use pipefail to respect vsock_test errors
 	set -o pipefail
 	if [[ "${host}" != server ]]; then
-		vm_ssh -- "${VSOCK_TEST}" \
+		vm_ssh "${ns}" -- "${VSOCK_TEST}" \
 			--mode=client \
 			--control-host="${host}" \
 			--peer-cid="${cid}" \
@@ -369,7 +399,7 @@ vm_vsock_test() {
 			2>&1 | log_guest
 		rc=$?
 	else
-		vm_ssh -- "${VSOCK_TEST}" \
+		vm_ssh "${ns}" -- "${VSOCK_TEST}" \
 			--mode=server \
 			--peer-cid="${cid}" \
 			--control-port="${port}" \
@@ -381,7 +411,7 @@ vm_vsock_test() {
 			return $rc
 		fi
 
-		vm_wait_for_listener "${port}"
+		vm_wait_for_listener "${ns}" "${port}"
 		rc=$?
 	fi
 	set +o pipefail
@@ -390,22 +420,28 @@ vm_vsock_test() {
 }
 
 host_vsock_test() {
-	local host=$1
-	local cid=$2
-	local port=$3
+	local ns=$1
+	local host=$2
+	local cid=$3
+	local port=$4
 	local rc
 
+	local cmd="${VSOCK_TEST}"
+	if [[ "${ns}" != "init_ns" ]]; then
+		cmd="ip netns exec ${ns} ${cmd}"
+	fi
+
 	# log output and use pipefail to respect vsock_test errors
 	set -o pipefail
 	if [[ "${host}" != server ]]; then
-		${VSOCK_TEST} \
+		${cmd} \
 			--mode=client \
 			--peer-cid="${cid}" \
 			--control-host="${host}" \
 			--control-port="${port}" 2>&1 | log_host
 		rc=$?
 	else
-		${VSOCK_TEST} \
+		${cmd} \
 			--mode=server \
 			--peer-cid="${cid}" \
 			--control-port="${port}" 2>&1 | log_host &
@@ -416,7 +452,7 @@ host_vsock_test() {
 			return $rc
 		fi
 
-		host_wait_for_listener "${port}"
+		host_wait_for_listener "${ns}" "${port}"
 		rc=$?
 	fi
 	set +o pipefail
@@ -460,11 +496,11 @@ log_guest() {
 }
 
 test_vm_server_host_client() {
-	if ! vm_vsock_test "server" 2 "${TEST_GUEST_PORT}"; then
+	if ! vm_vsock_test "init_ns" "server" 2 "${TEST_GUEST_PORT}"; then
 		return "${KSFT_FAIL}"
 	fi
 
-	if ! host_vsock_test "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}"; then
+	if ! host_vsock_test "init_ns" "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}"; then
 		return "${KSFT_FAIL}"
 	fi
 
@@ -472,11 +508,11 @@ test_vm_server_host_client() {
 }
 
 test_vm_client_host_server() {
-	if ! host_vsock_test "server" "${VSOCK_CID}" "${TEST_HOST_PORT_LISTENER}"; then
+	if ! host_vsock_test "init_ns" "server" "${VSOCK_CID}" "${TEST_HOST_PORT_LISTENER}"; then
 		return "${KSFT_FAIL}"
 	fi
 
-	if ! vm_vsock_test "10.0.2.2" 2 "${TEST_HOST_PORT_LISTENER}"; then
+	if ! vm_vsock_test "init_ns" "10.0.2.2" 2 "${TEST_HOST_PORT_LISTENER}"; then
 		return "${KSFT_FAIL}"
 	fi
 
@@ -486,13 +522,14 @@ test_vm_client_host_server() {
 test_vm_loopback() {
 	local port=60000 # non-forwarded local port
 
-	vm_ssh -- modprobe vsock_loopback &> /dev/null || :
+	vm_ssh "init_ns" -- modprobe vsock_loopback &> /dev/null || :
 
-	if ! vm_vsock_test "server" 1 "${port}"; then
+	if ! vm_vsock_test "init_ns" "server" 1 "${port}"; then
 		return "${KSFT_FAIL}"
 	fi
 
-	if ! vm_vsock_test "127.0.0.1" 1 "${port}"; then
+
+	if ! vm_vsock_test "init_ns" "127.0.0.1" 1 "${port}"; then
 		return "${KSFT_FAIL}"
 	fi
 
@@ -550,8 +587,8 @@ run_shared_vm_test() {
 
 	host_oops_cnt_before=$(dmesg | grep -c -i 'Oops')
 	host_warn_cnt_before=$(dmesg --level=warn | grep -c -i 'vsock')
-	vm_oops_cnt_before=$(vm_ssh -- dmesg | grep -c -i 'Oops')
-	vm_warn_cnt_before=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock')
+	vm_oops_cnt_before=$(vm_ssh "init_ns" -- dmesg | grep -c -i 'Oops')
+	vm_warn_cnt_before=$(vm_ssh "init_ns" -- dmesg --level=warn | grep -c -i 'vsock')
 
 	name=$(echo "${1}" | awk '{ print $1 }')
 	eval test_"${name}"
@@ -569,13 +606,13 @@ run_shared_vm_test() {
 		rc=$KSFT_FAIL
 	fi
 
-	vm_oops_cnt_after=$(vm_ssh -- dmesg | grep -i 'Oops' | wc -l)
+	vm_oops_cnt_after=$(vm_ssh "init_ns" -- dmesg | grep -i 'Oops' | wc -l)
 	if [[ ${vm_oops_cnt_after} -gt ${vm_oops_cnt_before} ]]; then
 		echo "FAIL: kernel oops detected on vm" | log_host
 		rc=$KSFT_FAIL
 	fi
 
-	vm_warn_cnt_after=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock')
+	vm_warn_cnt_after=$(vm_ssh "init_ns" -- dmesg --level=warn | grep -c -i 'vsock')
 	if [[ ${vm_warn_cnt_after} -gt ${vm_warn_cnt_before} ]]; then
 		echo "FAIL: kernel warning detected on vm" | log_host
 		rc=$KSFT_FAIL
@@ -621,8 +658,8 @@ cnt_total=0
 if shared_vm_tests_requested "${ARGS[@]}"; then
 	log_host "Booting up VM"
 	pidfile="$(create_pidfile)"
-	vm_start "${pidfile}"
-	vm_wait_for_ssh
+	vm_start "${pidfile}" "init_ns"
+	vm_wait_for_ssh "init_ns"
 	log_host "VM booted up"
 
 	run_shared_vm_tests "${ARGS[@]}"
-- 
cgit v1.2.3


From 4e870ac81df7e9628bdc08ff6f957a42e80582ee Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobbyeshleman@meta.com>
Date: Wed, 21 Jan 2026 14:11:47 -0800
Subject: selftests/vsock: add vm_dmesg_{warn,oops}_count() helpers

These functions are reused by the VM tests to collect and compare dmesg
warnings and oops counts. The future VM-specific tests use them heavily.
This patches relies on vm_ssh() already supporting namespaces.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-7-2859a7512097@meta.com
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/vsock/vmtest.sh | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh
index c4d73dd0a4cf..4b5929ffc9eb 100755
--- a/tools/testing/selftests/vsock/vmtest.sh
+++ b/tools/testing/selftests/vsock/vmtest.sh
@@ -380,6 +380,17 @@ host_wait_for_listener() {
 	fi
 }
 
+vm_dmesg_oops_count() {
+	local ns=$1
+
+	vm_ssh "${ns}" -- dmesg 2>/dev/null | grep -c -i 'Oops'
+}
+
+vm_dmesg_warn_count() {
+	local ns=$1
+
+	vm_ssh "${ns}" -- dmesg --level=warn 2>/dev/null | grep -c -i 'vsock'
+}
 
 vm_vsock_test() {
 	local ns=$1
@@ -587,8 +598,8 @@ run_shared_vm_test() {
 
 	host_oops_cnt_before=$(dmesg | grep -c -i 'Oops')
 	host_warn_cnt_before=$(dmesg --level=warn | grep -c -i 'vsock')
-	vm_oops_cnt_before=$(vm_ssh "init_ns" -- dmesg | grep -c -i 'Oops')
-	vm_warn_cnt_before=$(vm_ssh "init_ns" -- dmesg --level=warn | grep -c -i 'vsock')
+	vm_oops_cnt_before=$(vm_dmesg_oops_count "init_ns")
+	vm_warn_cnt_before=$(vm_dmesg_warn_count "init_ns")
 
 	name=$(echo "${1}" | awk '{ print $1 }')
 	eval test_"${name}"
@@ -606,13 +617,13 @@ run_shared_vm_test() {
 		rc=$KSFT_FAIL
 	fi
 
-	vm_oops_cnt_after=$(vm_ssh "init_ns" -- dmesg | grep -i 'Oops' | wc -l)
+	vm_oops_cnt_after=$(vm_dmesg_oops_count "init_ns")
 	if [[ ${vm_oops_cnt_after} -gt ${vm_oops_cnt_before} ]]; then
 		echo "FAIL: kernel oops detected on vm" | log_host
 		rc=$KSFT_FAIL
 	fi
 
-	vm_warn_cnt_after=$(vm_ssh "init_ns" -- dmesg --level=warn | grep -c -i 'vsock')
+	vm_warn_cnt_after=$(vm_dmesg_warn_count "init_ns")
 	if [[ ${vm_warn_cnt_after} -gt ${vm_warn_cnt_before} ]]; then
 		echo "FAIL: kernel warning detected on vm" | log_host
 		rc=$KSFT_FAIL
-- 
cgit v1.2.3


From 7418f3bb3aa289fbf52f93b551e79ba647371f51 Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobbyeshleman@meta.com>
Date: Wed, 21 Jan 2026 14:11:48 -0800
Subject: selftests/vsock: use ss to wait for listeners instead of /proc/net

Replace /proc/net parsing with ss(8) for detecting listening sockets in
wait_for_listener() functions and add support for TCP, VSOCK, and Unix
socket protocols.

The previous implementation parsed /proc/net/tcp using awk to detect
listening sockets, but this approach could not support vsock because
vsock does not export socket information to /proc/net/.

Instead, use ss so that we can detect listeners on tcp, vsock, and unix.

The protocol parameter is now required for all wait_for_listener family
functions (wait_for_listener, vm_wait_for_listener,
host_wait_for_listener) to explicitly specify which socket type to wait
for.

ss is added to the dependency check in check_deps().

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-8-2859a7512097@meta.com
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/vsock/vmtest.sh | 47 +++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh
index 4b5929ffc9eb..0e681d4c3a15 100755
--- a/tools/testing/selftests/vsock/vmtest.sh
+++ b/tools/testing/selftests/vsock/vmtest.sh
@@ -182,7 +182,7 @@ check_args() {
 }
 
 check_deps() {
-	for dep in vng ${QEMU} busybox pkill ssh; do
+	for dep in vng ${QEMU} busybox pkill ssh ss; do
 		if [[ ! -x $(command -v "${dep}") ]]; then
 			echo -e "skip:    dependency ${dep} not found!\n"
 			exit "${KSFT_SKIP}"
@@ -337,21 +337,32 @@ wait_for_listener()
 	local port=$1
 	local interval=$2
 	local max_intervals=$3
-	local protocol=tcp
-	local pattern
+	local protocol=$4
 	local i
 
-	pattern=":$(printf "%04X" "${port}") "
-
-	# for tcp protocol additionally check the socket state
-	[ "${protocol}" = "tcp" ] && pattern="${pattern}0A"
-
 	for i in $(seq "${max_intervals}"); do
-		if awk -v pattern="${pattern}" \
-			'BEGIN {rc=1} $2" "$4 ~ pattern {rc=0} END {exit rc}' \
-			/proc/net/"${protocol}"*; then
+		case "${protocol}" in
+		tcp)
+			if ss --listening --tcp --numeric | grep -q ":${port} "; then
+				break
+			fi
+			;;
+		vsock)
+			if ss --listening --vsock --numeric | grep -q ":${port} "; then
+				break
+			fi
+			;;
+		unix)
+			# For unix sockets, port is actually the socket path
+			if ss --listening --unix | grep -q "${port}"; then
+				break
+			fi
+			;;
+		*)
+			echo "Unknown protocol: ${protocol}" >&2
 			break
-		fi
+			;;
+		esac
 		sleep "${interval}"
 	done
 }
@@ -359,23 +370,25 @@ wait_for_listener()
 vm_wait_for_listener() {
 	local ns=$1
 	local port=$2
+	local protocol=$3
 
 	vm_ssh "${ns}" <<EOF
 $(declare -f wait_for_listener)
-wait_for_listener ${port} ${WAIT_PERIOD} ${WAIT_PERIOD_MAX}
+wait_for_listener ${port} ${WAIT_PERIOD} ${WAIT_PERIOD_MAX} ${protocol}
 EOF
 }
 
 host_wait_for_listener() {
 	local ns=$1
 	local port=$2
+	local protocol=$3
 
 	if [[ "${ns}" == "init_ns" ]]; then
-		wait_for_listener "${port}" "${WAIT_PERIOD}" "${WAIT_PERIOD_MAX}"
+		wait_for_listener "${port}" "${WAIT_PERIOD}" "${WAIT_PERIOD_MAX}" "${protocol}"
 	else
 		ip netns exec "${ns}" bash <<-EOF
 			$(declare -f wait_for_listener)
-			wait_for_listener ${port} ${WAIT_PERIOD} ${WAIT_PERIOD_MAX}
+			wait_for_listener ${port} ${WAIT_PERIOD} ${WAIT_PERIOD_MAX} ${protocol}
 		EOF
 	fi
 }
@@ -422,7 +435,7 @@ vm_vsock_test() {
 			return $rc
 		fi
 
-		vm_wait_for_listener "${ns}" "${port}"
+		vm_wait_for_listener "${ns}" "${port}" "tcp"
 		rc=$?
 	fi
 	set +o pipefail
@@ -463,7 +476,7 @@ host_vsock_test() {
 			return $rc
 		fi
 
-		host_wait_for_listener "${ns}" "${port}"
+		host_wait_for_listener "${ns}" "${port}" "tcp"
 		rc=$?
 	fi
 	set +o pipefail
-- 
cgit v1.2.3


From 06cf7895abf9080c050767c66b95d79d99e2c7e8 Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobbyeshleman@meta.com>
Date: Wed, 21 Jan 2026 14:11:49 -0800
Subject: selftests/vsock: add tests for proc sys vsock ns_mode

Add tests for the /proc/sys/net/vsock/{ns_mode,child_ns_mode}
interfaces. Namely, that they accept/report "global" and "local" strings
and enforce their access policies.

Start a convention of commenting the test name over the test
description. Add test name comments over test descriptions that existed
before this convention.

Add a check_netns() function that checks if the test requires namespaces
and if the current kernel supports namespaces. Skip tests that require
namespaces if the system does not have namespace support.

This patch is the first to add tests that do *not* re-use the same
shared VM. For that reason, it adds a run_ns_tests() function to run
these tests and filter out the shared VM tests.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-9-2859a7512097@meta.com
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/vsock/vmtest.sh | 140 +++++++++++++++++++++++++++++++-
 1 file changed, 138 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh
index 0e681d4c3a15..38785a102236 100755
--- a/tools/testing/selftests/vsock/vmtest.sh
+++ b/tools/testing/selftests/vsock/vmtest.sh
@@ -41,14 +41,38 @@ readonly KERNEL_CMDLINE="\
 	virtme.ssh virtme_ssh_channel=tcp virtme_ssh_user=$USER \
 "
 readonly LOG=$(mktemp /tmp/vsock_vmtest_XXXX.log)
-readonly TEST_NAMES=(vm_server_host_client vm_client_host_server vm_loopback)
+
+# Namespace tests must use the ns_ prefix. This is checked in check_netns() and
+# is used to determine if a test needs namespace setup before test execution.
+readonly TEST_NAMES=(
+	vm_server_host_client
+	vm_client_host_server
+	vm_loopback
+	ns_host_vsock_ns_mode_ok
+	ns_host_vsock_child_ns_mode_ok
+)
 readonly TEST_DESCS=(
+	# vm_server_host_client
 	"Run vsock_test in server mode on the VM and in client mode on the host."
+
+	# vm_client_host_server
 	"Run vsock_test in client mode on the VM and in server mode on the host."
+
+	# vm_loopback
 	"Run vsock_test using the loopback transport in the VM."
+
+	# ns_host_vsock_ns_mode_ok
+	"Check /proc/sys/net/vsock/ns_mode strings on the host."
+
+	# ns_host_vsock_child_ns_mode_ok
+	"Check /proc/sys/net/vsock/ns_mode is read-only and child_ns_mode is writable."
 )
 
-readonly USE_SHARED_VM=(vm_server_host_client vm_client_host_server vm_loopback)
+readonly USE_SHARED_VM=(
+	vm_server_host_client
+	vm_client_host_server
+	vm_loopback
+)
 readonly NS_MODES=("local" "global")
 
 VERBOSE=0
@@ -196,6 +220,20 @@ check_deps() {
 	fi
 }
 
+check_netns() {
+	local tname=$1
+
+	# If the test requires NS support, check if NS support exists
+	# using /proc/self/ns
+	if [[ "${tname}" =~ ^ns_ ]] &&
+	   [[ ! -e /proc/self/ns ]]; then
+		log_host "No NS support detected for test ${tname}"
+		return 1
+	fi
+
+	return 0
+}
+
 check_vng() {
 	local tested_versions
 	local version
@@ -519,6 +557,54 @@ log_guest() {
 	LOG_PREFIX=guest log "$@"
 }
 
+ns_get_mode() {
+	local ns=$1
+
+	ip netns exec "${ns}" cat /proc/sys/net/vsock/ns_mode 2>/dev/null
+}
+
+test_ns_host_vsock_ns_mode_ok() {
+	for mode in "${NS_MODES[@]}"; do
+		local actual
+
+		actual=$(ns_get_mode "${mode}0")
+		if [[ "${actual}" != "${mode}" ]]; then
+			log_host "expected mode ${mode}, got ${actual}"
+			return "${KSFT_FAIL}"
+		fi
+	done
+
+	return "${KSFT_PASS}"
+}
+
+test_ns_host_vsock_child_ns_mode_ok() {
+	local orig_mode
+	local rc
+
+	orig_mode=$(cat /proc/sys/net/vsock/child_ns_mode)
+
+	rc="${KSFT_PASS}"
+	for mode in "${NS_MODES[@]}"; do
+		local ns="${mode}0"
+
+		if echo "${mode}" 2>/dev/null > /proc/sys/net/vsock/ns_mode; then
+			log_host "ns_mode should be read-only but write succeeded"
+			rc="${KSFT_FAIL}"
+			continue
+		fi
+
+		if ! echo "${mode}" > /proc/sys/net/vsock/child_ns_mode; then
+			log_host "child_ns_mode should be writable to ${mode}"
+			rc="${KSFT_FAIL}"
+			continue
+		fi
+	done
+
+	echo "${orig_mode}" > /proc/sys/net/vsock/child_ns_mode
+
+	return "${rc}"
+}
+
 test_vm_server_host_client() {
 	if ! vm_vsock_test "init_ns" "server" 2 "${TEST_GUEST_PORT}"; then
 		return "${KSFT_FAIL}"
@@ -592,6 +678,11 @@ run_shared_vm_tests() {
 			continue
 		fi
 
+		if ! check_netns "${arg}"; then
+			check_result "${KSFT_SKIP}" "${arg}"
+			continue
+		fi
+
 		run_shared_vm_test "${arg}"
 		check_result "$?" "${arg}"
 	done
@@ -645,6 +736,49 @@ run_shared_vm_test() {
 	return "${rc}"
 }
 
+run_ns_tests() {
+	for arg in "${ARGS[@]}"; do
+		if shared_vm_test "${arg}"; then
+			continue
+		fi
+
+		if ! check_netns "${arg}"; then
+			check_result "${KSFT_SKIP}" "${arg}"
+			continue
+		fi
+
+		add_namespaces
+
+		name=$(echo "${arg}" | awk '{ print $1 }')
+		log_host "Executing test_${name}"
+
+		host_oops_before=$(dmesg 2>/dev/null | grep -c -i 'Oops')
+		host_warn_before=$(dmesg --level=warn 2>/dev/null | grep -c -i 'vsock')
+		eval test_"${name}"
+		rc=$?
+
+		host_oops_after=$(dmesg 2>/dev/null | grep -c -i 'Oops')
+		if [[ "${host_oops_after}" -gt "${host_oops_before}" ]]; then
+			echo "FAIL: kernel oops detected on host" | log_host
+			check_result "${KSFT_FAIL}" "${name}"
+			del_namespaces
+			continue
+		fi
+
+		host_warn_after=$(dmesg --level=warn 2>/dev/null | grep -c -i 'vsock')
+		if [[ "${host_warn_after}" -gt "${host_warn_before}" ]]; then
+			echo "FAIL: kernel warning detected on host" | log_host
+			check_result "${KSFT_FAIL}" "${name}"
+			del_namespaces
+			continue
+		fi
+
+		check_result "${rc}" "${name}"
+
+		del_namespaces
+	done
+}
+
 BUILD=0
 QEMU="qemu-system-$(uname -m)"
 
@@ -690,6 +824,8 @@ if shared_vm_tests_requested "${ARGS[@]}"; then
 	terminate_pidfiles "${pidfile}"
 fi
 
+run_ns_tests "${ARGS[@]}"
+
 echo "SUMMARY: PASS=${cnt_pass} SKIP=${cnt_skip} FAIL=${cnt_fail}"
 echo "Log: ${LOG}"
 
-- 
cgit v1.2.3


From 605caec5adc2956263a86b48eecfc52ee5c95dae Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobbyeshleman@meta.com>
Date: Wed, 21 Jan 2026 14:11:50 -0800
Subject: selftests/vsock: add namespace tests for CID collisions

Add tests to verify CID collision rules across different vsock namespace
modes.

1. Two VMs with the same CID cannot start in different global namespaces
   (ns_global_same_cid_fails)
2. Two VMs with the same CID can start in different local namespaces
   (ns_local_same_cid_ok)
3. VMs with the same CID can coexist when one is in a global namespace
   and another is in a local namespace (ns_global_local_same_cid_ok and
   ns_local_global_same_cid_ok)

The tests ns_global_local_same_cid_ok and ns_local_global_same_cid_ok
make sure that ordering does not matter.

The tests use a shared helper function namespaces_can_boot_same_cid()
that attempts to start two VMs with identical CIDs in the specified
namespaces and verifies whether VM initialization failed or succeeded.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-10-2859a7512097@meta.com
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/vsock/vmtest.sh | 78 +++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh
index 38785a102236..1bf537410ea6 100755
--- a/tools/testing/selftests/vsock/vmtest.sh
+++ b/tools/testing/selftests/vsock/vmtest.sh
@@ -50,6 +50,10 @@ readonly TEST_NAMES=(
 	vm_loopback
 	ns_host_vsock_ns_mode_ok
 	ns_host_vsock_child_ns_mode_ok
+	ns_global_same_cid_fails
+	ns_local_same_cid_ok
+	ns_global_local_same_cid_ok
+	ns_local_global_same_cid_ok
 )
 readonly TEST_DESCS=(
 	# vm_server_host_client
@@ -66,6 +70,18 @@ readonly TEST_DESCS=(
 
 	# ns_host_vsock_child_ns_mode_ok
 	"Check /proc/sys/net/vsock/ns_mode is read-only and child_ns_mode is writable."
+
+	# ns_global_same_cid_fails
+	"Check QEMU fails to start two VMs with same CID in two different global namespaces."
+
+	# ns_local_same_cid_ok
+	"Check QEMU successfully starts two VMs with same CID in two different local namespaces."
+
+	# ns_global_local_same_cid_ok
+	"Check QEMU successfully starts one VM in a global ns and then another VM in a local ns with the same CID."
+
+	# ns_local_global_same_cid_ok
+	"Check QEMU successfully starts one VM in a local ns and then another VM in a global ns with the same CID."
 )
 
 readonly USE_SHARED_VM=(
@@ -577,6 +593,68 @@ test_ns_host_vsock_ns_mode_ok() {
 	return "${KSFT_PASS}"
 }
 
+namespaces_can_boot_same_cid() {
+	local ns0=$1
+	local ns1=$2
+	local pidfile1 pidfile2
+	local rc
+
+	pidfile1="$(create_pidfile)"
+
+	# The first VM should be able to start. If it can't then we have
+	# problems and need to return non-zero.
+	if ! vm_start "${pidfile1}" "${ns0}"; then
+		return 1
+	fi
+
+	pidfile2="$(create_pidfile)"
+	vm_start "${pidfile2}" "${ns1}"
+	rc=$?
+	terminate_pidfiles "${pidfile1}" "${pidfile2}"
+
+	return "${rc}"
+}
+
+test_ns_global_same_cid_fails() {
+	init_namespaces
+
+	if namespaces_can_boot_same_cid "global0" "global1"; then
+		return "${KSFT_FAIL}"
+	fi
+
+	return "${KSFT_PASS}"
+}
+
+test_ns_local_global_same_cid_ok() {
+	init_namespaces
+
+	if namespaces_can_boot_same_cid "local0" "global0"; then
+		return "${KSFT_PASS}"
+	fi
+
+	return "${KSFT_FAIL}"
+}
+
+test_ns_global_local_same_cid_ok() {
+	init_namespaces
+
+	if namespaces_can_boot_same_cid "global0" "local0"; then
+		return "${KSFT_PASS}"
+	fi
+
+	return "${KSFT_FAIL}"
+}
+
+test_ns_local_same_cid_ok() {
+	init_namespaces
+
+	if namespaces_can_boot_same_cid "local0" "local1"; then
+		return "${KSFT_PASS}"
+	fi
+
+	return "${KSFT_FAIL}"
+}
+
 test_ns_host_vsock_child_ns_mode_ok() {
 	local orig_mode
 	local rc
-- 
cgit v1.2.3


From 0424ee7c3a1721c099544f36580cf6dd1661856d Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobbyeshleman@meta.com>
Date: Wed, 21 Jan 2026 14:11:51 -0800
Subject: selftests/vsock: add tests for host <-> vm connectivity with
 namespaces

Add tests to validate namespace correctness using vsock_test and socat.
The vsock_test tool is used to validate expected success tests, but
socat is used for expected failure tests. socat is used to ensure that
connections are rejected outright instead of failing due to some other
socket behavior (as tested in vsock_test). Additionally, socat is
already required for tunneling TCP traffic from vsock_test. Using only
one of the vsock_test tests like 'test_stream_client_close_client' would
have yielded a similar result, but doing so wouldn't remove the socat
dependency.

Additionally, check for the dependency socat. socat needs special
handling beyond just checking if it is on the path because it must be
compiled with support for both vsock and unix. The function
check_socat() checks that this support exists.

Add more padding to test name printf strings because the tests added in
this patch would otherwise overflow.

Add vm_dmesg_* helpers to encapsulate checking dmesg
for oops and warnings.

Add ability to pass extra args to host-side vsock_test so that tests
that cause false positives may be skipped with arg --skip.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-11-2859a7512097@meta.com
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/vsock/vmtest.sh | 572 +++++++++++++++++++++++++++++++-
 1 file changed, 568 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh
index 1bf537410ea6..a9eaf37bc31b 100755
--- a/tools/testing/selftests/vsock/vmtest.sh
+++ b/tools/testing/selftests/vsock/vmtest.sh
@@ -7,6 +7,7 @@
 #		* virtme-ng
 #		* busybox-static (used by virtme-ng)
 #		* qemu	(used by virtme-ng)
+#		* socat
 #
 # shellcheck disable=SC2317,SC2119
 
@@ -54,6 +55,19 @@ readonly TEST_NAMES=(
 	ns_local_same_cid_ok
 	ns_global_local_same_cid_ok
 	ns_local_global_same_cid_ok
+	ns_diff_global_host_connect_to_global_vm_ok
+	ns_diff_global_host_connect_to_local_vm_fails
+	ns_diff_global_vm_connect_to_global_host_ok
+	ns_diff_global_vm_connect_to_local_host_fails
+	ns_diff_local_host_connect_to_local_vm_fails
+	ns_diff_local_vm_connect_to_local_host_fails
+	ns_diff_global_to_local_loopback_local_fails
+	ns_diff_local_to_global_loopback_fails
+	ns_diff_local_to_local_loopback_fails
+	ns_diff_global_to_global_loopback_ok
+	ns_same_local_loopback_ok
+	ns_same_local_host_connect_to_local_vm_ok
+	ns_same_local_vm_connect_to_local_host_ok
 )
 readonly TEST_DESCS=(
 	# vm_server_host_client
@@ -82,6 +96,45 @@ readonly TEST_DESCS=(
 
 	# ns_local_global_same_cid_ok
 	"Check QEMU successfully starts one VM in a local ns and then another VM in a global ns with the same CID."
+
+	# ns_diff_global_host_connect_to_global_vm_ok
+	"Run vsock_test client in global ns with server in VM in another global ns."
+
+	# ns_diff_global_host_connect_to_local_vm_fails
+	"Run socat to test a process in a global ns fails to connect to a VM in a local ns."
+
+	# ns_diff_global_vm_connect_to_global_host_ok
+	"Run vsock_test client in VM in a global ns with server in another global ns."
+
+	# ns_diff_global_vm_connect_to_local_host_fails
+	"Run socat to test a VM in a global ns fails to connect to a host process in a local ns."
+
+	# ns_diff_local_host_connect_to_local_vm_fails
+	"Run socat to test a host process in a local ns fails to connect to a VM in another local ns."
+
+	# ns_diff_local_vm_connect_to_local_host_fails
+	"Run socat to test a VM in a local ns fails to connect to a host process in another local ns."
+
+	# ns_diff_global_to_local_loopback_local_fails
+	"Run socat to test a loopback vsock in a global ns fails to connect to a vsock in a local ns."
+
+	# ns_diff_local_to_global_loopback_fails
+	"Run socat to test a loopback vsock in a local ns fails to connect to a vsock in a global ns."
+
+	# ns_diff_local_to_local_loopback_fails
+	"Run socat to test a loopback vsock in a local ns fails to connect to a vsock in another local ns."
+
+	# ns_diff_global_to_global_loopback_ok
+	"Run socat to test a loopback vsock in a global ns successfully connects to a vsock in another global ns."
+
+	# ns_same_local_loopback_ok
+	"Run socat to test a loopback vsock in a local ns successfully connects to a vsock in the same ns."
+
+	# ns_same_local_host_connect_to_local_vm_ok
+	"Run vsock_test client in a local ns with server in VM in same ns."
+
+	# ns_same_local_vm_connect_to_local_host_ok
+	"Run vsock_test client in VM in a local ns with server in same ns."
 )
 
 readonly USE_SHARED_VM=(
@@ -112,7 +165,7 @@ usage() {
 	for ((i = 0; i < ${#TEST_NAMES[@]}; i++)); do
 		name=${TEST_NAMES[${i}]}
 		desc=${TEST_DESCS[${i}]}
-		printf "\t%-35s%-35s\n" "${name}" "${desc}"
+		printf "\t%-55s%-35s\n" "${name}" "${desc}"
 	done
 	echo
 
@@ -222,7 +275,7 @@ check_args() {
 }
 
 check_deps() {
-	for dep in vng ${QEMU} busybox pkill ssh ss; do
+	for dep in vng ${QEMU} busybox pkill ssh ss socat; do
 		if [[ ! -x $(command -v "${dep}") ]]; then
 			echo -e "skip:    dependency ${dep} not found!\n"
 			exit "${KSFT_SKIP}"
@@ -273,6 +326,20 @@ check_vng() {
 	fi
 }
 
+check_socat() {
+	local support_string
+
+	support_string="$(socat -V)"
+
+	if [[ "${support_string}" != *"WITH_VSOCK 1"* ]]; then
+		die "err: socat is missing vsock support"
+	fi
+
+	if [[ "${support_string}" != *"WITH_UNIX 1"* ]]; then
+		die "err: socat is missing unix support"
+	fi
+}
+
 handle_build() {
 	if [[ ! "${BUILD}" -eq 1 ]]; then
 		return
@@ -321,6 +388,14 @@ terminate_pidfiles() {
 	done
 }
 
+terminate_pids() {
+	local pid
+
+	for pid in "$@"; do
+		kill -SIGTERM "${pid}" &>/dev/null || :
+	done
+}
+
 vm_start() {
 	local pidfile=$1
 	local ns=$2
@@ -459,6 +534,28 @@ vm_dmesg_warn_count() {
 	vm_ssh "${ns}" -- dmesg --level=warn 2>/dev/null | grep -c -i 'vsock'
 }
 
+vm_dmesg_check() {
+	local pidfile=$1
+	local ns=$2
+	local oops_before=$3
+	local warn_before=$4
+	local oops_after warn_after
+
+	oops_after=$(vm_dmesg_oops_count "${ns}")
+	if [[ "${oops_after}" -gt "${oops_before}" ]]; then
+		echo "FAIL: kernel oops detected on vm in ns ${ns}" | log_host
+		return 1
+	fi
+
+	warn_after=$(vm_dmesg_warn_count "${ns}")
+	if [[ "${warn_after}" -gt "${warn_before}" ]]; then
+		echo "FAIL: kernel warning detected on vm in ns ${ns}" | log_host
+		return 1
+	fi
+
+	return 0
+}
+
 vm_vsock_test() {
 	local ns=$1
 	local host=$2
@@ -502,6 +599,8 @@ host_vsock_test() {
 	local host=$2
 	local cid=$3
 	local port=$4
+	shift 4
+	local extra_args=("$@")
 	local rc
 
 	local cmd="${VSOCK_TEST}"
@@ -516,13 +615,15 @@ host_vsock_test() {
 			--mode=client \
 			--peer-cid="${cid}" \
 			--control-host="${host}" \
-			--control-port="${port}" 2>&1 | log_host
+			--control-port="${port}" \
+			"${extra_args[@]}" 2>&1 | log_host
 		rc=$?
 	else
 		${cmd} \
 			--mode=server \
 			--peer-cid="${cid}" \
-			--control-port="${port}" 2>&1 | log_host &
+			--control-port="${port}" \
+			"${extra_args[@]}" 2>&1 | log_host &
 		rc=$?
 
 		if [[ $rc -ne 0 ]]; then
@@ -593,6 +694,468 @@ test_ns_host_vsock_ns_mode_ok() {
 	return "${KSFT_PASS}"
 }
 
+test_ns_diff_global_host_connect_to_global_vm_ok() {
+	local oops_before warn_before
+	local pids pid pidfile
+	local ns0 ns1 port
+	declare -a pids
+	local unixfile
+	ns0="global0"
+	ns1="global1"
+	port=1234
+	local rc
+
+	init_namespaces
+
+	pidfile="$(create_pidfile)"
+
+	if ! vm_start "${pidfile}" "${ns0}"; then
+		return "${KSFT_FAIL}"
+	fi
+
+	vm_wait_for_ssh "${ns0}"
+	oops_before=$(vm_dmesg_oops_count "${ns0}")
+	warn_before=$(vm_dmesg_warn_count "${ns0}")
+
+	unixfile=$(mktemp -u /tmp/XXXX.sock)
+	ip netns exec "${ns1}" \
+		socat TCP-LISTEN:"${TEST_HOST_PORT}",fork \
+			UNIX-CONNECT:"${unixfile}" &
+	pids+=($!)
+	host_wait_for_listener "${ns1}" "${TEST_HOST_PORT}" "tcp"
+
+	ip netns exec "${ns0}" socat UNIX-LISTEN:"${unixfile}",fork \
+		TCP-CONNECT:localhost:"${TEST_HOST_PORT}" &
+	pids+=($!)
+	host_wait_for_listener "${ns0}" "${unixfile}" "unix"
+
+	vm_vsock_test "${ns0}" "server" 2 "${TEST_GUEST_PORT}"
+	vm_wait_for_listener "${ns0}" "${TEST_GUEST_PORT}" "tcp"
+	host_vsock_test "${ns1}" "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}"
+	rc=$?
+
+	vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}"
+	dmesg_rc=$?
+
+	terminate_pids "${pids[@]}"
+	terminate_pidfiles "${pidfile}"
+
+	if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then
+		return "${KSFT_FAIL}"
+	fi
+
+	return "${KSFT_PASS}"
+}
+
+test_ns_diff_global_host_connect_to_local_vm_fails() {
+	local oops_before warn_before
+	local ns0="global0"
+	local ns1="local0"
+	local port=12345
+	local dmesg_rc
+	local pidfile
+	local result
+	local pid
+
+	init_namespaces
+
+	outfile=$(mktemp)
+
+	pidfile="$(create_pidfile)"
+	if ! vm_start "${pidfile}" "${ns1}"; then
+		log_host "failed to start vm (cid=${VSOCK_CID}, ns=${ns0})"
+		return "${KSFT_FAIL}"
+	fi
+
+	vm_wait_for_ssh "${ns1}"
+	oops_before=$(vm_dmesg_oops_count "${ns1}")
+	warn_before=$(vm_dmesg_warn_count "${ns1}")
+
+	vm_ssh "${ns1}" -- socat VSOCK-LISTEN:"${port}" STDOUT > "${outfile}" &
+	vm_wait_for_listener "${ns1}" "${port}" "vsock"
+	echo TEST | ip netns exec "${ns0}" \
+		socat STDIN VSOCK-CONNECT:"${VSOCK_CID}":"${port}" 2>/dev/null
+
+	vm_dmesg_check "${pidfile}" "${ns1}" "${oops_before}" "${warn_before}"
+	dmesg_rc=$?
+
+	terminate_pidfiles "${pidfile}"
+	result=$(cat "${outfile}")
+	rm -f "${outfile}"
+
+	if [[ "${result}" == "TEST" ]] || [[ "${dmesg_rc}" -ne 0 ]]; then
+		return "${KSFT_FAIL}"
+	fi
+
+	return "${KSFT_PASS}"
+}
+
+test_ns_diff_global_vm_connect_to_global_host_ok() {
+	local oops_before warn_before
+	local ns0="global0"
+	local ns1="global1"
+	local port=12345
+	local unixfile
+	local dmesg_rc
+	local pidfile
+	local pids
+	local rc
+
+	init_namespaces
+
+	declare -a pids
+
+	log_host "Setup socat bridge from ns ${ns0} to ns ${ns1} over port ${port}"
+
+	unixfile=$(mktemp -u /tmp/XXXX.sock)
+
+	ip netns exec "${ns0}" \
+		socat TCP-LISTEN:"${port}" UNIX-CONNECT:"${unixfile}" &
+	pids+=($!)
+	host_wait_for_listener "${ns0}" "${port}" "tcp"
+
+	ip netns exec "${ns1}" \
+		socat UNIX-LISTEN:"${unixfile}" TCP-CONNECT:127.0.0.1:"${port}" &
+	pids+=($!)
+	host_wait_for_listener "${ns1}" "${unixfile}" "unix"
+
+	log_host "Launching ${VSOCK_TEST} in ns ${ns1}"
+	host_vsock_test "${ns1}" "server" "${VSOCK_CID}" "${port}"
+
+	pidfile="$(create_pidfile)"
+	if ! vm_start "${pidfile}" "${ns0}"; then
+		log_host "failed to start vm (cid=${cid}, ns=${ns0})"
+		terminate_pids "${pids[@]}"
+		rm -f "${unixfile}"
+		return "${KSFT_FAIL}"
+	fi
+
+	vm_wait_for_ssh "${ns0}"
+
+	oops_before=$(vm_dmesg_oops_count "${ns0}")
+	warn_before=$(vm_dmesg_warn_count "${ns0}")
+
+	vm_vsock_test "${ns0}" "10.0.2.2" 2 "${port}"
+	rc=$?
+
+	vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}"
+	dmesg_rc=$?
+
+	terminate_pidfiles "${pidfile}"
+	terminate_pids "${pids[@]}"
+	rm -f "${unixfile}"
+
+	if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then
+		return "${KSFT_FAIL}"
+	fi
+
+	return "${KSFT_PASS}"
+
+}
+
+test_ns_diff_global_vm_connect_to_local_host_fails() {
+	local ns0="global0"
+	local ns1="local0"
+	local port=12345
+	local oops_before warn_before
+	local dmesg_rc
+	local pidfile
+	local result
+	local pid
+
+	init_namespaces
+
+	log_host "Launching socat in ns ${ns1}"
+	outfile=$(mktemp)
+
+	ip netns exec "${ns1}" socat VSOCK-LISTEN:"${port}" STDOUT &> "${outfile}" &
+	pid=$!
+	host_wait_for_listener "${ns1}" "${port}" "vsock"
+
+	pidfile="$(create_pidfile)"
+	if ! vm_start "${pidfile}" "${ns0}"; then
+		log_host "failed to start vm (cid=${cid}, ns=${ns0})"
+		terminate_pids "${pid}"
+		rm -f "${outfile}"
+		return "${KSFT_FAIL}"
+	fi
+
+	vm_wait_for_ssh "${ns0}"
+
+	oops_before=$(vm_dmesg_oops_count "${ns0}")
+	warn_before=$(vm_dmesg_warn_count "${ns0}")
+
+	vm_ssh "${ns0}" -- \
+		bash -c "echo TEST | socat STDIN VSOCK-CONNECT:2:${port}" 2>&1 | log_guest
+
+	vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}"
+	dmesg_rc=$?
+
+	terminate_pidfiles "${pidfile}"
+	terminate_pids "${pid}"
+
+	result=$(cat "${outfile}")
+	rm -f "${outfile}"
+
+	if [[ "${result}" != TEST ]] && [[ "${dmesg_rc}" -eq 0 ]]; then
+		return "${KSFT_PASS}"
+	fi
+
+	return "${KSFT_FAIL}"
+}
+
+test_ns_diff_local_host_connect_to_local_vm_fails() {
+	local ns0="local0"
+	local ns1="local1"
+	local port=12345
+	local oops_before warn_before
+	local dmesg_rc
+	local pidfile
+	local result
+	local pid
+
+	init_namespaces
+
+	outfile=$(mktemp)
+
+	pidfile="$(create_pidfile)"
+	if ! vm_start "${pidfile}" "${ns1}"; then
+		log_host "failed to start vm (cid=${cid}, ns=${ns0})"
+		return "${KSFT_FAIL}"
+	fi
+
+	vm_wait_for_ssh "${ns1}"
+	oops_before=$(vm_dmesg_oops_count "${ns1}")
+	warn_before=$(vm_dmesg_warn_count "${ns1}")
+
+	vm_ssh "${ns1}" -- socat VSOCK-LISTEN:"${port}" STDOUT > "${outfile}" &
+	vm_wait_for_listener "${ns1}" "${port}" "vsock"
+
+	echo TEST | ip netns exec "${ns0}" \
+		socat STDIN VSOCK-CONNECT:"${VSOCK_CID}":"${port}" 2>/dev/null
+
+	vm_dmesg_check "${pidfile}" "${ns1}" "${oops_before}" "${warn_before}"
+	dmesg_rc=$?
+
+	terminate_pidfiles "${pidfile}"
+
+	result=$(cat "${outfile}")
+	rm -f "${outfile}"
+
+	if [[ "${result}" != TEST ]] && [[ "${dmesg_rc}" -eq 0 ]]; then
+		return "${KSFT_PASS}"
+	fi
+
+	return "${KSFT_FAIL}"
+}
+
+test_ns_diff_local_vm_connect_to_local_host_fails() {
+	local oops_before warn_before
+	local ns0="local0"
+	local ns1="local1"
+	local port=12345
+	local dmesg_rc
+	local pidfile
+	local result
+	local pid
+
+	init_namespaces
+
+	log_host "Launching socat in ns ${ns1}"
+	outfile=$(mktemp)
+	ip netns exec "${ns1}" socat VSOCK-LISTEN:"${port}" STDOUT &> "${outfile}" &
+	pid=$!
+	host_wait_for_listener "${ns1}" "${port}" "vsock"
+
+	pidfile="$(create_pidfile)"
+	if ! vm_start "${pidfile}" "${ns0}"; then
+		log_host "failed to start vm (cid=${cid}, ns=${ns0})"
+		rm -f "${outfile}"
+		return "${KSFT_FAIL}"
+	fi
+
+	vm_wait_for_ssh "${ns0}"
+	oops_before=$(vm_dmesg_oops_count "${ns0}")
+	warn_before=$(vm_dmesg_warn_count "${ns0}")
+
+	vm_ssh "${ns0}" -- \
+		bash -c "echo TEST | socat STDIN VSOCK-CONNECT:2:${port}" 2>&1 | log_guest
+
+	vm_dmesg_check "${pidfile}" "${ns0}" "${oops_before}" "${warn_before}"
+	dmesg_rc=$?
+
+	terminate_pidfiles "${pidfile}"
+	terminate_pids "${pid}"
+
+	result=$(cat "${outfile}")
+	rm -f "${outfile}"
+
+	if [[ "${result}" != TEST ]] && [[ "${dmesg_rc}" -eq 0 ]]; then
+		return "${KSFT_PASS}"
+	fi
+
+	return "${KSFT_FAIL}"
+}
+
+__test_loopback_two_netns() {
+	local ns0=$1
+	local ns1=$2
+	local port=12345
+	local result
+	local pid
+
+	modprobe vsock_loopback &> /dev/null || :
+
+	log_host "Launching socat in ns ${ns1}"
+	outfile=$(mktemp)
+
+	ip netns exec "${ns1}" socat VSOCK-LISTEN:"${port}" STDOUT > "${outfile}" 2>/dev/null &
+	pid=$!
+	host_wait_for_listener "${ns1}" "${port}" "vsock"
+
+	log_host "Launching socat in ns ${ns0}"
+	echo TEST | ip netns exec "${ns0}" socat STDIN VSOCK-CONNECT:1:"${port}" 2>/dev/null
+	terminate_pids "${pid}"
+
+	result=$(cat "${outfile}")
+	rm -f "${outfile}"
+
+	if [[ "${result}" == TEST ]]; then
+		return 0
+	fi
+
+	return 1
+}
+
+test_ns_diff_global_to_local_loopback_local_fails() {
+	init_namespaces
+
+	if ! __test_loopback_two_netns "global0" "local0"; then
+		return "${KSFT_PASS}"
+	fi
+
+	return "${KSFT_FAIL}"
+}
+
+test_ns_diff_local_to_global_loopback_fails() {
+	init_namespaces
+
+	if ! __test_loopback_two_netns "local0" "global0"; then
+		return "${KSFT_PASS}"
+	fi
+
+	return "${KSFT_FAIL}"
+}
+
+test_ns_diff_local_to_local_loopback_fails() {
+	init_namespaces
+
+	if ! __test_loopback_two_netns "local0" "local1"; then
+		return "${KSFT_PASS}"
+	fi
+
+	return "${KSFT_FAIL}"
+}
+
+test_ns_diff_global_to_global_loopback_ok() {
+	init_namespaces
+
+	if __test_loopback_two_netns "global0" "global1"; then
+		return "${KSFT_PASS}"
+	fi
+
+	return "${KSFT_FAIL}"
+}
+
+test_ns_same_local_loopback_ok() {
+	init_namespaces
+
+	if __test_loopback_two_netns "local0" "local0"; then
+		return "${KSFT_PASS}"
+	fi
+
+	return "${KSFT_FAIL}"
+}
+
+test_ns_same_local_host_connect_to_local_vm_ok() {
+	local oops_before warn_before
+	local ns="local0"
+	local port=1234
+	local dmesg_rc
+	local pidfile
+	local rc
+
+	init_namespaces
+
+	pidfile="$(create_pidfile)"
+
+	if ! vm_start "${pidfile}" "${ns}"; then
+		return "${KSFT_FAIL}"
+	fi
+
+	vm_wait_for_ssh "${ns}"
+	oops_before=$(vm_dmesg_oops_count "${ns}")
+	warn_before=$(vm_dmesg_warn_count "${ns}")
+
+	vm_vsock_test "${ns}" "server" 2 "${TEST_GUEST_PORT}"
+
+	# Skip test 29 (transport release use-after-free): This test attempts
+	# binding both G2H and H2G CIDs. Because virtio-vsock (G2H) doesn't
+	# support local namespaces the test will fail when
+	# transport_g2h->stream_allow() returns false. This edge case only
+	# happens for vsock_test in client mode on the host in a local
+	# namespace. This is a false positive.
+	host_vsock_test "${ns}" "127.0.0.1" "${VSOCK_CID}" "${TEST_HOST_PORT}" --skip=29
+	rc=$?
+
+	vm_dmesg_check "${pidfile}" "${ns}" "${oops_before}" "${warn_before}"
+	dmesg_rc=$?
+
+	terminate_pidfiles "${pidfile}"
+
+	if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then
+		return "${KSFT_FAIL}"
+	fi
+
+	return "${KSFT_PASS}"
+}
+
+test_ns_same_local_vm_connect_to_local_host_ok() {
+	local oops_before warn_before
+	local ns="local0"
+	local port=1234
+	local dmesg_rc
+	local pidfile
+	local rc
+
+	init_namespaces
+
+	pidfile="$(create_pidfile)"
+
+	if ! vm_start "${pidfile}" "${ns}"; then
+		return "${KSFT_FAIL}"
+	fi
+
+	vm_wait_for_ssh "${ns}"
+	oops_before=$(vm_dmesg_oops_count "${ns}")
+	warn_before=$(vm_dmesg_warn_count "${ns}")
+
+	host_vsock_test "${ns}" "server" "${VSOCK_CID}" "${port}"
+	vm_vsock_test "${ns}" "10.0.2.2" 2 "${port}"
+	rc=$?
+
+	vm_dmesg_check "${pidfile}" "${ns}" "${oops_before}" "${warn_before}"
+	dmesg_rc=$?
+
+	terminate_pidfiles "${pidfile}"
+
+	if [[ "${rc}" -ne 0 ]] || [[ "${dmesg_rc}" -ne 0 ]]; then
+		return "${KSFT_FAIL}"
+	fi
+
+	return "${KSFT_PASS}"
+}
+
 namespaces_can_boot_same_cid() {
 	local ns0=$1
 	local ns1=$2
@@ -882,6 +1445,7 @@ fi
 check_args "${ARGS[@]}"
 check_deps
 check_vng
+check_socat
 handle_build
 
 echo "1..${#ARGS[@]}"
-- 
cgit v1.2.3


From b3b7b33264c69a3a97723c31a14c7a19b2fce503 Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobbyeshleman@meta.com>
Date: Wed, 21 Jan 2026 14:11:52 -0800
Subject: selftests/vsock: add tests for namespace deletion

Add tests that validate vsock sockets are resilient to deleting
namespaces. The vsock sockets should still function normally.

The function check_ns_delete_doesnt_break_connection() is added to
re-use the step-by-step logic of 1) setup connections, 2) delete ns,
3) check that the connections are still ok.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Link: https://patch.msgid.link/20260121-vsock-vmtest-v16-12-2859a7512097@meta.com
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/vsock/vmtest.sh | 84 +++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh
index a9eaf37bc31b..dc8dbe74a6d0 100755
--- a/tools/testing/selftests/vsock/vmtest.sh
+++ b/tools/testing/selftests/vsock/vmtest.sh
@@ -68,6 +68,9 @@ readonly TEST_NAMES=(
 	ns_same_local_loopback_ok
 	ns_same_local_host_connect_to_local_vm_ok
 	ns_same_local_vm_connect_to_local_host_ok
+	ns_delete_vm_ok
+	ns_delete_host_ok
+	ns_delete_both_ok
 )
 readonly TEST_DESCS=(
 	# vm_server_host_client
@@ -135,6 +138,15 @@ readonly TEST_DESCS=(
 
 	# ns_same_local_vm_connect_to_local_host_ok
 	"Run vsock_test client in VM in a local ns with server in same ns."
+
+	# ns_delete_vm_ok
+	"Check that deleting the VM's namespace does not break the socket connection"
+
+	# ns_delete_host_ok
+	"Check that deleting the host's namespace does not break the socket connection"
+
+	# ns_delete_both_ok
+	"Check that deleting the VM and host's namespaces does not break the socket connection"
 )
 
 readonly USE_SHARED_VM=(
@@ -1287,6 +1299,78 @@ test_vm_loopback() {
 	return "${KSFT_PASS}"
 }
 
+check_ns_delete_doesnt_break_connection() {
+	local pipefile pidfile outfile
+	local ns0="global0"
+	local ns1="global1"
+	local port=12345
+	local pids=()
+	local rc=0
+
+	init_namespaces
+
+	pidfile="$(create_pidfile)"
+	if ! vm_start "${pidfile}" "${ns0}"; then
+		return "${KSFT_FAIL}"
+	fi
+	vm_wait_for_ssh "${ns0}"
+
+	outfile=$(mktemp)
+	vm_ssh "${ns0}" -- \
+		socat VSOCK-LISTEN:"${port}",fork STDOUT > "${outfile}" 2>/dev/null &
+	pids+=($!)
+	vm_wait_for_listener "${ns0}" "${port}" "vsock"
+
+	# We use a pipe here so that we can echo into the pipe instead of using
+	# socat and a unix socket file. We just need a name for the pipe (not a
+	# regular file) so use -u.
+	pipefile=$(mktemp -u /tmp/vmtest_pipe_XXXX)
+	ip netns exec "${ns1}" \
+		socat PIPE:"${pipefile}" VSOCK-CONNECT:"${VSOCK_CID}":"${port}" &
+	pids+=($!)
+
+	timeout "${WAIT_PERIOD}" \
+		bash -c 'while [[ ! -e '"${pipefile}"' ]]; do sleep 1; done; exit 0'
+
+	if [[ "$1" == "vm" ]]; then
+		ip netns del "${ns0}"
+	elif [[ "$1" == "host" ]]; then
+		ip netns del "${ns1}"
+	elif [[ "$1" == "both" ]]; then
+		ip netns del "${ns0}"
+		ip netns del "${ns1}"
+	fi
+
+	echo "TEST" > "${pipefile}"
+
+	timeout "${WAIT_PERIOD}" \
+		bash -c 'while [[ ! -s '"${outfile}"' ]]; do sleep 1; done; exit 0'
+
+	if grep -q "TEST" "${outfile}"; then
+		rc="${KSFT_PASS}"
+	else
+		rc="${KSFT_FAIL}"
+	fi
+
+	terminate_pidfiles "${pidfile}"
+	terminate_pids "${pids[@]}"
+	rm -f "${outfile}" "${pipefile}"
+
+	return "${rc}"
+}
+
+test_ns_delete_vm_ok() {
+	check_ns_delete_doesnt_break_connection "vm"
+}
+
+test_ns_delete_host_ok() {
+	check_ns_delete_doesnt_break_connection "host"
+}
+
+test_ns_delete_both_ok() {
+	check_ns_delete_doesnt_break_connection "both"
+}
+
 shared_vm_test() {
 	local tname
 
-- 
cgit v1.2.3


From fd4eeb30b9e30ca1118a618be0755287bcbb2da9 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Mon, 26 Jan 2026 04:57:35 +0000
Subject: objtool: Print bfd_vma as unsigned long long on ia32-x86_64 cross
 build

When objtool is cross-compiled in ia32 container for x86_64 target it
fails with the following errors:

> disas.c: In function 'disas_print_addr_sym':
> disas.c:173:38: error: format '%lx' expects argument of type 'long unsigned int', but argument 3 has type 'bfd_vma' {aka 'long long unsigned int'} [-Werror=format=]
>   173 |                 DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr);
>       |                                      ^~~~~~~~~~~~  ~~~~
>       |                                                    |
>       |                                                    bfd_vma {aka long long unsigned int}

Provide a correct printf-fmt depending on sizeof(bfd_vma).

Fixes: 5d859dff266f ("objtool: Print symbol during disassembly")
Signed-off-by: Dmitry Safonov <dima@arista.com>
Reviewed-by: Alexandre Chartre <alexandre.chartre@oracle.com>
Link: https://patch.msgid.link/20260126-objtool-ia32-v1-1-bb6feaf17566@arista.com
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 tools/objtool/disas.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/disas.c b/tools/objtool/disas.c
index 2b5059f55e40..26f08d41f2b1 100644
--- a/tools/objtool/disas.c
+++ b/tools/objtool/disas.c
@@ -108,6 +108,8 @@ static int sprint_name(char *str, const char *name, unsigned long offset)
 
 #define DINFO_FPRINTF(dinfo, ...)	\
 	((*(dinfo)->fprintf_func)((dinfo)->stream, __VA_ARGS__))
+#define bfd_vma_fmt			\
+	__builtin_choose_expr(sizeof(bfd_vma) == sizeof(unsigned long), "%#lx <%s>", "%#llx <%s>")
 
 static int disas_result_fprintf(struct disas_context *dctx,
 				const char *fmt, va_list ap)
@@ -170,10 +172,10 @@ static void disas_print_addr_sym(struct section *sec, struct symbol *sym,
 
 	if (sym) {
 		sprint_name(symstr, sym->name, addr - sym->offset);
-		DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr);
+		DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, symstr);
 	} else {
 		str = offstr(sec, addr);
-		DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str);
+		DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, str);
 		free(str);
 	}
 }
@@ -252,7 +254,7 @@ static void disas_print_addr_reloc(bfd_vma addr, struct disassemble_info *dinfo)
 		 * example: "lea 0x0(%rip),%rdi". The kernel can reference
 		 * the next IP with _THIS_IP_ macro.
 		 */
-		DINFO_FPRINTF(dinfo, "0x%lx <_THIS_IP_>", addr);
+		DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, "_THIS_IP_");
 		return;
 	}
 
@@ -264,11 +266,11 @@ static void disas_print_addr_reloc(bfd_vma addr, struct disassemble_info *dinfo)
 	 */
 	if (reloc->sym->type == STT_SECTION) {
 		str = offstr(reloc->sym->sec, reloc->sym->offset + offset);
-		DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str);
+		DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, str);
 		free(str);
 	} else {
 		sprint_name(symstr, reloc->sym->name, offset);
-		DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr);
+		DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, symstr);
 	}
 }
 
@@ -311,7 +313,7 @@ static void disas_print_address(bfd_vma addr, struct disassemble_info *dinfo)
 	 */
 	sym = insn_call_dest(insn);
 	if (sym && (sym->offset == addr || (sym->offset == 0 && is_reloc))) {
-		DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, sym->name);
+		DINFO_FPRINTF(dinfo, bfd_vma_fmt, addr, sym->name);
 		return;
 	}
 
-- 
cgit v1.2.3


From d107b3265aa5e61a1e326b2815a767526ddb12ac Mon Sep 17 00:00:00 2001
From: Petr Pavlu <petr.pavlu@suse.com>
Date: Mon, 26 Jan 2026 16:13:48 +0100
Subject: objtool: Replace custom macros in elf.c with shared ones

The source file tools/objtool/elf.c defines the macros ALIGN_UP(),
ALIGN_UP_POW2() and MAX(). These macros unnecessarily duplicate
functionality already available under tools/include/, specifically ALIGN(),
roundup_pow_of_two() and max().

More importantly, the definition of ALIGN_UP_POW2() is incorrect when the
input is 1, as it results in a call to __builtin_clz(0), which produces an
undefined result. This issue impacts the function elf_alloc_reloc(). When
adding the first relocation to a section, the function allocates an
undefined number of relocations.

Replace the custom macros with the shared functionality to resolve these
issues.

Fixes: 2c05ca026218 ("objtool: Add elf_create_reloc() and elf_init_reloc()")
Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
Link: https://patch.msgid.link/20260126151356.3924887-1-petr.pavlu@suse.com
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 tools/objtool/elf.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 6a8ed9c62323..2c02c7b49265 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -18,15 +18,14 @@
 #include <errno.h>
 #include <libgen.h>
 #include <ctype.h>
+#include <linux/align.h>
+#include <linux/kernel.h>
 #include <linux/interval_tree_generic.h>
+#include <linux/log2.h>
 #include <objtool/builtin.h>
 #include <objtool/elf.h>
 #include <objtool/warn.h>
 
-#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1))
-#define ALIGN_UP_POW2(x) (1U << ((8 * sizeof(x)) - __builtin_clz((x) - 1U)))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
 static inline u32 str_hash(const char *str)
 {
 	return jhash(str, strlen(str), 0);
@@ -1336,7 +1335,7 @@ unsigned int elf_add_string(struct elf *elf, struct section *strtab, const char
 		return -1;
 	}
 
-	offset = ALIGN_UP(strtab->sh.sh_size, strtab->sh.sh_addralign);
+	offset = ALIGN(strtab->sh.sh_size, strtab->sh.sh_addralign);
 
 	if (!elf_add_data(elf, strtab, str, strlen(str) + 1))
 		return -1;
@@ -1378,7 +1377,7 @@ void *elf_add_data(struct elf *elf, struct section *sec, const void *data, size_
 	sec->data->d_size = size;
 	sec->data->d_align = 1;
 
-	offset = ALIGN_UP(sec->sh.sh_size, sec->sh.sh_addralign);
+	offset = ALIGN(sec->sh.sh_size, sec->sh.sh_addralign);
 	sec->sh.sh_size = offset + size;
 
 	mark_sec_changed(elf, sec, true);
@@ -1502,7 +1501,7 @@ static int elf_alloc_reloc(struct elf *elf, struct section *rsec)
 	rsec->data->d_size = nr_relocs_new * elf_rela_size(elf);
 	rsec->sh.sh_size   = rsec->data->d_size;
 
-	nr_alloc = MAX(64, ALIGN_UP_POW2(nr_relocs_new));
+	nr_alloc = max(64UL, roundup_pow_of_two(nr_relocs_new));
 	if (nr_alloc <= rsec->nr_alloc_relocs)
 		return 0;
 
-- 
cgit v1.2.3


From f2dba60339a6299e181671e95293efe312237e2d Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Sun, 25 Jan 2026 21:56:39 -0800
Subject: objtool/klp: Fix bug table handling for __WARN_printf()

Running objtool klp-diff on a changed function which uses WARN() can
fail with:

  vmlinux.o: error: objtool: md_run+0x866: failed to convert reloc sym '__bug_table' to its proper format

The problem is that since commit 5b472b6e5bd9 ("x86_64/bug: Implement
__WARN_printf()"), each __WARN_printf() call site now directly
references its bug table entry.  klp-diff errors out when it can't
convert such section-based references to object symbols (because bug
table entries don't have symbols).

Luckily, klp-diff already has code to create symbols for bug table
entries.  Move that code earlier, before function diffing.

Fixes: dd590d4d57eb ("objtool/klp: Introduce klp diff subcommand for diffing object files")
Fixes: 5b472b6e5bd9 ("x86_64/bug: Implement __WARN_printf()")
Reported-by: Song Liu <song@kernel.org>
Tested-by: Song Liu <song@kernel.org>
Link: https://patch.msgid.link/a8e0a714b9da962858842b9aecd63b4900927c88.1769406850.git.jpoimboe@kernel.org
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 tools/objtool/klp-diff.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/klp-diff.c b/tools/objtool/klp-diff.c
index 4d1f9e9977eb..d94531e3f64e 100644
--- a/tools/objtool/klp-diff.c
+++ b/tools/objtool/klp-diff.c
@@ -1425,9 +1425,6 @@ static int clone_special_sections(struct elfs *e)
 {
 	struct section *patched_sec;
 
-	if (create_fake_symbols(e->patched))
-		return -1;
-
 	for_each_sec(e->patched, patched_sec) {
 		if (is_special_section(patched_sec)) {
 			if (clone_special_section(e, patched_sec))
@@ -1704,6 +1701,17 @@ int cmd_klp_diff(int argc, const char **argv)
 	if (!e.out)
 		return -1;
 
+	/*
+	 * Special section fake symbols are needed so that individual special
+	 * section entries can be extracted by clone_special_sections().
+	 *
+	 * Note the fake symbols are also needed by clone_included_functions()
+	 * because __WARN_printf() call sites add references to bug table
+	 * entries in the calling functions.
+	 */
+	if (create_fake_symbols(e.patched))
+		return -1;
+
 	if (clone_included_functions(&e))
 		return -1;
 
-- 
cgit v1.2.3


From 752b807028e63f1473b84eb1350e131eca5e5249 Mon Sep 17 00:00:00 2001
From: Matt Bobrowski <mattbobrowski@google.com>
Date: Tue, 27 Jan 2026 08:51:10 +0000
Subject: bpf: add new BPF_CGROUP_ITER_CHILDREN control option

Currently, the BPF cgroup iterator supports walking descendants in
either pre-order (BPF_CGROUP_ITER_DESCENDANTS_PRE) or post-order
(BPF_CGROUP_ITER_DESCENDANTS_POST). These modes perform an exhaustive
depth-first search (DFS) of the hierarchy. In scenarios where a BPF
program may need to inspect only the direct children of a given parent
cgroup, a full DFS is unnecessarily expensive.

This patch introduces a new BPF cgroup iterator control option,
BPF_CGROUP_ITER_CHILDREN. This control option restricts the traversal
to the immediate children of a specified parent cgroup, allowing for
more targeted and efficient iteration, particularly when exhaustive
depth-first search (DFS) traversal is not required.

Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Link: https://lore.kernel.org/r/20260127085112.3608687-1-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       |  8 ++++++++
 kernel/bpf/cgroup_iter.c       | 26 +++++++++++++++++++++-----
 tools/include/uapi/linux/bpf.h |  8 ++++++++
 3 files changed, 37 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 44e7dbc278e3..c8d400b7680a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order {
 	BPF_CGROUP_ITER_DESCENDANTS_PRE,	/* walk descendants in pre-order. */
 	BPF_CGROUP_ITER_DESCENDANTS_POST,	/* walk descendants in post-order. */
 	BPF_CGROUP_ITER_ANCESTORS_UP,		/* walk ancestors upward. */
+	/*
+	 * Walks the immediate children of the specified parent
+	 * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE,
+	 * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP
+	 * the iterator does not include the specified parent as one of the
+	 * returned iterator elements.
+	 */
+	BPF_CGROUP_ITER_CHILDREN,
 };
 
 union bpf_iter_link_info {
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index f04a468cf6a7..fd51fe3d92cc 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -8,12 +8,13 @@
 
 #include "../cgroup/cgroup-internal.h"  /* cgroup_mutex and cgroup_is_dead */
 
-/* cgroup_iter provides four modes of traversal to the cgroup hierarchy.
+/* cgroup_iter provides five modes of traversal to the cgroup hierarchy.
  *
  *  1. Walk the descendants of a cgroup in pre-order.
  *  2. Walk the descendants of a cgroup in post-order.
  *  3. Walk the ancestors of a cgroup.
  *  4. Show the given cgroup only.
+ *  5. Walk the children of a given parent cgroup.
  *
  * For walking descendants, cgroup_iter can walk in either pre-order or
  * post-order. For walking ancestors, the iter walks up from a cgroup to
@@ -78,6 +79,8 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
 		return css_next_descendant_pre(NULL, p->start_css);
 	else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST)
 		return css_next_descendant_post(NULL, p->start_css);
+	else if (p->order == BPF_CGROUP_ITER_CHILDREN)
+		return css_next_child(NULL, p->start_css);
 	else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */
 		return p->start_css;
 }
@@ -113,6 +116,8 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		return css_next_descendant_post(curr, p->start_css);
 	else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP)
 		return curr->parent;
+	else if (p->order == BPF_CGROUP_ITER_CHILDREN)
+		return css_next_child(curr, p->start_css);
 	else  /* BPF_CGROUP_ITER_SELF_ONLY */
 		return NULL;
 }
@@ -200,11 +205,16 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog,
 	int order = linfo->cgroup.order;
 	struct cgroup *cgrp;
 
-	if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE &&
-	    order != BPF_CGROUP_ITER_DESCENDANTS_POST &&
-	    order != BPF_CGROUP_ITER_ANCESTORS_UP &&
-	    order != BPF_CGROUP_ITER_SELF_ONLY)
+	switch (order) {
+	case BPF_CGROUP_ITER_DESCENDANTS_PRE:
+	case BPF_CGROUP_ITER_DESCENDANTS_POST:
+	case BPF_CGROUP_ITER_ANCESTORS_UP:
+	case BPF_CGROUP_ITER_SELF_ONLY:
+	case BPF_CGROUP_ITER_CHILDREN:
+		break;
+	default:
 		return -EINVAL;
+	}
 
 	if (fd && id)
 		return -EINVAL;
@@ -257,6 +267,8 @@ show_order:
 		seq_puts(seq, "order: descendants_post\n");
 	else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP)
 		seq_puts(seq, "order: ancestors_up\n");
+	else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN)
+		seq_puts(seq, "order: children\n");
 	else /* BPF_CGROUP_ITER_SELF_ONLY */
 		seq_puts(seq, "order: self_only\n");
 }
@@ -320,6 +332,7 @@ __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it,
 	case BPF_CGROUP_ITER_DESCENDANTS_PRE:
 	case BPF_CGROUP_ITER_DESCENDANTS_POST:
 	case BPF_CGROUP_ITER_ANCESTORS_UP:
+	case BPF_CGROUP_ITER_CHILDREN:
 		break;
 	default:
 		return -EINVAL;
@@ -345,6 +358,9 @@ __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *i
 	case BPF_CGROUP_ITER_DESCENDANTS_POST:
 		kit->pos = css_next_descendant_post(kit->pos, kit->start);
 		break;
+	case BPF_CGROUP_ITER_CHILDREN:
+		kit->pos = css_next_child(kit->pos, kit->start);
+		break;
 	case BPF_CGROUP_ITER_ANCESTORS_UP:
 		kit->pos = kit->pos ? kit->pos->parent : kit->start;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3ca7d76e05f0..5e38b4887de6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order {
 	BPF_CGROUP_ITER_DESCENDANTS_PRE,	/* walk descendants in pre-order. */
 	BPF_CGROUP_ITER_DESCENDANTS_POST,	/* walk descendants in post-order. */
 	BPF_CGROUP_ITER_ANCESTORS_UP,		/* walk ancestors upward. */
+	/*
+	 * Walks the immediate children of the specified parent
+	 * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE,
+	 * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP
+	 * the iterator does not include the specified parent as one of the
+	 * returned iterator elements.
+	 */
+	BPF_CGROUP_ITER_CHILDREN,
 };
 
 union bpf_iter_link_info {
-- 
cgit v1.2.3


From 1456ebb291ddee67c9144c8f7f38a6dddcd32ed7 Mon Sep 17 00:00:00 2001
From: Matt Bobrowski <mattbobrowski@google.com>
Date: Tue, 27 Jan 2026 08:51:11 +0000
Subject: selftests/bpf: cover BPF_CGROUP_ITER_CHILDREN control option

Extend some of the existing CSS iterator selftests such that they
cover the newly introduced BPF_CGROUP_ITER_CHILDREN iterator control
option.

Signed-off-by: Matt Bobrowski <mattbobrowski@google.com>
Link: https://lore.kernel.org/r/20260127085112.3608687-2-mattbobrowski@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/cgroup_iter.c | 12 ++++++++++++
 tools/testing/selftests/bpf/prog_tests/iters.c       |  8 +++++++-
 tools/testing/selftests/bpf/progs/iters_css.c        |  9 ++++++---
 3 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
index 574d9a0cdc8e..0f88a9d00a22 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c
@@ -190,6 +190,16 @@ static void test_walk_self_only(struct cgroup_iter *skel)
 			      BPF_CGROUP_ITER_SELF_ONLY, "self_only");
 }
 
+static void test_walk_children(struct cgroup_iter *skel)
+{
+	snprintf(expected_output, sizeof(expected_output),
+		 PROLOGUE "%8llu\n%8llu\n" EPILOGUE, cg_id[CHILD1],
+		 cg_id[CHILD2]);
+
+	read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT],
+			      BPF_CGROUP_ITER_CHILDREN, "children");
+}
+
 static void test_walk_dead_self_only(struct cgroup_iter *skel)
 {
 	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
@@ -325,6 +335,8 @@ void test_cgroup_iter(void)
 		test_walk_dead_self_only(skel);
 	if (test__start_subtest("cgroup_iter__self_only_css_task"))
 		test_walk_self_only_css_task();
+	if (test__start_subtest("cgroup_iter__children"))
+		test_walk_children(skel);
 
 out:
 	cgroup_iter__destroy(skel);
diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c
index 3cea71f9c500..a539980a2fbe 100644
--- a/tools/testing/selftests/bpf/prog_tests/iters.c
+++ b/tools/testing/selftests/bpf/prog_tests/iters.c
@@ -253,6 +253,11 @@ static void subtest_css_iters(void)
 		{ "/cg1/cg2" },
 		{ "/cg1/cg2/cg3" },
 		{ "/cg1/cg2/cg3/cg4" },
+		{ "/cg1/cg5" },
+		{ "/cg1/cg5/cg6" },
+		{ "/cg1/cg7" },
+		{ "/cg1/cg7/cg8" },
+		{ "/cg1/cg7/cg8/cg9" },
 	};
 	int err, cg_nr = ARRAY_SIZE(cgs);
 	int i;
@@ -284,7 +289,8 @@ static void subtest_css_iters(void)
 
 	ASSERT_EQ(skel->bss->post_order_cnt, cg_nr, "post_order_cnt");
 	ASSERT_EQ(skel->bss->last_cg_id, get_cgroup_id(cgs[0].path), "last_cg_id");
-	ASSERT_EQ(skel->bss->tree_high, cg_nr - 1, "tree_high");
+	ASSERT_EQ(skel->bss->children_cnt, 3, "children_cnt");
+	ASSERT_EQ(skel->bss->tree_high, 3, "tree_high");
 	iters_css__detach(skel);
 cleanup:
 	cleanup_cgroup_environment();
diff --git a/tools/testing/selftests/bpf/progs/iters_css.c b/tools/testing/selftests/bpf/progs/iters_css.c
index ec1f6c2f590b..5a1d87d186a9 100644
--- a/tools/testing/selftests/bpf/progs/iters_css.c
+++ b/tools/testing/selftests/bpf/progs/iters_css.c
@@ -12,8 +12,7 @@ char _license[] SEC("license") = "GPL";
 pid_t target_pid;
 u64 root_cg_id, leaf_cg_id;
 u64 first_cg_id, last_cg_id;
-
-int pre_order_cnt, post_order_cnt, tree_high;
+int pre_order_cnt, post_order_cnt, children_cnt, tree_high;
 
 struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
 void bpf_cgroup_release(struct cgroup *p) __ksym;
@@ -43,7 +42,7 @@ int iter_css_for_each(const void *ctx)
 	}
 	root_css = &root_cgrp->self;
 	leaf_css = &leaf_cgrp->self;
-	pre_order_cnt = post_order_cnt = tree_high = 0;
+	pre_order_cnt = post_order_cnt = children_cnt = tree_high = 0;
 	first_cg_id = last_cg_id = 0;
 
 	bpf_rcu_read_lock();
@@ -60,6 +59,10 @@ int iter_css_for_each(const void *ctx)
 			first_cg_id = cur_cgrp->kn->id;
 	}
 
+	bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_CHILDREN) {
+		children_cnt++;
+	}
+
 	bpf_for_each(css, pos, leaf_css, BPF_CGROUP_ITER_ANCESTORS_UP)
 		tree_high++;
 
-- 
cgit v1.2.3


From 17e2ce02bf5669dfa659976e93d409228cba98f9 Mon Sep 17 00:00:00 2001
From: Jiayuan Chen <jiayuan.chen@linux.dev>
Date: Sat, 24 Jan 2026 19:32:45 +0800
Subject: selftests/bpf: Add tests for FIONREAD and copied_seq

This commit adds two new test functions: one to reproduce the bug reported
by syzkaller [1], and another to cover the calculation of copied_seq.

The tests primarily involve installing  and uninstalling sockmap on
sockets, then reading data to verify proper functionality.

Additionally, extend the do_test_sockmap_skb_verdict_fionread() function
to support UDP FIONREAD testing.

[1] https://syzkaller.appspot.com/bug?extid=06dbd397158ec0ea4983

Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/r/20260124113314.113584-4-jiayuan.chen@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/sockmap_basic.c       | 294 ++++++++++++++++++++-
 .../selftests/bpf/progs/test_sockmap_pass_prog.c   |  14 +
 2 files changed, 302 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index 1e3e4392dcca..256707e7d20d 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -1,7 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2020 Cloudflare
 #include <error.h>
-#include <netinet/tcp.h>
+#include <linux/tcp.h>
+#include <linux/socket.h>
 #include <sys/epoll.h>
 
 #include "test_progs.h"
@@ -22,6 +23,15 @@
 #define TCP_REPAIR_ON		1
 #define TCP_REPAIR_OFF_NO_WP	-1	/* Turn off without window probes */
 
+/**
+ * SOL_TCP is defined in <netinet/tcp.h> (glibc), but the copybuf_address
+ * field of tcp_zerocopy_receive is not yet included in older versions.
+ * This workaround remains necessary until the glibc update propagates.
+ */
+#ifndef SOL_TCP
+#define SOL_TCP 6
+#endif
+
 static int connected_socket_v4(void)
 {
 	struct sockaddr_in addr = {
@@ -536,13 +546,14 @@ out:
 }
 
 
-static void test_sockmap_skb_verdict_fionread(bool pass_prog)
+static void do_test_sockmap_skb_verdict_fionread(int sotype, bool pass_prog)
 {
 	int err, map, verdict, c0 = -1, c1 = -1, p0 = -1, p1 = -1;
 	int expected, zero = 0, sent, recvd, avail;
 	struct test_sockmap_pass_prog *pass = NULL;
 	struct test_sockmap_drop_prog *drop = NULL;
 	char buf[256] = "0123456789";
+	int split_len = sizeof(buf) / 2;
 
 	if (pass_prog) {
 		pass = test_sockmap_pass_prog__open_and_load();
@@ -550,7 +561,10 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog)
 			return;
 		verdict = bpf_program__fd(pass->progs.prog_skb_verdict);
 		map = bpf_map__fd(pass->maps.sock_map_rx);
-		expected = sizeof(buf);
+		if (sotype == SOCK_DGRAM)
+			expected = split_len; /* FIONREAD for UDP is different from TCP */
+		else
+			expected = sizeof(buf);
 	} else {
 		drop = test_sockmap_drop_prog__open_and_load();
 		if (!ASSERT_OK_PTR(drop, "open_and_load"))
@@ -566,7 +580,7 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog)
 	if (!ASSERT_OK(err, "bpf_prog_attach"))
 		goto out;
 
-	err = create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1);
+	err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1);
 	if (!ASSERT_OK(err, "create_socket_pairs()"))
 		goto out;
 
@@ -574,8 +588,9 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog)
 	if (!ASSERT_OK(err, "bpf_map_update_elem(c1)"))
 		goto out_close;
 
-	sent = xsend(p1, &buf, sizeof(buf), 0);
-	ASSERT_EQ(sent, sizeof(buf), "xsend(p0)");
+	sent = xsend(p1, &buf, split_len, 0);
+	sent += xsend(p1, &buf, sizeof(buf) - split_len, 0);
+	ASSERT_EQ(sent, sizeof(buf), "xsend(p1)");
 	err = ioctl(c1, FIONREAD, &avail);
 	ASSERT_OK(err, "ioctl(FIONREAD) error");
 	ASSERT_EQ(avail, expected, "ioctl(FIONREAD)");
@@ -597,6 +612,12 @@ out:
 		test_sockmap_drop_prog__destroy(drop);
 }
 
+static void test_sockmap_skb_verdict_fionread(bool pass_prog)
+{
+	do_test_sockmap_skb_verdict_fionread(SOCK_STREAM, pass_prog);
+	do_test_sockmap_skb_verdict_fionread(SOCK_DGRAM, pass_prog);
+}
+
 static void test_sockmap_skb_verdict_change_tail(void)
 {
 	struct test_sockmap_change_tail *skel;
@@ -1042,6 +1063,257 @@ close_map:
 	xclose(map);
 }
 
+/* it is used to reproduce WARNING */
+static void test_sockmap_zc(void)
+{
+	int map, err, sent, recvd, zero = 0, one = 1, on = 1;
+	char buf[10] = "0123456789", rcv[11], addr[100];
+	struct test_sockmap_pass_prog *skel = NULL;
+	int c0 = -1, p0 = -1, c1 = -1, p1 = -1;
+	struct tcp_zerocopy_receive zc;
+	socklen_t zc_len = sizeof(zc);
+	struct bpf_program *prog;
+
+	skel = test_sockmap_pass_prog__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+
+	if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1))
+		goto end;
+
+	prog = skel->progs.prog_skb_verdict_ingress;
+	map = bpf_map__fd(skel->maps.sock_map_rx);
+
+	err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0);
+	if (!ASSERT_OK(err, "bpf_prog_attach"))
+		goto end;
+
+	err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY);
+	if (!ASSERT_OK(err, "bpf_map_update_elem"))
+		goto end;
+
+	err = bpf_map_update_elem(map, &one, &p1, BPF_ANY);
+	if (!ASSERT_OK(err, "bpf_map_update_elem"))
+		goto end;
+
+	sent = xsend(c0, buf, sizeof(buf), 0);
+	if (!ASSERT_EQ(sent, sizeof(buf), "xsend"))
+		goto end;
+
+	/* trigger tcp_bpf_recvmsg_parser and inc copied_seq of p1 */
+	recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1);
+	if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1)"))
+		goto end;
+
+	/* uninstall sockmap of p1 */
+	bpf_map_delete_elem(map, &one);
+
+	/* trigger tcp stack and the rcv_nxt of p1 is less than copied_seq */
+	sent = xsend(c1, buf, sizeof(buf) - 1, 0);
+	if (!ASSERT_EQ(sent, sizeof(buf) - 1, "xsend"))
+		goto end;
+
+	err = setsockopt(p1, SOL_SOCKET, SO_ZEROCOPY, &on, sizeof(on));
+	if (!ASSERT_OK(err, "setsockopt"))
+		goto end;
+
+	memset(&zc, 0, sizeof(zc));
+	zc.copybuf_address = (__u64)((unsigned long)addr);
+	zc.copybuf_len = sizeof(addr);
+
+	err = getsockopt(p1, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, &zc, &zc_len);
+	if (!ASSERT_OK(err, "getsockopt"))
+		goto end;
+
+end:
+	if (c0 >= 0)
+		close(c0);
+	if (p0 >= 0)
+		close(p0);
+	if (c1 >= 0)
+		close(c1);
+	if (p1 >= 0)
+		close(p1);
+	test_sockmap_pass_prog__destroy(skel);
+}
+
+/* it is used to check whether copied_seq of sk is correct */
+static void test_sockmap_copied_seq(bool strp)
+{
+	int i, map, err, sent, recvd, zero = 0, one = 1;
+	struct test_sockmap_pass_prog *skel = NULL;
+	int c0 = -1, p0 = -1, c1 = -1, p1 = -1;
+	char buf[10] = "0123456789", rcv[11];
+	struct bpf_program *prog;
+
+	skel = test_sockmap_pass_prog__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+
+	if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1))
+		goto end;
+
+	prog = skel->progs.prog_skb_verdict_ingress;
+	map = bpf_map__fd(skel->maps.sock_map_rx);
+
+	err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0);
+	if (!ASSERT_OK(err, "bpf_prog_attach verdict"))
+		goto end;
+
+	if (strp) {
+		prog = skel->progs.prog_skb_verdict_ingress_strp;
+		err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_PARSER, 0);
+		if (!ASSERT_OK(err, "bpf_prog_attach parser"))
+			goto end;
+	}
+
+	err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY);
+	if (!ASSERT_OK(err, "bpf_map_update_elem(p0)"))
+		goto end;
+
+	err = bpf_map_update_elem(map, &one, &p1, BPF_ANY);
+	if (!ASSERT_OK(err, "bpf_map_update_elem(p1)"))
+		goto end;
+
+	/* just trigger sockamp: data sent by c0 will be received by p1 */
+	sent = xsend(c0, buf, sizeof(buf), 0);
+	if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), bpf"))
+		goto end;
+
+	/* do partial read */
+	recvd = recv_timeout(p1, rcv, 1, MSG_DONTWAIT, 1);
+	recvd += recv_timeout(p1, rcv + 1, sizeof(rcv) - 1, MSG_DONTWAIT, 1);
+	if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), bpf") ||
+	    !ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch"))
+		goto end;
+
+	/* uninstall sockmap of p1 and p0 */
+	err = bpf_map_delete_elem(map, &one);
+	if (!ASSERT_OK(err, "bpf_map_delete_elem(1)"))
+		goto end;
+
+	err = bpf_map_delete_elem(map, &zero);
+	if (!ASSERT_OK(err, "bpf_map_delete_elem(0)"))
+		goto end;
+
+	/* now all sockets become plain socket, they should still work */
+	for (i = 0; i < 5; i++) {
+		/* test copied_seq of p1 by running tcp native stack */
+		sent = xsend(c1, buf, sizeof(buf), 0);
+		if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c1), native"))
+			goto end;
+
+		recvd = recv(p1, rcv, sizeof(rcv), MSG_DONTWAIT);
+		if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), native"))
+			goto end;
+
+		/* p0 previously redirected skb to p1, we also check copied_seq of p0 */
+		sent = xsend(c0, buf, sizeof(buf), 0);
+		if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), native"))
+			goto end;
+
+		recvd = recv(p0, rcv, sizeof(rcv), MSG_DONTWAIT);
+		if (!ASSERT_EQ(recvd, sent, "recv_timeout(p0), native"))
+			goto end;
+	}
+
+end:
+	if (c0 >= 0)
+		close(c0);
+	if (p0 >= 0)
+		close(p0);
+	if (c1 >= 0)
+		close(c1);
+	if (p1 >= 0)
+		close(p1);
+	test_sockmap_pass_prog__destroy(skel);
+}
+
+/* Wait until FIONREAD returns the expected value or timeout */
+static int wait_for_fionread(int fd, int expected, unsigned int timeout_ms)
+{
+	unsigned int elapsed = 0;
+	int avail = 0;
+
+	while (elapsed < timeout_ms) {
+		if (ioctl(fd, FIONREAD, &avail) < 0)
+			return -errno;
+		if (avail >= expected)
+			return avail;
+		usleep(1000);
+		elapsed++;
+	}
+	return avail;
+}
+
+/* it is used to send data to via native stack and BPF redirecting */
+static void test_sockmap_multi_channels(int sotype)
+{
+	int map, err, sent, recvd, zero = 0, one = 1, avail = 0, expected;
+	struct test_sockmap_pass_prog *skel = NULL;
+	int c0 = -1, p0 = -1, c1 = -1, p1 = -1;
+	char buf[10] = "0123456789", rcv[11];
+	struct bpf_program *prog;
+
+	skel = test_sockmap_pass_prog__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+
+	err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1);
+	if (err)
+		goto end;
+
+	prog = skel->progs.prog_skb_verdict_ingress;
+	map = bpf_map__fd(skel->maps.sock_map_rx);
+
+	err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0);
+	if (!ASSERT_OK(err, "bpf_prog_attach verdict"))
+		goto end;
+
+	err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY);
+	if (!ASSERT_OK(err, "bpf_map_update_elem(p0)"))
+		goto end;
+
+	err = bpf_map_update_elem(map, &one, &p1, BPF_ANY);
+	if (!ASSERT_OK(err, "bpf_map_update_elem"))
+		goto end;
+
+	/* send data to p1 via native stack */
+	sent = xsend(c1, buf, 2, 0);
+	if (!ASSERT_EQ(sent, 2, "xsend(2)"))
+		goto end;
+
+	avail = wait_for_fionread(p1, 2, IO_TIMEOUT_SEC);
+	ASSERT_EQ(avail, 2, "ioctl(FIONREAD) partial return");
+
+	/* send data to p1 via bpf redirecting */
+	sent = xsend(c0, buf + 2, sizeof(buf) - 2, 0);
+	if (!ASSERT_EQ(sent, sizeof(buf) - 2, "xsend(remain-data)"))
+		goto end;
+
+	/* Poll FIONREAD until expected bytes arrive, poll_read() is unreliable
+	 * here since it may return immediately if prior data is already queued.
+	 */
+	expected = sotype == SOCK_DGRAM ? 2 : sizeof(buf);
+	avail = wait_for_fionread(p1, expected, IO_TIMEOUT_SEC);
+	ASSERT_EQ(avail, expected, "ioctl(FIONREAD) full return");
+
+	recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1);
+	if (!ASSERT_EQ(recvd, sizeof(buf), "recv_timeout(p1)") ||
+	    !ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch"))
+		goto end;
+end:
+	if (c0 >= 0)
+		close(c0);
+	if (p0 >= 0)
+		close(p0);
+	if (c1 >= 0)
+		close(c1);
+	if (p1 >= 0)
+		close(p1);
+	test_sockmap_pass_prog__destroy(skel);
+}
+
 void test_sockmap_basic(void)
 {
 	if (test__start_subtest("sockmap create_update_free"))
@@ -1108,4 +1380,14 @@ void test_sockmap_basic(void)
 		test_sockmap_skb_verdict_vsock_poll();
 	if (test__start_subtest("sockmap vsock unconnected"))
 		test_sockmap_vsock_unconnected();
+	if (test__start_subtest("sockmap with zc"))
+		test_sockmap_zc();
+	if (test__start_subtest("sockmap recover"))
+		test_sockmap_copied_seq(false);
+	if (test__start_subtest("sockmap recover with strp"))
+		test_sockmap_copied_seq(true);
+	if (test__start_subtest("sockmap tcp multi channels"))
+		test_sockmap_multi_channels(SOCK_STREAM);
+	if (test__start_subtest("sockmap udp multi channels"))
+		test_sockmap_multi_channels(SOCK_DGRAM);
 }
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c
index 69aacc96db36..ef9edca184ea 100644
--- a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c
@@ -44,4 +44,18 @@ int prog_skb_parser(struct __sk_buff *skb)
 	return SK_PASS;
 }
 
+SEC("sk_skb/stream_verdict")
+int prog_skb_verdict_ingress(struct __sk_buff *skb)
+{
+	int one = 1;
+
+	return bpf_sk_redirect_map(skb, &sock_map_rx, one, BPF_F_INGRESS);
+}
+
+SEC("sk_skb/stream_parser")
+int prog_skb_verdict_ingress_strp(struct __sk_buff *skb)
+{
+	return skb->len;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 612e4022c616eba66ed15e6b7a9924251e0298e8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Jan 2026 12:43:43 -0300
Subject: perf strlist: Remove dont_dupstr logic, used only once

Ian Rogers noticed that 678ed6b707e4b2db ("perf strlist: Don't write to
const memory") breaks the 'Remove thread map' 'perf test' entry, because
it keeps pointers to the temporary string introduced to avoid touching
the const memory.

This is because the thread_map__new_by_[pt]id_str() were the only
methods using the slist->dont_dupstr knob to keep pointers to the
original const string list, as it uses strtol to parse numbers and it
stops at the comma.

As this is the only case of dont_dupstr use, dupstr being the default,
and it gets in the way of getting rid of the last const-correctness,
remove this knob, with it:

  $ perf test 37
  37: Remove thread map   : Ok
  $

Fixes: 678ed6b707e4b2db ("perf strlist: Don't write to const memory")
Reported-by: Ian Rogers <irogers@google.com>
Tested-by: Ian Rogers <irogers@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/strlist.c    | 25 ++++++++-----------------
 tools/perf/util/strlist.h    |  2 --
 tools/perf/util/thread_map.c | 18 ++++++------------
 3 files changed, 14 insertions(+), 31 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/strlist.c b/tools/perf/util/strlist.c
index 98883672fcf4..50add72575e0 100644
--- a/tools/perf/util/strlist.c
+++ b/tools/perf/util/strlist.c
@@ -12,20 +12,16 @@
 #include <linux/zalloc.h>
 
 static
-struct rb_node *strlist__node_new(struct rblist *rblist, const void *entry)
+struct rb_node *strlist__node_new(struct rblist *rblist __maybe_unused, const void *entry)
 {
 	const char *s = entry;
 	struct rb_node *rc = NULL;
-	struct strlist *strlist = container_of(rblist, struct strlist, rblist);
 	struct str_node *snode = malloc(sizeof(*snode));
 
 	if (snode != NULL) {
-		if (strlist->dupstr) {
-			s = strdup(s);
-			if (s == NULL)
-				goto out_delete;
-		}
-		snode->s = s;
+		snode->s = strdup(s);
+		if (snode->s == NULL)
+			goto out_delete;
 		rc = &snode->rb_node;
 	}
 
@@ -36,20 +32,18 @@ out_delete:
 	return NULL;
 }
 
-static void str_node__delete(struct str_node *snode, bool dupstr)
+static void str_node__delete(struct str_node *snode)
 {
-	if (dupstr)
-		zfree((char **)&snode->s);
+	zfree((char **)&snode->s);
 	free(snode);
 }
 
 static
-void strlist__node_delete(struct rblist *rblist, struct rb_node *rb_node)
+void strlist__node_delete(struct rblist *rblist __maybe_unused, struct rb_node *rb_node)
 {
-	struct strlist *slist = container_of(rblist, struct strlist, rblist);
 	struct str_node *snode = container_of(rb_node, struct str_node, rb_node);
 
-	str_node__delete(snode, slist->dupstr);
+	str_node__delete(snode);
 }
 
 static int strlist__node_cmp(struct rb_node *rb_node, const void *entry)
@@ -165,12 +159,10 @@ struct strlist *strlist__new(const char *list, const struct strlist_config *conf
 	struct strlist *slist = malloc(sizeof(*slist));
 
 	if (slist != NULL) {
-		bool dupstr = true;
 		bool file_only = false;
 		const char *dirname = NULL;
 
 		if (config) {
-			dupstr = !config->dont_dupstr;
 			dirname = config->dirname;
 			file_only = config->file_only;
 		}
@@ -180,7 +172,6 @@ struct strlist *strlist__new(const char *list, const struct strlist_config *conf
 		slist->rblist.node_new    = strlist__node_new;
 		slist->rblist.node_delete = strlist__node_delete;
 
-		slist->dupstr	 = dupstr;
 		slist->file_only = file_only;
 
 		if (list && strlist__parse_list(slist, list, dirname) != 0)
diff --git a/tools/perf/util/strlist.h b/tools/perf/util/strlist.h
index 7e82c71dcc42..3e9533e66ca9 100644
--- a/tools/perf/util/strlist.h
+++ b/tools/perf/util/strlist.h
@@ -14,7 +14,6 @@ struct str_node {
 
 struct strlist {
 	struct rblist rblist;
-	bool	      dupstr;
 	bool	      file_only;
 };
 
@@ -24,7 +23,6 @@ struct strlist {
  *             found
  */
 struct strlist_config {
-	bool dont_dupstr;
 	bool file_only;
 	const char *dirname;
 };
diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c
index ca193c1374ed..48c70f149e92 100644
--- a/tools/perf/util/thread_map.c
+++ b/tools/perf/util/thread_map.c
@@ -164,19 +164,16 @@ static struct perf_thread_map *thread_map__new_by_pid_str(const char *pid_str)
 	struct dirent **namelist = NULL;
 	int i, j = 0;
 	pid_t pid, prev_pid = INT_MAX;
-	char *end_ptr;
 	struct str_node *pos;
-	struct strlist_config slist_config = { .dont_dupstr = true, };
-	struct strlist *slist = strlist__new(pid_str, &slist_config);
+	struct strlist *slist = strlist__new(pid_str, NULL);
 
 	if (!slist)
 		return NULL;
 
 	strlist__for_each_entry(pos, slist) {
-		pid = strtol(pos->s, &end_ptr, 10);
+		pid = strtol(pos->s, NULL, 10);
 
-		if (pid == INT_MIN || pid == INT_MAX ||
-		    (*end_ptr != '\0' && *end_ptr != ','))
+		if (pid == INT_MIN || pid == INT_MAX)
 			goto out_free_threads;
 
 		if (pid == prev_pid)
@@ -223,24 +220,21 @@ struct perf_thread_map *thread_map__new_by_tid_str(const char *tid_str)
 	struct perf_thread_map *threads = NULL, *nt;
 	int ntasks = 0;
 	pid_t tid, prev_tid = INT_MAX;
-	char *end_ptr;
 	struct str_node *pos;
-	struct strlist_config slist_config = { .dont_dupstr = true, };
 	struct strlist *slist;
 
 	/* perf-stat expects threads to be generated even if tid not given */
 	if (!tid_str)
 		return perf_thread_map__new_dummy();
 
-	slist = strlist__new(tid_str, &slist_config);
+	slist = strlist__new(tid_str, NULL);
 	if (!slist)
 		return NULL;
 
 	strlist__for_each_entry(pos, slist) {
-		tid = strtol(pos->s, &end_ptr, 10);
+		tid = strtol(pos->s, NULL, 10);
 
-		if (tid == INT_MIN || tid == INT_MAX ||
-		    (*end_ptr != '\0' && *end_ptr != ','))
+		if (tid == INT_MIN || tid == INT_MAX)
 			goto out_free_threads;
 
 		if (tid == prev_tid)
-- 
cgit v1.2.3


From 297c9d96e3085116c5cde18170dba716a1f2591e Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Wed, 21 Jan 2026 16:19:40 +0000
Subject: perf jevents: Handle deleted JSONS in out of source builds

Make the source folders a dependency for the generated folder root so
that whenever a file is deleted from the source it will force a new
fresh copy of all the JSON files and avoid stale deleted files.

JSON_DIRS_OUTPUT_ROOT needs to be a dependency of LEGACY_CACHE_JSON so
that the root folder doesn't get cleaned after the legacy JSON is
generated. But this is a no-op with in-source builds as
JSON_DIRS_OUTPUT_ROOT is unset.

JSON_DIRS is added as a dependency of PMU_EVENTS_C which also forces a
re-build for in source builds when JSON files are deleted. This could
have also resulted in stale builds, but never a broken one.

Closes: https://lore.kernel.org/linux-next/aW5XSAo88_LBPSYI@sirena.org.uk/
Fixes: 4bb55de4ff03db3e ("perf jevents: Support copying the source json files to OUTPUT")
Reported-by: Mark Brown <broonie@kernel.org>
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/Build | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build
index a46ab7b612df..4f9ef624ba70 100644
--- a/tools/perf/pmu-events/Build
+++ b/tools/perf/pmu-events/Build
@@ -1,5 +1,6 @@
 pmu-events-y	+= pmu-events.o
 JSON		=  $(shell find pmu-events/arch -name '*.json' -o -name '*.csv')
+JSON_DIRS	=  $(shell find pmu-events/arch -type d)
 JDIR_TEST	=  pmu-events/arch/test
 JSON_TEST	=  $(shell [ -d $(JDIR_TEST) ] &&			\
 			find $(JDIR_TEST) -name '*.json')
@@ -31,16 +32,23 @@ $(PMU_EVENTS_C): $(EMPTY_PMU_EVENTS_C)
 else
 # Copy checked-in json to OUTPUT for generation if it's an out of source build
 ifneq ($(OUTPUT),)
-$(OUTPUT)pmu-events/arch/%: pmu-events/arch/%
+# Remove all output directories when any source directory timestamp changes
+# so there are no stale deleted files
+JSON_DIRS_ROOT = $(OUTPUT)pmu-events/arch/
+$(JSON_DIRS_ROOT): $(JSON_DIRS)
+	$(Q)$(call echo-cmd,gen)rm -rf $@
+	$(Q)mkdir -p $@
+
+$(OUTPUT)pmu-events/arch/%: pmu-events/arch/% $(JSON_DIRS_ROOT)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)cp $< $@
 endif
 
-$(LEGACY_CACHE_JSON): $(LEGACY_CACHE_PY)
+$(LEGACY_CACHE_JSON): $(LEGACY_CACHE_PY) $(JSON_DIRS_ROOT)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)$(PYTHON) $(LEGACY_CACHE_PY) > $@
 
-GEN_JSON = $(patsubst %,$(OUTPUT)%,$(JSON)) $(LEGACY_CACHE_JSON)
+GEN_JSON = $(patsubst %,$(OUTPUT)%,$(JSON)) $(LEGACY_CACHE_JSON) $(JSON_DIRS)
 
 $(METRIC_TEST_LOG): $(METRIC_TEST_PY) $(METRIC_PY)
 	$(call rule_mkdir)
-- 
cgit v1.2.3


From cc4448d0856d424e52b5f53b2592575598233eac Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Thu, 22 Jan 2026 22:26:03 -0500
Subject: tools/sched_ext: add scx_userland scheduler

Add in the scx_userland scheduler that does vruntime-based
scheduling in userspace code and communicates scheduling
decisions to BPF by accessing and modifying globals through
the skeleton.

Cc: Tejun Heo <tj@kernel.org>
Cc: David Vernet <dvernet@meta.com>
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/Makefile           |   2 +-
 tools/sched_ext/scx_userland.bpf.c | 344 +++++++++++++++++++++++++++++
 tools/sched_ext/scx_userland.c     | 437 +++++++++++++++++++++++++++++++++++++
 tools/sched_ext/scx_userland.h     |  17 ++
 4 files changed, 799 insertions(+), 1 deletion(-)
 create mode 100644 tools/sched_ext/scx_userland.bpf.c
 create mode 100644 tools/sched_ext/scx_userland.c
 create mode 100644 tools/sched_ext/scx_userland.h

(limited to 'tools')

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index e4bda2474060..12043a82a1a9 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 
 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
 
-c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg
+c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland
 
 $(addprefix $(BINDIR)/,$(c-sched-targets)): \
 	$(BINDIR)/%: \
diff --git a/tools/sched_ext/scx_userland.bpf.c b/tools/sched_ext/scx_userland.bpf.c
new file mode 100644
index 000000000000..f29862b89386
--- /dev/null
+++ b/tools/sched_ext/scx_userland.bpf.c
@@ -0,0 +1,344 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A minimal userland scheduler.
+ *
+ * In terms of scheduling, this provides two different types of behaviors:
+ * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity.
+ *    All such tasks are direct-dispatched from the kernel, and are never
+ *    enqueued in user space.
+ * 2. A primitive vruntime scheduler that is implemented in user space, for all
+ *    other tasks.
+ *
+ * Some parts of this example user space scheduler could be implemented more
+ * efficiently using more complex and sophisticated data structures. For
+ * example, rather than using BPF_MAP_TYPE_QUEUE's,
+ * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between
+ * user space and kernel space. Similarly, we use a simple vruntime-sorted list
+ * in user space, but an rbtree could be used instead.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <scx/common.bpf.h>
+#include "scx_userland.h"
+
+/*
+ * Maximum amount of tasks enqueued/dispatched between kernel and user-space.
+ */
+#define MAX_ENQUEUED_TASKS 4096
+
+char _license[] SEC("license") = "GPL";
+
+const volatile s32 usersched_pid;
+
+/* !0 for veristat, set during init */
+const volatile u32 num_possible_cpus = 64;
+
+/* Stats that are printed by user space. */
+u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues;
+
+/*
+ * Number of tasks that are queued for scheduling.
+ *
+ * This number is incremented by the BPF component when a task is queued to the
+ * user-space scheduler and it must be decremented by the user-space scheduler
+ * when a task is consumed.
+ */
+volatile u64 nr_queued;
+
+/*
+ * Number of tasks that are waiting for scheduling.
+ *
+ * This number must be updated by the user-space scheduler to keep track if
+ * there is still some scheduling work to do.
+ */
+volatile u64 nr_scheduled;
+
+UEI_DEFINE(uei);
+
+/*
+ * The map containing tasks that are enqueued in user space from the kernel.
+ *
+ * This map is drained by the user space scheduler.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, MAX_ENQUEUED_TASKS);
+	__type(value, struct scx_userland_enqueued_task);
+} enqueued SEC(".maps");
+
+/*
+ * The map containing tasks that are dispatched to the kernel from user space.
+ *
+ * Drained by the kernel in userland_dispatch().
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, MAX_ENQUEUED_TASKS);
+	__type(value, s32);
+} dispatched SEC(".maps");
+
+/* Per-task scheduling context */
+struct task_ctx {
+	bool force_local; /* Dispatch directly to local DSQ */
+};
+
+/* Map that contains task-local storage. */
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+/*
+ * Flag used to wake-up the user-space scheduler.
+ */
+static volatile u32 usersched_needed;
+
+/*
+ * Set user-space scheduler wake-up flag (equivalent to an atomic release
+ * operation).
+ */
+static void set_usersched_needed(void)
+{
+	__sync_fetch_and_or(&usersched_needed, 1);
+}
+
+/*
+ * Check and clear user-space scheduler wake-up flag (equivalent to an atomic
+ * acquire operation).
+ */
+static bool test_and_clear_usersched_needed(void)
+{
+	return __sync_fetch_and_and(&usersched_needed, 0) == 1;
+}
+
+static bool is_usersched_task(const struct task_struct *p)
+{
+	return p->pid == usersched_pid;
+}
+
+static bool keep_in_kernel(const struct task_struct *p)
+{
+	return p->nr_cpus_allowed < num_possible_cpus;
+}
+
+static struct task_struct *usersched_task(void)
+{
+	struct task_struct *p;
+
+	p = bpf_task_from_pid(usersched_pid);
+	/*
+	 * Should never happen -- the usersched task should always be managed
+	 * by sched_ext.
+	 */
+	if (!p)
+		scx_bpf_error("Failed to find usersched task %d", usersched_pid);
+
+	return p;
+}
+
+s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	if (keep_in_kernel(p)) {
+		s32 cpu;
+		struct task_ctx *tctx;
+
+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("Failed to look up task-local storage for %s", p->comm);
+			return -ESRCH;
+		}
+
+		if (p->nr_cpus_allowed == 1 ||
+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+			tctx->force_local = true;
+			return prev_cpu;
+		}
+
+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+		if (cpu >= 0) {
+			tctx->force_local = true;
+			return cpu;
+		}
+	}
+
+	return prev_cpu;
+}
+
+static void dispatch_user_scheduler(void)
+{
+	struct task_struct *p;
+
+	p = usersched_task();
+	if (p) {
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	}
+}
+
+static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)
+{
+	struct scx_userland_enqueued_task task = {};
+
+	task.pid = p->pid;
+	task.sum_exec_runtime = p->se.sum_exec_runtime;
+	task.weight = p->scx.weight;
+
+	if (bpf_map_push_elem(&enqueued, &task, 0)) {
+		/*
+		 * If we fail to enqueue the task in user space, put it
+		 * directly on the global DSQ.
+		 */
+		__sync_fetch_and_add(&nr_failed_enqueues, 1);
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	} else {
+		__sync_fetch_and_add(&nr_user_enqueues, 1);
+		set_usersched_needed();
+	}
+}
+
+void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	if (keep_in_kernel(p)) {
+		u64 dsq_id = SCX_DSQ_GLOBAL;
+		struct task_ctx *tctx;
+
+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("Failed to lookup task ctx for %s", p->comm);
+			return;
+		}
+
+		if (tctx->force_local)
+			dsq_id = SCX_DSQ_LOCAL;
+		tctx->force_local = false;
+		scx_bpf_dsq_insert(p, dsq_id, SCX_SLICE_DFL, enq_flags);
+		__sync_fetch_and_add(&nr_kernel_enqueues, 1);
+		return;
+	} else if (!is_usersched_task(p)) {
+		enqueue_task_in_user_space(p, enq_flags);
+	}
+}
+
+void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)
+{
+	if (test_and_clear_usersched_needed())
+		dispatch_user_scheduler();
+
+	bpf_repeat(MAX_ENQUEUED_TASKS) {
+		s32 pid;
+		struct task_struct *p;
+
+		if (bpf_map_pop_elem(&dispatched, &pid))
+			break;
+
+		/*
+		 * The task could have exited by the time we get around to
+		 * dispatching it. Treat this as a normal occurrence, and simply
+		 * move onto the next iteration.
+		 */
+		p = bpf_task_from_pid(pid);
+		if (!p)
+			continue;
+
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	}
+}
+
+/*
+ * A CPU is about to change its idle state. If the CPU is going idle, ensure
+ * that the user-space scheduler has a chance to run if there is any remaining
+ * work to do.
+ */
+void BPF_STRUCT_OPS(userland_update_idle, s32 cpu, bool idle)
+{
+	/*
+	 * Don't do anything if we exit from and idle state, a CPU owner will
+	 * be assigned in .running().
+	 */
+	if (!idle)
+		return;
+	/*
+	 * A CPU is now available, notify the user-space scheduler that tasks
+	 * can be dispatched, if there is at least one task waiting to be
+	 * scheduled, either queued (accounted in nr_queued) or scheduled
+	 * (accounted in nr_scheduled).
+	 *
+	 * NOTE: nr_queued is incremented by the BPF component, more exactly in
+	 * enqueue(), when a task is sent to the user-space scheduler, then
+	 * the scheduler drains the queued tasks (updating nr_queued) and adds
+	 * them to its internal data structures / state; at this point tasks
+	 * become "scheduled" and the user-space scheduler will take care of
+	 * updating nr_scheduled accordingly; lastly tasks will be dispatched
+	 * and the user-space scheduler will update nr_scheduled again.
+	 *
+	 * Checking both counters allows to determine if there is still some
+	 * pending work to do for the scheduler: new tasks have been queued
+	 * since last check, or there are still tasks "queued" or "scheduled"
+	 * since the previous user-space scheduler run. If the counters are
+	 * both zero it is pointless to wake-up the scheduler (even if a CPU
+	 * becomes idle), because there is nothing to do.
+	 *
+	 * Keep in mind that update_idle() doesn't run concurrently with the
+	 * user-space scheduler (that is single-threaded): this function is
+	 * naturally serialized with the user-space scheduler code, therefore
+	 * this check here is also safe from a concurrency perspective.
+	 */
+	if (nr_queued || nr_scheduled) {
+		/*
+		 * Kick the CPU to make it immediately ready to accept
+		 * dispatched tasks.
+		 */
+		set_usersched_needed();
+		scx_bpf_kick_cpu(cpu, 0);
+	}
+}
+
+s32 BPF_STRUCT_OPS(userland_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+s32 BPF_STRUCT_OPS(userland_init)
+{
+	if (num_possible_cpus == 0) {
+		scx_bpf_error("User scheduler # CPUs uninitialized (%d)",
+			      num_possible_cpus);
+		return -EINVAL;
+	}
+
+	if (usersched_pid <= 0) {
+		scx_bpf_error("User scheduler pid uninitialized (%d)",
+			      usersched_pid);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(userland_ops,
+	       .select_cpu		= (void *)userland_select_cpu,
+	       .enqueue			= (void *)userland_enqueue,
+	       .dispatch		= (void *)userland_dispatch,
+	       .update_idle		= (void *)userland_update_idle,
+	       .init_task		= (void *)userland_init_task,
+	       .init			= (void *)userland_init,
+	       .exit			= (void *)userland_exit,
+	       .flags			= SCX_OPS_ENQ_LAST |
+					  SCX_OPS_KEEP_BUILTIN_IDLE,
+	       .name			= "userland");
diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
new file mode 100644
index 000000000000..10b31020f44f
--- /dev/null
+++ b/tools/sched_ext/scx_userland.c
@@ -0,0 +1,437 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A demo sched_ext user space scheduler which provides vruntime semantics
+ * using a simple ordered-list implementation.
+ *
+ * Each CPU in the system resides in a single, global domain. This precludes
+ * the need to do any load balancing between domains. The scheduler could
+ * easily be extended to support multiple domains, with load balancing
+ * happening in user space.
+ *
+ * Any task which has any CPU affinity is scheduled entirely in BPF. This
+ * program only schedules tasks which may run on any CPU.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <sched.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <pthread.h>
+#include <bpf/bpf.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+
+#include <scx/common.h>
+#include "scx_userland.h"
+#include "scx_userland.bpf.skel.h"
+
+const char help_fmt[] =
+"A minimal userland sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Try to reduce `sysctl kernel.pid_max` if this program triggers OOMs.\n"
+"\n"
+"Usage: %s [-b BATCH]\n"
+"\n"
+"  -b BATCH      The number of tasks to batch when dispatching (default: 8)\n"
+"  -v            Print libbpf debug messages\n"
+"  -h            Display this help and exit\n";
+
+/* Defined in UAPI */
+#define SCHED_EXT 7
+
+/* Number of tasks to batch when dispatching to user space. */
+static __u32 batch_size = 8;
+
+static bool verbose;
+static volatile int exit_req;
+static int enqueued_fd, dispatched_fd;
+
+static struct scx_userland *skel;
+static struct bpf_link *ops_link;
+
+/* Stats collected in user space. */
+static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches, nr_vruntime_failed;
+
+/* Number of tasks currently enqueued. */
+static __u64 nr_curr_enqueued;
+
+/* The data structure containing tasks that are enqueued in user space. */
+struct enqueued_task {
+	LIST_ENTRY(enqueued_task) entries;
+	__u64 sum_exec_runtime;
+	double vruntime;
+};
+
+/*
+ * Use a vruntime-sorted list to store tasks. This could easily be extended to
+ * a more optimal data structure, such as an rbtree as is done in CFS. We
+ * currently elect to use a sorted list to simplify the example for
+ * illustrative purposes.
+ */
+LIST_HEAD(listhead, enqueued_task);
+
+/*
+ * A vruntime-sorted list of tasks. The head of the list contains the task with
+ * the lowest vruntime. That is, the task that has the "highest" claim to be
+ * scheduled.
+ */
+static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head);
+
+/*
+ * The main array of tasks. The array is allocated all at once during
+ * initialization, based on /proc/sys/kernel/pid_max, to avoid having to
+ * dynamically allocate memory on the enqueue path, which could cause a
+ * deadlock. A more substantive user space scheduler could e.g. provide a hook
+ * for newly enabled tasks that are passed to the scheduler from the
+ * .prep_enable() callback to allows the scheduler to allocate on safe paths.
+ */
+struct enqueued_task *tasks;
+static int pid_max;
+
+static double min_vruntime;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int userland)
+{
+	exit_req = 1;
+}
+
+static int get_pid_max(void)
+{
+	FILE *fp;
+	int pid_max;
+
+	fp = fopen("/proc/sys/kernel/pid_max", "r");
+	if (fp == NULL) {
+		fprintf(stderr, "Error opening /proc/sys/kernel/pid_max\n");
+		return -1;
+	}
+	if (fscanf(fp, "%d", &pid_max) != 1) {
+		fprintf(stderr, "Error reading from /proc/sys/kernel/pid_max\n");
+		fclose(fp);
+		return -1;
+	}
+	fclose(fp);
+
+	return pid_max;
+}
+
+static int init_tasks(void)
+{
+	pid_max = get_pid_max();
+	if (pid_max < 0)
+		return pid_max;
+
+	tasks = calloc(pid_max, sizeof(*tasks));
+	if (!tasks) {
+		fprintf(stderr, "Error allocating tasks array\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static __u32 task_pid(const struct enqueued_task *task)
+{
+	return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task);
+}
+
+static int dispatch_task(__s32 pid)
+{
+	int err;
+
+	err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0);
+	if (err) {
+		nr_vruntime_failed++;
+	} else {
+		nr_vruntime_dispatches++;
+	}
+
+	return err;
+}
+
+static struct enqueued_task *get_enqueued_task(__s32 pid)
+{
+	if (pid >= pid_max)
+		return NULL;
+
+	return &tasks[pid];
+}
+
+static double calc_vruntime_delta(__u64 weight, __u64 delta)
+{
+	double weight_f = (double)weight / 100.0;
+	double delta_f = (double)delta;
+
+	return delta_f / weight_f;
+}
+
+static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task)
+{
+	__u64 delta;
+
+	delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime;
+
+	enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta);
+	if (min_vruntime > enqueued->vruntime)
+		enqueued->vruntime = min_vruntime;
+	enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime;
+}
+
+static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task)
+{
+	struct enqueued_task *curr, *enqueued, *prev;
+
+	curr = get_enqueued_task(bpf_task->pid);
+	if (!curr)
+		return ENOENT;
+
+	update_enqueued(curr, bpf_task);
+	nr_vruntime_enqueues++;
+	nr_curr_enqueued++;
+
+	/*
+	 * Enqueue the task in a vruntime-sorted list. A more optimal data
+	 * structure such as an rbtree could easily be used as well. We elect
+	 * to use a list here simply because it's less code, and thus the
+	 * example is less convoluted and better serves to illustrate what a
+	 * user space scheduler could look like.
+	 */
+
+	if (LIST_EMPTY(&vruntime_head)) {
+		LIST_INSERT_HEAD(&vruntime_head, curr, entries);
+		return 0;
+	}
+
+	LIST_FOREACH(enqueued, &vruntime_head, entries) {
+		if (curr->vruntime <= enqueued->vruntime) {
+			LIST_INSERT_BEFORE(enqueued, curr, entries);
+			return 0;
+		}
+		prev = enqueued;
+	}
+
+	LIST_INSERT_AFTER(prev, curr, entries);
+
+	return 0;
+}
+
+static void drain_enqueued_map(void)
+{
+	while (1) {
+		struct scx_userland_enqueued_task task;
+		int err;
+
+		if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) {
+			skel->bss->nr_queued = 0;
+			skel->bss->nr_scheduled = nr_curr_enqueued;
+			return;
+		}
+
+		err = vruntime_enqueue(&task);
+		if (err) {
+			fprintf(stderr, "Failed to enqueue task %d: %s\n",
+				task.pid, strerror(err));
+			exit_req = 1;
+			return;
+		}
+	}
+}
+
+static void dispatch_batch(void)
+{
+	__u32 i;
+
+	for (i = 0; i < batch_size; i++) {
+		struct enqueued_task *task;
+		int err;
+		__s32 pid;
+
+		task = LIST_FIRST(&vruntime_head);
+		if (!task)
+			break;
+
+		min_vruntime = task->vruntime;
+		pid = task_pid(task);
+		LIST_REMOVE(task, entries);
+		err = dispatch_task(pid);
+		if (err) {
+			/*
+			 * If we fail to dispatch, put the task back to the
+			 * vruntime_head list and stop dispatching additional
+			 * tasks in this batch.
+			 */
+			LIST_INSERT_HEAD(&vruntime_head, task, entries);
+			break;
+		}
+		nr_curr_enqueued--;
+	}
+	skel->bss->nr_scheduled = nr_curr_enqueued;
+}
+
+static void *run_stats_printer(void *arg)
+{
+	while (!exit_req) {
+		__u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total;
+
+		nr_failed_enqueues = skel->bss->nr_failed_enqueues;
+		nr_kernel_enqueues = skel->bss->nr_kernel_enqueues;
+		nr_user_enqueues = skel->bss->nr_user_enqueues;
+		total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues;
+
+		printf("o-----------------------o\n");
+		printf("| BPF ENQUEUES          |\n");
+		printf("|-----------------------|\n");
+		printf("|  kern:     %10llu |\n", nr_kernel_enqueues);
+		printf("|  user:     %10llu |\n", nr_user_enqueues);
+		printf("|  failed:   %10llu |\n", nr_failed_enqueues);
+		printf("|  -------------------- |\n");
+		printf("|  total:    %10llu |\n", total);
+		printf("|                       |\n");
+		printf("|-----------------------|\n");
+		printf("| VRUNTIME / USER       |\n");
+		printf("|-----------------------|\n");
+		printf("|  enq:      %10llu |\n", nr_vruntime_enqueues);
+		printf("|  disp:     %10llu |\n", nr_vruntime_dispatches);
+		printf("|  failed:   %10llu |\n", nr_vruntime_failed);
+		printf("o-----------------------o\n");
+		printf("\n\n");
+		fflush(stdout);
+		sleep(1);
+	}
+
+	return NULL;
+}
+
+static int spawn_stats_thread(void)
+{
+	pthread_t stats_printer;
+
+	return pthread_create(&stats_printer, NULL, run_stats_printer, NULL);
+}
+
+static void pre_bootstrap(int argc, char **argv)
+{
+	int err;
+	__u32 opt;
+	struct sched_param sched_param = {
+		.sched_priority = sched_get_priority_max(SCHED_EXT),
+	};
+
+	err = init_tasks();
+	if (err)
+		exit(err);
+
+	libbpf_set_print(libbpf_print_fn);
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+
+	/*
+	 * Enforce that the user scheduler task is managed by sched_ext. The
+	 * task eagerly drains the list of enqueued tasks in its main work
+	 * loop, and then yields the CPU. The BPF scheduler only schedules the
+	 * user space scheduler task when at least one other task in the system
+	 * needs to be scheduled.
+	 */
+	err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param);
+	SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT");
+
+	while ((opt = getopt(argc, argv, "b:vh")) != -1) {
+		switch (opt) {
+		case 'b':
+			batch_size = strtoul(optarg, NULL, 0);
+			break;
+		case 'v':
+			verbose = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			exit(opt != 'h');
+		}
+	}
+
+	/*
+	 * It's not always safe to allocate in a user space scheduler, as an
+	 * enqueued task could hold a lock that we require in order to be able
+	 * to allocate.
+	 */
+	err = mlockall(MCL_CURRENT | MCL_FUTURE);
+	SCX_BUG_ON(err, "Failed to prefault and lock address space");
+}
+
+static void bootstrap(char *comm)
+{
+	skel = SCX_OPS_OPEN(userland_ops, scx_userland);
+
+	skel->rodata->num_possible_cpus = libbpf_num_possible_cpus();
+	assert(skel->rodata->num_possible_cpus > 0);
+	skel->rodata->usersched_pid = getpid();
+	assert(skel->rodata->usersched_pid > 0);
+
+	SCX_OPS_LOAD(skel, userland_ops, scx_userland, uei);
+
+	enqueued_fd = bpf_map__fd(skel->maps.enqueued);
+	dispatched_fd = bpf_map__fd(skel->maps.dispatched);
+	assert(enqueued_fd > 0);
+	assert(dispatched_fd > 0);
+
+	SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread");
+
+	ops_link = SCX_OPS_ATTACH(skel, userland_ops, scx_userland);
+}
+
+static void sched_main_loop(void)
+{
+	while (!exit_req) {
+		/*
+		 * Perform the following work in the main user space scheduler
+		 * loop:
+		 *
+		 * 1. Drain all tasks from the enqueued map, and enqueue them
+		 *    to the vruntime sorted list.
+		 *
+		 * 2. Dispatch a batch of tasks from the vruntime sorted list
+		 *    down to the kernel.
+		 *
+		 * 3. Yield the CPU back to the system. The BPF scheduler will
+		 *    reschedule the user space scheduler once another task has
+		 *    been enqueued to user space.
+		 */
+		drain_enqueued_map();
+		dispatch_batch();
+		sched_yield();
+	}
+}
+
+int main(int argc, char **argv)
+{
+	__u64 ecode;
+
+	pre_bootstrap(argc, argv);
+restart:
+	bootstrap(argv[0]);
+	sched_main_loop();
+
+	exit_req = 1;
+	bpf_link__destroy(ops_link);
+	ecode = UEI_REPORT(skel, uei);
+	scx_userland__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
+	return 0;
+}
diff --git a/tools/sched_ext/scx_userland.h b/tools/sched_ext/scx_userland.h
new file mode 100644
index 000000000000..684fb2dd5de9
--- /dev/null
+++ b/tools/sched_ext/scx_userland.h
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta, Inc */
+
+#ifndef __SCX_USERLAND_COMMON_H
+#define __SCX_USERLAND_COMMON_H
+
+/*
+ * An instance of a task that has been enqueued by the kernel for consumption
+ * by a user space global scheduler thread.
+ */
+struct scx_userland_enqueued_task {
+	__s32 pid;
+	u64 sum_exec_runtime;
+	u64 weight;
+};
+
+#endif  // __SCX_USERLAND_COMMON_H
-- 
cgit v1.2.3


From f0262b102c7ce43f3744bdb0278ddf0d15bb1a71 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Thu, 22 Jan 2026 22:26:04 -0500
Subject: tools/sched_ext: add scx_pair scheduler

Add the scx_pair cgroup-based core scheduler.

Cc: Tejun Heo <tj@kernel.org>
Cc: David Vernet <dvernet@meta.com>
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/Makefile       |   2 +-
 tools/sched_ext/scx_pair.bpf.c | 610 +++++++++++++++++++++++++++++++++++++++++
 tools/sched_ext/scx_pair.c     | 180 ++++++++++++
 tools/sched_ext/scx_pair.h     |   9 +
 4 files changed, 800 insertions(+), 1 deletion(-)
 create mode 100644 tools/sched_ext/scx_pair.bpf.c
 create mode 100644 tools/sched_ext/scx_pair.c
 create mode 100644 tools/sched_ext/scx_pair.h

(limited to 'tools')

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 12043a82a1a9..208e8f8fe4d8 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 
 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
 
-c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland
+c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair
 
 $(addprefix $(BINDIR)/,$(c-sched-targets)): \
 	$(BINDIR)/%: \
diff --git a/tools/sched_ext/scx_pair.bpf.c b/tools/sched_ext/scx_pair.bpf.c
new file mode 100644
index 000000000000..267011b57cba
--- /dev/null
+++ b/tools/sched_ext/scx_pair.bpf.c
@@ -0,0 +1,610 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A demo sched_ext core-scheduler which always makes every sibling CPU pair
+ * execute from the same CPU cgroup.
+ *
+ * This scheduler is a minimal implementation and would need some form of
+ * priority handling both inside each cgroup and across the cgroups to be
+ * practically useful.
+ *
+ * Each CPU in the system is paired with exactly one other CPU, according to a
+ * "stride" value that can be specified when the BPF scheduler program is first
+ * loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee
+ * that they will only ever schedule tasks that belong to the same CPU cgroup.
+ *
+ * Scheduler Initialization
+ * ------------------------
+ *
+ * The scheduler BPF program is first initialized from user space, before it is
+ * enabled. During this initialization process, each CPU on the system is
+ * assigned several values that are constant throughout its runtime:
+ *
+ * 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling
+ *		  decisions. Paired CPUs always schedule tasks from the same
+ *		  CPU cgroup, and synchronize with each other to guarantee
+ *		  that this constraint is not violated.
+ * 2. *Pair ID*:  Each CPU pair is assigned a Pair ID, which is used to access
+ *		  a struct pair_ctx object that is shared between the pair.
+ * 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the
+ *		       pair. Each struct pair_ctx has an active_mask field,
+ *		       which is a bitmap used to indicate whether each core
+ *		       in the pair currently has an actively running task.
+ *		       This index specifies which entry in the bitmap corresponds
+ *		       to each CPU in the pair.
+ *
+ * During this initialization, the CPUs are paired according to a "stride" that
+ * may be specified when invoking the user space program that initializes and
+ * loads the scheduler. By default, the stride is 1/2 the total number of CPUs.
+ *
+ * Tasks and cgroups
+ * -----------------
+ *
+ * Every cgroup in the system is registered with the scheduler using the
+ * pair_cgroup_init() callback, and every task in the system is associated with
+ * exactly one cgroup. At a high level, the idea with the pair scheduler is to
+ * always schedule tasks from the same cgroup within a given CPU pair. When a
+ * task is enqueued (i.e. passed to the pair_enqueue() callback function), its
+ * cgroup ID is read from its task struct, and then a corresponding queue map
+ * is used to FIFO-enqueue the task for that cgroup.
+ *
+ * If you look through the implementation of the scheduler, you'll notice that
+ * there is quite a bit of complexity involved with looking up the per-cgroup
+ * FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash
+ * BPF hash map that is used to map a cgroup ID to a globally unique ID that's
+ * allocated in the BPF program. This is done because we use separate maps to
+ * store the FIFO queue of tasks, and the length of that map, per cgroup. This
+ * complexity is only present because of current deficiencies in BPF that will
+ * soon be addressed. The main point to keep in mind is that newly enqueued
+ * tasks are added to their cgroup's FIFO queue.
+ *
+ * Dispatching tasks
+ * -----------------
+ *
+ * This section will describe how enqueued tasks are dispatched and scheduled.
+ * Tasks are dispatched in pair_dispatch(), and at a high level the workflow is
+ * as follows:
+ *
+ * 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is
+ *    the structure that's used to synchronize amongst the two pair CPUs in their
+ *    scheduling decisions. After any of the following events have occurred:
+ *
+ * - The cgroup's slice run has expired, or
+ * - The cgroup becomes empty, or
+ * - Either CPU in the pair is preempted by a higher priority scheduling class
+ *
+ * The cgroup transitions to the draining state and stops executing new tasks
+ * from the cgroup.
+ *
+ * 2. If the pair is still executing a task, mark the pair_ctx as draining, and
+ *    wait for the pair CPU to be preempted.
+ *
+ * 3. Otherwise, if the pair CPU is not running a task, we can move onto
+ *    scheduling new tasks. Pop the next cgroup id from the top_q queue.
+ *
+ * 4. Pop a task from that cgroup's FIFO task queue, and begin executing it.
+ *
+ * Note again that this scheduling behavior is simple, but the implementation
+ * is complex mostly because this it hits several BPF shortcomings and has to
+ * work around in often awkward ways. Most of the shortcomings are expected to
+ * be resolved in the near future which should allow greatly simplifying this
+ * scheduler.
+ *
+ * Dealing with preemption
+ * -----------------------
+ *
+ * SCX is the lowest priority sched_class, and could be preempted by them at
+ * any time. To address this, the scheduler implements pair_cpu_release() and
+ * pair_cpu_acquire() callbacks which are invoked by the core scheduler when
+ * the scheduler loses and gains control of the CPU respectively.
+ *
+ * In pair_cpu_release(), we mark the pair_ctx as having been preempted, and
+ * then invoke:
+ *
+ * scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT);
+ *
+ * This preempts the pair CPU, and waits until it has re-entered the scheduler
+ * before returning. This is necessary to ensure that the higher priority
+ * sched_class that preempted our scheduler does not schedule a task
+ * concurrently with our pair CPU.
+ *
+ * When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption
+ * in the pair_ctx, and send another resched IPI to the pair CPU to re-enable
+ * pair scheduling.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <scx/common.bpf.h>
+#include "scx_pair.h"
+
+char _license[] SEC("license") = "GPL";
+
+/* !0 for veristat, set during init */
+const volatile u32 nr_cpu_ids = 1;
+
+/* a pair of CPUs stay on a cgroup for this duration */
+const volatile u32 pair_batch_dur_ns;
+
+/* cpu ID -> pair cpu ID */
+const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu);
+
+/* cpu ID -> pair_id */
+const volatile u32 RESIZABLE_ARRAY(rodata, pair_id);
+
+/* CPU ID -> CPU # in the pair (0 or 1) */
+const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx);
+
+struct pair_ctx {
+	struct bpf_spin_lock	lock;
+
+	/* the cgroup the pair is currently executing */
+	u64			cgid;
+
+	/* the pair started executing the current cgroup at */
+	u64			started_at;
+
+	/* whether the current cgroup is draining */
+	bool			draining;
+
+	/* the CPUs that are currently active on the cgroup */
+	u32			active_mask;
+
+	/*
+	 * the CPUs that are currently preempted and running tasks in a
+	 * different scheduler.
+	 */
+	u32			preempted_mask;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct pair_ctx);
+} pair_ctx SEC(".maps");
+
+/* queue of cgrp_q's possibly with tasks on them */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	/*
+	 * Because it's difficult to build strong synchronization encompassing
+	 * multiple non-trivial operations in BPF, this queue is managed in an
+	 * opportunistic way so that we guarantee that a cgroup w/ active tasks
+	 * is always on it but possibly multiple times. Once we have more robust
+	 * synchronization constructs and e.g. linked list, we should be able to
+	 * do this in a prettier way but for now just size it big enough.
+	 */
+	__uint(max_entries, 4 * MAX_CGRPS);
+	__type(value, u64);
+} top_q SEC(".maps");
+
+/* per-cgroup q which FIFOs the tasks from the cgroup */
+struct cgrp_q {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, MAX_QUEUED);
+	__type(value, u32);
+};
+
+/*
+ * Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local
+ * storage; however, a cgroup local storage can only be accessed from the BPF
+ * progs attached to the cgroup. For now, work around by allocating array of
+ * cgrp_q's and then allocating per-cgroup indices.
+ *
+ * Another caveat: It's difficult to populate a large array of maps statically
+ * or from BPF. Initialize it from userland.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__uint(max_entries, MAX_CGRPS);
+	__type(key, s32);
+	__array(values, struct cgrp_q);
+} cgrp_q_arr SEC(".maps");
+
+static u64 cgrp_q_len[MAX_CGRPS];
+
+/*
+ * This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be
+ * useful to have as a map type.
+ */
+static u32 cgrp_q_idx_cursor;
+static u64 cgrp_q_idx_busy[MAX_CGRPS];
+
+/*
+ * All added up, the following is what we do:
+ *
+ * 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking
+ *    for a free ID. If not found, fail cgroup creation with -EBUSY.
+ *
+ * 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following
+ *    cgrp_q_idx_hash.
+ *
+ * 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from
+ *    cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr.
+ *
+ * This is sadly complicated for something pretty simple. Hopefully, we should
+ * be able to simplify in the future.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, MAX_CGRPS);
+	__uint(key_size, sizeof(u64));		/* cgrp ID */
+	__uint(value_size, sizeof(s32));	/* cgrp_q idx */
+} cgrp_q_idx_hash SEC(".maps");
+
+/* statistics */
+u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions;
+u64 nr_exps, nr_exp_waits, nr_exp_empty;
+u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty;
+
+UEI_DEFINE(uei);
+
+void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct cgroup *cgrp;
+	struct cgrp_q *cgq;
+	s32 pid = p->pid;
+	u64 cgid;
+	u32 *q_idx;
+	u64 *cgq_len;
+
+	__sync_fetch_and_add(&nr_total, 1);
+
+	cgrp = scx_bpf_task_cgroup(p);
+	cgid = cgrp->kn->id;
+	bpf_cgroup_release(cgrp);
+
+	/* find the cgroup's q and push @p into it */
+	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
+	if (!q_idx) {
+		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
+		return;
+	}
+
+	cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx);
+	if (!cgq) {
+		scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]",
+			      cgid, *q_idx);
+		return;
+	}
+
+	if (bpf_map_push_elem(cgq, &pid, 0)) {
+		scx_bpf_error("cgroup[%llu] queue overflow", cgid);
+		return;
+	}
+
+	/* bump q len, if going 0 -> 1, queue cgroup into the top_q */
+	cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
+	if (!cgq_len) {
+		scx_bpf_error("MEMBER_VTPR malfunction");
+		return;
+	}
+
+	if (!__sync_fetch_and_add(cgq_len, 1) &&
+	    bpf_map_push_elem(&top_q, &cgid, 0)) {
+		scx_bpf_error("top_q overflow");
+		return;
+	}
+}
+
+static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask)
+{
+	u32 *vptr;
+
+	vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids);
+	if (!vptr)
+		return -EINVAL;
+
+	*pairc = bpf_map_lookup_elem(&pair_ctx, vptr);
+	if (!(*pairc))
+		return -EINVAL;
+
+	vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids);
+	if (!vptr)
+		return -EINVAL;
+
+	*mask = 1U << *vptr;
+
+	return 0;
+}
+
+__attribute__((noinline))
+static int try_dispatch(s32 cpu)
+{
+	struct pair_ctx *pairc;
+	struct bpf_map *cgq_map;
+	struct task_struct *p;
+	u64 now = scx_bpf_now();
+	bool kick_pair = false;
+	bool expired, pair_preempted;
+	u32 *vptr, in_pair_mask;
+	s32 pid, q_idx;
+	u64 cgid;
+	int ret;
+
+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
+	if (ret) {
+		scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]",
+			      cpu);
+		return -ENOENT;
+	}
+
+	bpf_spin_lock(&pairc->lock);
+	pairc->active_mask &= ~in_pair_mask;
+
+	expired = time_before(pairc->started_at + pair_batch_dur_ns, now);
+	if (expired || pairc->draining) {
+		u64 new_cgid = 0;
+
+		__sync_fetch_and_add(&nr_exps, 1);
+
+		/*
+		 * We're done with the current cgid. An obvious optimization
+		 * would be not draining if the next cgroup is the current one.
+		 * For now, be dumb and always expire.
+		 */
+		pairc->draining = true;
+
+		pair_preempted = pairc->preempted_mask;
+		if (pairc->active_mask || pair_preempted) {
+			/*
+			 * The other CPU is still active, or is no longer under
+			 * our control due to e.g. being preempted by a higher
+			 * priority sched_class. We want to wait until this
+			 * cgroup expires, or until control of our pair CPU has
+			 * been returned to us.
+			 *
+			 * If the pair controls its CPU, and the time already
+			 * expired, kick.  When the other CPU arrives at
+			 * dispatch and clears its active mask, it'll push the
+			 * pair to the next cgroup and kick this CPU.
+			 */
+			__sync_fetch_and_add(&nr_exp_waits, 1);
+			bpf_spin_unlock(&pairc->lock);
+			if (expired && !pair_preempted)
+				kick_pair = true;
+			goto out_maybe_kick;
+		}
+
+		bpf_spin_unlock(&pairc->lock);
+
+		/*
+		 * Pick the next cgroup. It'd be easier / cleaner to not drop
+		 * pairc->lock and use stronger synchronization here especially
+		 * given that we'll be switching cgroups significantly less
+		 * frequently than tasks. Unfortunately, bpf_spin_lock can't
+		 * really protect anything non-trivial. Let's do opportunistic
+		 * operations instead.
+		 */
+		bpf_repeat(BPF_MAX_LOOPS) {
+			u32 *q_idx;
+			u64 *cgq_len;
+
+			if (bpf_map_pop_elem(&top_q, &new_cgid)) {
+				/* no active cgroup, go idle */
+				__sync_fetch_and_add(&nr_exp_empty, 1);
+				return 0;
+			}
+
+			q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid);
+			if (!q_idx)
+				continue;
+
+			/*
+			 * This is the only place where empty cgroups are taken
+			 * off the top_q.
+			 */
+			cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]);
+			if (!cgq_len || !*cgq_len)
+				continue;
+
+			/*
+			 * If it has any tasks, requeue as we may race and not
+			 * execute it.
+			 */
+			bpf_map_push_elem(&top_q, &new_cgid, 0);
+			break;
+		}
+
+		bpf_spin_lock(&pairc->lock);
+
+		/*
+		 * The other CPU may already have started on a new cgroup while
+		 * we dropped the lock. Make sure that we're still draining and
+		 * start on the new cgroup.
+		 */
+		if (pairc->draining && !pairc->active_mask) {
+			__sync_fetch_and_add(&nr_cgrp_next, 1);
+			pairc->cgid = new_cgid;
+			pairc->started_at = now;
+			pairc->draining = false;
+			kick_pair = true;
+		} else {
+			__sync_fetch_and_add(&nr_cgrp_coll, 1);
+		}
+	}
+
+	cgid = pairc->cgid;
+	pairc->active_mask |= in_pair_mask;
+	bpf_spin_unlock(&pairc->lock);
+
+	/* again, it'd be better to do all these with the lock held, oh well */
+	vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
+	if (!vptr) {
+		scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid);
+		return -ENOENT;
+	}
+	q_idx = *vptr;
+
+	/* claim one task from cgrp_q w/ q_idx */
+	bpf_repeat(BPF_MAX_LOOPS) {
+		u64 *cgq_len, len;
+
+		cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]);
+		if (!cgq_len || !(len = *(volatile u64 *)cgq_len)) {
+			/* the cgroup must be empty, expire and repeat */
+			__sync_fetch_and_add(&nr_cgrp_empty, 1);
+			bpf_spin_lock(&pairc->lock);
+			pairc->draining = true;
+			pairc->active_mask &= ~in_pair_mask;
+			bpf_spin_unlock(&pairc->lock);
+			return -EAGAIN;
+		}
+
+		if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len)
+			continue;
+
+		break;
+	}
+
+	cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx);
+	if (!cgq_map) {
+		scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]",
+			      cgid, q_idx);
+		return -ENOENT;
+	}
+
+	if (bpf_map_pop_elem(cgq_map, &pid)) {
+		scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]",
+			      cgid, q_idx);
+		return -ENOENT;
+	}
+
+	p = bpf_task_from_pid(pid);
+	if (p) {
+		__sync_fetch_and_add(&nr_dispatched, 1);
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	} else {
+		/* we don't handle dequeues, retry on lost tasks */
+		__sync_fetch_and_add(&nr_missing, 1);
+		return -EAGAIN;
+	}
+
+out_maybe_kick:
+	if (kick_pair) {
+		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
+		if (pair) {
+			__sync_fetch_and_add(&nr_kicks, 1);
+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
+		}
+	}
+	return 0;
+}
+
+void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev)
+{
+	bpf_repeat(BPF_MAX_LOOPS) {
+		if (try_dispatch(cpu) != -EAGAIN)
+			break;
+	}
+}
+
+void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args)
+{
+	int ret;
+	u32 in_pair_mask;
+	struct pair_ctx *pairc;
+	bool kick_pair;
+
+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
+	if (ret)
+		return;
+
+	bpf_spin_lock(&pairc->lock);
+	pairc->preempted_mask &= ~in_pair_mask;
+	/* Kick the pair CPU, unless it was also preempted. */
+	kick_pair = !pairc->preempted_mask;
+	bpf_spin_unlock(&pairc->lock);
+
+	if (kick_pair) {
+		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
+
+		if (pair) {
+			__sync_fetch_and_add(&nr_kicks, 1);
+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT);
+		}
+	}
+}
+
+void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
+{
+	int ret;
+	u32 in_pair_mask;
+	struct pair_ctx *pairc;
+	bool kick_pair;
+
+	ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask);
+	if (ret)
+		return;
+
+	bpf_spin_lock(&pairc->lock);
+	pairc->preempted_mask |= in_pair_mask;
+	pairc->active_mask &= ~in_pair_mask;
+	/* Kick the pair CPU if it's still running. */
+	kick_pair = pairc->active_mask;
+	pairc->draining = true;
+	bpf_spin_unlock(&pairc->lock);
+
+	if (kick_pair) {
+		s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids);
+
+		if (pair) {
+			__sync_fetch_and_add(&nr_kicks, 1);
+			scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT);
+		}
+	}
+	__sync_fetch_and_add(&nr_preemptions, 1);
+}
+
+s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp)
+{
+	u64 cgid = cgrp->kn->id;
+	s32 i, q_idx;
+
+	bpf_for(i, 0, MAX_CGRPS) {
+		q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS;
+		if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1))
+			break;
+	}
+	if (i == MAX_CGRPS)
+		return -EBUSY;
+
+	if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) {
+		u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]);
+		if (busy)
+			*busy = 0;
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp)
+{
+	u64 cgid = cgrp->kn->id;
+	s32 *q_idx;
+
+	q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid);
+	if (q_idx) {
+		u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]);
+		if (busy)
+			*busy = 0;
+		bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid);
+	}
+}
+
+void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(pair_ops,
+	       .enqueue			= (void *)pair_enqueue,
+	       .dispatch		= (void *)pair_dispatch,
+	       .cpu_acquire		= (void *)pair_cpu_acquire,
+	       .cpu_release		= (void *)pair_cpu_release,
+	       .cgroup_init		= (void *)pair_cgroup_init,
+	       .cgroup_exit		= (void *)pair_cgroup_exit,
+	       .exit			= (void *)pair_exit,
+	       .name			= "pair");
diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
new file mode 100644
index 000000000000..d3e97faa6334
--- /dev/null
+++ b/tools/sched_ext/scx_pair.c
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_pair.h"
+#include "scx_pair.bpf.skel.h"
+
+const char help_fmt[] =
+"A demo sched_ext core-scheduler which always makes every sibling CPU pair\n"
+"execute from the same CPU cgroup.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-S STRIDE]\n"
+"\n"
+"  -S STRIDE     Override CPU pair stride (default: nr_cpus_ids / 2)\n"
+"  -v            Print libbpf debug messages\n"
+"  -h            Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int dummy)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_pair *skel;
+	struct bpf_link *link;
+	__u64 seq = 0, ecode;
+	__s32 stride, i, opt, outer_fd;
+
+	libbpf_set_print(libbpf_print_fn);
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+restart:
+	skel = SCX_OPS_OPEN(pair_ops, scx_pair);
+
+	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
+	assert(skel->rodata->nr_cpu_ids > 0);
+	skel->rodata->pair_batch_dur_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
+
+	/* pair up the earlier half to the latter by default, override with -s */
+	stride = skel->rodata->nr_cpu_ids / 2;
+
+	while ((opt = getopt(argc, argv, "S:vh")) != -1) {
+		switch (opt) {
+		case 'S':
+			stride = strtoul(optarg, NULL, 0);
+			break;
+		case 'v':
+			verbose = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2);
+
+	/* Resize arrays so their element count is equal to cpu count. */
+	RESIZE_ARRAY(skel, rodata, pair_cpu, skel->rodata->nr_cpu_ids);
+	RESIZE_ARRAY(skel, rodata, pair_id, skel->rodata->nr_cpu_ids);
+	RESIZE_ARRAY(skel, rodata, in_pair_idx, skel->rodata->nr_cpu_ids);
+
+	for (i = 0; i < skel->rodata->nr_cpu_ids; i++)
+		skel->rodata_pair_cpu->pair_cpu[i] = -1;
+
+	printf("Pairs: ");
+	for (i = 0; i < skel->rodata->nr_cpu_ids; i++) {
+		int j = (i + stride) % skel->rodata->nr_cpu_ids;
+
+		if (skel->rodata_pair_cpu->pair_cpu[i] >= 0)
+			continue;
+
+		SCX_BUG_ON(i == j,
+			   "Invalid stride %d - CPU%d wants to be its own pair",
+			   stride, i);
+
+		SCX_BUG_ON(skel->rodata_pair_cpu->pair_cpu[j] >= 0,
+			   "Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair",
+			   stride, i, j, skel->rodata_pair_cpu->pair_cpu[j]);
+
+		skel->rodata_pair_cpu->pair_cpu[i] = j;
+		skel->rodata_pair_cpu->pair_cpu[j] = i;
+		skel->rodata_pair_id->pair_id[i] = i;
+		skel->rodata_pair_id->pair_id[j] = i;
+		skel->rodata_in_pair_idx->in_pair_idx[i] = 0;
+		skel->rodata_in_pair_idx->in_pair_idx[j] = 1;
+
+		printf("[%d, %d] ", i, j);
+	}
+	printf("\n");
+
+	SCX_OPS_LOAD(skel, pair_ops, scx_pair, uei);
+
+	/*
+	 * Populate the cgrp_q_arr map which is an array containing per-cgroup
+	 * queues. It'd probably be better to do this from BPF but there are too
+	 * many to initialize statically and there's no way to dynamically
+	 * populate from BPF.
+	 */
+	outer_fd = bpf_map__fd(skel->maps.cgrp_q_arr);
+	SCX_BUG_ON(outer_fd < 0, "Failed to get outer_fd: %d", outer_fd);
+
+	printf("Initializing");
+        for (i = 0; i < MAX_CGRPS; i++) {
+		__s32 inner_fd;
+
+		if (exit_req)
+			break;
+
+		inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0,
+					  sizeof(__u32), MAX_QUEUED, NULL);
+		SCX_BUG_ON(inner_fd < 0, "Failed to get inner_fd: %d",
+			   inner_fd);
+		SCX_BUG_ON(bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY),
+			   "Failed to set inner map");
+		close(inner_fd);
+
+		if (!(i % 10))
+			printf(".");
+		fflush(stdout);
+        }
+	printf("\n");
+
+	/*
+	 * Fully initialized, attach and run.
+	 */
+	link = SCX_OPS_ATTACH(skel, pair_ops, scx_pair);
+
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
+		printf("[SEQ %llu]\n", seq++);
+		printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 "   missing:%10" PRIu64 "\n",
+		       skel->bss->nr_total,
+		       skel->bss->nr_dispatched,
+		       skel->bss->nr_missing);
+		printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n",
+		       skel->bss->nr_kicks,
+		       skel->bss->nr_preemptions);
+		printf("   exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n",
+		       skel->bss->nr_exps,
+		       skel->bss->nr_exp_waits,
+		       skel->bss->nr_exp_empty);
+		printf("cgnext:%10" PRIu64 "   cgcoll:%10" PRIu64 "   cgempty:%10" PRIu64 "\n",
+		       skel->bss->nr_cgrp_next,
+		       skel->bss->nr_cgrp_coll,
+		       skel->bss->nr_cgrp_empty);
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	ecode = UEI_REPORT(skel, uei);
+	scx_pair__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
+	return 0;
+}
diff --git a/tools/sched_ext/scx_pair.h b/tools/sched_ext/scx_pair.h
new file mode 100644
index 000000000000..d9666a447d3f
--- /dev/null
+++ b/tools/sched_ext/scx_pair.h
@@ -0,0 +1,9 @@
+#ifndef __SCX_EXAMPLE_PAIR_H
+#define __SCX_EXAMPLE_PAIR_H
+
+enum {
+	MAX_QUEUED		= 4096,
+	MAX_CGRPS		= 4096,
+};
+
+#endif /* __SCX_EXAMPLE_PAIR_H */
-- 
cgit v1.2.3


From 36929ebd17ae66ed3acde9056a9daf611d81a2e5 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Thu, 22 Jan 2026 22:26:05 -0500
Subject: tools/sched_ext: add arena based scheduler

Add a scheduler that uses BPF arenas to manage task context data.

Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/Makefile      |   2 +-
 tools/sched_ext/scx_sdt.bpf.c | 710 ++++++++++++++++++++++++++++++++++++++++++
 tools/sched_ext/scx_sdt.c     | 101 ++++++
 tools/sched_ext/scx_sdt.h     | 113 +++++++
 4 files changed, 925 insertions(+), 1 deletion(-)
 create mode 100644 tools/sched_ext/scx_sdt.bpf.c
 create mode 100644 tools/sched_ext/scx_sdt.c
 create mode 100644 tools/sched_ext/scx_sdt.h

(limited to 'tools')

diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
index 208e8f8fe4d8..47ad7444677e 100644
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@@ -189,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 
 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
 
-c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair
+c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg scx_userland scx_pair scx_sdt
 
 $(addprefix $(BINDIR)/,$(c-sched-targets)): \
 	$(BINDIR)/%: \
diff --git a/tools/sched_ext/scx_sdt.bpf.c b/tools/sched_ext/scx_sdt.bpf.c
new file mode 100644
index 000000000000..48ea18614e28
--- /dev/null
+++ b/tools/sched_ext/scx_sdt.bpf.c
@@ -0,0 +1,710 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Arena-based task data scheduler. This is a variation of scx_simple
+ * that uses a combined allocator and indexing structure to organize
+ * task data. Task context allocation is done when a task enters the
+ * scheduler, while freeing is done when it exits. Task contexts are
+ * retrieved from task-local storage, pointing to the allocated memory.
+ *
+ * The main purpose of this scheduler is to demostrate arena memory
+ * management.
+ *
+ * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024-2025 Emil Tsalapatis <etsal@meta.com>
+ * Copyright (c) 2024-2025 Tejun Heo <tj@kernel.org>
+ *
+ */
+#include <scx/common.bpf.h>
+#include <scx/bpf_arena_common.bpf.h>
+
+#include "scx_sdt.h"
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARENA);
+	__uint(map_flags, BPF_F_MMAPABLE);
+#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
+	__uint(max_entries, 1 << 16); /* number of pages */
+        __ulong(map_extra, (1ull << 32)); /* start of mmap() region */
+#else
+	__uint(max_entries, 1 << 20); /* number of pages */
+        __ulong(map_extra, (1ull << 44)); /* start of mmap() region */
+#endif
+} arena __weak SEC(".maps");
+
+#define SHARED_DSQ 0
+
+#define DEFINE_SDT_STAT(metric)				\
+static inline void				\
+stat_inc_##metric(struct scx_stats __arena *stats)	\
+{							\
+	cast_kern(stats);				\
+	stats->metric += 1;				\
+}							\
+__u64 stat_##metric;					\
+
+DEFINE_SDT_STAT(enqueue);
+DEFINE_SDT_STAT(init);
+DEFINE_SDT_STAT(exit);
+DEFINE_SDT_STAT(select_idle_cpu);
+DEFINE_SDT_STAT(select_busy_cpu);
+
+/*
+ * Necessary for cond_break/can_loop's semantics. According to kernel commit
+ * 011832b, the loop counter variable must be seen as imprecise and bounded
+ * by the verifier. Initializing it from a constant (e.g., i = 0;), then,
+ * makes it precise and prevents may_goto from helping with converging the
+ * loop. For these loops we must initialize the loop counter from a variable
+ * whose value the verifier cannot reason about when checking the program, so
+ * that the loop counter's value is imprecise.
+ */
+static __u64 zero = 0;
+
+/*
+ * XXX Hack to get the verifier to find the arena for sdt_exit_task.
+ * As of 6.12-rc5, The verifier associates arenas with programs by
+ * checking LD.IMM instruction operands for an arena and populating
+ * the program state with the first instance it finds. This requires
+ * accessing our global arena variable, but scx methods do not necessarily
+ * do so while still using pointers from that arena. Insert a bpf_printk
+ * statement that triggers at most once to generate an LD.IMM instruction
+ * to access the arena and help the verifier.
+ */
+static volatile bool scx_arena_verify_once;
+
+__hidden void scx_arena_subprog_init(void)
+{
+	if (scx_arena_verify_once)
+		return;
+
+	bpf_printk("%s: arena pointer %p", __func__, &arena);
+	scx_arena_verify_once = true;
+}
+
+
+private(LOCK) struct bpf_spin_lock alloc_lock;
+private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock;
+
+/* allocation pools */
+struct sdt_pool desc_pool;
+struct sdt_pool chunk_pool;
+
+/* Protected by alloc_lock. */
+struct scx_alloc_stats alloc_stats;
+
+
+/* Allocate element from the pool. Must be called with a then pool lock held. */
+static
+void __arena *scx_alloc_from_pool(struct sdt_pool *pool)
+{
+	__u64 elem_size, max_elems;
+	void __arena *slab;
+	void __arena *ptr;
+
+	elem_size = pool->elem_size;
+	max_elems = pool->max_elems;
+
+	/* If the chunk is spent, get a new one. */
+	if (pool->idx >= max_elems) {
+		slab = bpf_arena_alloc_pages(&arena, NULL,
+			div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0);
+		if (!slab)
+			return NULL;
+
+		pool->slab = slab;
+		pool->idx = 0;
+	}
+
+	ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx);
+	pool->idx += 1;
+
+	return ptr;
+}
+
+/* Alloc desc and associated chunk. Called with the allocator spinlock held. */
+static sdt_desc_t *scx_alloc_chunk(void)
+{
+	struct sdt_chunk __arena *chunk;
+	sdt_desc_t *desc;
+	sdt_desc_t *out;
+
+	chunk = scx_alloc_from_pool(&chunk_pool);
+	if (!chunk)
+		return NULL;
+
+	desc = scx_alloc_from_pool(&desc_pool);
+	if (!desc) {
+		/*
+		 * Effectively frees the previous chunk allocation.
+		 * Index cannot be 0, so decrementing is always
+		 * valid.
+		 */
+		chunk_pool.idx -= 1;
+		return NULL;
+	}
+
+	out = desc;
+
+	desc->nr_free = SDT_TASK_ENTS_PER_CHUNK;
+	desc->chunk = chunk;
+
+	alloc_stats.chunk_allocs += 1;
+
+	return out;
+}
+
+static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages)
+{
+	if (unlikely(data_size % 8))
+		return -EINVAL;
+
+	if (unlikely(nr_pages == 0))
+		return -EINVAL;
+
+	pool->elem_size = data_size;
+	pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size;
+	/* Populate the pool slab on the first allocation. */
+	pool->idx = pool->max_elems;
+
+	return 0;
+}
+
+/* Initialize both the base pool allocators and the root chunk of the index. */
+__hidden int
+scx_alloc_init(struct scx_allocator *alloc, __u64 data_size)
+{
+	size_t min_chunk_size;
+	int ret;
+
+	_Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE,
+		"chunk size must fit into a page");
+
+	ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1);
+	if (ret != 0)
+		return ret;
+
+	ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1);
+	if (ret != 0)
+		return ret;
+
+	/* Wrap data into a descriptor and word align. */
+	data_size += sizeof(struct sdt_data);
+	data_size = round_up(data_size, 8);
+
+	/*
+	 * Ensure we allocate large enough chunks from the arena to avoid excessive
+	 * internal fragmentation when turning chunks it into structs.
+	 */
+	min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE);
+	ret = pool_set_size(&alloc->pool, data_size, min_chunk_size);
+	if (ret != 0)
+		return ret;
+
+	bpf_spin_lock(&alloc_lock);
+	alloc->root = scx_alloc_chunk();
+	bpf_spin_unlock(&alloc_lock);
+	if (!alloc->root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static
+int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state)
+{
+	__u64 __arena *allocated = desc->allocated;
+	__u64 bit;
+
+	if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK))
+		return -EINVAL;
+
+	bit = (__u64)1 << (pos % 64);
+
+	if (state)
+		allocated[pos / 64] |= bit;
+	else
+		allocated[pos / 64] &= ~bit;
+
+	return 0;
+}
+
+static __noinline
+int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS])
+{
+	sdt_desc_t *desc;
+	__u64 u, level;
+	int ret;
+
+	for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) {
+		level = SDT_TASK_LEVELS - 1 - u;
+
+		/* Only propagate upwards if we are the parent's only free chunk. */
+		desc = lv_desc[level];
+
+		ret = set_idx_state(desc, lv_pos[level], false);
+		if (unlikely(ret != 0))
+			return ret;
+
+		desc->nr_free += 1;
+		if (desc->nr_free > 1)
+			return 0;
+	}
+
+	return 0;
+}
+
+/*
+ * Free the allocated struct with the given index. Called with the
+ * allocator lock taken.
+ */
+__hidden
+int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx)
+{
+	const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1;
+	sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
+	sdt_desc_t * __arena *desc_children;
+	struct sdt_chunk __arena *chunk;
+	sdt_desc_t *desc;
+	struct sdt_data __arena *data;
+	__u64 level, shift, pos;
+	__u64 lv_pos[SDT_TASK_LEVELS];
+	int ret;
+	int i;
+
+	if (!alloc)
+		return 0;
+
+	desc = alloc->root;
+	if (unlikely(!desc))
+		return -EINVAL;
+
+	/* To appease the verifier. */
+	for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
+		lv_desc[level] = NULL;
+		lv_pos[level] = 0;
+	}
+
+	/* Find the leaf node containing the index. */
+	for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
+		shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT;
+		pos = (idx >> shift) & mask;
+
+		lv_desc[level] = desc;
+		lv_pos[level] = pos;
+
+		if (level == SDT_TASK_LEVELS - 1)
+			break;
+
+		chunk = desc->chunk;
+
+		desc_children = (sdt_desc_t * __arena *)chunk->descs;
+		desc = desc_children[pos];
+
+		if (unlikely(!desc))
+			return -EINVAL;
+	}
+
+	chunk = desc->chunk;
+
+	pos = idx & mask;
+	data = chunk->data[pos];
+	if (likely(data)) {
+		data[pos] = (struct sdt_data) {
+			.tid.genn = data->tid.genn + 1,
+		};
+
+		/* Zero out one word at a time. */
+		for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) {
+			data->payload[i] = 0;
+		}
+	}
+
+	ret = mark_nodes_avail(lv_desc, lv_pos);
+	if (unlikely(ret != 0))
+		return ret;
+
+	alloc_stats.active_allocs -= 1;
+	alloc_stats.free_ops += 1;
+
+	return 0;
+}
+
+static inline
+int ffs(__u64 word)
+{
+	unsigned int num = 0;
+
+	if ((word & 0xffffffff) == 0) {
+		num += 32;
+		word >>= 32;
+	}
+
+	if ((word & 0xffff) == 0) {
+		num += 16;
+		word >>= 16;
+	}
+
+	if ((word & 0xff) == 0) {
+		num += 8;
+		word >>= 8;
+	}
+
+	if ((word & 0xf) == 0) {
+		num += 4;
+		word >>= 4;
+	}
+
+	if ((word & 0x3) == 0) {
+		num += 2;
+		word >>= 2;
+	}
+
+	if ((word & 0x1) == 0) {
+		num += 1;
+		word >>= 1;
+	}
+
+	return num;
+}
+
+
+/* find the first empty slot */
+__hidden
+__u64 chunk_find_empty(sdt_desc_t __arg_arena *desc)
+{
+	__u64 freeslots;
+	__u64 i;
+
+	for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) {
+		freeslots = ~desc->allocated[i];
+		if (freeslots == (__u64)0)
+			continue;
+
+		return (i * 64) + ffs(freeslots);
+	}
+
+	return SDT_TASK_ENTS_PER_CHUNK;
+}
+
+/*
+ * Find and return an available idx on the allocator.
+ * Called with the task spinlock held.
+ */
+static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp)
+{
+	sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
+	sdt_desc_t * __arena *desc_children;
+	struct sdt_chunk __arena *chunk;
+	sdt_desc_t *tmp;
+	__u64 lv_pos[SDT_TASK_LEVELS];
+	__u64 u, pos, level;
+	__u64 idx = 0;
+	int ret;
+
+	for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
+		pos = chunk_find_empty(desc);
+
+		/* If we error out, something has gone very wrong. */
+		if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK))
+			return NULL;
+
+		if (pos == SDT_TASK_ENTS_PER_CHUNK)
+			return NULL;
+
+		idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT;
+		idx |= pos;
+
+		/* Log the levels to complete allocation. */
+		lv_desc[level] = desc;
+		lv_pos[level] = pos;
+
+		/* The rest of the loop is for internal node traversal. */
+		if (level == SDT_TASK_LEVELS - 1)
+			break;
+
+		/* Allocate an internal node if necessary. */
+		chunk = desc->chunk;
+		desc_children = (sdt_desc_t * __arena *)chunk->descs;
+
+		desc = desc_children[pos];
+		if (!desc) {
+			desc = scx_alloc_chunk();
+			if (!desc)
+				return NULL;
+
+			desc_children[pos] = desc;
+		}
+	}
+
+	/*
+	 * Finding the descriptor along with any internal node
+	 * allocations was successful. Update all levels with
+	 * the new allocation.
+	 */
+	bpf_for(u, 0, SDT_TASK_LEVELS) {
+		level = SDT_TASK_LEVELS - 1 - u;
+		tmp = lv_desc[level];
+
+		ret = set_idx_state(tmp, lv_pos[level], true);
+		if (ret != 0)
+			break;
+
+		tmp->nr_free -= 1;
+		if (tmp->nr_free > 0)
+			break;
+	}
+
+	*idxp = idx;
+
+	return desc;
+}
+
+__hidden
+void __arena *scx_alloc(struct scx_allocator *alloc)
+{
+	struct sdt_data __arena *data = NULL;
+	struct sdt_chunk __arena *chunk;
+	sdt_desc_t *desc;
+	__u64 idx, pos;
+
+	if (!alloc)
+		return NULL;
+
+	bpf_spin_lock(&alloc_lock);
+
+	/* We unlock if we encounter an error in the function. */
+	desc = desc_find_empty(alloc->root, &idx);
+	if (unlikely(desc == NULL)) {
+		bpf_spin_unlock(&alloc_lock);
+		return NULL;
+	}
+
+	chunk = desc->chunk;
+
+	/* Populate the leaf node if necessary. */
+	pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1);
+	data = chunk->data[pos];
+	if (!data) {
+		data = scx_alloc_from_pool(&alloc->pool);
+		if (!data) {
+			scx_alloc_free_idx(alloc, idx);
+			bpf_spin_unlock(&alloc_lock);
+			return NULL;
+		}
+	}
+
+	chunk->data[pos] = data;
+
+	/* The data counts as a chunk */
+	alloc_stats.data_allocs += 1;
+	alloc_stats.alloc_ops += 1;
+	alloc_stats.active_allocs += 1;
+
+	data->tid.idx = idx;
+
+	bpf_spin_unlock(&alloc_lock);
+
+	return data;
+}
+
+/*
+ * Task BPF map entry recording the task's assigned ID and pointing to the data
+ * area allocated in arena.
+ */
+struct scx_task_map_val {
+	union sdt_id		tid;
+	__u64			tptr;
+	struct sdt_data __arena	*data;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct scx_task_map_val);
+} scx_task_map SEC(".maps");
+
+static struct scx_allocator scx_task_allocator;
+
+__hidden
+void __arena *scx_task_alloc(struct task_struct *p)
+{
+	struct sdt_data __arena *data = NULL;
+	struct scx_task_map_val *mval;
+
+	mval = bpf_task_storage_get(&scx_task_map, p, 0,
+				    BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!mval)
+		return NULL;
+
+	data = scx_alloc(&scx_task_allocator);
+	if (unlikely(!data))
+		return NULL;
+
+	mval->tid = data->tid;
+	mval->tptr = (__u64) p;
+	mval->data = data;
+
+	return (void __arena *)data->payload;
+}
+
+__hidden
+int scx_task_init(__u64 data_size)
+{
+	return scx_alloc_init(&scx_task_allocator, data_size);
+}
+
+__hidden
+void __arena *scx_task_data(struct task_struct *p)
+{
+	struct sdt_data __arena *data;
+	struct scx_task_map_val *mval;
+
+	scx_arena_subprog_init();
+
+	mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
+	if (!mval)
+		return NULL;
+
+	data = mval->data;
+
+	return (void __arena *)data->payload;
+}
+
+__hidden
+void scx_task_free(struct task_struct *p)
+{
+	struct scx_task_map_val *mval;
+
+	scx_arena_subprog_init();
+
+	mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
+	if (!mval)
+		return;
+
+	bpf_spin_lock(&alloc_lock);
+	scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx);
+	bpf_spin_unlock(&alloc_lock);
+
+	bpf_task_storage_delete(&scx_task_map, p);
+}
+
+static inline void
+scx_stat_global_update(struct scx_stats __arena *stats)
+{
+	cast_kern(stats);
+	__sync_fetch_and_add(&stat_enqueue, stats->enqueue);
+	__sync_fetch_and_add(&stat_init, stats->init);
+	__sync_fetch_and_add(&stat_exit, stats->exit);
+	__sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu);
+	__sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu);
+}
+
+s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+	struct scx_stats __arena *stats;
+	bool is_idle = false;
+	s32 cpu;
+
+	stats = scx_task_data(p);
+	if (!stats) {
+		scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
+		return 0;
+	}
+
+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
+	if (is_idle) {
+		stat_inc_select_idle_cpu(stats);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+	} else {
+		stat_inc_select_busy_cpu(stats);
+	}
+
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct scx_stats __arena *stats;
+
+	stats = scx_task_data(p);
+	if (!stats) {
+		scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
+		return;
+	}
+
+	stat_inc_enqueue(stats);
+
+	scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev)
+{
+	scx_bpf_dsq_move_to_local(SHARED_DSQ);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p,
+			     struct scx_init_task_args *args)
+{
+	struct scx_stats __arena *stats;
+
+	stats = scx_task_alloc(p);
+	if (!stats) {
+		scx_bpf_error("arena allocator out of memory");
+		return -ENOMEM;
+	}
+
+	stats->pid = p->pid;
+
+	stat_inc_init(stats);
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p,
+			      struct scx_exit_task_args *args)
+{
+	struct scx_stats __arena *stats;
+
+	stats = scx_task_data(p);
+	if (!stats) {
+		scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
+		return;
+	}
+
+	stat_inc_exit(stats);
+	scx_stat_global_update(stats);
+
+	scx_task_free(p);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
+{
+	int ret;
+
+	ret = scx_task_init(sizeof(struct scx_stats));
+	if (ret < 0) {
+		scx_bpf_error("%s: failed with %d", __func__, ret);
+		return ret;
+	}
+
+	return scx_bpf_create_dsq(SHARED_DSQ, -1);
+}
+
+void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(sdt_ops,
+	       .select_cpu		= (void *)sdt_select_cpu,
+	       .enqueue			= (void *)sdt_enqueue,
+	       .dispatch		= (void *)sdt_dispatch,
+	       .init_task		= (void *)sdt_init_task,
+	       .exit_task		= (void *)sdt_exit_task,
+	       .init			= (void *)sdt_init,
+	       .exit			= (void *)sdt_exit,
+	       .name			= "sdt");
diff --git a/tools/sched_ext/scx_sdt.c b/tools/sched_ext/scx_sdt.c
new file mode 100644
index 000000000000..b0363363476d
--- /dev/null
+++ b/tools/sched_ext/scx_sdt.c
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Emil Tsalapatis <etsal@meta.com>
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+
+#include "scx_sdt.h"
+#include "scx_sdt.bpf.skel.h"
+
+const char help_fmt[] =
+"A simple arena-based sched_ext scheduler.\n"
+"\n"
+"Modified version of scx_simple that demonstrates arena-based data structures.\n"
+"\n"
+"Usage: %s [-f] [-v]\n"
+"\n"
+"  -v            Print libbpf debug messages\n"
+"  -h            Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int sig)
+{
+	exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+	struct scx_sdt *skel;
+	struct bpf_link *link;
+	__u32 opt;
+	__u64 ecode;
+
+	libbpf_set_print(libbpf_print_fn);
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+restart:
+	skel = SCX_OPS_OPEN(sdt_ops, scx_sdt);
+
+	while ((opt = getopt(argc, argv, "fvh")) != -1) {
+		switch (opt) {
+		case 'v':
+			verbose = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			return opt != 'h';
+		}
+	}
+
+	SCX_OPS_LOAD(skel, sdt_ops, scx_sdt, uei);
+	link = SCX_OPS_ATTACH(skel, sdt_ops, scx_sdt);
+
+	while (!exit_req && !UEI_EXITED(skel, uei)) {
+		printf("====SCHEDULING STATS====\n");
+		printf("enqueues=%llu\t", skel->bss->stat_enqueue);
+		printf("inits=%llu\t", skel->bss->stat_init);
+		printf("exits=%llu\t", skel->bss->stat_exit);
+		printf("\n");
+
+		printf("select_idle_cpu=%llu\t", skel->bss->stat_select_idle_cpu);
+		printf("select_busy_cpu=%llu\t", skel->bss->stat_select_busy_cpu);
+		printf("\n");
+
+		printf("====ALLOCATION STATS====\n");
+		printf("chunk allocs=%llu\t", skel->bss->alloc_stats.chunk_allocs);
+		printf("data_allocs=%llu\n", skel->bss->alloc_stats.data_allocs);
+		printf("alloc_ops=%llu\t", skel->bss->alloc_stats.alloc_ops);
+		printf("free_ops=%llu\t", skel->bss->alloc_stats.free_ops);
+		printf("active_allocs=%llu\t", skel->bss->alloc_stats.active_allocs);
+		printf("arena_pages_used=%llu\t", skel->bss->alloc_stats.arena_pages_used);
+		printf("\n\n");
+
+		fflush(stdout);
+		sleep(1);
+	}
+
+	bpf_link__destroy(link);
+	ecode = UEI_REPORT(skel, uei);
+	scx_sdt__destroy(skel);
+
+	if (UEI_ECODE_RESTART(ecode))
+		goto restart;
+	return 0;
+}
diff --git a/tools/sched_ext/scx_sdt.h b/tools/sched_ext/scx_sdt.h
new file mode 100644
index 000000000000..67982ce9bc9b
--- /dev/null
+++ b/tools/sched_ext/scx_sdt.h
@@ -0,0 +1,113 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2025 Emil Tsalapatis <etsal@meta.com>
+ */
+#pragma once
+
+#ifndef __BPF__
+#define __arena
+#endif /* __BPF__ */
+
+struct scx_alloc_stats {
+	__u64		chunk_allocs;
+	__u64		data_allocs;
+	__u64		alloc_ops;
+	__u64		free_ops;
+	__u64		active_allocs;
+	__u64		arena_pages_used;
+};
+
+struct sdt_pool {
+	void __arena	*slab;
+	__u64		elem_size;
+	__u64		max_elems;
+	__u64		idx;
+};
+
+#ifndef div_round_up
+#define div_round_up(a, b) (((a) + (b) - 1) / (b))
+#endif
+
+#ifndef round_up
+#define round_up(a, b) (div_round_up((a), (b)) * (b))
+#endif
+
+typedef struct sdt_desc __arena sdt_desc_t;
+
+enum sdt_consts {
+	SDT_TASK_ENTS_PER_PAGE_SHIFT	= 9,
+	SDT_TASK_LEVELS			= 3,
+	SDT_TASK_ENTS_PER_CHUNK		= 1 << SDT_TASK_ENTS_PER_PAGE_SHIFT,
+	SDT_TASK_CHUNK_BITMAP_U64S	= div_round_up(SDT_TASK_ENTS_PER_CHUNK, 64),
+	SDT_TASK_MIN_ELEM_PER_ALLOC 	= 8,
+};
+
+union sdt_id {
+	__s64				val;
+	struct {
+		__s32			idx;	/* index in the radix tree */
+		__s32			genn;	/* ++'d on recycle so that it forms unique'ish 64bit ID */
+	};
+};
+
+struct sdt_chunk;
+
+/*
+ * Each index page is described by the following descriptor which carries the
+ * bitmap. This way the actual index can host power-of-two numbers of entries
+ * which makes indexing cheaper.
+ */
+struct sdt_desc {
+	__u64				allocated[SDT_TASK_CHUNK_BITMAP_U64S];
+	__u64				nr_free;
+	struct sdt_chunk __arena	*chunk;
+};
+
+/*
+ * Leaf node containing per-task data.
+ */
+struct sdt_data {
+	union sdt_id			tid;
+	__u64				payload[];
+};
+
+/*
+ * Intermediate node pointing to another intermediate node or leaf node.
+ */
+struct sdt_chunk {
+	union {
+		sdt_desc_t * descs[SDT_TASK_ENTS_PER_CHUNK];
+		struct sdt_data __arena *data[SDT_TASK_ENTS_PER_CHUNK];
+	};
+};
+
+struct scx_allocator {
+	struct sdt_pool	pool;
+	sdt_desc_t	*root;
+};
+
+struct scx_stats {
+	int	seq;
+	pid_t	pid;
+	__u64	enqueue;
+	__u64	exit;
+	__u64	init;
+	__u64	select_busy_cpu;
+	__u64	select_idle_cpu;
+};
+
+#ifdef __BPF__
+
+void __arena *scx_task_data(struct task_struct *p);
+int scx_task_init(__u64 data_size);
+void __arena *scx_task_alloc(struct task_struct *p);
+void scx_task_free(struct task_struct *p);
+void scx_arena_subprog_init(void);
+
+int scx_alloc_init(struct scx_allocator *alloc, __u64 data_size);
+u64 scx_alloc_internal(struct scx_allocator *alloc);
+int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx);
+
+#endif /* __BPF__ */
-- 
cgit v1.2.3


From 166e664e702ed96b83df2a87c1ea2138a995b604 Mon Sep 17 00:00:00 2001
From: Junjie Cao <junjie.cao@intel.com>
Date: Mon, 26 Jan 2026 14:15:31 +0800
Subject: selftests: ptp: use KSFT_SKIP exit code for skip scenarios

The kselftest framework defines KSFT_SKIP=4 as the standard exit code
for skipped tests. However, phc.sh currently uses a mix of 'exit 0' and
'exit 1' to indicate skip conditions, which can confuse test harnesses
and CI systems.

This patch introduces ksft_skip=4 variable and unifies all skip exit
paths to use 'exit $ksft_skip', consistent with other selftests like
net/lib.sh and net/fib_nexthops.sh.

Signed-off-by: Junjie Cao <junjie.cao@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20260126061532.12532-1-junjie.cao@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/ptp/phc.sh | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ptp/phc.sh b/tools/testing/selftests/ptp/phc.sh
index ac6e5a6e1d3a..51aad466d989 100755
--- a/tools/testing/selftests/ptp/phc.sh
+++ b/tools/testing/selftests/ptp/phc.sh
@@ -8,17 +8,20 @@ ALL_TESTS="
 "
 DEV=$1
 
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
 ##############################################################################
 # Sanity checks
 
 if [[ "$(id -u)" -ne 0 ]]; then
 	echo "SKIP: need root privileges"
-	exit 0
+	exit $ksft_skip
 fi
 
 if [[ "$DEV" == "" ]]; then
 	echo "SKIP: PTP device not provided"
-	exit 0
+	exit $ksft_skip
 fi
 
 require_command()
@@ -27,7 +30,7 @@ require_command()
 
 	if [[ ! -x "$(command -v "$cmd")" ]]; then
 		echo "SKIP: $cmd not installed"
-		exit 1
+		exit $ksft_skip
 	fi
 }
 
@@ -37,7 +40,7 @@ phc_sanity()
 
 	if [ $? != 0 ]; then
 		echo "SKIP: unknown clock $DEV: No such device"
-		exit 1
+		exit $ksft_skip
 	fi
 }
 
-- 
cgit v1.2.3


From 239f09e258b906deced5c2a7c1ac8aed301b558b Mon Sep 17 00:00:00 2001
From: Junjie Cao <junjie.cao@intel.com>
Date: Mon, 26 Jan 2026 14:15:32 +0800
Subject: selftests: ptp: treat unsupported PHC operations as skip

Some PTP hardware clock (PHC) devices may return -EOPNOTSUPP for
operations like settime, adjtime, or adjfreq. This commonly occurs
with timestamp-only PHC implementations that don't support full clock
control.

For background, syzbot previously exposed a crash risk when PTP clock
drivers lacked required callbacks[1]. Subsequent work[2] made callback
presence a registration requirement. As a result, some drivers (like
iwlwifi MVM/MLD[3]) now provide stub callbacks that return -EOPNOTSUPP
for unsupported operations.

When phc_ctl encounters such devices, the "Operation not supported"
error should be treated as a skip (device limitation) rather than a
test failure. This patch:
- Adds [SKIP] output handling in log_test()
- Detects "Operation not supported" from phc_ctl and returns ksft_skip
- Returns ksft_skip if all tests are skipped, preventing false-positive
  results when testing timestamp-only PHC implementations

Link: https://lore.kernel.org/netdev/20251028043216.1971292-1-junjie.cao@intel.com/ [1]
Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dfb073d32cac [2]
Link: https://lore.kernel.org/netdev/20251204123204.9316-1-ziyao@disroot.org/ [3]
Signed-off-by: Junjie Cao <junjie.cao@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20260126061532.12532-2-junjie.cao@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/ptp/phc.sh | 49 ++++++++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ptp/phc.sh b/tools/testing/selftests/ptp/phc.sh
index 51aad466d989..9f61c1579edf 100755
--- a/tools/testing/selftests/ptp/phc.sh
+++ b/tools/testing/selftests/ptp/phc.sh
@@ -52,6 +52,7 @@ phc_sanity
 
 # Exit status to return at the end. Set in case one of the tests fails.
 EXIT_STATUS=0
+PASS_COUNT=0
 # Per-test return value. Clear at the beginning of each test.
 RET=0
 
@@ -68,12 +69,18 @@ log_test()
 {
 	local test_name=$1
 
+	if [[ $RET -eq $ksft_skip ]]; then
+		printf "TEST: %-60s  [SKIP]\n" "$test_name"
+		return 0
+	fi
+
 	if [[ $RET -ne 0 ]]; then
 		EXIT_STATUS=1
 		printf "TEST: %-60s  [FAIL]\n" "$test_name"
 		return 1
 	fi
 
+	((PASS_COUNT++))
 	printf "TEST: %-60s  [ OK ]\n" "$test_name"
 	return 0
 }
@@ -92,34 +99,49 @@ tests_run()
 
 settime_do()
 {
-	local res
+	local res out
 
-	res=$(phc_ctl $DEV set 0 wait 120.5 get 2> /dev/null \
-		| awk '/clock time is/{print $5}' \
-		| awk -F. '{print $1}')
+	out=$(LC_ALL=C phc_ctl $DEV set 0 wait 120.5 get 2>&1)
+	if [[ $? -ne 0 ]]; then
+		if echo "$out" | grep -qi "Operation not supported"; then
+			return $ksft_skip
+		fi
+		return 1
+	fi
+	res=$(echo "$out" | awk '/clock time is/{print $5}' | awk -F. '{print $1}')
 
 	(( res == 120 ))
 }
 
 adjtime_do()
 {
-	local res
+	local res out
 
-	res=$(phc_ctl $DEV set 0 adj 10 get 2> /dev/null \
-		| awk '/clock time is/{print $5}' \
-		| awk -F. '{print $1}')
+	out=$(LC_ALL=C phc_ctl $DEV set 0 adj 10 get 2>&1)
+	if [[ $? -ne 0 ]]; then
+		if echo "$out" | grep -qi "Operation not supported"; then
+			return $ksft_skip
+		fi
+		return 1
+	fi
+	res=$(echo "$out" | awk '/clock time is/{print $5}' | awk -F. '{print $1}')
 
 	(( res == 10 ))
 }
 
 adjfreq_do()
 {
-	local res
+	local res out
 
 	# Set the clock to be 1% faster
-	res=$(phc_ctl $DEV freq 10000000 set 0 wait 100.5 get 2> /dev/null \
-		| awk '/clock time is/{print $5}' \
-		| awk -F. '{print $1}')
+	out=$(LC_ALL=C phc_ctl $DEV freq 10000000 set 0 wait 100.5 get 2>&1)
+	if [[ $? -ne 0 ]]; then
+		if echo "$out" | grep -qi "Operation not supported"; then
+			return $ksft_skip
+		fi
+		return 1
+	fi
+	res=$(echo "$out" | awk '/clock time is/{print $5}' | awk -F. '{print $1}')
 
 	(( res == 101 ))
 }
@@ -166,4 +188,7 @@ trap cleanup EXIT
 
 tests_run
 
+if [[ $EXIT_STATUS -eq 0 && $PASS_COUNT -eq 0 ]]; then
+	exit $ksft_skip
+fi
 exit $EXIT_STATUS
-- 
cgit v1.2.3


From 944e3f7562c55fa37ebcdd58e5f60f296c81a854 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 27 Jan 2026 12:12:06 +0100
Subject: tools: Update context analysis macros in compiler_types.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In sync with the main kernel headers, include a stub version of
compiler-context-analysis.h in tools/include/linux/compiler_types.h and
remove the sparse context tracking definitions.

Since tools/ headers are generally self-contained, provide a standalone
tools/include/linux/compiler-context-analysis.h with no-op stubs for now. Also
clean up redundant stubs in tools/testing/shared/linux/kernel.h that are now
redundant.

This fixes build errors in tools/testing/radix-tree/ where headers from
include/linux/ (like cleanup.h) are used directly and expect these
macros to be defined:

| cc -I../shared -I. -I../../include -I../../arch/x86/include -I../../../lib -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined    -c -o radix-tree.o radix-tree.c
| In file included from ../shared/linux/cleanup.h:2,
|                  from ../shared/linux/../../../../include/linux/idr.h:18,
|                  from ../shared/linux/idr.h:5,
|                  from radix-tree.c:18:
| ../shared/linux/../../../../include/linux/idr.h: In function ‘class_idr_alloc_destructor’:
| ../shared/linux/../../../../include/linux/cleanup.h:283:9: error: expected declaration specifiers before ‘__no_context_analysis’
|   283 |         __no_context_analysis                                           \
|       |         ^~~~~~~~~~~~~~~~~~~~~

Closes: https://lore.kernel.org/oe-lkp/202601261546.d7ae2447-lkp@intel.com
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Tested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Link: https://patch.msgid.link/20260127111428.3747328-1-elver@google.com
---
 tools/include/linux/compiler-context-analysis.h | 42 +++++++++++++++++++++++++
 tools/include/linux/compiler_types.h            | 16 +---------
 tools/testing/shared/linux/kernel.h             |  4 ---
 3 files changed, 43 insertions(+), 19 deletions(-)
 create mode 100644 tools/include/linux/compiler-context-analysis.h

(limited to 'tools')

diff --git a/tools/include/linux/compiler-context-analysis.h b/tools/include/linux/compiler-context-analysis.h
new file mode 100644
index 000000000000..13a9115e9e58
--- /dev/null
+++ b/tools/include/linux/compiler-context-analysis.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TOOLS_LINUX_COMPILER_CONTEXT_ANALYSIS_H
+#define _TOOLS_LINUX_COMPILER_CONTEXT_ANALYSIS_H
+
+/*
+ * Macros and attributes for compiler-based static context analysis.
+ * No-op stubs for tools.
+ */
+
+#define __guarded_by(...)
+#define __pt_guarded_by(...)
+
+#define context_lock_struct(name, ...)	struct __VA_ARGS__ name
+
+#define __no_context_analysis
+#define __context_unsafe(comment)
+#define context_unsafe(...)		({ __VA_ARGS__; })
+#define context_unsafe_alias(p)
+#define disable_context_analysis()
+#define enable_context_analysis()
+
+#define __must_hold(...)
+#define __must_not_hold(...)
+#define __acquires(...)
+#define __cond_acquires(ret, x)
+#define __releases(...)
+#define __acquire(x)			(void)0
+#define __release(x)			(void)0
+
+#define __must_hold_shared(...)
+#define __acquires_shared(...)
+#define __cond_acquires_shared(ret, x)
+#define __releases_shared(...)
+#define __acquire_shared(x)		(void)0
+#define __release_shared(x)		(void)0
+
+#define __acquire_ret(call, expr)	(call)
+#define __acquire_shared_ret(call, expr) (call)
+#define __acquires_ret
+#define __acquires_shared_ret
+
+#endif /* _TOOLS_LINUX_COMPILER_CONTEXT_ANALYSIS_H */
diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h
index 067a5b4e0f7b..14e420467eee 100644
--- a/tools/include/linux/compiler_types.h
+++ b/tools/include/linux/compiler_types.h
@@ -13,21 +13,7 @@
 #define __has_builtin(x) (0)
 #endif
 
-#ifdef __CHECKER__
-/* context/locking */
-# define __must_hold(x)	__attribute__((context(x,1,1)))
-# define __acquires(x)	__attribute__((context(x,0,1)))
-# define __releases(x)	__attribute__((context(x,1,0)))
-# define __acquire(x)	__context__(x,1)
-# define __release(x)	__context__(x,-1)
-#else /* __CHECKER__ */
-/* context/locking */
-# define __must_hold(x)
-# define __acquires(x)
-# define __releases(x)
-# define __acquire(x)	(void)0
-# define __release(x)	(void)0
-#endif /* __CHECKER__ */
+#include <linux/compiler-context-analysis.h>
 
 /* Compiler specific macros. */
 #ifdef __GNUC__
diff --git a/tools/testing/shared/linux/kernel.h b/tools/testing/shared/linux/kernel.h
index c0a2bb785b92..dc2b4ccfb185 100644
--- a/tools/testing/shared/linux/kernel.h
+++ b/tools/testing/shared/linux/kernel.h
@@ -21,9 +21,5 @@
 #define schedule()
 #define PAGE_SHIFT	12
 
-#define __acquires(x)
-#define __releases(x)
-#define __must_hold(x)
-
 #define EXPORT_PER_CPU_SYMBOL_GPL(x)
 #endif /* _KERNEL_H */
-- 
cgit v1.2.3


From 96e004b4bdf9029d137a1b06de1606ff6e263ab8 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Jan 2026 16:16:14 +0000
Subject: kselftest/arm64: Add a no-SVE loop after SVE in fp-pidbench

Some applications use SVE intermittently, one common case being where SVE
is used during statup (eg, by ld.so) but then rarely if ever during the
main application runtime. Add a repeat of the no SVE loop after we've done
the SVE loops to fp-pidbench to capture results for that.

Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/fp/fp-pidbench.S | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/fp-pidbench.S b/tools/testing/selftests/arm64/fp/fp-pidbench.S
index 73830f6bc99b..aeeadc7873dc 100644
--- a/tools/testing/selftests/arm64/fp/fp-pidbench.S
+++ b/tools/testing/selftests/arm64/fp/fp-pidbench.S
@@ -63,6 +63,10 @@ function _start
 	puts	"SVE used per syscall: "
 	test_loop "rdvl x0, #8"
 
+	// Test non-SVE execution after SVE
+	puts	"No SVE after SVE: "
+	test_loop
+
 	//  And we're done
 out:
 	mov	x0, #0
-- 
cgit v1.2.3


From b661d753ce2ee951558db1f2c7b97f32d9431966 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 27 Jan 2026 16:16:15 +0000
Subject: kselftest/arm64: Raise default number of loops in fp-pidbench

When fp-pidbench was originally written SVE hardware was not widely
available so it was useful to run it in emulation and the default number
of loops was set very low, running for less than a second on actual
hardware. Now that SVE hardware is reasonably available it is very much
less interesting to use emulation, bump the default number of loops up to
even out a bit of the noise on real systems. On the machine I have to hand
this now takes about 15s which is still a toy microbenchmark but perhaps a
bit more useful.

Signed-off-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/fp/fp-pidbench.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/arm64/fp/fp-pidbench.S b/tools/testing/selftests/arm64/fp/fp-pidbench.S
index aeeadc7873dc..881dfa3b342e 100644
--- a/tools/testing/selftests/arm64/fp/fp-pidbench.S
+++ b/tools/testing/selftests/arm64/fp/fp-pidbench.S
@@ -33,7 +33,7 @@
 function _start
 	puts	"Iterations per test: "
 	mov	x20, #10000
-	lsl	x20, x20, #8
+	lsl	x20, x20, #12
 	mov	x0, x20
 	bl	putdec
 	puts	"\n"
-- 
cgit v1.2.3


From 2a85bbaed06bec2b97dec15bb01cbdbf81dce3e3 Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Tue, 27 Jan 2026 18:49:55 +0000
Subject: perf header: Replace hardcoded max cpus by MAX_NR_CPUS

cpumask and cpulist from cpu-domain header have hardcoded max_cpus value
of 1024.

Current systems have more cpus than this value. Replace it with
MAX_NR_CPUS.

Also define a macro to represent domain name length.

Fixes: d40c68a49f69c9bd ("perf header: Support CPU DOMAIN relation info")
Reported-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Chen Yu <yu.c.chen@intel.com>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/header.c | 5 +++--
 tools/perf/util/util.c   | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index eefd1cd73b6a..31c3bab1b10a 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -76,6 +76,7 @@ static const u64 __perf_magic2    = 0x32454c4946524550ULL;
 static const u64 __perf_magic2_sw = 0x50455246494c4532ULL;
 
 #define PERF_MAGIC	__perf_magic2
+#define DNAME_LEN	16
 
 const char perf_version_string[] = PERF_VERSION;
 
@@ -1616,10 +1617,10 @@ static int write_pmu_caps(struct feat_fd *ff,
 
 struct cpu_domain_map **build_cpu_domain_map(u32 *schedstat_version, u32 *max_sched_domains, u32 nr)
 {
+	char dname[DNAME_LEN], cpumask[MAX_NR_CPUS];
 	struct domain_info *domain_info;
 	struct cpu_domain_map **cd_map;
-	char dname[16], cpumask[256];
-	char cpulist[1024];
+	char cpulist[MAX_NR_CPUS];
 	char *line = NULL;
 	u32 cpu, domain;
 	u32 dcount = 0;
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 03a603fbcd7d..94f5a2ece245 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include "perf.h"
 #include "util.h"
 #include "debug.h"
 #include "event.h"
@@ -262,7 +263,7 @@ void cpumask_to_cpulist(char *cpumask, char *cpulist)
 	int i, j, bm_size, nbits;
 	int len = strlen(cpumask);
 	unsigned long *bm;
-	char cpus[1024];
+	char cpus[MAX_NR_CPUS];
 
 	for (i = 0; i < len; i++) {
 		if (cpumask[i] == ',') {
-- 
cgit v1.2.3


From 05134d15375ce9fc57a91453999729d861efe9f9 Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Tue, 27 Jan 2026 18:49:56 +0000
Subject: perf util: Fix NULL check in cpumask_to_cpulist()

The function cpumask_to_cpulist() allocates memory with calloc() and
stores the result in 'bm', but then incorrectly checks 'cpumask' for
NULL instead of 'bm'.

This means that if the allocation fails, the function will dereference a
NULL pointer when trying to access 'bm'.

Fix the check to test the correct variable 'bm'.

Fixes: d40c68a49f69c9bd ("perf header: Support CPU DOMAIN relation info")
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Chen Yu <yu.c.chen@intel.com>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 94f5a2ece245..8b893de35f77 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -279,7 +279,7 @@ void cpumask_to_cpulist(char *cpumask, char *cpulist)
 		return;
 
 	bm = calloc(bm_size, sizeof(unsigned long));
-	if (!cpumask)
+	if (!bm)
 		goto free_bm;
 
 	for (i = 0; i < bm_size; i++) {
-- 
cgit v1.2.3


From b03b95b4d71edbb10bfb1588dc59cfea8d13796c Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Tue, 27 Jan 2026 18:49:57 +0000
Subject: perf sched stats: Add NULL check for cd_map

In perf_sched__schedstat_live(), build_cpu_domain_map() returns the
pointer to cpu_domain_map which can also be NULL.

Add NULL check for the same to avoid NULL pointer dereference.

Fixes: 00093b3133984ffe ("perf sched stats: Add support for live mode")
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Chen Yu <yu.c.chen@intel.com>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index b190e928117c..f5e449bd6823 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -4714,6 +4714,11 @@ static int perf_sched__schedstat_live(struct perf_sched *sched,
 
 	nr = cpu__max_present_cpu().cpu;
 	cd_map = build_cpu_domain_map(&sv, &md, nr);
+	if (!cd_map) {
+		pr_err("Unable to generate cpu-domain relation info");
+		goto out;
+	}
+
 	show_schedstat_data(&cpu_head, cd_map, NULL, NULL, false);
 	free_cpu_domain_info(cd_map, sv, nr);
 out:
-- 
cgit v1.2.3


From 7284dc7e19fab1ff425579cbb4721fff292ac70e Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Tue, 27 Jan 2026 18:49:58 +0000
Subject: perf sched stats: correct spelling of function name

Replace store_schedtstat_cpu_diff() with store_schedstat_cpu_diff()

Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Chen Yu <yu.c.chen@intel.com>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index f5e449bd6823..1a24c4869331 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -3946,7 +3946,7 @@ static struct schedstat_domain *domain_second_pass;
 static bool after_workload_flag;
 static bool verbose_field;
 
-static void store_schedtstat_cpu_diff(struct schedstat_cpu *after_workload)
+static void store_schedstat_cpu_diff(struct schedstat_cpu *after_workload)
 {
 	struct perf_record_schedstat_cpu *before = cpu_second_pass->cpu_data;
 	struct perf_record_schedstat_cpu *after = after_workload->cpu_data;
@@ -4437,7 +4437,7 @@ static int perf_sched__process_schedstat(const struct perf_tool *tool __maybe_un
 			}
 			domain_second_pass = list_first_entry(&cpu_second_pass->domain_head,
 							      struct schedstat_domain, domain_list);
-			store_schedtstat_cpu_diff(temp);
+			store_schedstat_cpu_diff(temp);
 		}
 	} else if (event->header.type == PERF_RECORD_SCHEDSTAT_DOMAIN) {
 		struct schedstat_cpu *cpu_tail;
-- 
cgit v1.2.3


From f7dc49645346d9d47825b60c4557da6885d48037 Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Tue, 27 Jan 2026 18:49:59 +0000
Subject: perf sched stats: Define macro for SEP_LEN

Define a macro for separator length of the line in perf sched stats
report.

Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Chen Yu <yu.c.chen@intel.com>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 1a24c4869331..3f509cfdd58c 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -55,6 +55,7 @@
 #define SYM_LEN			129
 #define MAX_PID			1024000
 #define MAX_PRIO		140
+#define SEP_LEN			100
 
 static const char *cpu_list;
 static struct perf_cpu_map *user_requested_cpus;
@@ -3997,7 +3998,7 @@ static inline void print_cpu_stats(struct perf_record_schedstat_cpu *cs1,
 		       "PCT_CHANGE1", "PCT_CHANGE2");
 
 	printf("\n");
-	print_separator2(100, "", 0);
+	print_separator2(SEP_LEN, "", 0);
 
 #define CALC_PCT(_x, _y)	((_y) ? ((double)(_x) / (_y)) * 100 : 0.0)
 
@@ -4047,8 +4048,8 @@ static inline void print_domain_stats(struct perf_record_schedstat_domain *ds1,
 #define DOMAIN_CATEGORY(_desc)							\
 	do {									\
 		size_t _len = strlen(_desc);					\
-		size_t _pre_dash_cnt = (100 - _len) / 2;			\
-		size_t _post_dash_cnt = 100 - _len - _pre_dash_cnt;		\
+		size_t _pre_dash_cnt = (SEP_LEN - _len) / 2;			\
+		size_t _post_dash_cnt = SEP_LEN - _len - _pre_dash_cnt;		\
 		print_separator2((int)_pre_dash_cnt, _desc, (int)_post_dash_cnt);\
 	} while (0)
 
@@ -4238,14 +4239,14 @@ static int show_schedstat_data(struct list_head *head1, struct cpu_domain_map **
 	int ret = 0;
 
 	printf("Description\n");
-	print_separator2(100, "", 0);
+	print_separator2(SEP_LEN, "", 0);
 	printf("%-30s-> %s\n", "DESC", "Description of the field");
 	printf("%-30s-> %s\n", "COUNT", "Value of the field");
 	printf("%-30s-> %s\n", "PCT_CHANGE", "Percent change with corresponding base value");
 	printf("%-30s-> %s\n", "AVG_JIFFIES",
 	       "Avg time in jiffies between two consecutive occurrence of event");
 
-	print_separator2(100, "", 0);
+	print_separator2(SEP_LEN, "", 0);
 	printf("\n");
 
 	printf("%-65s: ", "Time elapsed (in jiffies)");
@@ -4286,16 +4287,16 @@ static int show_schedstat_data(struct list_head *head1, struct cpu_domain_map **
 			return -1;
 		}
 
-		print_separator2(100, "", 0);
+		print_separator2(SEP_LEN, "", 0);
 
 		if (is_summary)
 			printf("CPU: <ALL CPUS SUMMARY>\n");
 		else
 			printf("CPU: %d\n", cs1->cpu);
 
-		print_separator2(100, "", 0);
+		print_separator2(SEP_LEN, "", 0);
 		print_cpu_stats(cs1, cs2);
-		print_separator2(100, "", 0);
+		print_separator2(SEP_LEN, "", 0);
 
 		list_for_each_entry(dptr1, &cptr1->domain_head, domain_list) {
 			struct domain_info *dinfo1 = NULL, *dinfo2 = NULL;
@@ -4329,9 +4330,9 @@ static int show_schedstat_data(struct list_head *head1, struct cpu_domain_map **
 
 				printf("%s\n", dinfo1->cpulist);
 			}
-			print_separator2(100, "", 0);
+			print_separator2(SEP_LEN, "", 0);
 			print_domain_stats(ds1, ds2, jiffies1, jiffies2);
-			print_separator2(100, "", 0);
+			print_separator2(SEP_LEN, "", 0);
 
 			if (dptr2)
 				dptr2 = list_next_entry(dptr2, domain_list);
-- 
cgit v1.2.3


From 34b0a58eef04a49376e4103efc5b09f1e33e594a Mon Sep 17 00:00:00 2001
From: Swapnil Sapkal <swapnil.sapkal@amd.com>
Date: Tue, 27 Jan 2026 18:50:00 +0000
Subject: perf sched stats: Fixes in man page

Fix the incorrect description of the schedstats report. Also fix the
spelling errors in man page.

Fixes: 800af362d68945e5 ("perf sched stats: Add details in man page")
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Reported-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Chen Yu <yu.c.chen@intel.com>
Cc: Gautham Shenoy <gautham.shenoy@amd.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-sched.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 5bfb7bb6c633..4d9981609c04 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -86,7 +86,7 @@ There are several variants of 'perf sched':
    exposed through the file ``/proc/schedstat``. These counters are enabled or disabled
    via the sysctl governed by the file ``/proc/sys/kernel/sched_schedstats``. These
    counters accounts for many scheduler events such as ``schedule()`` calls, load-balancing
-   events, ``try_to_wakeup()`` call among others. This is useful in understading the
+   events, ``try_to_wakeup()`` call among others. This is useful in understanding the
    scheduler behavior for the workload.
 
    Note: The tool will not give correct results if there is topological reordering or
@@ -100,7 +100,7 @@ There are several variants of 'perf sched':
    A detailed description of the schedstats can be found in the Kernel Documentation:
    https://www.kernel.org/doc/html/latest/scheduler/sched-stats.html
 
-   The result can be interprested as follows:
+   The result can be interpreted as follows:
 
    The `perf sched stats report` starts with description of the columns present in
    the report. These column names are given before cpu and domain stats to improve
@@ -116,7 +116,7 @@ There are several variants of 'perf sched':
    Next is the total profiling time in terms of jiffies:
 
    ----------------------------------------------------------------------------------------------------
-   Time elapsed (in jiffies)                                   :       24537
+   Time elapsed (in jiffies)                                   :        2323
    ----------------------------------------------------------------------------------------------------
 
    Next is CPU scheduling statistics. These are simple diffs of /proc/schedstat CPU lines
@@ -210,7 +210,7 @@ There are several variants of 'perf sched':
 
    While profiling was active, the load-balancer found 28490 times the load
    needs to be balanced on a newly idle CPU 0. Following value encapsulated
-   inside $ is average jiffies between two events (28490 / 24537 = 0.08).
+   inside $ is average jiffies between two events (2323 / 28490 = 0.08).
 
    Next are active_load_balance() stats. alb did not trigger while the
    profiling was active, hence it's all 0s.
-- 
cgit v1.2.3


From a537c0da168a08b0b6a7f7bd9e75f4cc8d45ff57 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Fri, 23 Jan 2026 13:32:03 +0000
Subject: tools: Fix bitfield dependency failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A perf build failure was reported by Thomas Voegtle on stable kernel
v6.6.120:

    CC      tests/sample-parsing.o
    CC      util/intel-pt-decoder/intel-pt-pkt-decoder.o
    CC      util/perf-regs-arch/perf_regs_csky.o
    CC      util/arm-spe-decoder/arm-spe-pkt-decoder.o
    CC      util/perf-regs-arch/perf_regs_loongarch.o
  In file included from util/arm-spe-decoder/arm-spe-pkt-decoder.h:10,
                   from util/arm-spe-decoder/arm-spe-pkt-decoder.c:14:
  /local/git/linux-stable-rc/tools/include/linux/bitfield.h: In function ‘le16_encode_bits’:
  /local/git/linux-stable-rc/tools/include/linux/bitfield.h:166:31: error: implicit declaration of
  function ‘cpu_to_le16’; did you mean ‘htole16’? [-Werror=implicit-function-declaration]
    ____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu) \
                                 ^~~~~~~~~
  /local/git/linux-stable-rc/tools/include/linux/bitfield.h:149:9: note: in definition of macro
  ‘____MAKE_OP’
    return to((v & field_mask(field)) * field_multiplier(field)); \
           ^~
  /local/git/linux-stable-rc/tools/include/linux/bitfield.h:170:1: note: in expansion of macro
  ‘__MAKE_OP’
   __MAKE_OP(16)

Fix this by including linux/kernel.h, which provides the required
definitions.

The issue was not found on the mainline due to the relevant C files have
included kernel.h.  It'd be good to merge this change on mainline
as well for robustness.

Closes: https://lore.kernel.org/stable/3a44500b-d7c8-179f-61f6-e51cb50d3512@lio96.de/
Fixes: 64d86c03e1441742 ("perf arm-spe: Extend branch operations")
Reported-by: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
Reported-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ian Rogers <irogers@google.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
To: Sasha Levin <sashal@kernel.org>
Cc: stable@vger.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/linux/bitfield.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/include/linux/bitfield.h b/tools/include/linux/bitfield.h
index 6093fa6db260..ddf81f24956b 100644
--- a/tools/include/linux/bitfield.h
+++ b/tools/include/linux/bitfield.h
@@ -8,6 +8,7 @@
 #define _LINUX_BITFIELD_H
 
 #include <linux/build_bug.h>
+#include <linux/kernel.h>
 #include <asm/byteorder.h>
 
 /*
-- 
cgit v1.2.3


From 7a0ba3891104da77cfd1a16d41699e0fdf45603a Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Fri, 23 Jan 2026 13:32:04 +0000
Subject: perf: Remove redundant kernel.h include

Now that the bitfield dependency is resolved, the explicit inclusion of
kernel.h is no longer needed.

Remove the redundant include.

Signed-off-by: Leo Yan <leo.yan@arm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm64/util/header.c | 1 -
 tools/perf/util/cs-etm.c            | 1 -
 2 files changed, 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm64/util/header.c b/tools/perf/arch/arm64/util/header.c
index f445a2dd6293..cbc0ba101636 100644
--- a/tools/perf/arch/arm64/util/header.c
+++ b/tools/perf/arch/arm64/util/header.c
@@ -1,4 +1,3 @@
-#include <linux/kernel.h>
 #include <linux/bits.h>
 #include <linux/bitfield.h>
 #include <stdio.h>
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 12b55c2bc2ca..95f439c96180 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -6,7 +6,6 @@
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
  */
 
-#include <linux/kernel.h>
 #include <linux/bitfield.h>
 #include <linux/bitops.h>
 #include <linux/coresight-pmu.h>
-- 
cgit v1.2.3


From 19eab0efe72f02516b9f194a6ad10e7c83a009ae Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:32 -0800
Subject: perf jevents: Build support for generating metrics from python

Generate extra-metrics.json and extra-metricgroups.json from python
architecture specific scripts. The metrics themselves will be added in
later patches.

If a build takes place in tools/perf/ then extra-metrics.json and
extra-metricgroups.json are generated in that directory and so added
to .gitignore.

If there is an OUTPUT directory then the tools/perf/pmu-events/arch
files are copied to it so the generated extra-metrics.json and
extra-metricgroups.json can be added/generated there.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/.gitignore                  |  5 ++++
 tools/perf/Makefile.perf               |  2 ++
 tools/perf/pmu-events/Build            | 52 +++++++++++++++++++++++++++++++++-
 tools/perf/pmu-events/amd_metrics.py   | 42 +++++++++++++++++++++++++++
 tools/perf/pmu-events/arm64_metrics.py | 43 ++++++++++++++++++++++++++++
 tools/perf/pmu-events/intel_metrics.py | 42 +++++++++++++++++++++++++++
 6 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100755 tools/perf/pmu-events/amd_metrics.py
 create mode 100755 tools/perf/pmu-events/arm64_metrics.py
 create mode 100755 tools/perf/pmu-events/intel_metrics.py

(limited to 'tools')

diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index 5c59f954f52a..0f9451a6e39c 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -43,6 +43,11 @@ pmu-events/metric_test.log
 pmu-events/empty-pmu-events.log
 pmu-events/test-empty-pmu-events.c
 *.shellcheck_log
+pmu-events/arch/**/extra-metrics.json
+pmu-events/arch/**/extra-metricgroups.json
+tests/shell/*.shellcheck_log
+tests/shell/coresight/*.shellcheck_log
+tests/shell/lib/*.shellcheck_log
 feature/
 libapi/
 libbpf/
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 45d5a59a02cb..b6edc8100c8e 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1273,6 +1273,8 @@ ifeq ($(OUTPUT),)
 		pmu-events/metric_test.log \
 		pmu-events/test-empty-pmu-events.c \
 		pmu-events/empty-pmu-events.log
+	$(Q)find pmu-events/arch -name 'extra-metrics.json' -delete -o \
+		-name 'extra-metricgroups.json' -delete
 else # When an OUTPUT directory is present, clean up the copied pmu-events/arch directory.
 	$(call QUIET_CLEAN, pmu-events) $(RM) -r $(OUTPUT)pmu-events/arch \
 		$(OUTPUT)pmu-events/pmu-events.c \
diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build
index 4f9ef624ba70..ba2662d441c4 100644
--- a/tools/perf/pmu-events/Build
+++ b/tools/perf/pmu-events/Build
@@ -30,6 +30,10 @@ $(PMU_EVENTS_C): $(EMPTY_PMU_EVENTS_C)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)cp $< $@
 else
+# Functions to extract the model from a extra-metrics.json or extra-metricgroups.json path.
+model_name = $(shell echo $(1)|sed -e 's@.\+/\(.*\)/extra-metric.*\.json@\1@')
+vendor_name = $(shell echo $(1)|sed -e 's@.\+/\(.*\)/[^/]*/extra-metric.*\.json@\1@')
+
 # Copy checked-in json to OUTPUT for generation if it's an out of source build
 ifneq ($(OUTPUT),)
 # Remove all output directories when any source directory timestamp changes
@@ -48,7 +52,53 @@ $(LEGACY_CACHE_JSON): $(LEGACY_CACHE_PY) $(JSON_DIRS_ROOT)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)$(PYTHON) $(LEGACY_CACHE_PY) > $@
 
-GEN_JSON = $(patsubst %,$(OUTPUT)%,$(JSON)) $(LEGACY_CACHE_JSON) $(JSON_DIRS)
+GEN_METRIC_DEPS := pmu-events/metric.py
+
+# Generate AMD Json
+ZENS = $(shell ls -d pmu-events/arch/x86/amdzen*)
+ZEN_METRICS = $(foreach x,$(ZENS),$(OUTPUT)$(x)/extra-metrics.json)
+ZEN_METRICGROUPS = $(foreach x,$(ZENS),$(OUTPUT)$(x)/extra-metricgroups.json)
+
+$(ZEN_METRICS): pmu-events/amd_metrics.py $(GEN_METRIC_DEPS)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) arch > $@
+
+$(ZEN_METRICGROUPS): pmu-events/amd_metrics.py $(GEN_METRIC_DEPS)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) arch > $@
+
+# Generate ARM Json
+ARMS = $(shell ls -d pmu-events/arch/arm64/arm/*|grep -v cmn)
+ARM_METRICS = $(foreach x,$(ARMS),$(OUTPUT)$(x)/extra-metrics.json)
+ARM_METRICGROUPS = $(foreach x,$(ARMS),$(OUTPUT)$(x)/extra-metricgroups.json)
+
+$(ARM_METRICS): pmu-events/arm64_metrics.py $(GEN_METRIC_DEPS)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call vendor_name,$@) $(call model_name,$@) arch > $@
+
+$(ARM_METRICGROUPS): pmu-events/arm64_metrics.py $(GEN_METRIC_DEPS)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call vendor_name,$@) $(call model_name,$@) arch > $@
+
+# Generate Intel Json
+INTELS = $(shell ls -d pmu-events/arch/x86/*|grep -v amdzen|grep -v mapfile.csv)
+INTEL_METRICS = $(foreach x,$(INTELS),$(OUTPUT)$(x)/extra-metrics.json)
+INTEL_METRICGROUPS = $(foreach x,$(INTELS),$(OUTPUT)$(x)/extra-metricgroups.json)
+
+$(INTEL_METRICS): pmu-events/intel_metrics.py $(GEN_METRIC_DEPS)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) arch > $@
+
+$(INTEL_METRICGROUPS): pmu-events/intel_metrics.py $(GEN_METRIC_DEPS)
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) arch > $@
+
+GEN_JSON = $(patsubst %,$(OUTPUT)%,$(JSON)) \
+            $(LEGACY_CACHE_JSON) \
+            $(JSON_DIRS) \
+            $(ZEN_METRICS) $(ZEN_METRICGROUPS) \
+            $(ARM_METRICS) $(ARM_METRICGROUPS) \
+            $(INTEL_METRICS) $(INTEL_METRICGROUPS)
 
 $(METRIC_TEST_LOG): $(METRIC_TEST_PY) $(METRIC_PY)
 	$(call rule_mkdir)
diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py
new file mode 100755
index 000000000000..5f44687d8d20
--- /dev/null
+++ b/tools/perf/pmu-events/amd_metrics.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+import argparse
+import os
+from metric import (
+    JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, MetricGroup)
+
+# Global command line arguments.
+_args = None
+
+
+def main() -> None:
+    global _args
+
+    def dir_path(path: str) -> str:
+        """Validate path is a directory for argparse."""
+        if os.path.isdir(path):
+            return path
+        raise argparse.ArgumentTypeError(
+            f'\'{path}\' is not a valid directory')
+
+    parser = argparse.ArgumentParser(description="AMD perf json generator")
+    parser.add_argument(
+        "-metricgroups", help="Generate metricgroups data", action='store_true')
+    parser.add_argument("model", help="e.g. amdzen[123]")
+    parser.add_argument(
+        'events_path',
+        type=dir_path,
+        help='Root of tree containing architecture directories containing json files'
+    )
+    _args = parser.parse_args()
+
+    all_metrics = MetricGroup("", [])
+
+    if _args.metricgroups:
+        print(JsonEncodeMetricGroupDescriptions(all_metrics))
+    else:
+        print(JsonEncodeMetric(all_metrics))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/perf/pmu-events/arm64_metrics.py b/tools/perf/pmu-events/arm64_metrics.py
new file mode 100755
index 000000000000..204b3b08c680
--- /dev/null
+++ b/tools/perf/pmu-events/arm64_metrics.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+import argparse
+import os
+from metric import (
+    JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, MetricGroup)
+
+# Global command line arguments.
+_args = None
+
+
+def main() -> None:
+    global _args
+
+    def dir_path(path: str) -> str:
+        """Validate path is a directory for argparse."""
+        if os.path.isdir(path):
+            return path
+        raise argparse.ArgumentTypeError(
+            f'\'{path}\' is not a valid directory')
+
+    parser = argparse.ArgumentParser(description="ARM perf json generator")
+    parser.add_argument(
+        "-metricgroups", help="Generate metricgroups data", action='store_true')
+    parser.add_argument("vendor", help="e.g. arm")
+    parser.add_argument("model", help="e.g. neoverse-n1")
+    parser.add_argument(
+        'events_path',
+        type=dir_path,
+        help='Root of tree containing architecture directories containing json files'
+    )
+    _args = parser.parse_args()
+
+    all_metrics = MetricGroup("", [])
+
+    if _args.metricgroups:
+        print(JsonEncodeMetricGroupDescriptions(all_metrics))
+    else:
+        print(JsonEncodeMetric(all_metrics))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
new file mode 100755
index 000000000000..65ada006d05a
--- /dev/null
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+import argparse
+import os
+from metric import (
+    JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, MetricGroup)
+
+# Global command line arguments.
+_args = None
+
+
+def main() -> None:
+    global _args
+
+    def dir_path(path: str) -> str:
+        """Validate path is a directory for argparse."""
+        if os.path.isdir(path):
+            return path
+        raise argparse.ArgumentTypeError(
+            f'\'{path}\' is not a valid directory')
+
+    parser = argparse.ArgumentParser(description="Intel perf json generator")
+    parser.add_argument(
+        "-metricgroups", help="Generate metricgroups data", action='store_true')
+    parser.add_argument("model", help="e.g. skylakex")
+    parser.add_argument(
+        'events_path',
+        type=dir_path,
+        help='Root of tree containing architecture directories containing json files'
+    )
+    _args = parser.parse_args()
+
+    all_metrics = MetricGroup("", [])
+
+    if _args.metricgroups:
+        print(JsonEncodeMetricGroupDescriptions(all_metrics))
+    else:
+        print(JsonEncodeMetric(all_metrics))
+
+
+if __name__ == '__main__':
+    main()
-- 
cgit v1.2.3


From 6bd6c5ef6c7ae35000179c9db8ecdbc3e9698d72 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:33 -0800
Subject: perf jevents: Add load event JSON to verify and allow fallbacks

Add a LoadEvents function that loads all event JSON files in a
directory.

In the Event constructor ensure all events are defined in the event JSON
except for legacy events like "cycles".

If the initial event isn't found then legacy_event1 is used, and if that
isn't found legacy_event2 is used.

This allows a single Event to have multiple event names as models will
often rename the same event over time. If the event doesn't exist an
exception is raised.

So that references to metrics can be added, add the MetricRef
class. This doesn't validate as an event name and so provides an
escape hatch for metrics to refer to each other.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/Build            | 12 ++---
 tools/perf/pmu-events/amd_metrics.py   |  7 ++-
 tools/perf/pmu-events/arm64_metrics.py |  7 ++-
 tools/perf/pmu-events/intel_metrics.py |  7 ++-
 tools/perf/pmu-events/metric.py        | 83 ++++++++++++++++++++++++++++++++--
 5 files changed, 101 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build
index ba2662d441c4..68227614d0b1 100644
--- a/tools/perf/pmu-events/Build
+++ b/tools/perf/pmu-events/Build
@@ -61,11 +61,11 @@ ZEN_METRICGROUPS = $(foreach x,$(ZENS),$(OUTPUT)$(x)/extra-metricgroups.json)
 
 $(ZEN_METRICS): pmu-events/amd_metrics.py $(GEN_METRIC_DEPS)
 	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) arch > $@
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) pmu-events/arch > $@
 
 $(ZEN_METRICGROUPS): pmu-events/amd_metrics.py $(GEN_METRIC_DEPS)
 	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) arch > $@
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) pmu-events/arch > $@
 
 # Generate ARM Json
 ARMS = $(shell ls -d pmu-events/arch/arm64/arm/*|grep -v cmn)
@@ -74,11 +74,11 @@ ARM_METRICGROUPS = $(foreach x,$(ARMS),$(OUTPUT)$(x)/extra-metricgroups.json)
 
 $(ARM_METRICS): pmu-events/arm64_metrics.py $(GEN_METRIC_DEPS)
 	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call vendor_name,$@) $(call model_name,$@) arch > $@
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call vendor_name,$@) $(call model_name,$@) pmu-events/arch > $@
 
 $(ARM_METRICGROUPS): pmu-events/arm64_metrics.py $(GEN_METRIC_DEPS)
 	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call vendor_name,$@) $(call model_name,$@) arch > $@
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call vendor_name,$@) $(call model_name,$@) pmu-events/arch > $@
 
 # Generate Intel Json
 INTELS = $(shell ls -d pmu-events/arch/x86/*|grep -v amdzen|grep -v mapfile.csv)
@@ -87,11 +87,11 @@ INTEL_METRICGROUPS = $(foreach x,$(INTELS),$(OUTPUT)$(x)/extra-metricgroups.json
 
 $(INTEL_METRICS): pmu-events/intel_metrics.py $(GEN_METRIC_DEPS)
 	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) arch > $@
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< $(call model_name,$@) pmu-events/arch > $@
 
 $(INTEL_METRICGROUPS): pmu-events/intel_metrics.py $(GEN_METRIC_DEPS)
 	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) arch > $@
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) pmu-events/arch > $@
 
 GEN_JSON = $(patsubst %,$(OUTPUT)%,$(JSON)) \
             $(LEGACY_CACHE_JSON) \
diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py
index 5f44687d8d20..bc91d9c120fa 100755
--- a/tools/perf/pmu-events/amd_metrics.py
+++ b/tools/perf/pmu-events/amd_metrics.py
@@ -2,8 +2,8 @@
 # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 import argparse
 import os
-from metric import (
-    JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, MetricGroup)
+from metric import (JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents,
+                    MetricGroup)
 
 # Global command line arguments.
 _args = None
@@ -30,6 +30,9 @@ def main() -> None:
     )
     _args = parser.parse_args()
 
+    directory = f"{_args.events_path}/x86/{_args.model}/"
+    LoadEvents(directory)
+
     all_metrics = MetricGroup("", [])
 
     if _args.metricgroups:
diff --git a/tools/perf/pmu-events/arm64_metrics.py b/tools/perf/pmu-events/arm64_metrics.py
index 204b3b08c680..ac717ca3513a 100755
--- a/tools/perf/pmu-events/arm64_metrics.py
+++ b/tools/perf/pmu-events/arm64_metrics.py
@@ -2,8 +2,8 @@
 # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 import argparse
 import os
-from metric import (
-    JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, MetricGroup)
+from metric import (JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents,
+                    MetricGroup)
 
 # Global command line arguments.
 _args = None
@@ -31,6 +31,9 @@ def main() -> None:
     )
     _args = parser.parse_args()
 
+    directory = f"{_args.events_path}/arm64/{_args.vendor}/{_args.model}/"
+    LoadEvents(directory)
+
     all_metrics = MetricGroup("", [])
 
     if _args.metricgroups:
diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 65ada006d05a..b287ef115193 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -2,8 +2,8 @@
 # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 import argparse
 import os
-from metric import (
-    JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, MetricGroup)
+from metric import (JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents,
+                    MetricGroup)
 
 # Global command line arguments.
 _args = None
@@ -30,6 +30,9 @@ def main() -> None:
     )
     _args = parser.parse_args()
 
+    directory = f"{_args.events_path}/x86/{_args.model}/"
+    LoadEvents(directory)
+
     all_metrics = MetricGroup("", [])
 
     if _args.metricgroups:
diff --git a/tools/perf/pmu-events/metric.py b/tools/perf/pmu-events/metric.py
index dd8fd06940e6..e33e163b2815 100644
--- a/tools/perf/pmu-events/metric.py
+++ b/tools/perf/pmu-events/metric.py
@@ -3,10 +3,56 @@
 import ast
 import decimal
 import json
+import os
 import re
 from enum import Enum
 from typing import Dict, List, Optional, Set, Tuple, Union
 
+all_events = set()
+
+def LoadEvents(directory: str) -> None:
+  """Populate a global set of all known events for the purpose of validating Event names"""
+  global all_events
+  all_events = {
+      "context\\-switches",
+      "cpu\\-cycles",
+      "cycles",
+      "duration_time",
+      "instructions",
+      "l2_itlb_misses",
+  }
+  for file in os.listdir(os.fsencode(directory)):
+    filename = os.fsdecode(file)
+    if filename.endswith(".json"):
+      try:
+        for x in json.load(open(f"{directory}/{filename}")):
+          if "EventName" in x:
+            all_events.add(x["EventName"])
+          elif "ArchStdEvent" in x:
+            all_events.add(x["ArchStdEvent"])
+      except json.decoder.JSONDecodeError:
+        # The generated directory may be the same as the input, which
+        # causes partial json files. Ignore errors.
+        pass
+
+
+def CheckEvent(name: str) -> bool:
+  """Check the event name exists in the set of all loaded events"""
+  global all_events
+  if len(all_events) == 0:
+    # No events loaded so assume any event is good.
+    return True
+
+  if ':' in name:
+    # Remove trailing modifier.
+    name = name[:name.find(':')]
+  elif '/' in name:
+    # Name could begin with a PMU or an event, for now assume it is good.
+    return True
+
+  return name in all_events
+
+
 class MetricConstraint(Enum):
   GROUPED_EVENTS = 0
   NO_GROUP_EVENTS = 1
@@ -317,9 +363,18 @@ def _FixEscapes(s: str) -> str:
 class Event(Expression):
   """An event in an expression."""
 
-  def __init__(self, name: str, legacy_name: str = ''):
-    self.name = _FixEscapes(name)
-    self.legacy_name = _FixEscapes(legacy_name)
+  def __init__(self, *args: str):
+    error = ""
+    for name in args:
+      if CheckEvent(name):
+        self.name = _FixEscapes(name)
+        return
+      if error:
+        error += " or " + name
+      else:
+        error = name
+    global all_events
+    raise Exception(f"No event {error} in:\n{all_events}")
 
   def ToPerfJson(self):
     result = re.sub('/', '@', self.name)
@@ -338,6 +393,28 @@ class Event(Expression):
     return self
 
 
+class MetricRef(Expression):
+  """A metric reference in an expression."""
+
+  def __init__(self, name: str):
+    self.name = _FixEscapes(name)
+
+  def ToPerfJson(self):
+    return self.name
+
+  def ToPython(self):
+    return f'MetricRef(r"{self.name}")'
+
+  def Simplify(self) -> Expression:
+    return self
+
+  def Equals(self, other: Expression) -> bool:
+    return isinstance(other, MetricRef) and self.name == other.name
+
+  def Substitute(self, name: str, expression: Expression) -> Expression:
+    return self
+
+
 class Constant(Expression):
   """A constant within the expression tree."""
 
-- 
cgit v1.2.3


From d10ae3a935e7fdc2b149d81e617069fcde9864e3 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:34 -0800
Subject: perf jevents: Add RAPL event metric for AMD zen models

Add power per second metrics based on RAPL.

Reviewed-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/amd_metrics.py | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py
index bc91d9c120fa..b6cdeb4f09fe 100755
--- a/tools/perf/pmu-events/amd_metrics.py
+++ b/tools/perf/pmu-events/amd_metrics.py
@@ -1,13 +1,36 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 import argparse
+import math
 import os
-from metric import (JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents,
-                    MetricGroup)
+from metric import (d_ratio, has_event, Event, JsonEncodeMetric, JsonEncodeMetricGroupDescriptions,
+                    LoadEvents, Metric, MetricGroup, Select)
 
 # Global command line arguments.
 _args = None
 
+interval_sec = Event("duration_time")
+
+
+def Rapl() -> MetricGroup:
+    """Processor socket power consumption estimate.
+
+    Use events from the running average power limit (RAPL) driver.
+    """
+    # Watts = joules/second
+    # Currently only energy-pkg is supported by AMD:
+    # https://lore.kernel.org/lkml/20220105185659.643355-1-eranian@google.com/
+    pkg = Event("power/energy\\-pkg/")
+    cond_pkg = Select(pkg, has_event(pkg), math.nan)
+    scale = 2.3283064365386962890625e-10
+    metrics = [
+        Metric("lpm_cpu_power_pkg", "",
+               d_ratio(cond_pkg * scale, interval_sec), "Watts"),
+    ]
+
+    return MetricGroup("lpm_cpu_power", metrics,
+                       description="Processor socket power consumption estimates")
+
 
 def main() -> None:
     global _args
@@ -33,7 +56,9 @@ def main() -> None:
     directory = f"{_args.events_path}/x86/{_args.model}/"
     LoadEvents(directory)
 
-    all_metrics = MetricGroup("", [])
+    all_metrics = MetricGroup("", [
+        Rapl(),
+    ])
 
     if _args.metricgroups:
         print(JsonEncodeMetricGroupDescriptions(all_metrics))
-- 
cgit v1.2.3


From 6da95e1834480f94f99afde0caabc63159eab64d Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:35 -0800
Subject: perf jevents: Add idle metric for AMD zen models

Compute using the MSR PMU the percentage of wallclock cycles where the
CPUs are in a low power state.

Reviewed-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/amd_metrics.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py
index b6cdeb4f09fe..f51a044b8005 100755
--- a/tools/perf/pmu-events/amd_metrics.py
+++ b/tools/perf/pmu-events/amd_metrics.py
@@ -3,8 +3,9 @@
 import argparse
 import math
 import os
-from metric import (d_ratio, has_event, Event, JsonEncodeMetric, JsonEncodeMetricGroupDescriptions,
-                    LoadEvents, Metric, MetricGroup, Select)
+from metric import (d_ratio, has_event, max, Event, JsonEncodeMetric,
+                    JsonEncodeMetricGroupDescriptions, LoadEvents, Metric,
+                    MetricGroup, Select)
 
 # Global command line arguments.
 _args = None
@@ -12,6 +13,16 @@ _args = None
 interval_sec = Event("duration_time")
 
 
+def Idle() -> Metric:
+    cyc = Event("msr/mperf/")
+    tsc = Event("msr/tsc/")
+    low = max(tsc - cyc, 0)
+    return Metric(
+        "lpm_idle",
+        "Percentage of total wallclock cycles where CPUs are in low power state (C1 or deeper sleep state)",
+        d_ratio(low, tsc), "100%")
+
+
 def Rapl() -> MetricGroup:
     """Processor socket power consumption estimate.
 
@@ -57,6 +68,7 @@ def main() -> None:
     LoadEvents(directory)
 
     all_metrics = MetricGroup("", [
+        Idle(),
         Rapl(),
     ])
 
-- 
cgit v1.2.3


From 9c9efc7462487c85a269275655807631fba760fc Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:36 -0800
Subject: perf jevents: Add upc metric for uops per cycle for AMD

The metric adjusts for whether or not SMT is on.

Reviewed-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/amd_metrics.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py
index f51a044b8005..42e46b33334d 100755
--- a/tools/perf/pmu-events/amd_metrics.py
+++ b/tools/perf/pmu-events/amd_metrics.py
@@ -3,14 +3,26 @@
 import argparse
 import math
 import os
+from typing import Optional
 from metric import (d_ratio, has_event, max, Event, JsonEncodeMetric,
-                    JsonEncodeMetricGroupDescriptions, LoadEvents, Metric,
-                    MetricGroup, Select)
+                    JsonEncodeMetricGroupDescriptions, Literal, LoadEvents,
+                    Metric, MetricGroup, Select)
 
 # Global command line arguments.
 _args = None
-
+_zen_model: int = 1
 interval_sec = Event("duration_time")
+ins = Event("instructions")
+cycles = Event("cycles")
+# Number of CPU cycles scaled for SMT.
+smt_cycles = Select(cycles / 2, Literal("#smt_on"), cycles)
+
+
+def AmdUpc() -> Metric:
+    ops = Event("ex_ret_ops", "ex_ret_cops")
+    upc = d_ratio(ops, smt_cycles)
+    return Metric("lpm_upc", "Micro-ops retired per core cycle (higher is better)",
+                  upc, "uops/cycle")
 
 
 def Idle() -> Metric:
@@ -45,6 +57,7 @@ def Rapl() -> MetricGroup:
 
 def main() -> None:
     global _args
+    global _zen_model
 
     def dir_path(path: str) -> str:
         """Validate path is a directory for argparse."""
@@ -67,7 +80,10 @@ def main() -> None:
     directory = f"{_args.events_path}/x86/{_args.model}/"
     LoadEvents(directory)
 
+    _zen_model = int(_args.model[6:])
+
     all_metrics = MetricGroup("", [
+        AmdUpc(),
         Idle(),
         Rapl(),
     ])
-- 
cgit v1.2.3


From 78067ae26815f8432f8648657ea14c0e3c4f93ad Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:37 -0800
Subject: perf jevents: Add br metric group for branch statistics on AMD

The br metric group for branches itself comprises metric groups for
total, taken, conditional, fused and far metric groups using JSON
events.

The lack of conditional events on anything but zen2 means this category
is lacking on zen1, zen3 and zen4.

Reviewed-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/amd_metrics.py | 105 +++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py
index 42e46b33334d..38948f63cb52 100755
--- a/tools/perf/pmu-events/amd_metrics.py
+++ b/tools/perf/pmu-events/amd_metrics.py
@@ -18,6 +18,110 @@ cycles = Event("cycles")
 smt_cycles = Select(cycles / 2, Literal("#smt_on"), cycles)
 
 
+def AmdBr():
+    def Total() -> MetricGroup:
+        br = Event("ex_ret_brn")
+        br_m_all = Event("ex_ret_brn_misp")
+        br_clr = Event("ex_ret_brn_cond_misp",
+                       "ex_ret_msprd_brnch_instr_dir_msmtch",
+                       "ex_ret_brn_resync")
+
+        br_r = d_ratio(br, interval_sec)
+        ins_r = d_ratio(ins, br)
+        misp_r = d_ratio(br_m_all, br)
+        clr_r = d_ratio(br_clr, interval_sec)
+
+        return MetricGroup("lpm_br_total", [
+            Metric("lpm_br_total_retired",
+                   "The number of branch instructions retired per second.", br_r,
+                   "insn/s"),
+            Metric(
+                "lpm_br_total_mispred",
+                "The number of branch instructions retired, of any type, that were "
+                "not correctly predicted as a percentage of all branch instrucions.",
+                misp_r, "100%"),
+            Metric("lpm_br_total_insn_between_branches",
+                   "The number of instructions divided by the number of branches.",
+                   ins_r, "insn"),
+            Metric("lpm_br_total_insn_fe_resteers",
+                   "The number of resync branches per second.", clr_r, "req/s")
+        ])
+
+    def Taken() -> MetricGroup:
+        br = Event("ex_ret_brn_tkn")
+        br_m_tk = Event("ex_ret_brn_tkn_misp")
+        br_r = d_ratio(br, interval_sec)
+        ins_r = d_ratio(ins, br)
+        misp_r = d_ratio(br_m_tk, br)
+        return MetricGroup("lpm_br_taken", [
+            Metric("lpm_br_taken_retired",
+                   "The number of taken branches that were retired per second.",
+                   br_r, "insn/s"),
+            Metric(
+                "lpm_br_taken_mispred",
+                "The number of retired taken branch instructions that were "
+                "mispredicted as a percentage of all taken branches.", misp_r,
+                "100%"),
+            Metric(
+                "lpm_br_taken_insn_between_branches",
+                "The number of instructions divided by the number of taken branches.",
+                ins_r, "insn"),
+        ])
+
+    def Conditional() -> Optional[MetricGroup]:
+        global _zen_model
+        br = Event("ex_ret_brn_cond", "ex_ret_cond")
+        br_r = d_ratio(br, interval_sec)
+        ins_r = d_ratio(ins, br)
+
+        metrics = [
+            Metric("lpm_br_cond_retired", "Retired conditional branch instructions.",
+                   br_r, "insn/s"),
+            Metric("lpm_br_cond_insn_between_branches",
+                   "The number of instructions divided by the number of conditional "
+                   "branches.", ins_r, "insn"),
+        ]
+        if _zen_model == 2:
+            br_m_cond = Event("ex_ret_cond_misp")
+            misp_r = d_ratio(br_m_cond, br)
+            metrics += [
+                Metric("lpm_br_cond_mispred",
+                       "Retired conditional branch instructions mispredicted as a "
+                       "percentage of all conditional branches.", misp_r, "100%"),
+            ]
+
+        return MetricGroup("lpm_br_cond", metrics)
+
+    def Fused() -> MetricGroup:
+        br = Event("ex_ret_fused_instr", "ex_ret_fus_brnch_inst")
+        br_r = d_ratio(br, interval_sec)
+        ins_r = d_ratio(ins, br)
+        return MetricGroup("lpm_br_cond", [
+            Metric("lpm_br_fused_retired",
+                   "Retired fused branch instructions per second.", br_r, "insn/s"),
+            Metric(
+                "lpm_br_fused_insn_between_branches",
+                "The number of instructions divided by the number of fused "
+                "branches.", ins_r, "insn"),
+        ])
+
+    def Far() -> MetricGroup:
+        br = Event("ex_ret_brn_far")
+        br_r = d_ratio(br, interval_sec)
+        ins_r = d_ratio(ins, br)
+        return MetricGroup("lpm_br_far", [
+            Metric("lpm_br_far_retired", "Retired far control transfers per second.",
+                   br_r, "insn/s"),
+            Metric(
+                "lpm_br_far_insn_between_branches",
+                "The number of instructions divided by the number of far branches.",
+                ins_r, "insn"),
+        ])
+
+    return MetricGroup("lpm_br", [Total(), Taken(), Conditional(), Fused(), Far()],
+                       description="breakdown of retired branch instructions")
+
+
 def AmdUpc() -> Metric:
     ops = Event("ex_ret_ops", "ex_ret_cops")
     upc = d_ratio(ops, smt_cycles)
@@ -83,6 +187,7 @@ def main() -> None:
     _zen_model = int(_args.model[6:])
 
     all_metrics = MetricGroup("", [
+        AmdBr(),
         AmdUpc(),
         Idle(),
         Rapl(),
-- 
cgit v1.2.3


From e596f329668ec2b5da6ac60fc87c035c5a337d1f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:38 -0800
Subject: perf jevents: Add itlb metric group for AMD

Add metrics that give an overview and details of the l1 itlb (zen1,
zen2, zen3) and l2 itlb (all zens).

Reviewed-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/amd_metrics.py | 49 ++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py
index 38948f63cb52..8fb0b55074a2 100755
--- a/tools/perf/pmu-events/amd_metrics.py
+++ b/tools/perf/pmu-events/amd_metrics.py
@@ -122,6 +122,54 @@ def AmdBr():
                        description="breakdown of retired branch instructions")
 
 
+def AmdItlb():
+    global _zen_model
+    l2h = Event("bp_l1_tlb_miss_l2_tlb_hit", "bp_l1_tlb_miss_l2_hit")
+    l2m = Event("l2_itlb_misses")
+    l2r = l2h + l2m
+
+    itlb_l1_mg = None
+    l1m = l2r
+    if _zen_model <= 3:
+        l1r = Event("ic_fw32")
+        l1h = max(l1r - l1m, 0)
+        itlb_l1_mg = MetricGroup("lpm_itlb_l1", [
+            Metric("lpm_itlb_l1_hits",
+                   "L1 ITLB hits as a perecentage of L1 ITLB accesses.",
+                   d_ratio(l1h, l1h + l1m), "100%"),
+            Metric("lpm_itlb_l1_miss",
+                   "L1 ITLB misses as a perecentage of L1 ITLB accesses.",
+                   d_ratio(l1m, l1h + l1m), "100%"),
+            Metric("lpm_itlb_l1_reqs",
+                   "The number of 32B fetch windows transferred from IC pipe to DE "
+                   "instruction decoder per second.", d_ratio(
+                       l1r, interval_sec),
+                   "windows/sec"),
+        ])
+
+    return MetricGroup("lpm_itlb", [
+        MetricGroup("lpm_itlb_ov", [
+            Metric("lpm_itlb_ov_insn_bt_l1_miss",
+                   "Number of instructions between l1 misses", d_ratio(
+                       ins, l1m), "insns"),
+            Metric("lpm_itlb_ov_insn_bt_l2_miss",
+                   "Number of instructions between l2 misses", d_ratio(
+                       ins, l2m), "insns"),
+        ]),
+        itlb_l1_mg,
+        MetricGroup("lpm_itlb_l2", [
+            Metric("lpm_itlb_l2_hits",
+                   "L2 ITLB hits as a percentage of all L2 ITLB accesses.",
+                   d_ratio(l2h, l2r), "100%"),
+            Metric("lpm_itlb_l2_miss",
+                   "L2 ITLB misses as a percentage of all L2 ITLB accesses.",
+                   d_ratio(l2m, l2r), "100%"),
+            Metric("lpm_itlb_l2_reqs", "ITLB accesses per second.",
+                   d_ratio(l2r, interval_sec), "accesses/sec"),
+        ]),
+    ], description="Instruction TLB breakdown")
+
+
 def AmdUpc() -> Metric:
     ops = Event("ex_ret_ops", "ex_ret_cops")
     upc = d_ratio(ops, smt_cycles)
@@ -188,6 +236,7 @@ def main() -> None:
 
     all_metrics = MetricGroup("", [
         AmdBr(),
+        AmdItlb(),
         AmdUpc(),
         Idle(),
         Rapl(),
-- 
cgit v1.2.3


From c4108b9509f2cc1fab66c64c2de55ac4f9ee38b4 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:39 -0800
Subject: perf jevents: Add dtlb metric group for AMD

Add metrics that give an overview and details of the dtlb (zen1, zen2,
zen3).

Reviewed-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/amd_metrics.py | 111 +++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py
index 8fb0b55074a2..a4ff88de08b5 100755
--- a/tools/perf/pmu-events/amd_metrics.py
+++ b/tools/perf/pmu-events/amd_metrics.py
@@ -122,6 +122,116 @@ def AmdBr():
                        description="breakdown of retired branch instructions")
 
 
+def AmdDtlb() -> Optional[MetricGroup]:
+    global _zen_model
+    if _zen_model >= 4:
+        return None
+
+    d_dat = Event("ls_dc_accesses") if _zen_model <= 3 else None
+    d_h4k = Event("ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit")
+    d_hcoal = Event(
+        "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit") if _zen_model >= 2 else 0
+    d_h2m = Event("ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit")
+    d_h1g = Event("ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit")
+
+    d_m4k = Event("ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss")
+    d_mcoal = Event(
+        "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_miss") if _zen_model >= 2 else 0
+    d_m2m = Event("ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss")
+    d_m1g = Event("ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss")
+
+    d_w0 = Event("ls_tablewalker.dc_type0") if _zen_model <= 3 else None
+    d_w1 = Event("ls_tablewalker.dc_type1") if _zen_model <= 3 else None
+    walks = d_w0 + d_w1
+    walks_r = d_ratio(walks, interval_sec)
+    ins_w = d_ratio(ins, walks)
+    l1 = d_dat
+    l1_r = d_ratio(l1, interval_sec)
+    l2_hits = d_h4k + d_hcoal + d_h2m + d_h1g
+    l2_miss = d_m4k + d_mcoal + d_m2m + d_m1g
+    l2_r = d_ratio(l2_hits + l2_miss, interval_sec)
+    l1_miss = l2_hits + l2_miss + walks
+    l1_hits = max(l1 - l1_miss, 0)
+    ins_l = d_ratio(ins, l1_miss)
+
+    return MetricGroup("lpm_dtlb", [
+        MetricGroup("lpm_dtlb_ov", [
+            Metric("lpm_dtlb_ov_insn_bt_l1_miss",
+                   "DTLB overview: instructions between l1 misses.", ins_l,
+                   "insns"),
+            Metric("lpm_dtlb_ov_insn_bt_walks",
+                   "DTLB overview: instructions between dtlb page table walks.",
+                   ins_w, "insns"),
+        ]),
+        MetricGroup("lpm_dtlb_l1", [
+            Metric("lpm_dtlb_l1_hits",
+                   "DTLB L1 hits as percentage of all DTLB L1 accesses.",
+                   d_ratio(l1_hits, l1), "100%"),
+            Metric("lpm_dtlb_l1_miss",
+                   "DTLB L1 misses as percentage of all DTLB L1 accesses.",
+                   d_ratio(l1_miss, l1), "100%"),
+            Metric("lpm_dtlb_l1_reqs", "DTLB L1 accesses per second.", l1_r,
+                   "insns/s"),
+        ]),
+        MetricGroup("lpm_dtlb_l2", [
+            Metric("lpm_dtlb_l2_hits",
+                   "DTLB L2 hits as percentage of all DTLB L2 accesses.",
+                   d_ratio(l2_hits, l2_hits + l2_miss), "100%"),
+            Metric("lpm_dtlb_l2_miss",
+                   "DTLB L2 misses as percentage of all DTLB L2 accesses.",
+                   d_ratio(l2_miss, l2_hits + l2_miss), "100%"),
+            Metric("lpm_dtlb_l2_reqs", "DTLB L2 accesses per second.", l2_r,
+                   "insns/s"),
+            MetricGroup("lpm_dtlb_l2_4kb", [
+                Metric(
+                    "lpm_dtlb_l2_4kb_hits",
+                    "DTLB L2 4kb page size hits as percentage of all DTLB L2 4kb "
+                    "accesses.", d_ratio(d_h4k, d_h4k + d_m4k), "100%"),
+                Metric(
+                    "lpm_dtlb_l2_4kb_miss",
+                    "DTLB L2 4kb page size misses as percentage of all DTLB L2 4kb"
+                    "accesses.", d_ratio(d_m4k, d_h4k + d_m4k), "100%")
+            ]),
+            MetricGroup("lpm_dtlb_l2_coalesced", [
+                Metric(
+                    "lpm_dtlb_l2_coal_hits",
+                    "DTLB L2 coalesced page (16kb) hits as percentage of all DTLB "
+                    "L2 coalesced accesses.", d_ratio(d_hcoal,
+                                                      d_hcoal + d_mcoal), "100%"),
+                Metric(
+                    "lpm_dtlb_l2_coal_miss",
+                    "DTLB L2 coalesced page (16kb) misses as percentage of all "
+                    "DTLB L2 coalesced accesses.",
+                    d_ratio(d_mcoal, d_hcoal + d_mcoal), "100%")
+            ]),
+            MetricGroup("lpm_dtlb_l2_2mb", [
+                Metric(
+                    "lpm_dtlb_l2_2mb_hits",
+                    "DTLB L2 2mb page size hits as percentage of all DTLB L2 2mb "
+                    "accesses.", d_ratio(d_h2m, d_h2m + d_m2m), "100%"),
+                Metric(
+                    "lpm_dtlb_l2_2mb_miss",
+                    "DTLB L2 2mb page size misses as percentage of all DTLB L2 "
+                    "accesses.", d_ratio(d_m2m, d_h2m + d_m2m), "100%")
+            ]),
+            MetricGroup("lpm_dtlb_l2_1g", [
+                Metric(
+                    "lpm_dtlb_l2_1g_hits",
+                    "DTLB L2 1gb page size hits as percentage of all DTLB L2 1gb "
+                    "accesses.", d_ratio(d_h1g, d_h1g + d_m1g), "100%"),
+                Metric(
+                    "lpm_dtlb_l2_1g_miss",
+                    "DTLB L2 1gb page size misses as percentage of all DTLB L2 "
+                    "1gb accesses.", d_ratio(d_m1g, d_h1g + d_m1g), "100%")
+            ]),
+        ]),
+        MetricGroup("lpm_dtlb_walks", [
+            Metric("lpm_dtlb_walks_reqs", "DTLB page table walks per second.",
+                   walks_r, "walks/s"),
+        ]),
+    ], description="Data TLB metrics")
+
+
 def AmdItlb():
     global _zen_model
     l2h = Event("bp_l1_tlb_miss_l2_tlb_hit", "bp_l1_tlb_miss_l2_hit")
@@ -236,6 +346,7 @@ def main() -> None:
 
     all_metrics = MetricGroup("", [
         AmdBr(),
+        AmdDtlb(),
         AmdItlb(),
         AmdUpc(),
         Idle(),
-- 
cgit v1.2.3


From 5ecb1622d000dffb9245df8d264d9c1e7492874f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:40 -0800
Subject: perf jevents: Add uncore l3 metric group for AMD

Metrics use the amd_l3 PMU for access/miss/hit information.

Reviewed-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/amd_metrics.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py
index a4ff88de08b5..d71997177239 100755
--- a/tools/perf/pmu-events/amd_metrics.py
+++ b/tools/perf/pmu-events/amd_metrics.py
@@ -317,6 +317,24 @@ def Rapl() -> MetricGroup:
                        description="Processor socket power consumption estimates")
 
 
+def UncoreL3():
+    acc = Event("l3_lookup_state.all_coherent_accesses_to_l3",
+                "l3_lookup_state.all_l3_req_typs")
+    miss = Event("l3_lookup_state.l3_miss",
+                 "l3_comb_clstr_state.request_miss")
+    acc = max(acc, miss)
+    hits = acc - miss
+
+    return MetricGroup("lpm_l3", [
+        Metric("lpm_l3_accesses", "L3 victim cache accesses",
+               d_ratio(acc, interval_sec), "accesses/sec"),
+        Metric("lpm_l3_hits", "L3 victim cache hit rate",
+               d_ratio(hits, acc), "100%"),
+        Metric("lpm_l3_miss", "L3 victim cache miss rate", d_ratio(miss, acc),
+               "100%"),
+    ], description="L3 cache breakdown per CCX")
+
+
 def main() -> None:
     global _args
     global _zen_model
@@ -351,6 +369,7 @@ def main() -> None:
         AmdUpc(),
         Idle(),
         Rapl(),
+        UncoreL3(),
     ])
 
     if _args.metricgroups:
-- 
cgit v1.2.3


From fb4c0581740d99029761f4ad84fc0dbc48913eea Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:41 -0800
Subject: perf jevents: Add load store breakdown metrics ldst for AMD

Give breakdown of number of instructions. Use the counter mask (cmask)
to show the number of cycles taken to retire the instructions.

Reviewed-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/amd_metrics.py | 75 ++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py
index d71997177239..b3de74babe40 100755
--- a/tools/perf/pmu-events/amd_metrics.py
+++ b/tools/perf/pmu-events/amd_metrics.py
@@ -280,6 +280,80 @@ def AmdItlb():
     ], description="Instruction TLB breakdown")
 
 
+def AmdLdSt() -> MetricGroup:
+    ldst_ld = Event("ls_dispatch.pure_ld", "ls_dispatch.ld_dispatch")
+    ldst_st = Event("ls_dispatch.pure_st", "ls_dispatch.store_dispatch")
+    ldst_ldc1 = Event(f"{ldst_ld}/cmask=1/")
+    ldst_stc1 = Event(f"{ldst_st}/cmask=1/")
+    ldst_ldc2 = Event(f"{ldst_ld}/cmask=2/")
+    ldst_stc2 = Event(f"{ldst_st}/cmask=2/")
+    ldst_ldc3 = Event(f"{ldst_ld}/cmask=3/")
+    ldst_stc3 = Event(f"{ldst_st}/cmask=3/")
+    ldst_cyc = Event("ls_not_halted_cyc")
+
+    ld_rate = d_ratio(ldst_ld, interval_sec)
+    st_rate = d_ratio(ldst_st, interval_sec)
+
+    ld_v1 = max(ldst_ldc1 - ldst_ldc2, 0)
+    ld_v2 = max(ldst_ldc2 - ldst_ldc3, 0)
+    ld_v3 = ldst_ldc3
+
+    st_v1 = max(ldst_stc1 - ldst_stc2, 0)
+    st_v2 = max(ldst_stc2 - ldst_stc3, 0)
+    st_v3 = ldst_stc3
+
+    return MetricGroup("lpm_ldst", [
+        MetricGroup("lpm_ldst_total", [
+            Metric("lpm_ldst_total_ld", "Number of loads dispatched per second.",
+                   ld_rate, "insns/sec"),
+            Metric("lpm_ldst_total_st", "Number of stores dispatched per second.",
+                   st_rate, "insns/sec"),
+        ]),
+        MetricGroup("lpm_ldst_percent_insn", [
+            Metric("lpm_ldst_percent_insn_ld",
+                   "Load instructions as a percentage of all instructions.",
+                   d_ratio(ldst_ld, ins), "100%"),
+            Metric("lpm_ldst_percent_insn_st",
+                   "Store instructions as a percentage of all instructions.",
+                   d_ratio(ldst_st, ins), "100%"),
+        ]),
+        MetricGroup("lpm_ldst_ret_loads_per_cycle", [
+            Metric(
+                "lpm_ldst_ret_loads_per_cycle_1",
+                "Load instructions retiring in 1 cycle as a percentage of all "
+                "unhalted cycles.", d_ratio(ld_v1, ldst_cyc), "100%"),
+            Metric(
+                "lpm_ldst_ret_loads_per_cycle_2",
+                "Load instructions retiring in 2 cycles as a percentage of all "
+                "unhalted cycles.", d_ratio(ld_v2, ldst_cyc), "100%"),
+            Metric(
+                "lpm_ldst_ret_loads_per_cycle_3",
+                "Load instructions retiring in 3 or more cycles as a percentage"
+                "of all unhalted cycles.", d_ratio(ld_v3, ldst_cyc), "100%"),
+        ]),
+        MetricGroup("lpm_ldst_ret_stores_per_cycle", [
+            Metric(
+                "lpm_ldst_ret_stores_per_cycle_1",
+                "Store instructions retiring in 1 cycle as a percentage of all "
+                "unhalted cycles.", d_ratio(st_v1, ldst_cyc), "100%"),
+            Metric(
+                "lpm_ldst_ret_stores_per_cycle_2",
+                "Store instructions retiring in 2 cycles as a percentage of all "
+                "unhalted cycles.", d_ratio(st_v2, ldst_cyc), "100%"),
+            Metric(
+                "lpm_ldst_ret_stores_per_cycle_3",
+                "Store instructions retiring in 3 or more cycles as a percentage"
+                "of all unhalted cycles.", d_ratio(st_v3, ldst_cyc), "100%"),
+        ]),
+        MetricGroup("lpm_ldst_insn_bt", [
+            Metric("lpm_ldst_insn_bt_ld", "Number of instructions between loads.",
+                   d_ratio(ins, ldst_ld), "insns"),
+            Metric("lpm_ldst_insn_bt_st", "Number of instructions between stores.",
+                   d_ratio(ins, ldst_st), "insns"),
+        ])
+    ], description="Breakdown of load/store instructions")
+
+
 def AmdUpc() -> Metric:
     ops = Event("ex_ret_ops", "ex_ret_cops")
     upc = d_ratio(ops, smt_cycles)
@@ -366,6 +440,7 @@ def main() -> None:
         AmdBr(),
         AmdDtlb(),
         AmdItlb(),
+        AmdLdSt(),
         AmdUpc(),
         Idle(),
         Rapl(),
-- 
cgit v1.2.3


From 3563030d4f77856bd0a32492fbe9edc3d36248ec Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:42 -0800
Subject: perf jevents: Add context switch metrics for AMD

Metrics break down context switches for different kinds of
instruction.

Reviewed-by: Sandipan Das <sandipan.das@amd.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/amd_metrics.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py
index b3de74babe40..83e77ccc059e 100755
--- a/tools/perf/pmu-events/amd_metrics.py
+++ b/tools/perf/pmu-events/amd_metrics.py
@@ -122,6 +122,38 @@ def AmdBr():
                        description="breakdown of retired branch instructions")
 
 
+def AmdCtxSw() -> MetricGroup:
+    cs = Event("context\\-switches")
+    metrics = [
+        Metric("lpm_cs_rate", "Context switches per second",
+               d_ratio(cs, interval_sec), "ctxsw/s")
+    ]
+
+    ev = Event("instructions")
+    metrics.append(Metric("lpm_cs_instr", "Instructions per context switch",
+                          d_ratio(ev, cs), "instr/cs"))
+
+    ev = Event("cycles")
+    metrics.append(Metric("lpm_cs_cycles", "Cycles per context switch",
+                          d_ratio(ev, cs), "cycles/cs"))
+
+    ev = Event("ls_dispatch.pure_ld", "ls_dispatch.ld_dispatch")
+    metrics.append(Metric("lpm_cs_loads", "Loads per context switch",
+                          d_ratio(ev, cs), "loads/cs"))
+
+    ev = Event("ls_dispatch.pure_st", "ls_dispatch.store_dispatch")
+    metrics.append(Metric("lpm_cs_stores", "Stores per context switch",
+                          d_ratio(ev, cs), "stores/cs"))
+
+    ev = Event("ex_ret_brn_tkn")
+    metrics.append(Metric("lpm_cs_br_taken", "Branches taken per context switch",
+                          d_ratio(ev, cs), "br_taken/cs"))
+
+    return MetricGroup("lpm_cs", metrics,
+                       description=("Number of context switches per second, instructions "
+                                    "retired & core cycles between context switches"))
+
+
 def AmdDtlb() -> Optional[MetricGroup]:
     global _zen_model
     if _zen_model >= 4:
@@ -438,6 +470,7 @@ def main() -> None:
 
     all_metrics = MetricGroup("", [
         AmdBr(),
+        AmdCtxSw(),
         AmdDtlb(),
         AmdItlb(),
         AmdLdSt(),
-- 
cgit v1.2.3


From bab90b3b46cd74d5f562b0a7b2bef1222e8960af Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:43 -0800
Subject: perf jevents: Add RAPL metrics for all Intel models

Add a 'cpu_power' metric group that computes the power consumption
from RAPL events if they are present.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 44 +++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index b287ef115193..61778deedfff 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -1,12 +1,48 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 import argparse
+import math
 import os
-from metric import (JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents,
-                    MetricGroup)
+from metric import (d_ratio, has_event, Event, JsonEncodeMetric, JsonEncodeMetricGroupDescriptions,
+                    LoadEvents, Metric, MetricGroup, Select)
 
 # Global command line arguments.
 _args = None
+interval_sec = Event("duration_time")
+
+
+def Rapl() -> MetricGroup:
+    """Processor power consumption estimate.
+
+    Use events from the running average power limit (RAPL) driver.
+    """
+    # Watts = joules/second
+    pkg = Event("power/energy\\-pkg/")
+    cond_pkg = Select(pkg, has_event(pkg), math.nan)
+    cores = Event("power/energy\\-cores/")
+    cond_cores = Select(cores, has_event(cores), math.nan)
+    ram = Event("power/energy\\-ram/")
+    cond_ram = Select(ram, has_event(ram), math.nan)
+    gpu = Event("power/energy\\-gpu/")
+    cond_gpu = Select(gpu, has_event(gpu), math.nan)
+    psys = Event("power/energy\\-psys/")
+    cond_psys = Select(psys, has_event(psys), math.nan)
+    scale = 2.3283064365386962890625e-10
+    metrics = [
+        Metric("lpm_cpu_power_pkg", "",
+               d_ratio(cond_pkg * scale, interval_sec), "Watts"),
+        Metric("lpm_cpu_power_cores", "",
+               d_ratio(cond_cores * scale, interval_sec), "Watts"),
+        Metric("lpm_cpu_power_ram", "",
+               d_ratio(cond_ram * scale, interval_sec), "Watts"),
+        Metric("lpm_cpu_power_gpu", "",
+               d_ratio(cond_gpu * scale, interval_sec), "Watts"),
+        Metric("lpm_cpu_power_psys", "",
+               d_ratio(cond_psys * scale, interval_sec), "Watts"),
+    ]
+
+    return MetricGroup("lpm_cpu_power", metrics,
+                       description="Running Average Power Limit (RAPL) power consumption estimates")
 
 
 def main() -> None:
@@ -33,7 +69,9 @@ def main() -> None:
     directory = f"{_args.events_path}/x86/{_args.model}/"
     LoadEvents(directory)
 
-    all_metrics = MetricGroup("", [])
+    all_metrics = MetricGroup("", [
+        Rapl(),
+    ])
 
     if _args.metricgroups:
         print(JsonEncodeMetricGroupDescriptions(all_metrics))
-- 
cgit v1.2.3


From 1d519e5aa8ee025a13a822ed87fa6c9f249c63b1 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:44 -0800
Subject: perf jevents: Add idle metric for Intel models

Compute using the msr PMU the percentage of wallclock cycles where the
CPUs are in a low power state.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 61778deedfff..0cb7a38ad238 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -3,14 +3,25 @@
 import argparse
 import math
 import os
-from metric import (d_ratio, has_event, Event, JsonEncodeMetric, JsonEncodeMetricGroupDescriptions,
-                    LoadEvents, Metric, MetricGroup, Select)
+from metric import (d_ratio, has_event, max, Event, JsonEncodeMetric,
+                    JsonEncodeMetricGroupDescriptions, LoadEvents, Metric,
+                    MetricGroup, Select)
 
 # Global command line arguments.
 _args = None
 interval_sec = Event("duration_time")
 
 
+def Idle() -> Metric:
+    cyc = Event("msr/mperf/")
+    tsc = Event("msr/tsc/")
+    low = max(tsc - cyc, 0)
+    return Metric(
+        "lpm_idle",
+        "Percentage of total wallclock cycles where CPUs are in low power state (C1 or deeper sleep state)",
+        d_ratio(low, tsc), "100%")
+
+
 def Rapl() -> MetricGroup:
     """Processor power consumption estimate.
 
@@ -70,6 +81,7 @@ def main() -> None:
     LoadEvents(directory)
 
     all_metrics = MetricGroup("", [
+        Idle(),
         Rapl(),
     ])
 
-- 
cgit v1.2.3


From 61b7b2ef64f8c9cf48ae1b0bfe2ee0bcb9bb3181 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:45 -0800
Subject: perf jevents: Add CheckPmu to see if a PMU is in loaded JSON events

CheckPmu can be used to determine if hybrid events are present,
allowing for hybrid conditional metrics/events/pmus to be premised on
the JSON files rather than hard coded tables.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/metric.py | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/metric.py b/tools/perf/pmu-events/metric.py
index e33e163b2815..62d1a1e1d458 100644
--- a/tools/perf/pmu-events/metric.py
+++ b/tools/perf/pmu-events/metric.py
@@ -8,10 +8,12 @@ import re
 from enum import Enum
 from typing import Dict, List, Optional, Set, Tuple, Union
 
+all_pmus = set()
 all_events = set()
 
 def LoadEvents(directory: str) -> None:
   """Populate a global set of all known events for the purpose of validating Event names"""
+  global all_pmus
   global all_events
   all_events = {
       "context\\-switches",
@@ -26,6 +28,8 @@ def LoadEvents(directory: str) -> None:
     if filename.endswith(".json"):
       try:
         for x in json.load(open(f"{directory}/{filename}")):
+          if "Unit" in x:
+            all_pmus.add(x["Unit"])
           if "EventName" in x:
             all_events.add(x["EventName"])
           elif "ArchStdEvent" in x:
@@ -36,6 +40,10 @@ def LoadEvents(directory: str) -> None:
         pass
 
 
+def CheckPmu(name: str) -> bool:
+  return name in all_pmus
+
+
 def CheckEvent(name: str) -> bool:
   """Check the event name exists in the set of all loaded events"""
   global all_events
-- 
cgit v1.2.3


From 17d616b7d98dcc15561b83a7e1c78f304b8cea74 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:46 -0800
Subject: perf jevents: Add smi metric group for Intel models

Allow duplicated metric to be dropped from JSON files.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 0cb7a38ad238..94604b1b07d8 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -3,9 +3,9 @@
 import argparse
 import math
 import os
-from metric import (d_ratio, has_event, max, Event, JsonEncodeMetric,
+from metric import (d_ratio, has_event, max, CheckPmu, Event, JsonEncodeMetric,
                     JsonEncodeMetricGroupDescriptions, LoadEvents, Metric,
-                    MetricGroup, Select)
+                    MetricGroup, MetricRef, Select)
 
 # Global command line arguments.
 _args = None
@@ -56,6 +56,25 @@ def Rapl() -> MetricGroup:
                        description="Running Average Power Limit (RAPL) power consumption estimates")
 
 
+def Smi() -> MetricGroup:
+    pmu = "<cpu_core or cpu_atom>" if CheckPmu("cpu_core") else "cpu"
+    aperf = Event('msr/aperf/')
+    cycles = Event('cycles')
+    smi_num = Event('msr/smi/')
+    smi_cycles = Select(Select((aperf - cycles) / aperf, smi_num > 0, 0),
+                        has_event(aperf),
+                        0)
+    return MetricGroup('smi', [
+        Metric('smi_num', 'Number of SMI interrupts.',
+               Select(smi_num, has_event(smi_num), 0), 'SMI#'),
+        # Note, the smi_cycles "Event" is really a reference to the metric.
+        Metric('smi_cycles',
+               'Percentage of cycles spent in System Management Interrupts. '
+               f'Requires /sys/bus/event_source/devices/{pmu}/freeze_on_smi to be 1.',
+               smi_cycles, '100%', threshold=(MetricRef('smi_cycles') > 0.10))
+    ], description='System Management Interrupt metrics')
+
+
 def main() -> None:
     global _args
 
@@ -83,6 +102,7 @@ def main() -> None:
     all_metrics = MetricGroup("", [
         Idle(),
         Rapl(),
+        Smi(),
     ])
 
     if _args.metricgroups:
-- 
cgit v1.2.3


From 7eb9fa417c0218fd4e3b8d18894d7b89c15fb8ba Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:47 -0800
Subject: perf jevents: Mark metrics with experimental events as experimental

When metrics are made with experimental events it is desirable the
metric description also carries this information in case of metric
inaccuracies.

Suggested-by: Perry Taylor <perry.taylor@intel.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/metric.py | 44 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/metric.py b/tools/perf/pmu-events/metric.py
index 62d1a1e1d458..2029b6e28365 100644
--- a/tools/perf/pmu-events/metric.py
+++ b/tools/perf/pmu-events/metric.py
@@ -10,11 +10,13 @@ from typing import Dict, List, Optional, Set, Tuple, Union
 
 all_pmus = set()
 all_events = set()
+experimental_events = set()
 
 def LoadEvents(directory: str) -> None:
   """Populate a global set of all known events for the purpose of validating Event names"""
   global all_pmus
   global all_events
+  global experimental_events
   all_events = {
       "context\\-switches",
       "cpu\\-cycles",
@@ -32,6 +34,8 @@ def LoadEvents(directory: str) -> None:
             all_pmus.add(x["Unit"])
           if "EventName" in x:
             all_events.add(x["EventName"])
+            if "Experimental" in x and x["Experimental"] == "1":
+              experimental_events.add(x["EventName"])
           elif "ArchStdEvent" in x:
             all_events.add(x["ArchStdEvent"])
       except json.decoder.JSONDecodeError:
@@ -61,6 +65,18 @@ def CheckEvent(name: str) -> bool:
   return name in all_events
 
 
+def IsExperimentalEvent(name: str) -> bool:
+  global experimental_events
+  if ':' in name:
+    # Remove trailing modifier.
+    name = name[:name.find(':')]
+  elif '/' in name:
+    # Name could begin with a PMU or an event, for now assume it is not experimental.
+    return False
+
+  return name in experimental_events
+
+
 class MetricConstraint(Enum):
   GROUPED_EVENTS = 0
   NO_GROUP_EVENTS = 1
@@ -82,6 +98,10 @@ class Expression:
     """Returns a simplified version of self."""
     raise NotImplementedError()
 
+  def HasExperimentalEvents(self) -> bool:
+    """Are experimental events used in the expression?"""
+    raise NotImplementedError()
+
   def Equals(self, other) -> bool:
     """Returns true when two expressions are the same."""
     raise NotImplementedError()
@@ -249,6 +269,9 @@ class Operator(Expression):
 
     return Operator(self.operator, lhs, rhs)
 
+  def HasExperimentalEvents(self) -> bool:
+    return self.lhs.HasExperimentalEvents() or self.rhs.HasExperimentalEvents()
+
   def Equals(self, other: Expression) -> bool:
     if isinstance(other, Operator):
       return self.operator == other.operator and self.lhs.Equals(
@@ -297,6 +320,10 @@ class Select(Expression):
 
     return Select(true_val, cond, false_val)
 
+  def HasExperimentalEvents(self) -> bool:
+    return (self.cond.HasExperimentalEvents() or self.true_val.HasExperimentalEvents() or
+            self.false_val.HasExperimentalEvents())
+
   def Equals(self, other: Expression) -> bool:
     if isinstance(other, Select):
       return self.cond.Equals(other.cond) and self.false_val.Equals(
@@ -345,6 +372,9 @@ class Function(Expression):
 
     return Function(self.fn, lhs, rhs)
 
+  def HasExperimentalEvents(self) -> bool:
+    return self.lhs.HasExperimentalEvents() or (self.rhs and self.rhs.HasExperimentalEvents())
+
   def Equals(self, other: Expression) -> bool:
     if isinstance(other, Function):
       result = self.fn == other.fn and self.lhs.Equals(other.lhs)
@@ -384,6 +414,9 @@ class Event(Expression):
     global all_events
     raise Exception(f"No event {error} in:\n{all_events}")
 
+  def HasExperimentalEvents(self) -> bool:
+    return IsExperimentalEvent(self.name)
+
   def ToPerfJson(self):
     result = re.sub('/', '@', self.name)
     return result
@@ -416,6 +449,9 @@ class MetricRef(Expression):
   def Simplify(self) -> Expression:
     return self
 
+  def HasExperimentalEvents(self) -> bool:
+    return False
+
   def Equals(self, other: Expression) -> bool:
     return isinstance(other, MetricRef) and self.name == other.name
 
@@ -443,6 +479,9 @@ class Constant(Expression):
   def Simplify(self) -> Expression:
     return self
 
+  def HasExperimentalEvents(self) -> bool:
+    return False
+
   def Equals(self, other: Expression) -> bool:
     return isinstance(other, Constant) and self.value == other.value
 
@@ -465,6 +504,9 @@ class Literal(Expression):
   def Simplify(self) -> Expression:
     return self
 
+  def HasExperimentalEvents(self) -> bool:
+    return False
+
   def Equals(self, other: Expression) -> bool:
     return isinstance(other, Literal) and self.value == other.value
 
@@ -527,6 +569,8 @@ class Metric:
     self.name = name
     self.description = description
     self.expr = expr.Simplify()
+    if self.expr.HasExperimentalEvents():
+      self.description += " (metric should be considered experimental as it contains experimental events)."
     # Workraound valid_only_metric hiding certain metrics based on unit.
     scale_unit = scale_unit.replace('/sec', ' per sec')
     if scale_unit[0].isdigit():
-- 
cgit v1.2.3


From 8c345f35003269eaceeb84b3b14588aeb2bae6f7 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:48 -0800
Subject: perf jevents: Add tsx metric group for Intel models

Allow duplicated metric to be dropped from JSON files. Detect when TSX
is supported by a model by using the JSON events, use sysfs events at
runtime as hypervisors, etc. may disable TSX.

Add CheckPmu to metric to determine if which PMUs have been associated
with the loaded events.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 50 ++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 94604b1b07d8..05f3d94ec5d5 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -3,6 +3,7 @@
 import argparse
 import math
 import os
+from typing import Optional
 from metric import (d_ratio, has_event, max, CheckPmu, Event, JsonEncodeMetric,
                     JsonEncodeMetricGroupDescriptions, LoadEvents, Metric,
                     MetricGroup, MetricRef, Select)
@@ -75,6 +76,54 @@ def Smi() -> MetricGroup:
     ], description='System Management Interrupt metrics')
 
 
+def Tsx() -> Optional[MetricGroup]:
+    pmu = "cpu_core" if CheckPmu("cpu_core") else "cpu"
+    cycles = Event('cycles')
+    cycles_in_tx = Event(f'{pmu}/cycles\\-t/')
+    cycles_in_tx_cp = Event(f'{pmu}/cycles\\-ct/')
+    try:
+        # Test if the tsx event is present in the json, prefer the
+        # sysfs version so that we can detect its presence at runtime.
+        transaction_start = Event("RTM_RETIRED.START")
+        transaction_start = Event(f'{pmu}/tx\\-start/')
+    except:
+        return None
+
+    elision_start = None
+    try:
+        # Elision start isn't supported by all models, but we'll not
+        # generate the tsx_cycles_per_elision metric in that
+        # case. Again, prefer the sysfs encoding of the event.
+        elision_start = Event("HLE_RETIRED.START")
+        elision_start = Event(f'{pmu}/el\\-start/')
+    except:
+        pass
+
+    return MetricGroup('transaction', [
+        Metric('tsx_transactional_cycles',
+               'Percentage of cycles within a transaction region.',
+               Select(cycles_in_tx / cycles, has_event(cycles_in_tx), 0),
+               '100%'),
+        Metric('tsx_aborted_cycles', 'Percentage of cycles in aborted transactions.',
+               Select(max(cycles_in_tx - cycles_in_tx_cp, 0) / cycles,
+                      has_event(cycles_in_tx),
+                      0),
+               '100%'),
+        Metric('tsx_cycles_per_transaction',
+               'Number of cycles within a transaction divided by the number of transactions.',
+               Select(cycles_in_tx / transaction_start,
+                      has_event(cycles_in_tx),
+                      0),
+               "cycles / transaction"),
+        Metric('tsx_cycles_per_elision',
+               'Number of cycles within a transaction divided by the number of elisions.',
+               Select(cycles_in_tx / elision_start,
+                      has_event(elision_start),
+                      0),
+               "cycles / elision") if elision_start else None,
+    ], description="Breakdown of transactional memory statistics")
+
+
 def main() -> None:
     global _args
 
@@ -103,6 +152,7 @@ def main() -> None:
         Idle(),
         Rapl(),
         Smi(),
+        Tsx(),
     ])
 
     if _args.metricgroups:
-- 
cgit v1.2.3


From 37d0b00a1ac85309e63700153049bc16fc446b19 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:49 -0800
Subject: perf jevents: Add br metric group for branch statistics on Intel

The br metric group for branches itself comprises metric groups for
total, taken, conditional, fused and far metric groups using JSON
events.

Conditional taken and not taken metrics are specific to Icelake and
later generations, so the presence of the event is used to determine
whether the metric should exist.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 138 +++++++++++++++++++++++++++++++++
 1 file changed, 138 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 05f3d94ec5d5..e1944d821248 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -124,6 +124,143 @@ def Tsx() -> Optional[MetricGroup]:
     ], description="Breakdown of transactional memory statistics")
 
 
+def IntelBr():
+    ins = Event("instructions")
+
+    def Total() -> MetricGroup:
+        br_all = Event("BR_INST_RETIRED.ALL_BRANCHES", "BR_INST_RETIRED.ANY")
+        br_m_all = Event("BR_MISP_RETIRED.ALL_BRANCHES",
+                         "BR_INST_RETIRED.MISPRED",
+                         "BR_MISP_EXEC.ANY")
+        br_clr = None
+        try:
+            br_clr = Event("BACLEARS.ANY", "BACLEARS.ALL")
+        except:
+            pass
+
+        br_r = d_ratio(br_all, interval_sec)
+        ins_r = d_ratio(ins, br_all)
+        misp_r = d_ratio(br_m_all, br_all)
+        clr_r = d_ratio(br_clr, interval_sec) if br_clr else None
+
+        return MetricGroup("lpm_br_total", [
+            Metric("lpm_br_total_retired",
+                   "The number of branch instructions retired per second.", br_r,
+                   "insn/s"),
+            Metric(
+                "lpm_br_total_mispred",
+                "The number of branch instructions retired, of any type, that were "
+                "not correctly predicted as a percentage of all branch instrucions.",
+                misp_r, "100%"),
+            Metric("lpm_br_total_insn_between_branches",
+                   "The number of instructions divided by the number of branches.",
+                   ins_r, "insn"),
+            Metric("lpm_br_total_insn_fe_resteers",
+                   "The number of resync branches per second.", clr_r, "req/s"
+                   ) if clr_r else None
+        ])
+
+    def Taken() -> MetricGroup:
+        br_all = Event("BR_INST_RETIRED.ALL_BRANCHES", "BR_INST_RETIRED.ANY")
+        br_m_tk = None
+        try:
+            br_m_tk = Event("BR_MISP_RETIRED.NEAR_TAKEN",
+                            "BR_MISP_RETIRED.TAKEN_JCC",
+                            "BR_INST_RETIRED.MISPRED_TAKEN")
+        except:
+            pass
+        br_r = d_ratio(br_all, interval_sec)
+        ins_r = d_ratio(ins, br_all)
+        misp_r = d_ratio(br_m_tk, br_all) if br_m_tk else None
+        return MetricGroup("lpm_br_taken", [
+            Metric("lpm_br_taken_retired",
+                   "The number of taken branches that were retired per second.",
+                   br_r, "insn/s"),
+            Metric(
+                "lpm_br_taken_mispred",
+                "The number of retired taken branch instructions that were "
+                "mispredicted as a percentage of all taken branches.", misp_r,
+                "100%") if misp_r else None,
+            Metric(
+                "lpm_br_taken_insn_between_branches",
+                "The number of instructions divided by the number of taken branches.",
+                ins_r, "insn"),
+        ])
+
+    def Conditional() -> Optional[MetricGroup]:
+        try:
+            br_cond = Event("BR_INST_RETIRED.COND",
+                            "BR_INST_RETIRED.CONDITIONAL",
+                            "BR_INST_RETIRED.TAKEN_JCC")
+            br_m_cond = Event("BR_MISP_RETIRED.COND",
+                              "BR_MISP_RETIRED.CONDITIONAL",
+                              "BR_MISP_RETIRED.TAKEN_JCC")
+        except:
+            return None
+
+        br_cond_nt = None
+        br_m_cond_nt = None
+        try:
+            br_cond_nt = Event("BR_INST_RETIRED.COND_NTAKEN")
+            br_m_cond_nt = Event("BR_MISP_RETIRED.COND_NTAKEN")
+        except:
+            pass
+        br_r = d_ratio(br_cond, interval_sec)
+        ins_r = d_ratio(ins, br_cond)
+        misp_r = d_ratio(br_m_cond, br_cond)
+        taken_metrics = [
+            Metric("lpm_br_cond_retired", "Retired conditional branch instructions.",
+                   br_r, "insn/s"),
+            Metric("lpm_br_cond_insn_between_branches",
+                   "The number of instructions divided by the number of conditional "
+                   "branches.", ins_r, "insn"),
+            Metric("lpm_br_cond_mispred",
+                   "Retired conditional branch instructions mispredicted as a "
+                   "percentage of all conditional branches.", misp_r, "100%"),
+        ]
+        if not br_m_cond_nt:
+            return MetricGroup("lpm_br_cond", taken_metrics)
+
+        br_r = d_ratio(br_cond_nt, interval_sec)
+        ins_r = d_ratio(ins, br_cond_nt)
+        misp_r = d_ratio(br_m_cond_nt, br_cond_nt)
+
+        not_taken_metrics = [
+            Metric("lpm_br_cond_retired", "Retired conditional not taken branch instructions.",
+                   br_r, "insn/s"),
+            Metric("lpm_br_cond_insn_between_branches",
+                   "The number of instructions divided by the number of not taken conditional "
+                   "branches.", ins_r, "insn"),
+            Metric("lpm_br_cond_mispred",
+                   "Retired not taken conditional branch instructions mispredicted as a "
+                   "percentage of all not taken conditional branches.", misp_r, "100%"),
+        ]
+        return MetricGroup("lpm_br_cond", [
+            MetricGroup("lpm_br_cond_nt", not_taken_metrics),
+            MetricGroup("lpm_br_cond_tkn", taken_metrics),
+        ])
+
+    def Far() -> Optional[MetricGroup]:
+        try:
+            br_far = Event("BR_INST_RETIRED.FAR_BRANCH")
+        except:
+            return None
+
+        br_r = d_ratio(br_far, interval_sec)
+        ins_r = d_ratio(ins, br_far)
+        return MetricGroup("lpm_br_far", [
+            Metric("lpm_br_far_retired", "Retired far control transfers per second.",
+                   br_r, "insn/s"),
+            Metric(
+                "lpm_br_far_insn_between_branches",
+                "The number of instructions divided by the number of far branches.",
+                ins_r, "insn"),
+        ])
+
+    return MetricGroup("lpm_br", [Total(), Taken(), Conditional(), Far()],
+                       description="breakdown of retired branch instructions")
+
+
 def main() -> None:
     global _args
 
@@ -153,6 +290,7 @@ def main() -> None:
         Rapl(),
         Smi(),
         Tsx(),
+        IntelBr(),
     ])
 
     if _args.metricgroups:
-- 
cgit v1.2.3


From 397fdb3a24435f55c4c675726d1b214954e7aa53 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:50 -0800
Subject: perf jevents: Add software prefetch (swpf) metric group for Intel

Add metrics that breakdown software prefetch instruction use.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 66 ++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index e1944d821248..919a058c343a 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -261,6 +261,71 @@ def IntelBr():
                        description="breakdown of retired branch instructions")
 
 
+def IntelSwpf() -> Optional[MetricGroup]:
+    ins = Event("instructions")
+    try:
+        s_ld = Event("MEM_INST_RETIRED.ALL_LOADS",
+                     "MEM_UOPS_RETIRED.ALL_LOADS")
+        s_nta = Event("SW_PREFETCH_ACCESS.NTA")
+        s_t0 = Event("SW_PREFETCH_ACCESS.T0")
+        s_t1 = Event("SW_PREFETCH_ACCESS.T1_T2")
+        s_w = Event("SW_PREFETCH_ACCESS.PREFETCHW")
+    except:
+        return None
+
+    all_sw = s_nta + s_t0 + s_t1 + s_w
+    swp_r = d_ratio(all_sw, interval_sec)
+    ins_r = d_ratio(ins, all_sw)
+    ld_r = d_ratio(s_ld, all_sw)
+
+    return MetricGroup("lpm_swpf", [
+        MetricGroup("lpm_swpf_totals", [
+            Metric("lpm_swpf_totals_exec", "Software prefetch instructions per second",
+                   swp_r, "swpf/s"),
+            Metric("lpm_swpf_totals_insn_per_pf",
+                   "Average number of instructions between software prefetches",
+                   ins_r, "insn/swpf"),
+            Metric("lpm_swpf_totals_loads_per_pf",
+                   "Average number of loads between software prefetches",
+                   ld_r, "loads/swpf"),
+        ]),
+        MetricGroup("lpm_swpf_bkdwn", [
+            MetricGroup("lpm_swpf_bkdwn_nta", [
+                Metric("lpm_swpf_bkdwn_nta_per_swpf",
+                       "Software prefetch NTA instructions as a percent of all prefetch instructions",
+                       d_ratio(s_nta, all_sw), "100%"),
+                Metric("lpm_swpf_bkdwn_nta_rate",
+                       "Software prefetch NTA instructions per second",
+                       d_ratio(s_nta, interval_sec), "insn/s"),
+            ]),
+            MetricGroup("lpm_swpf_bkdwn_t0", [
+                Metric("lpm_swpf_bkdwn_t0_per_swpf",
+                       "Software prefetch T0 instructions as a percent of all prefetch instructions",
+                       d_ratio(s_t0, all_sw), "100%"),
+                Metric("lpm_swpf_bkdwn_t0_rate",
+                       "Software prefetch T0 instructions per second",
+                       d_ratio(s_t0, interval_sec), "insn/s"),
+            ]),
+            MetricGroup("lpm_swpf_bkdwn_t1_t2", [
+                Metric("lpm_swpf_bkdwn_t1_t2_per_swpf",
+                       "Software prefetch T1 or T2 instructions as a percent of all prefetch instructions",
+                       d_ratio(s_t1, all_sw), "100%"),
+                Metric("lpm_swpf_bkdwn_t1_t2_rate",
+                       "Software prefetch T1 or T2 instructions per second",
+                       d_ratio(s_t1, interval_sec), "insn/s"),
+            ]),
+            MetricGroup("lpm_swpf_bkdwn_w", [
+                Metric("lpm_swpf_bkdwn_w_per_swpf",
+                       "Software prefetch W instructions as a percent of all prefetch instructions",
+                       d_ratio(s_w, all_sw), "100%"),
+                Metric("lpm_swpf_bkdwn_w_rate",
+                       "Software prefetch W instructions per second",
+                       d_ratio(s_w, interval_sec), "insn/s"),
+            ]),
+        ]),
+    ], description="Software prefetch instruction breakdown")
+
+
 def main() -> None:
     global _args
 
@@ -291,6 +356,7 @@ def main() -> None:
         Smi(),
         Tsx(),
         IntelBr(),
+        IntelSwpf(),
     ])
 
     if _args.metricgroups:
-- 
cgit v1.2.3


From cd1c6a487407a350970d1bea9d3e674f2a281179 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:51 -0800
Subject: perf jevents: Add ports metric group giving utilization on Intel

The ports metric group contains a metric for each port giving its
utilization as a ratio of cycles.

The metrics are created by looking for UOPS_DISPATCHED.PORT events.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 35 ++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 919a058c343a..7fcc0a1c544d 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -1,12 +1,14 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 import argparse
+import json
 import math
 import os
+import re
 from typing import Optional
 from metric import (d_ratio, has_event, max, CheckPmu, Event, JsonEncodeMetric,
-                    JsonEncodeMetricGroupDescriptions, LoadEvents, Metric,
-                    MetricGroup, MetricRef, Select)
+                    JsonEncodeMetricGroupDescriptions, Literal, LoadEvents,
+                    Metric, MetricGroup, MetricRef, Select)
 
 # Global command line arguments.
 _args = None
@@ -261,6 +263,34 @@ def IntelBr():
                        description="breakdown of retired branch instructions")
 
 
+def IntelPorts() -> Optional[MetricGroup]:
+    pipeline_events = json.load(
+        open(f"{_args.events_path}/x86/{_args.model}/pipeline.json"))
+
+    core_cycles = Event("CPU_CLK_UNHALTED.THREAD_P_ANY",
+                        "CPU_CLK_UNHALTED.DISTRIBUTED",
+                        "cycles")
+    # Number of CPU cycles scaled for SMT.
+    smt_cycles = Select(core_cycles / 2, Literal("#smt_on"), core_cycles)
+
+    metrics = []
+    for x in pipeline_events:
+        if "EventName" in x and re.search("^UOPS_DISPATCHED.PORT", x["EventName"]):
+            name = x["EventName"]
+            port = re.search(r"(PORT_[0-9].*)", name).group(0).lower()
+            if name.endswith("_CORE"):
+                cyc = core_cycles
+            else:
+                cyc = smt_cycles
+            metrics.append(Metric(f"lpm_{port}", f"{port} utilization (higher is better)",
+                                  d_ratio(Event(name), cyc), "100%"))
+    if len(metrics) == 0:
+        return None
+
+    return MetricGroup("lpm_ports", metrics, "functional unit (port) utilization -- "
+                       "fraction of cycles each port is utilized (higher is better)")
+
+
 def IntelSwpf() -> Optional[MetricGroup]:
     ins = Event("instructions")
     try:
@@ -356,6 +386,7 @@ def main() -> None:
         Smi(),
         Tsx(),
         IntelBr(),
+        IntelPorts(),
         IntelSwpf(),
     ])
 
-- 
cgit v1.2.3


From 7413633e255cb02d59e9451e55fbe9e50310db18 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:52 -0800
Subject: perf jevents: Add L2 metrics for Intel

Give a breakdown of various L2 counters as metrics, including totals,
reads, hardware prefetcher, RFO, code and evictions.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 170 +++++++++++++++++++++++++++++++++
 1 file changed, 170 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 7fcc0a1c544d..d190d97f4aff 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -263,6 +263,175 @@ def IntelBr():
                        description="breakdown of retired branch instructions")
 
 
+def IntelL2() -> Optional[MetricGroup]:
+    try:
+        DC_HIT = Event("L2_RQSTS.DEMAND_DATA_RD_HIT")
+    except:
+        return None
+    try:
+        DC_MISS = Event("L2_RQSTS.DEMAND_DATA_RD_MISS")
+        l2_dmnd_miss = DC_MISS
+        l2_dmnd_rd_all = DC_MISS + DC_HIT
+    except:
+        DC_ALL = Event("L2_RQSTS.ALL_DEMAND_DATA_RD")
+        l2_dmnd_miss = DC_ALL - DC_HIT
+        l2_dmnd_rd_all = DC_ALL
+    l2_dmnd_mrate = d_ratio(l2_dmnd_miss, interval_sec)
+    l2_dmnd_rrate = d_ratio(l2_dmnd_rd_all, interval_sec)
+
+    DC_PFH = None
+    DC_PFM = None
+    l2_pf_all = None
+    l2_pf_mrate = None
+    l2_pf_rrate = None
+    try:
+        DC_PFH = Event("L2_RQSTS.PF_HIT")
+        DC_PFM = Event("L2_RQSTS.PF_MISS")
+        l2_pf_all = DC_PFH + DC_PFM
+        l2_pf_mrate = d_ratio(DC_PFM, interval_sec)
+        l2_pf_rrate = d_ratio(l2_pf_all, interval_sec)
+    except:
+        pass
+
+    DC_RFOH = None
+    DC_RFOM = None
+    l2_rfo_all = None
+    l2_rfo_mrate = None
+    l2_rfo_rrate = None
+    try:
+        DC_RFOH = Event("L2_RQSTS.RFO_HIT")
+        DC_RFOM = Event("L2_RQSTS.RFO_MISS")
+        l2_rfo_all = DC_RFOH + DC_RFOM
+        l2_rfo_mrate = d_ratio(DC_RFOM, interval_sec)
+        l2_rfo_rrate = d_ratio(l2_rfo_all, interval_sec)
+    except:
+        pass
+
+    DC_CH = None
+    try:
+        DC_CH = Event("L2_RQSTS.CODE_RD_HIT")
+    except:
+        pass
+    DC_CM = Event("L2_RQSTS.CODE_RD_MISS")
+    DC_IN = Event("L2_LINES_IN.ALL")
+    DC_OUT_NS = None
+    DC_OUT_S = None
+    l2_lines_out = None
+    l2_out_rate = None
+    wbn = None
+    isd = None
+    try:
+        DC_OUT_NS = Event("L2_LINES_OUT.NON_SILENT",
+                          "L2_LINES_OUT.DEMAND_DIRTY",
+                          "L2_LINES_IN.S")
+        DC_OUT_S = Event("L2_LINES_OUT.SILENT",
+                         "L2_LINES_OUT.DEMAND_CLEAN",
+                         "L2_LINES_IN.I")
+        if DC_OUT_S.name == "L2_LINES_OUT.SILENT" and (
+                args.model.startswith("skylake") or
+                args.model == "cascadelakex"):
+            DC_OUT_S.name = "L2_LINES_OUT.SILENT/any/"
+        # bring is back to per-CPU
+        l2_s = Select(DC_OUT_S / 2, Literal("#smt_on"), DC_OUT_S)
+        l2_ns = DC_OUT_NS
+        l2_lines_out = l2_s + l2_ns
+        l2_out_rate = d_ratio(l2_lines_out, interval_sec)
+        nlr = max(l2_ns - DC_WB_U - DC_WB_D, 0)
+        wbn = d_ratio(nlr, interval_sec)
+        isd = d_ratio(l2_s, interval_sec)
+    except:
+        pass
+    DC_OUT_U = None
+    l2_pf_useless = None
+    l2_useless_rate = None
+    try:
+        DC_OUT_U = Event("L2_LINES_OUT.USELESS_HWPF")
+        l2_pf_useless = DC_OUT_U
+        l2_useless_rate = d_ratio(l2_pf_useless, interval_sec)
+    except:
+        pass
+    DC_WB_U = None
+    DC_WB_D = None
+    wbu = None
+    wbd = None
+    try:
+        DC_WB_U = Event("IDI_MISC.WB_UPGRADE")
+        DC_WB_D = Event("IDI_MISC.WB_DOWNGRADE")
+        wbu = d_ratio(DC_WB_U, interval_sec)
+        wbd = d_ratio(DC_WB_D, interval_sec)
+    except:
+        pass
+
+    l2_lines_in = DC_IN
+    l2_code_all = (DC_CH + DC_CM) if DC_CH else None
+    l2_code_rate = d_ratio(l2_code_all, interval_sec) if DC_CH else None
+    l2_code_miss_rate = d_ratio(DC_CM, interval_sec)
+    l2_in_rate = d_ratio(l2_lines_in, interval_sec)
+
+    return MetricGroup("lpm_l2", [
+        MetricGroup("lpm_l2_totals", [
+            Metric("lpm_l2_totals_in", "L2 cache total in per second",
+                   l2_in_rate, "In/s"),
+            Metric("lpm_l2_totals_out", "L2 cache total out per second",
+                   l2_out_rate, "Out/s") if l2_out_rate else None,
+        ]),
+        MetricGroup("lpm_l2_rd", [
+            Metric("lpm_l2_rd_hits", "L2 cache data read hits",
+                   d_ratio(DC_HIT, l2_dmnd_rd_all), "100%"),
+            Metric("lpm_l2_rd_hits", "L2 cache data read hits",
+                   d_ratio(l2_dmnd_miss, l2_dmnd_rd_all), "100%"),
+            Metric("lpm_l2_rd_requests", "L2 cache data read requests per second",
+                   l2_dmnd_rrate, "requests/s"),
+            Metric("lpm_l2_rd_misses", "L2 cache data read misses per second",
+                   l2_dmnd_mrate, "misses/s"),
+        ]),
+        MetricGroup("lpm_l2_hwpf", [
+            Metric("lpm_l2_hwpf_hits", "L2 cache hardware prefetcher hits",
+                   d_ratio(DC_PFH, l2_pf_all), "100%"),
+            Metric("lpm_l2_hwpf_misses", "L2 cache hardware prefetcher misses",
+                   d_ratio(DC_PFM, l2_pf_all), "100%"),
+            Metric("lpm_l2_hwpf_useless", "L2 cache hardware prefetcher useless prefetches per second",
+                   l2_useless_rate, "100%") if l2_useless_rate else None,
+            Metric("lpm_l2_hwpf_requests", "L2 cache hardware prefetcher requests per second",
+                   l2_pf_rrate, "100%"),
+            Metric("lpm_l2_hwpf_misses", "L2 cache hardware prefetcher misses per second",
+                   l2_pf_mrate, "100%"),
+        ]) if DC_PFH else None,
+        MetricGroup("lpm_l2_rfo", [
+            Metric("lpm_l2_rfo_hits", "L2 cache request for ownership (RFO) hits",
+                   d_ratio(DC_RFOH, l2_rfo_all), "100%"),
+            Metric("lpm_l2_rfo_misses", "L2 cache request for ownership (RFO) misses",
+                   d_ratio(DC_RFOM, l2_rfo_all), "100%"),
+            Metric("lpm_l2_rfo_requests", "L2 cache request for ownership (RFO) requests per second",
+                   l2_rfo_rrate, "requests/s"),
+            Metric("lpm_l2_rfo_misses", "L2 cache request for ownership (RFO) misses per second",
+                   l2_rfo_mrate, "misses/s"),
+        ]) if DC_RFOH else None,
+        MetricGroup("lpm_l2_code", [
+            Metric("lpm_l2_code_hits", "L2 cache code hits",
+                   d_ratio(DC_CH, l2_code_all), "100%") if DC_CH else None,
+            Metric("lpm_l2_code_misses", "L2 cache code misses",
+                   d_ratio(DC_CM, l2_code_all), "100%") if DC_CH else None,
+            Metric("lpm_l2_code_requests", "L2 cache code requests per second",
+                   l2_code_rate, "requests/s") if DC_CH else None,
+            Metric("lpm_l2_code_misses", "L2 cache code misses per second",
+                   l2_code_miss_rate, "misses/s"),
+        ]),
+        MetricGroup("lpm_l2_evict", [
+            MetricGroup("lpm_l2_evict_mef_lines", [
+                Metric("lpm_l2_evict_mef_lines_l3_hot_lru", "L2 evictions M/E/F lines L3 hot LRU per second",
+                       wbu, "HotLRU/s") if wbu else None,
+                Metric("lpm_l2_evict_mef_lines_l3_norm_lru", "L2 evictions M/E/F lines L3 normal LRU per second",
+                       wbn, "NormLRU/s") if wbn else None,
+                Metric("lpm_l2_evict_mef_lines_dropped", "L2 evictions M/E/F lines dropped per second",
+                       wbd, "dropped/s") if wbd else None,
+                Metric("lpm_l2_evict_is_lines_dropped", "L2 evictions I/S lines dropped per second",
+                       isd, "dropped/s") if isd else None,
+            ]),
+        ]),
+    ], description="L2 data cache analysis")
+
+
 def IntelPorts() -> Optional[MetricGroup]:
     pipeline_events = json.load(
         open(f"{_args.events_path}/x86/{_args.model}/pipeline.json"))
@@ -386,6 +555,7 @@ def main() -> None:
         Smi(),
         Tsx(),
         IntelBr(),
+        IntelL2(),
         IntelPorts(),
         IntelSwpf(),
     ])
-- 
cgit v1.2.3


From d80edef23124baffa2cfd61a009933d03d982741 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:53 -0800
Subject: perf jevents: Add load store breakdown metrics ldst for Intel

Give breakdown of number of instructions. Use the counter mask (cmask)
to show the number of cycles taken to retire the instructions.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 87 +++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index d190d97f4aff..19a284b4c520 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -8,7 +8,7 @@ import re
 from typing import Optional
 from metric import (d_ratio, has_event, max, CheckPmu, Event, JsonEncodeMetric,
                     JsonEncodeMetricGroupDescriptions, Literal, LoadEvents,
-                    Metric, MetricGroup, MetricRef, Select)
+                    Metric, MetricConstraint, MetricGroup, MetricRef, Select)
 
 # Global command line arguments.
 _args = None
@@ -525,6 +525,90 @@ def IntelSwpf() -> Optional[MetricGroup]:
     ], description="Software prefetch instruction breakdown")
 
 
+def IntelLdSt() -> Optional[MetricGroup]:
+    if _args.model in [
+        "bonnell",
+        "nehalemep",
+        "nehalemex",
+        "westmereep-dp",
+        "westmereep-sp",
+        "westmereex",
+    ]:
+        return None
+    LDST_LD = Event("MEM_INST_RETIRED.ALL_LOADS", "MEM_UOPS_RETIRED.ALL_LOADS")
+    LDST_ST = Event("MEM_INST_RETIRED.ALL_STORES",
+                    "MEM_UOPS_RETIRED.ALL_STORES")
+    LDST_LDC1 = Event(f"{LDST_LD.name}/cmask=1/")
+    LDST_STC1 = Event(f"{LDST_ST.name}/cmask=1/")
+    LDST_LDC2 = Event(f"{LDST_LD.name}/cmask=2/")
+    LDST_STC2 = Event(f"{LDST_ST.name}/cmask=2/")
+    LDST_LDC3 = Event(f"{LDST_LD.name}/cmask=3/")
+    LDST_STC3 = Event(f"{LDST_ST.name}/cmask=3/")
+    ins = Event("instructions")
+    LDST_CYC = Event("CPU_CLK_UNHALTED.THREAD",
+                     "CPU_CLK_UNHALTED.CORE_P",
+                     "CPU_CLK_UNHALTED.THREAD_P")
+    LDST_PRE = None
+    try:
+        LDST_PRE = Event("LOAD_HIT_PREFETCH.SWPF", "LOAD_HIT_PRE.SW_PF")
+    except:
+        pass
+    LDST_AT = None
+    try:
+        LDST_AT = Event("MEM_INST_RETIRED.LOCK_LOADS")
+    except:
+        pass
+    cyc = LDST_CYC
+
+    ld_rate = d_ratio(LDST_LD, interval_sec)
+    st_rate = d_ratio(LDST_ST, interval_sec)
+    pf_rate = d_ratio(LDST_PRE, interval_sec) if LDST_PRE else None
+    at_rate = d_ratio(LDST_AT, interval_sec) if LDST_AT else None
+
+    ldst_ret_constraint = MetricConstraint.GROUPED_EVENTS
+    if LDST_LD.name == "MEM_UOPS_RETIRED.ALL_LOADS":
+        ldst_ret_constraint = MetricConstraint.NO_GROUP_EVENTS_NMI
+
+    return MetricGroup("lpm_ldst", [
+        MetricGroup("lpm_ldst_total", [
+            Metric("lpm_ldst_total_loads", "Load/store instructions total loads",
+                   ld_rate, "loads"),
+            Metric("lpm_ldst_total_stores", "Load/store instructions total stores",
+                   st_rate, "stores"),
+        ]),
+        MetricGroup("lpm_ldst_prcnt", [
+            Metric("lpm_ldst_prcnt_loads", "Percent of all instructions that are loads",
+                   d_ratio(LDST_LD, ins), "100%"),
+            Metric("lpm_ldst_prcnt_stores", "Percent of all instructions that are stores",
+                   d_ratio(LDST_ST, ins), "100%"),
+        ]),
+        MetricGroup("lpm_ldst_ret_lds", [
+            Metric("lpm_ldst_ret_lds_1", "Retired loads in 1 cycle",
+                   d_ratio(max(LDST_LDC1 - LDST_LDC2, 0), cyc), "100%",
+                   constraint=ldst_ret_constraint),
+            Metric("lpm_ldst_ret_lds_2", "Retired loads in 2 cycles",
+                   d_ratio(max(LDST_LDC2 - LDST_LDC3, 0), cyc), "100%",
+                   constraint=ldst_ret_constraint),
+            Metric("lpm_ldst_ret_lds_3", "Retired loads in 3 or more cycles",
+                   d_ratio(LDST_LDC3, cyc), "100%"),
+        ]),
+        MetricGroup("lpm_ldst_ret_sts", [
+            Metric("lpm_ldst_ret_sts_1", "Retired stores in 1 cycle",
+                   d_ratio(max(LDST_STC1 - LDST_STC2, 0), cyc), "100%",
+                   constraint=ldst_ret_constraint),
+            Metric("lpm_ldst_ret_sts_2", "Retired stores in 2 cycles",
+                   d_ratio(max(LDST_STC2 - LDST_STC3, 0), cyc), "100%",
+                   constraint=ldst_ret_constraint),
+            Metric("lpm_ldst_ret_sts_3", "Retired stores in 3 more cycles",
+                   d_ratio(LDST_STC3, cyc), "100%"),
+        ]),
+        Metric("lpm_ldst_ld_hit_swpf", "Load hit software prefetches per second",
+               pf_rate, "swpf/s") if pf_rate else None,
+        Metric("lpm_ldst_atomic_lds", "Atomic loads per second",
+               at_rate, "loads/s") if at_rate else None,
+    ], description="Breakdown of load/store instructions")
+
+
 def main() -> None:
     global _args
 
@@ -556,6 +640,7 @@ def main() -> None:
         Tsx(),
         IntelBr(),
         IntelL2(),
+        IntelLdSt(),
         IntelPorts(),
         IntelSwpf(),
     ])
-- 
cgit v1.2.3


From 59341f4e171170d3e2be50047508569a57eee829 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:54 -0800
Subject: perf jevents: Add ILP metrics for Intel

Use the counter mask (cmask) to see how many cycles an instruction
takes to retire. Present as a set of ILP metrics.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 40 ++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 19a284b4c520..bc3c50285916 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -263,6 +263,45 @@ def IntelBr():
                        description="breakdown of retired branch instructions")
 
 
+def IntelIlp() -> MetricGroup:
+    tsc = Event("msr/tsc/")
+    c0 = Event("msr/mperf/")
+    low = tsc - c0
+    inst_ret = Event("INST_RETIRED.ANY_P")
+    inst_ret_c = [Event(f"{inst_ret.name}/cmask={x}/") for x in range(1, 6)]
+    core_cycles = Event("CPU_CLK_UNHALTED.THREAD_P_ANY",
+                        "CPU_CLK_UNHALTED.DISTRIBUTED",
+                        "cycles")
+    ilp = [d_ratio(max(inst_ret_c[x] - inst_ret_c[x + 1], 0), core_cycles)
+           for x in range(0, 4)]
+    ilp.append(d_ratio(inst_ret_c[4], core_cycles))
+    ilp0 = 1
+    for x in ilp:
+        ilp0 -= x
+    return MetricGroup("lpm_ilp", [
+        Metric("lpm_ilp_idle", "Lower power cycles as a percentage of all cycles",
+               d_ratio(low, tsc), "100%"),
+        Metric("lpm_ilp_inst_ret_0",
+               "Instructions retired in 0 cycles as a percentage of all cycles",
+               ilp0, "100%"),
+        Metric("lpm_ilp_inst_ret_1",
+               "Instructions retired in 1 cycles as a percentage of all cycles",
+               ilp[0], "100%"),
+        Metric("lpm_ilp_inst_ret_2",
+               "Instructions retired in 2 cycles as a percentage of all cycles",
+               ilp[1], "100%"),
+        Metric("lpm_ilp_inst_ret_3",
+               "Instructions retired in 3 cycles as a percentage of all cycles",
+               ilp[2], "100%"),
+        Metric("lpm_ilp_inst_ret_4",
+               "Instructions retired in 4 cycles as a percentage of all cycles",
+               ilp[3], "100%"),
+        Metric("lpm_ilp_inst_ret_5",
+               "Instructions retired in 5 or more cycles as a percentage of all cycles",
+               ilp[4], "100%"),
+    ])
+
+
 def IntelL2() -> Optional[MetricGroup]:
     try:
         DC_HIT = Event("L2_RQSTS.DEMAND_DATA_RD_HIT")
@@ -639,6 +678,7 @@ def main() -> None:
         Smi(),
         Tsx(),
         IntelBr(),
+        IntelIlp(),
         IntelL2(),
         IntelLdSt(),
         IntelPorts(),
-- 
cgit v1.2.3


From 2f3d6ea05deca7c1c653e2e53d73ac5a81378d53 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:55 -0800
Subject: perf jevents: Add context switch metrics for Intel

Metrics break down context switches for different kinds of
instruction.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 58 ++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index bc3c50285916..9cf4bd8ac769 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -263,6 +263,63 @@ def IntelBr():
                        description="breakdown of retired branch instructions")
 
 
+def IntelCtxSw() -> MetricGroup:
+    cs = Event("context\\-switches")
+    metrics = [
+        Metric("lpm_cs_rate", "Context switches per second",
+               d_ratio(cs, interval_sec), "ctxsw/s")
+    ]
+
+    ev = Event("instructions")
+    metrics.append(Metric("lpm_cs_instr", "Instructions per context switch",
+                          d_ratio(ev, cs), "instr/cs"))
+
+    ev = Event("cycles")
+    metrics.append(Metric("lpm_cs_cycles", "Cycles per context switch",
+                          d_ratio(ev, cs), "cycles/cs"))
+
+    try:
+        ev = Event("MEM_INST_RETIRED.ALL_LOADS", "MEM_UOPS_RETIRED.ALL_LOADS")
+        metrics.append(Metric("lpm_cs_loads", "Loads per context switch",
+                              d_ratio(ev, cs), "loads/cs"))
+    except:
+        pass
+
+    try:
+        ev = Event("MEM_INST_RETIRED.ALL_STORES",
+                   "MEM_UOPS_RETIRED.ALL_STORES")
+        metrics.append(Metric("lpm_cs_stores", "Stores per context switch",
+                              d_ratio(ev, cs), "stores/cs"))
+    except:
+        pass
+
+    try:
+        ev = Event("BR_INST_RETIRED.NEAR_TAKEN", "BR_INST_RETIRED.TAKEN_JCC")
+        metrics.append(Metric("lpm_cs_br_taken", "Branches taken per context switch",
+                              d_ratio(ev, cs), "br_taken/cs"))
+    except:
+        pass
+
+    try:
+        l2_misses = (Event("L2_RQSTS.DEMAND_DATA_RD_MISS") +
+                     Event("L2_RQSTS.RFO_MISS") +
+                     Event("L2_RQSTS.CODE_RD_MISS"))
+        try:
+            l2_misses += Event("L2_RQSTS.HWPF_MISS",
+                               "L2_RQSTS.L2_PF_MISS", "L2_RQSTS.PF_MISS")
+        except:
+            pass
+
+        metrics.append(Metric("lpm_cs_l2_misses", "L2 misses per context switch",
+                              d_ratio(l2_misses, cs), "l2_misses/cs"))
+    except:
+        pass
+
+    return MetricGroup("lpm_cs", metrics,
+                       description=("Number of context switches per second, instructions "
+                                    "retired & core cycles between context switches"))
+
+
 def IntelIlp() -> MetricGroup:
     tsc = Event("msr/tsc/")
     c0 = Event("msr/mperf/")
@@ -678,6 +735,7 @@ def main() -> None:
         Smi(),
         Tsx(),
         IntelBr(),
+        IntelCtxSw(),
         IntelIlp(),
         IntelL2(),
         IntelLdSt(),
-- 
cgit v1.2.3


From d666f0172ab306fa7b7af38499d51f4941460688 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:56 -0800
Subject: perf jevents: Add FPU metrics for Intel

Metrics break down of floating point operations.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 97 ++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 9cf4bd8ac769..77b8e10194db 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -320,6 +320,102 @@ def IntelCtxSw() -> MetricGroup:
                                     "retired & core cycles between context switches"))
 
 
+def IntelFpu() -> Optional[MetricGroup]:
+    cyc = Event("cycles")
+    try:
+        s_64 = Event("FP_ARITH_INST_RETIRED.SCALAR_SINGLE",
+                     "SIMD_INST_RETIRED.SCALAR_SINGLE")
+    except:
+        return None
+    d_64 = Event("FP_ARITH_INST_RETIRED.SCALAR_DOUBLE",
+                 "SIMD_INST_RETIRED.SCALAR_DOUBLE")
+    s_128 = Event("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE",
+                  "SIMD_INST_RETIRED.PACKED_SINGLE")
+
+    flop = s_64 + d_64 + 4 * s_128
+
+    d_128 = None
+    s_256 = None
+    d_256 = None
+    s_512 = None
+    d_512 = None
+    try:
+        d_128 = Event("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE")
+        flop += 2 * d_128
+        s_256 = Event("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE")
+        flop += 8 * s_256
+        d_256 = Event("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE")
+        flop += 4 * d_256
+        s_512 = Event("FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE")
+        flop += 16 * s_512
+        d_512 = Event("FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE")
+        flop += 8 * d_512
+    except:
+        pass
+
+    f_assist = Event("ASSISTS.FP", "FP_ASSIST.ANY", "FP_ASSIST.S")
+    if f_assist in [
+        "ASSISTS.FP",
+        "FP_ASSIST.S",
+    ]:
+        f_assist += "/cmask=1/"
+
+    flop_r = d_ratio(flop, interval_sec)
+    flop_c = d_ratio(flop, cyc)
+    nmi_constraint = MetricConstraint.GROUPED_EVENTS
+    if f_assist.name == "ASSISTS.FP":  # Icelake+
+        nmi_constraint = MetricConstraint.NO_GROUP_EVENTS_NMI
+
+    def FpuMetrics(group: str, fl: Optional[Event], mult: int, desc: str) -> Optional[MetricGroup]:
+        if not fl:
+            return None
+
+        f = fl * mult
+        fl_r = d_ratio(f, interval_sec)
+        r_s = d_ratio(fl, interval_sec)
+        return MetricGroup(group, [
+            Metric(f"{group}_of_total", desc + " floating point operations per second",
+                   d_ratio(f, flop), "100%"),
+            Metric(f"{group}_flops", desc + " floating point operations per second",
+                   fl_r, "flops/s"),
+            Metric(f"{group}_ops", desc + " operations per second",
+                   r_s, "ops/s"),
+        ])
+
+    return MetricGroup("lpm_fpu", [
+        MetricGroup("lpm_fpu_total", [
+            Metric("lpm_fpu_total_flops", "Floating point operations per second",
+                   flop_r, "flops/s"),
+            Metric("lpm_fpu_total_flopc", "Floating point operations per cycle",
+                   flop_c, "flops/cycle", constraint=nmi_constraint),
+        ]),
+        MetricGroup("lpm_fpu_64", [
+            FpuMetrics("lpm_fpu_64_single", s_64, 1, "64-bit single"),
+            FpuMetrics("lpm_fpu_64_double", d_64, 1, "64-bit double"),
+        ]),
+        MetricGroup("lpm_fpu_128", [
+            FpuMetrics("lpm_fpu_128_single", s_128,
+                       4, "128-bit packed single"),
+            FpuMetrics("lpm_fpu_128_double", d_128,
+                       2, "128-bit packed double"),
+        ]),
+        MetricGroup("lpm_fpu_256", [
+            FpuMetrics("lpm_fpu_256_single", s_256,
+                       8, "128-bit packed single"),
+            FpuMetrics("lpm_fpu_256_double", d_256,
+                       4, "128-bit packed double"),
+        ]),
+        MetricGroup("lpm_fpu_512", [
+            FpuMetrics("lpm_fpu_512_single", s_512,
+                       16, "128-bit packed single"),
+            FpuMetrics("lpm_fpu_512_double", d_512,
+                       8, "128-bit packed double"),
+        ]),
+        Metric("lpm_fpu_assists", "FP assists as a percentage of cycles",
+               d_ratio(f_assist, cyc), "100%"),
+    ])
+
+
 def IntelIlp() -> MetricGroup:
     tsc = Event("msr/tsc/")
     c0 = Event("msr/mperf/")
@@ -736,6 +832,7 @@ def main() -> None:
         Tsx(),
         IntelBr(),
         IntelCtxSw(),
+        IntelFpu(),
         IntelIlp(),
         IntelL2(),
         IntelLdSt(),
-- 
cgit v1.2.3


From 426b8442898de9afe256e3c415d272808d6b69fb Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:57 -0800
Subject: perf jevents: Add Miss Level Parallelism (MLP) metric for Intel

Number of outstanding load misses per cycle.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 77b8e10194db..dddeae35e4b4 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -624,6 +624,20 @@ def IntelL2() -> Optional[MetricGroup]:
     ], description="L2 data cache analysis")
 
 
+def IntelMlp() -> Optional[Metric]:
+    try:
+        l1d = Event("L1D_PEND_MISS.PENDING")
+        l1dc = Event("L1D_PEND_MISS.PENDING_CYCLES")
+    except:
+        return None
+
+    l1dc = Select(l1dc / 2, Literal("#smt_on"), l1dc)
+    ml = d_ratio(l1d, l1dc)
+    return Metric("lpm_mlp",
+                  "Miss level parallelism - number of outstanding load misses per cycle (higher is better)",
+                  ml, "load_miss_pending/cycle")
+
+
 def IntelPorts() -> Optional[MetricGroup]:
     pipeline_events = json.load(
         open(f"{_args.events_path}/x86/{_args.model}/pipeline.json"))
@@ -836,6 +850,7 @@ def main() -> None:
         IntelIlp(),
         IntelL2(),
         IntelLdSt(),
+        IntelMlp(),
         IntelPorts(),
         IntelSwpf(),
     ])
-- 
cgit v1.2.3


From 130f4245af99e140cc5dc93bdf8e59ffd9cc9d6f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:58 -0800
Subject: perf jevents: Add mem_bw metric for Intel

Break down memory bandwidth using uncore counters. For many models
this matches the memory_bandwidth_* metrics, but these metrics aren't
made available on all models.

Add support for free running counters.  Query the event JSON when
determining which what events/counters are available.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 62 ++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index dddeae35e4b4..f671d6e4fd67 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -815,6 +815,67 @@ def IntelLdSt() -> Optional[MetricGroup]:
     ], description="Breakdown of load/store instructions")
 
 
+def UncoreMemBw() -> Optional[MetricGroup]:
+    mem_events = []
+    try:
+        mem_events = json.load(open(f"{os.path.dirname(os.path.realpath(__file__))}"
+                                    f"/arch/x86/{args.model}/uncore-memory.json"))
+    except:
+        pass
+
+    ddr_rds = 0
+    ddr_wrs = 0
+    ddr_total = 0
+    for x in mem_events:
+        if "EventName" in x:
+            name = x["EventName"]
+            if re.search("^UNC_MC[0-9]+_RDCAS_COUNT_FREERUN", name):
+                ddr_rds += Event(name)
+            elif re.search("^UNC_MC[0-9]+_WRCAS_COUNT_FREERUN", name):
+                ddr_wrs += Event(name)
+            # elif re.search("^UNC_MC[0-9]+_TOTAL_REQCOUNT_FREERUN", name):
+            #  ddr_total += Event(name)
+
+    if ddr_rds == 0:
+        try:
+            ddr_rds = Event("UNC_M_CAS_COUNT.RD")
+            ddr_wrs = Event("UNC_M_CAS_COUNT.WR")
+        except:
+            return None
+
+    ddr_total = ddr_rds + ddr_wrs
+
+    pmm_rds = 0
+    pmm_wrs = 0
+    try:
+        pmm_rds = Event("UNC_M_PMM_RPQ_INSERTS")
+        pmm_wrs = Event("UNC_M_PMM_WPQ_INSERTS")
+    except:
+        pass
+
+    pmm_total = pmm_rds + pmm_wrs
+
+    scale = 64 / 1_000_000
+    return MetricGroup("lpm_mem_bw", [
+        MetricGroup("lpm_mem_bw_ddr", [
+            Metric("lpm_mem_bw_ddr_read", "DDR memory read bandwidth",
+                   d_ratio(ddr_rds, interval_sec), f"{scale}MB/s"),
+            Metric("lpm_mem_bw_ddr_write", "DDR memory write bandwidth",
+                   d_ratio(ddr_wrs, interval_sec), f"{scale}MB/s"),
+            Metric("lpm_mem_bw_ddr_total", "DDR memory write bandwidth",
+                   d_ratio(ddr_total, interval_sec), f"{scale}MB/s"),
+        ], description="DDR Memory Bandwidth"),
+        MetricGroup("lpm_mem_bw_pmm", [
+            Metric("lpm_mem_bw_pmm_read", "PMM memory read bandwidth",
+                   d_ratio(pmm_rds, interval_sec), f"{scale}MB/s"),
+            Metric("lpm_mem_bw_pmm_write", "PMM memory write bandwidth",
+                   d_ratio(pmm_wrs, interval_sec), f"{scale}MB/s"),
+            Metric("lpm_mem_bw_pmm_total", "PMM memory write bandwidth",
+                   d_ratio(pmm_total, interval_sec), f"{scale}MB/s"),
+        ], description="PMM Memory Bandwidth") if pmm_rds != 0 else None,
+    ], description="Memory Bandwidth")
+
+
 def main() -> None:
     global _args
 
@@ -853,6 +914,7 @@ def main() -> None:
         IntelMlp(),
         IntelPorts(),
         IntelSwpf(),
+        UncoreMemBw(),
     ])
 
     if _args.metricgroups:
-- 
cgit v1.2.3


From cde9c1a5d92520a874b333591ac29aef37b0c0cb Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:44:59 -0800
Subject: perf jevents: Add local/remote "mem" breakdown metrics for Intel

Breakdown local and remote memory bandwidth, read and writes.

The implementation uses the HA and CHA PMUs present in server models
broadwellde, broadwellx cascadelakex, emeraldrapids, haswellx, icelakex,
ivytown, sapphirerapids and skylakex.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index f671d6e4fd67..983e5021f3d3 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -815,6 +815,36 @@ def IntelLdSt() -> Optional[MetricGroup]:
     ], description="Breakdown of load/store instructions")
 
 
+def UncoreMem() -> Optional[MetricGroup]:
+    try:
+        loc_rds = Event("UNC_CHA_REQUESTS.READS_LOCAL",
+                        "UNC_H_REQUESTS.READS_LOCAL")
+        rem_rds = Event("UNC_CHA_REQUESTS.READS_REMOTE",
+                        "UNC_H_REQUESTS.READS_REMOTE")
+        loc_wrs = Event("UNC_CHA_REQUESTS.WRITES_LOCAL",
+                        "UNC_H_REQUESTS.WRITES_LOCAL")
+        rem_wrs = Event("UNC_CHA_REQUESTS.WRITES_REMOTE",
+                        "UNC_H_REQUESTS.WRITES_REMOTE")
+    except:
+        return None
+
+    scale = 64 / 1_000_000
+    return MetricGroup("lpm_mem", [
+        MetricGroup("lpm_mem_local", [
+            Metric("lpm_mem_local_read", "Local memory read bandwidth not including directory updates",
+                   d_ratio(loc_rds, interval_sec), f"{scale}MB/s"),
+            Metric("lpm_mem_local_write", "Local memory write bandwidth not including directory updates",
+                   d_ratio(loc_wrs, interval_sec), f"{scale}MB/s"),
+        ]),
+        MetricGroup("lpm_mem_remote", [
+            Metric("lpm_mem_remote_read", "Remote memory read bandwidth not including directory updates",
+                   d_ratio(rem_rds, interval_sec), f"{scale}MB/s"),
+            Metric("lpm_mem_remote_write", "Remote memory write bandwidth not including directory updates",
+                   d_ratio(rem_wrs, interval_sec), f"{scale}MB/s"),
+        ]),
+    ], description="Memory Bandwidth breakdown local vs. remote (remote requests in). directory updates not included")
+
+
 def UncoreMemBw() -> Optional[MetricGroup]:
     mem_events = []
     try:
@@ -914,6 +944,7 @@ def main() -> None:
         IntelMlp(),
         IntelPorts(),
         IntelSwpf(),
+        UncoreMem(),
         UncoreMemBw(),
     ])
 
-- 
cgit v1.2.3


From 2166b44be938420934181be34fe20deee6e46441 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:45:00 -0800
Subject: perf jevents: Add dir breakdown metrics for Intel

Breakdown directory hit, misses and requests. The implementation uses
the M2M and CHA PMUs present in server models broadwellde, broadwellx
cascadelakex, emeraldrapids, icelakex, sapphirerapids and skylakex.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 36 ++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 983e5021f3d3..24ceb7f8719b 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -815,6 +815,41 @@ def IntelLdSt() -> Optional[MetricGroup]:
     ], description="Breakdown of load/store instructions")
 
 
+def UncoreDir() -> Optional[MetricGroup]:
+    try:
+        m2m_upd = Event("UNC_M2M_DIRECTORY_UPDATE.ANY")
+        m2m_hits = Event("UNC_M2M_DIRECTORY_HIT.DIRTY_I")
+        # Turn the umask into a ANY rather than DIRTY_I filter.
+        m2m_hits.name += "/umask=0xFF,name=UNC_M2M_DIRECTORY_HIT.ANY/"
+        m2m_miss = Event("UNC_M2M_DIRECTORY_MISS.DIRTY_I")
+        # Turn the umask into a ANY rather than DIRTY_I filter.
+        m2m_miss.name += "/umask=0xFF,name=UNC_M2M_DIRECTORY_MISS.ANY/"
+        cha_upd = Event("UNC_CHA_DIR_UPDATE.HA")
+        # Turn the umask into a ANY rather than HA filter.
+        cha_upd.name += "/umask=3,name=UNC_CHA_DIR_UPDATE.ANY/"
+    except:
+        return None
+
+    m2m_total = m2m_hits + m2m_miss
+    upd = m2m_upd + cha_upd  # in cache lines
+    upd_r = upd / interval_sec
+    look_r = m2m_total / interval_sec
+
+    scale = 64 / 1_000_000  # Cache lines to MB
+    return MetricGroup("lpm_dir", [
+        Metric("lpm_dir_lookup_rate", "",
+               d_ratio(m2m_total, interval_sec), "requests/s"),
+        Metric("lpm_dir_lookup_hits", "",
+               d_ratio(m2m_hits, m2m_total), "100%"),
+        Metric("lpm_dir_lookup_misses", "",
+               d_ratio(m2m_miss, m2m_total), "100%"),
+        Metric("lpm_dir_update_requests", "",
+               d_ratio(m2m_upd + cha_upd, interval_sec), "requests/s"),
+        Metric("lpm_dir_update_bw", "",
+               d_ratio(m2m_upd + cha_upd, interval_sec), f"{scale}MB/s"),
+    ])
+
+
 def UncoreMem() -> Optional[MetricGroup]:
     try:
         loc_rds = Event("UNC_CHA_REQUESTS.READS_LOCAL",
@@ -944,6 +979,7 @@ def main() -> None:
         IntelMlp(),
         IntelPorts(),
         IntelSwpf(),
+        UncoreDir(),
         UncoreMem(),
         UncoreMemBw(),
     ])
-- 
cgit v1.2.3


From 1fee2701a7d35fa7285479c5c2ee6c2d9bd99526 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:45:01 -0800
Subject: perf jevents: Add C-State metrics from the PCU PMU for Intel

Use occupancy events fixed in:

  https://lore.kernel.org/lkml/20240226201517.3540187-1-irogers@google.com/

Metrics are at the socket level referring to cores, not hyperthreads.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 24ceb7f8719b..118fe0fc05a3 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -815,6 +815,35 @@ def IntelLdSt() -> Optional[MetricGroup]:
     ], description="Breakdown of load/store instructions")
 
 
+def UncoreCState() -> Optional[MetricGroup]:
+    try:
+        pcu_ticks = Event("UNC_P_CLOCKTICKS")
+        c0 = Event("UNC_P_POWER_STATE_OCCUPANCY.CORES_C0")
+        c3 = Event("UNC_P_POWER_STATE_OCCUPANCY.CORES_C3")
+        c6 = Event("UNC_P_POWER_STATE_OCCUPANCY.CORES_C6")
+    except:
+        return None
+
+    num_cores = Literal("#num_cores") / Literal("#num_packages")
+
+    max_cycles = pcu_ticks * num_cores
+    total_cycles = c0 + c3 + c6
+
+    # remove fused-off cores which show up in C6/C7.
+    c6 = Select(max(c6 - (total_cycles - max_cycles), 0),
+                total_cycles > max_cycles,
+                c6)
+
+    return MetricGroup("lpm_cstate", [
+        Metric("lpm_cstate_c0", "C-State cores in C0/C1",
+               d_ratio(c0, pcu_ticks), "cores"),
+        Metric("lpm_cstate_c3", "C-State cores in C3",
+               d_ratio(c3, pcu_ticks), "cores"),
+        Metric("lpm_cstate_c6", "C-State cores in C6/C7",
+               d_ratio(c6, pcu_ticks), "cores"),
+    ])
+
+
 def UncoreDir() -> Optional[MetricGroup]:
     try:
         m2m_upd = Event("UNC_M2M_DIRECTORY_UPDATE.ANY")
@@ -979,6 +1008,7 @@ def main() -> None:
         IntelMlp(),
         IntelPorts(),
         IntelSwpf(),
+        UncoreCState(),
         UncoreDir(),
         UncoreMem(),
         UncoreMemBw(),
-- 
cgit v1.2.3


From 6ec3058e709cc63513bafe105bf48b512baabe04 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:45:02 -0800
Subject: perf jevents: Add local/remote miss latency metrics for Intel

Derive from CBOX/CHA occupancy and inserts the average latency as is
provided in Intel's uncore performance monitoring reference.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 70 ++++++++++++++++++++++++++++++++--
 1 file changed, 67 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 118fe0fc05a3..037f9b2ea1b6 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -6,9 +6,10 @@ import math
 import os
 import re
 from typing import Optional
-from metric import (d_ratio, has_event, max, CheckPmu, Event, JsonEncodeMetric,
-                    JsonEncodeMetricGroupDescriptions, Literal, LoadEvents,
-                    Metric, MetricConstraint, MetricGroup, MetricRef, Select)
+from metric import (d_ratio, has_event, max, source_count, CheckPmu, Event,
+                    JsonEncodeMetric, JsonEncodeMetricGroupDescriptions,
+                    Literal, LoadEvents, Metric, MetricConstraint, MetricGroup,
+                    MetricRef, Select)
 
 # Global command line arguments.
 _args = None
@@ -624,6 +625,68 @@ def IntelL2() -> Optional[MetricGroup]:
     ], description="L2 data cache analysis")
 
 
+def IntelMissLat() -> Optional[MetricGroup]:
+    try:
+        ticks = Event("UNC_CHA_CLOCKTICKS", "UNC_C_CLOCKTICKS")
+        data_rd_loc_occ = Event("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL",
+                                "UNC_CHA_TOR_OCCUPANCY.IA_MISS",
+                                "UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE",
+                                "UNC_C_TOR_OCCUPANCY.MISS_OPCODE")
+        data_rd_loc_ins = Event("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL",
+                                "UNC_CHA_TOR_INSERTS.IA_MISS",
+                                "UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE",
+                                "UNC_C_TOR_INSERTS.MISS_OPCODE")
+        data_rd_rem_occ = Event("UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_REMOTE",
+                                "UNC_CHA_TOR_OCCUPANCY.IA_MISS",
+                                "UNC_C_TOR_OCCUPANCY.MISS_REMOTE_OPCODE",
+                                "UNC_C_TOR_OCCUPANCY.NID_MISS_OPCODE")
+        data_rd_rem_ins = Event("UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE",
+                                "UNC_CHA_TOR_INSERTS.IA_MISS",
+                                "UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE",
+                                "UNC_C_TOR_INSERTS.NID_MISS_OPCODE")
+    except:
+        return None
+
+    if (data_rd_loc_occ.name == "UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE" or
+            data_rd_loc_occ.name == "UNC_C_TOR_OCCUPANCY.MISS_OPCODE"):
+        data_rd = 0x182
+        for e in [data_rd_loc_occ, data_rd_loc_ins, data_rd_rem_occ, data_rd_rem_ins]:
+            e.name += f"/filter_opc={hex(data_rd)}/"
+    elif data_rd_loc_occ.name == "UNC_CHA_TOR_OCCUPANCY.IA_MISS":
+        # Demand Data Read - Full cache-line read requests from core for
+        # lines to be cached in S or E, typically for data
+        demand_data_rd = 0x202
+        #  LLC Prefetch Data - Uncore will first look up the line in the
+        #  LLC; for a cache hit, the LRU will be updated, on a miss, the
+        #  DRd will be initiated
+        llc_prefetch_data = 0x25a
+        local_filter = (f"/filter_opc0={hex(demand_data_rd)},"
+                        f"filter_opc1={hex(llc_prefetch_data)},"
+                        "filter_loc,filter_nm,filter_not_nm/")
+        remote_filter = (f"/filter_opc0={hex(demand_data_rd)},"
+                         f"filter_opc1={hex(llc_prefetch_data)},"
+                         "filter_rem,filter_nm,filter_not_nm/")
+        for e in [data_rd_loc_occ, data_rd_loc_ins]:
+            e.name += local_filter
+        for e in [data_rd_rem_occ, data_rd_rem_ins]:
+            e.name += remote_filter
+    else:
+        assert data_rd_loc_occ.name == "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_LOCAL", data_rd_loc_occ
+
+    ticks_per_cha = ticks / source_count(data_rd_loc_ins)
+    loc_lat = interval_sec * 1e9 * data_rd_loc_occ / \
+        (ticks_per_cha * data_rd_loc_ins)
+    ticks_per_cha = ticks / source_count(data_rd_rem_ins)
+    rem_lat = interval_sec * 1e9 * data_rd_rem_occ / \
+        (ticks_per_cha * data_rd_rem_ins)
+    return MetricGroup("lpm_miss_lat", [
+        Metric("lpm_miss_lat_loc", "Local to a socket miss latency in nanoseconds",
+               loc_lat, "ns"),
+        Metric("lpm_miss_lat_rem", "Remote to a socket miss latency in nanoseconds",
+               rem_lat, "ns"),
+    ])
+
+
 def IntelMlp() -> Optional[Metric]:
     try:
         l1d = Event("L1D_PEND_MISS.PENDING")
@@ -1005,6 +1068,7 @@ def main() -> None:
         IntelIlp(),
         IntelL2(),
         IntelLdSt(),
+        IntelMissLat(),
         IntelMlp(),
         IntelPorts(),
         IntelSwpf(),
-- 
cgit v1.2.3


From 5dc81578ad77c298248a12de8b5e19923ef2c617 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:45:03 -0800
Subject: perf jevents: Add upi_bw metric for Intel

Break down UPI read and write bandwidth using uncore_upi counters.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index 037f9b2ea1b6..f6bb691dc5bb 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -1033,6 +1033,27 @@ def UncoreMemBw() -> Optional[MetricGroup]:
     ], description="Memory Bandwidth")
 
 
+def UncoreUpiBw() -> Optional[MetricGroup]:
+    try:
+        upi_rds = Event("UNC_UPI_RxL_FLITS.ALL_DATA")
+        upi_wrs = Event("UNC_UPI_TxL_FLITS.ALL_DATA")
+    except:
+        return None
+
+    upi_total = upi_rds + upi_wrs
+
+    # From "Uncore Performance Monitoring": When measuring the amount of
+    # bandwidth consumed by transmission of the data (i.e. NOT including
+    # the header), it should be .ALL_DATA / 9 * 64B.
+    scale = (64 / 9) / 1_000_000
+    return MetricGroup("lpm_upi_bw", [
+        Metric("lpm_upi_bw_read", "UPI read bandwidth",
+               d_ratio(upi_rds, interval_sec), f"{scale}MB/s"),
+        Metric("lpm_upi_bw_write", "DDR memory write bandwidth",
+               d_ratio(upi_wrs, interval_sec), f"{scale}MB/s"),
+    ], description="UPI Bandwidth")
+
+
 def main() -> None:
     global _args
 
@@ -1076,6 +1097,7 @@ def main() -> None:
         UncoreDir(),
         UncoreMem(),
         UncoreMemBw(),
+        UncoreUpiBw(),
     ])
 
     if _args.metricgroups:
-- 
cgit v1.2.3


From e74f72a7e21782332bb7b9541634199278f3461b Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:45:04 -0800
Subject: perf jevents: Add mesh bandwidth saturation metric for Intel

Memory bandwidth saturation from CBOX/CHA events present in
broadwellde, broadwellx, cascadelakex, haswellx, icelakex, skylakex
and snowridgex.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/intel_metrics.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index f6bb691dc5bb..d56bab7337df 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -1033,6 +1033,22 @@ def UncoreMemBw() -> Optional[MetricGroup]:
     ], description="Memory Bandwidth")
 
 
+def UncoreMemSat() -> Optional[Metric]:
+    try:
+        clocks = Event("UNC_CHA_CLOCKTICKS", "UNC_C_CLOCKTICKS")
+        sat = Event("UNC_CHA_DISTRESS_ASSERTED.VERT", "UNC_CHA_FAST_ASSERTED.VERT",
+                    "UNC_C_FAST_ASSERTED")
+    except:
+        return None
+
+    desc = ("Mesh Bandwidth saturation (% CBOX cycles with FAST signal asserted, "
+            "include QPI bandwidth saturation), lower is better")
+    if "UNC_CHA_" in sat.name:
+        desc = ("Mesh Bandwidth saturation (% CHA cycles with FAST signal asserted, "
+                "include UPI bandwidth saturation), lower is better")
+    return Metric("lpm_mem_sat", desc, d_ratio(sat, clocks), "100%")
+
+
 def UncoreUpiBw() -> Optional[MetricGroup]:
     try:
         upi_rds = Event("UNC_UPI_RxL_FLITS.ALL_DATA")
@@ -1097,6 +1113,7 @@ def main() -> None:
         UncoreDir(),
         UncoreMem(),
         UncoreMemBw(),
+        UncoreMemSat(),
         UncoreUpiBw(),
     ])
 
-- 
cgit v1.2.3


From 82e53e7ae09a054b00cf3afdddf7c378351cf3e0 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:45:05 -0800
Subject: perf jevents: Add cycles breakdown metric for arm64/AMD/Intel

Breakdown cycles to user, kernel and guest. Add a common_metrics.py
file for such metrics.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/Build             |  2 +-
 tools/perf/pmu-events/amd_metrics.py    |  2 ++
 tools/perf/pmu-events/arm64_metrics.py  |  5 ++++-
 tools/perf/pmu-events/common_metrics.py | 19 +++++++++++++++++++
 tools/perf/pmu-events/intel_metrics.py  |  2 ++
 5 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100644 tools/perf/pmu-events/common_metrics.py

(limited to 'tools')

diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build
index 68227614d0b1..ec964ed05974 100644
--- a/tools/perf/pmu-events/Build
+++ b/tools/perf/pmu-events/Build
@@ -52,7 +52,7 @@ $(LEGACY_CACHE_JSON): $(LEGACY_CACHE_PY) $(JSON_DIRS_ROOT)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)$(PYTHON) $(LEGACY_CACHE_PY) > $@
 
-GEN_METRIC_DEPS := pmu-events/metric.py
+GEN_METRIC_DEPS := pmu-events/metric.py pmu-events/common_metrics.py
 
 # Generate AMD Json
 ZENS = $(shell ls -d pmu-events/arch/x86/amdzen*)
diff --git a/tools/perf/pmu-events/amd_metrics.py b/tools/perf/pmu-events/amd_metrics.py
index 83e77ccc059e..e2defaffde3e 100755
--- a/tools/perf/pmu-events/amd_metrics.py
+++ b/tools/perf/pmu-events/amd_metrics.py
@@ -4,6 +4,7 @@ import argparse
 import math
 import os
 from typing import Optional
+from common_metrics import Cycles
 from metric import (d_ratio, has_event, max, Event, JsonEncodeMetric,
                     JsonEncodeMetricGroupDescriptions, Literal, LoadEvents,
                     Metric, MetricGroup, Select)
@@ -475,6 +476,7 @@ def main() -> None:
         AmdItlb(),
         AmdLdSt(),
         AmdUpc(),
+        Cycles(),
         Idle(),
         Rapl(),
         UncoreL3(),
diff --git a/tools/perf/pmu-events/arm64_metrics.py b/tools/perf/pmu-events/arm64_metrics.py
index ac717ca3513a..4ecda96d11fa 100755
--- a/tools/perf/pmu-events/arm64_metrics.py
+++ b/tools/perf/pmu-events/arm64_metrics.py
@@ -4,6 +4,7 @@ import argparse
 import os
 from metric import (JsonEncodeMetric, JsonEncodeMetricGroupDescriptions, LoadEvents,
                     MetricGroup)
+from common_metrics import Cycles
 
 # Global command line arguments.
 _args = None
@@ -34,7 +35,9 @@ def main() -> None:
     directory = f"{_args.events_path}/arm64/{_args.vendor}/{_args.model}/"
     LoadEvents(directory)
 
-    all_metrics = MetricGroup("", [])
+    all_metrics = MetricGroup("", [
+        Cycles(),
+    ])
 
     if _args.metricgroups:
         print(JsonEncodeMetricGroupDescriptions(all_metrics))
diff --git a/tools/perf/pmu-events/common_metrics.py b/tools/perf/pmu-events/common_metrics.py
new file mode 100644
index 000000000000..fcdfb9d3e648
--- /dev/null
+++ b/tools/perf/pmu-events/common_metrics.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+from metric import (d_ratio, Event, Metric, MetricGroup)
+
+
+def Cycles() -> MetricGroup:
+    cyc_k = Event("cpu\\-cycles:kHh")  # exclude user and guest
+    cyc_g = Event("cpu\\-cycles:G")   # exclude host
+    cyc_u = Event("cpu\\-cycles:uH")  # exclude kernel, hypervisor and guest
+    cyc = cyc_k + cyc_g + cyc_u
+
+    return MetricGroup("lpm_cycles", [
+        Metric("lpm_cycles_total", "Total number of cycles", cyc, "cycles"),
+        Metric("lpm_cycles_user", "User cycles as a percentage of all cycles",
+               d_ratio(cyc_u, cyc), "100%"),
+        Metric("lpm_cycles_kernel", "Kernel cycles as a percentage of all cycles",
+               d_ratio(cyc_k, cyc), "100%"),
+        Metric("lpm_cycles_guest", "Hypervisor guest cycles as a percentage of all cycles",
+               d_ratio(cyc_g, cyc), "100%"),
+    ], description="cycles breakdown per privilege level (users, kernel, guest)")
diff --git a/tools/perf/pmu-events/intel_metrics.py b/tools/perf/pmu-events/intel_metrics.py
index d56bab7337df..52035433b505 100755
--- a/tools/perf/pmu-events/intel_metrics.py
+++ b/tools/perf/pmu-events/intel_metrics.py
@@ -6,6 +6,7 @@ import math
 import os
 import re
 from typing import Optional
+from common_metrics import Cycles
 from metric import (d_ratio, has_event, max, source_count, CheckPmu, Event,
                     JsonEncodeMetric, JsonEncodeMetricGroupDescriptions,
                     Literal, LoadEvents, Metric, MetricConstraint, MetricGroup,
@@ -1095,6 +1096,7 @@ def main() -> None:
     LoadEvents(directory)
 
     all_metrics = MetricGroup("", [
+        Cycles(),
         Idle(),
         Rapl(),
         Smi(),
-- 
cgit v1.2.3


From e205952db7717557f71f22baa96589f0a56d83c5 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 27 Jan 2026 10:45:06 -0800
Subject: perf jevents: Validate that all names given an Event

Validate they exist in a JSON file from one directory found from one
directory above the model's JSON directory.

This avoids broken fallback encodings being created.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Falcon <thomas.falcon@intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Benjamin Gray <bgray@linux.ibm.com>
Cc: Caleb Biggers <caleb.biggers@intel.com>
Cc: Edward Baker <edward.baker@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Perry Taylor <perry.taylor@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Weilin Wang <weilin.wang@intel.com>
Cc: Xu Yang <xu.yang_2@nxp.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/metric.py | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/metric.py b/tools/perf/pmu-events/metric.py
index 2029b6e28365..585454828c2f 100644
--- a/tools/perf/pmu-events/metric.py
+++ b/tools/perf/pmu-events/metric.py
@@ -11,12 +11,14 @@ from typing import Dict, List, Optional, Set, Tuple, Union
 all_pmus = set()
 all_events = set()
 experimental_events = set()
+all_events_all_models = set()
 
 def LoadEvents(directory: str) -> None:
   """Populate a global set of all known events for the purpose of validating Event names"""
   global all_pmus
   global all_events
   global experimental_events
+  global all_events_all_models
   all_events = {
       "context\\-switches",
       "cpu\\-cycles",
@@ -42,6 +44,20 @@ def LoadEvents(directory: str) -> None:
         # The generated directory may be the same as the input, which
         # causes partial json files. Ignore errors.
         pass
+  all_events_all_models = all_events.copy()
+  for root, dirs, files in os.walk(directory + ".."):
+    for filename in files:
+      if filename.endswith(".json"):
+        try:
+          for x in json.load(open(f"{root}/{filename}")):
+            if "EventName" in x:
+              all_events_all_models.add(x["EventName"])
+            elif "ArchStdEvent" in x:
+              all_events_all_models.add(x["ArchStdEvent"])
+        except json.decoder.JSONDecodeError:
+          # The generated directory may be the same as the input, which
+          # causes partial json files. Ignore errors.
+          pass
 
 
 def CheckPmu(name: str) -> bool:
@@ -64,6 +80,25 @@ def CheckEvent(name: str) -> bool:
 
   return name in all_events
 
+def CheckEveryEvent(*names: str) -> None:
+  """Check all the events exist in at least one json file"""
+  global all_events_all_models
+  if len(all_events_all_models) == 0:
+    assert len(names) == 1, f"Cannot determine valid events in {names}"
+    # No events loaded so assume any event is good.
+    return
+
+  for name in names:
+    # Remove trailing modifier.
+    if ':' in name:
+      name = name[:name.find(':')]
+    elif '/' in name:
+      name = name[:name.find('/')]
+      if any([name.startswith(x) for x in ['amd', 'arm', 'cpu', 'msr', 'power']]):
+        continue
+    if name not in all_events_all_models:
+      raise Exception(f"Is {name} a named json event?")
+
 
 def IsExperimentalEvent(name: str) -> bool:
   global experimental_events
@@ -403,6 +438,7 @@ class Event(Expression):
 
   def __init__(self, *args: str):
     error = ""
+    CheckEveryEvent(*args)
     for name in args:
       if CheckEvent(name):
         self.name = _FixEscapes(name)
-- 
cgit v1.2.3


From b640d556a2b354863a9962747a01f67f31cbf4d8 Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Wed, 28 Jan 2026 19:05:51 +0000
Subject: selftests/bpf: Remove xxd util dependency

The verification signature header generation requires converting a
binary certificate to a C array. Previously this only worked with
xxd (part of vim-common package).
As xxd may not be available on some systems building selftests, it makes
sense to substitute it with more common utils: hexdump, wc, sed to
generate equivalent C array output.

Tested by generating header with both xxd and hexdump and comparing
them.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/bpf/20260128190552.242335-1-mykyta.yatsenko5@gmail.com
---
 tools/testing/selftests/bpf/.gitignore |  1 -
 tools/testing/selftests/bpf/Makefile   | 10 ++++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index b8bf51b7a0b0..a3ea98211ea6 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -23,7 +23,6 @@ test_tcpnotify_user
 test_libbpf
 xdping
 test_cpp
-test_progs_verification_cert
 *.d
 *.subskel.h
 *.skel.h
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 2c2f68a171ed..c6bf4dfb1495 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -720,9 +720,12 @@ $(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP)
 	$(Q)mkdir -p $(BUILD_DIR)
 	$(Q)$(VERIFY_SIG_SETUP) genkey $(BUILD_DIR)
 
+# Generates a header with C array declaration, containing test_progs_verification_cert bytes
 $(VERIFY_SIG_HDR): $(VERIFICATION_CERT)
-	$(Q)ln -fs $< test_progs_verification_cert && \
-	xxd -i test_progs_verification_cert > $@
+	$(Q)(echo "unsigned char test_progs_verification_cert[] = {"; \
+	 hexdump -v -e '12/1 "  0x%02x," "\n"' $< | sed 's/0x  ,//g; $$s/,$$//'; \
+	 echo "};"; \
+	 echo "unsigned int test_progs_verification_cert_len = $$(wc -c < $<);") > $@
 
 # Define test_progs test runner.
 TRUNNER_TESTS_DIR := prog_tests
@@ -898,8 +901,7 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)			\
 			       *.BTF *.BTF_ids *.BTF.base		\
 			       no_alu32 cpuv4 bpf_gcc			\
 			       liburandom_read.so)			\
-	$(OUTPUT)/FEATURE-DUMP.selftests				\
-	test_progs_verification_cert
+	$(OUTPUT)/FEATURE-DUMP.selftests
 
 .PHONY: docs docs-clean
 
-- 
cgit v1.2.3


From 08a7491843224f8b96518fbe70d9e48163046054 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Wed, 28 Jan 2026 13:12:55 -0800
Subject: bpftool: Fix dependencies for static build

When building selftests/bpf with EXTRA_LDFLAGS=-static the follwoing
error happens:

  LINK    /ws/linux/tools/testing/selftests/bpf/tools/build/bpftool/bootstrap/bpftool
/usr/bin/x86_64-linux-gnu-ld.bfd: /usr/lib/gcc/x86_64-linux-gnu/15/../../../x86_64-linux-gnu/libcrypto.a(libcrypto-lib-dso_dlfcn.o): in function `dlfcn_globallookup':
   [...]
/usr/bin/x86_64-linux-gnu-ld.bfd: /usr/lib/gcc/x86_64-linux-gnu/15/../../../x86_64-linux-gnu/libcrypto.a(libcrypto-lib-c_zlib.o): in function `zlib_oneshot_expand_block':
(.text+0xc64): undefined reference to `uncompress'
/usr/bin/x86_64-linux-gnu-ld.bfd: /usr/lib/gcc/x86_64-linux-gnu/15/../../../x86_64-linux-gnu/libcrypto.a(libcrypto-lib-c_zlib.o): in function `zlib_oneshot_compress_block':
(.text+0xce4): undefined reference to `compress'
collect2: error: ld returned 1 exit status
make[1]: *** [Makefile:252: /ws/linux/tools/testing/selftests/bpf/tools/build/bpftool/bootstrap/bpftool] Error 1
make: *** [Makefile:327: /ws/linux/tools/testing/selftests/bpf/tools/sbin/bpftool] Error 2
make: *** Waiting for unfinished jobs....

This is caused by wrong order of dependencies in the Makefile. Fix it.

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260128211255.376933-1-ihor.solodrai@linux.dev
---
 tools/bpf/bpftool/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 5442073a2e42..519ea5cb8ab1 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -130,8 +130,8 @@ include $(FEATURES_DUMP)
 endif
 endif
 
-LIBS = $(LIBBPF) -lelf -lz -lcrypto
-LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz -lcrypto
+LIBS = $(LIBBPF) -lelf -lcrypto -lz
+LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lcrypto -lz
 
 ifeq ($(feature-libelf-zstd),1)
 LIBS += -lzstd
-- 
cgit v1.2.3


From 60d2c438c1bb705cdbf74ce8f12e6e141a4719b0 Mon Sep 17 00:00:00 2001
From: Luis Gerhorst <luis.gerhorst@fau.de>
Date: Tue, 27 Jan 2026 12:59:12 +0100
Subject: bpf: Test nospec after dead stack write in helper

Without the fix from the previous commit, the selftest fails:

$ ./tools/testing/selftests/bpf/vmtest.sh -- \
        ./test_progs -t verifier_unpriv
[...]
run_subtest:PASS:obj_open_mem 0 nsec
libbpf: BTF loading error: -EPERM
libbpf: Error loading .BTF into kernel: -EPERM. BTF is optional, ignoring.
libbpf: prog 'unpriv_nospec_after_helper_stack_write': BPF program load failed: -EFAULT
libbpf: prog 'unpriv_nospec_after_helper_stack_write': failed to load: -EFAULT
libbpf: failed to load object 'verifier_unpriv'
run_subtest:FAIL:unexpected_load_failure unexpected error: -14 (errno 14)
VERIFIER LOG:
=============
0: R1=ctx() R10=fp0
0: (b7) r0 = 0                        ; R0=P0
1: (55) if r0 != 0x1 goto pc+6 2: R0=Pscalar() R1=ctx() R10=fp0
2: (b7) r2 = 0                        ; R2=P0
3: (bf) r3 = r10                      ; R3=fp0 R10=fp0
4: (07) r3 += -16                     ; R3=fp-16
5: (b7) r4 = 4                        ; R4=P4
6: (b7) r5 = 0                        ; R5=P0
7: (85) call bpf_skb_load_bytes_relative#68
verifier bug: speculation barrier after jump instruction may not have the desired effect (BPF_CLASS(insn->code) == BPF_JMP || BPF_CLASS(insn->code) == BPF_JMP32)
processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
=============
[...]

The test is based on the PoC from the report.

Signed-off-by: Luis Gerhorst <luis.gerhorst@fau.de>
Reported-by: Yinhao Hu <dddddd@hust.edu.cn>
Reported-by: Kaiyan Mei <M202472210@hust.edu.cn>
Reported-by: Dongliang Mu <dzm91@hust.edu.cn>
Link: https://lore.kernel.org/bpf/7678017d-b760-4053-a2d8-a6879b0dbeeb@hust.edu.cn/
Link: https://lore.kernel.org/r/20260127115912.3026761-3-luis.gerhorst@fau.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/progs/verifier_unpriv.c  | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_unpriv.c b/tools/testing/selftests/bpf/progs/verifier_unpriv.c
index 28b4f7035ceb..8ee1243e62a8 100644
--- a/tools/testing/selftests/bpf/progs/verifier_unpriv.c
+++ b/tools/testing/selftests/bpf/progs/verifier_unpriv.c
@@ -950,4 +950,26 @@ l3_%=:	r0 = 0;						\
 "	::: __clobber_all);
 }
 
+SEC("socket")
+__description("unpriv: nospec after dead stack write in helper")
+__success __success_unpriv
+__retval(0)
+/* Dead code sanitizer rewrites the call to `goto -1`. */
+__naked void unpriv_dead_helper_stack_write_nospec_result(void)
+{
+	asm volatile ("					\
+	r0 = 0;						\
+	if r0 != 1 goto l0_%=;				\
+	r2 = 0;						\
+	r3 = r10;					\
+	r3 += -16;					\
+	r4 = 4;						\
+	r5 = 0;						\
+	call %[bpf_skb_load_bytes_relative];		\
+l0_%=:	exit;						\
+"	:
+	: __imm(bpf_skb_load_bytes_relative)
+	: __clobber_all);
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 6080d525aba8a6cab9fe4b841ca1fab48a2969c6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 27 Jan 2026 12:38:28 +0000
Subject: selftest: packetdrill: add tcp_timestamping_tcp_tx_timestamp_bug.pkt

Test tcp_tx_timestamp() behavior after ("tcp: tcp_tx_timestamp()
must look at the rtx queue").

Without the fix, this new test fails like this:

tcp_timestamping_tcp_tx_timestamp_bug.pkt:55: runtime error in recvmsg call: Expected result 0 but got -1 with errno 11 (Resource temporarily unavailable)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20260127123828.4098577-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../tcp_timestamping_tcp_tx_timestamp_bug.pkt      | 70 ++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt b/tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt
new file mode 100644
index 000000000000..95a1957a2cf9
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_timestamping_tcp_tx_timestamp_bug.pkt
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+// Test after "tcp: tcp_tx_timestamp() must look at the rtx queue"
+
+// This test is about receiving the SCM_TSTAMP_ACK,
+// we do not care about its SCM_TIMESTAMPING precision.
+--tolerance_usecs=1000000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_min_tso_segs=70
+`
+
+// Create a socket and set it to non-blocking.
+    0	socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0	fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
+   +0	fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+// Establish connection and verify that there was no error.
+   +0	connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+   +0	> S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++.010	< S. 0:0(0) ack 1 win 65535 <mss 1000,sackOK,TS val 700 ecr 100,nop,wscale 7>
+   +0	> . 1:1(0) ack 1 <nop,nop,TS val 200 ecr 700>
+   +0	getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
+   +0	setsockopt(3, SOL_SOCKET, SO_SNDBUF, [30000], 4) = 0
+
+   +0   write(3, ..., 9880) = 9880
+   +0   > P. 1:9881(9880) ack 1 <nop,nop,TS val 200 ecr 700>
++.010   < . 1:1(0) ack 9881 win 10000 <nop,nop,TS val 701 ecr 200>
+
+   +0   write(3, ..., 19760) = 19760
+   +0   > P. 9881:29641(19760) ack 1 <nop,nop,TS val 201 ecr 701>
++.010   < . 1:1(0) ack 29641 win 10000 <nop,nop,TS val 702 ecr 201>
+
+   +0   write(3, ..., 39520) = 39520
+   +0   > P. 29641:69161(39520) ack 1 <nop,nop,TS val 202 ecr 702>
++.010   < . 1:1(0) ack 69161 win 10000 <nop,nop,TS val 703 ecr 202>
+
+// One more write to increase cwnd
+   +0	write(3, ..., 79040) = 79040
+   +0	> P. 69161:108681(39520) ack 1 <nop,nop,TS val 203 ecr 703>
+   +0	> P. 108681:148201(39520) ack 1 <nop,nop,TS val 203 ecr 703>
++.010	< . 1:1(0) ack 148201 win 1000 <nop,nop,TS val 704 ecr 203>
+
+   +0	setsockopt(3, SOL_SOCKET, SO_TIMESTAMPING,
+		   [SOF_TIMESTAMPING_TX_ACK | SOF_TIMESTAMPING_SOFTWARE |
+		    SOF_TIMESTAMPING_OPT_ID], 4) = 0
+
+// We have one write filling one skb
+// last byte can not be stored because of our small SO_SNDBUF
+   +0	write(3, ..., 65209) = 65208
+   +0	> P. 148201:213409(65208) ack 1 <nop,nop,TS val 204 ecr 704>
++.010	< . 1:1(0) ack 213409 win 1000 <nop,nop,TS val 705 ecr 204>
+
+// SCM_TSTAMP_ACK should be received after the last ack at
+// t=60ms.
+   +0	recvmsg(3, {msg_name(...)=...,
+		    msg_iov(1)=[{...,0}],
+                    msg_flags=MSG_ERRQUEUE|MSG_TRUNC,
+                    msg_control=[
+			{cmsg_level=SOL_SOCKET,
+			 cmsg_type=SCM_TIMESTAMPING,
+			 cmsg_data={scm_sec=0,scm_nsec=60000000}},
+			{cmsg_level=CMSG_LEVEL_IP,
+			 cmsg_type=CMSG_TYPE_RECVERR,
+			 cmsg_data={ee_errno=ENOMSG,
+				    ee_origin=SO_EE_ORIGIN_TIMESTAMPING,
+				    ee_type=0,
+				    ee_code=0,
+				    ee_info=SCM_TSTAMP_ACK,
+				    ee_data=65207}}
+		    ]}, MSG_ERRQUEUE) = 0
-- 
cgit v1.2.3


From 70de46740b62b83198802bfe6682c5f865c25dc5 Mon Sep 17 00:00:00 2001
From: Daniel Zahka <daniel.zahka@gmail.com>
Date: Tue, 27 Jan 2026 08:30:55 -0800
Subject: selftests: drv-net: psp: fix test flakes from racy connection close

There is a bug in assoc_sk_only_mismatch() and
assoc_sk_only_mismatch_tx() that creates a race condition which
triggers test flakes in later test cases e.g. data_send_bad_key().

The problem is that the client uses the "conn clr" rpc to setup a data
connection with psp_responder, but never uses a matching "data close"
rpc. This creates a race condition where if the client can queue
another data sock request, like in data_send_bad_key(), before the
server can accept the old connection from the backlog we end up in a
situation where we have two connections in the backlog: one for the
closed connection we have received a FIN for, and one for the new PSP
connection which is expecting to do key exchange.

From there the server pops the closed connection from the backlog, but
the data_send_bad_key() test case in psp.py hangs waiting to perform
key exchange.

The fix is to properly use _conn_close, which fill force the server to
remove the closed connection from the backlog before sending the RPC
ack to the client.

Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
Link: https://patch.msgid.link/20260127-psp-flaky-test-v1-1-13403e390af3@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/psp.py | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/psp.py b/tools/testing/selftests/drivers/net/psp.py
index 528a421ecf76..864d9fce1094 100755
--- a/tools/testing/selftests/drivers/net/psp.py
+++ b/tools/testing/selftests/drivers/net/psp.py
@@ -266,6 +266,7 @@ def assoc_sk_only_mismatch(cfg):
         the_exception = cm.exception
         ksft_eq(the_exception.nl_msg.extack['bad-attr'], ".dev-id")
         ksft_eq(the_exception.nl_msg.error, -errno.EINVAL)
+        _close_conn(cfg, s)
 
 
 def assoc_sk_only_mismatch_tx(cfg):
@@ -283,6 +284,7 @@ def assoc_sk_only_mismatch_tx(cfg):
         the_exception = cm.exception
         ksft_eq(the_exception.nl_msg.extack['bad-attr'], ".dev-id")
         ksft_eq(the_exception.nl_msg.error, -errno.EINVAL)
+        _close_conn(cfg, s)
 
 
 def assoc_sk_only_unconn(cfg):
-- 
cgit v1.2.3


From 5fc90003de59b0f772a1654f5609fa5f87b4300f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 27 Jan 2026 17:48:06 +0000
Subject: selftests: drv-net: toeplitz: accept bigger rss keys

/proc/sys/net/core/netdev_rss_key got bigger (256 bytes instead of 52)

Fixes: 37b0ea8fef56 ("net: expand NETDEV_RSS_KEY_LEN to 256 bytes")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20260127174806.886561-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/toeplitz.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.c b/tools/testing/selftests/drivers/net/hw/toeplitz.c
index 285bb17df9c2..cd4bf58a44ee 100644
--- a/tools/testing/selftests/drivers/net/hw/toeplitz.c
+++ b/tools/testing/selftests/drivers/net/hw/toeplitz.c
@@ -59,7 +59,7 @@
 #include "../../../net/lib/ksft.h"
 
 #define TOEPLITZ_KEY_MIN_LEN	40
-#define TOEPLITZ_KEY_MAX_LEN	60
+#define TOEPLITZ_KEY_MAX_LEN	256
 
 #define TOEPLITZ_STR_LEN(K)	(((K) * 3) - 1)	/* hex encoded: AA:BB:CC:...:ZZ */
 #define TOEPLITZ_STR_MIN_LEN	TOEPLITZ_STR_LEN(TOEPLITZ_KEY_MIN_LEN)
-- 
cgit v1.2.3


From 8467458dfa61b37e259e3485a5d3e415d08193c1 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 27 Jan 2026 20:27:24 +0100
Subject: selftests: mptcp: check no dup close events after error

This validates the previous commit: subflow closed events are re-sent
with less info when the initial subflow is disconnected after an error
and each time a subflow is closed after that.

In this new test, the userspace PM is involved because that's how it was
discovered, but it is not specific to it. The initial subflow is
terminated with a RESET, and that will cause the subflow disconnect.
Then, a new subflow is initiated, but also got rejected, which cause a
second subflow closed event, but not a third one.

While at it, in case of failure to get the expected amount of events,
the events are printed.

The 'Fixes' tag here below is the same as the one from the previous
commit: this patch here is not fixing anything wrong in the selftests,
but it validates the previous fix for an issue introduced by this commit
ID.

Fixes: d82809b6c5f2 ("mptcp: avoid duplicated SUB_CLOSED events")
Cc: stable@vger.kernel.org
Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260127-net-mptcp-dup-nl-events-v1-2-7f71e1bc4feb@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 51 +++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index b2e6e548f796..1765714a1e2f 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -3872,11 +3872,32 @@ chk_evt_nr()
 	count=$(grep -cw "type:${evt}" "${evts}")
 	if [ "${count}" != "${exp}" ]; then
 		fail_test "got ${count} events, expected ${exp}"
+		cat "${evts}"
 	else
 		print_ok
 	fi
 }
 
+# $1: ns ; $2: event type ; $3: expected count
+wait_event()
+{
+	local ns="${1}"
+	local evt_name="${2}"
+	local exp="${3}"
+
+	local evt="${!evt_name}"
+	local evts="${evts_ns1}"
+	local count
+
+	[ "${ns}" == "ns2" ] && evts="${evts_ns2}"
+
+	for _ in $(seq 100); do
+		count=$(grep -cw "type:${evt}" "${evts}")
+		[ "${count}" -ge "${exp}" ] && break
+		sleep 0.1
+	done
+}
+
 userspace_tests()
 {
 	# userspace pm type prevents add_addr
@@ -4085,6 +4106,36 @@ userspace_tests()
 		kill_events_pids
 		mptcp_lib_kill_group_wait $tests_pid
 	fi
+
+	# userspace pm no duplicated spurious close events after an error
+	if reset_with_events "userspace pm no dup close events after error" &&
+	   continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
+		set_userspace_pm $ns2
+		pm_nl_set_limits $ns1 0 2
+		{ timeout_test=120 test_linkfail=128 speed=slow \
+			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
+		local tests_pid=$!
+		wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1
+		userspace_pm_add_sf $ns2 10.0.3.2 20
+		chk_mptcp_info subflows 1 subflows 1
+		chk_subflows_total 2 2
+
+		# force quick loss
+		ip netns exec $ns2 sysctl -q net.ipv4.tcp_syn_retries=1
+		if ip netns exec "${ns1}" ${iptables} -A INPUT -s "10.0.1.2" \
+		      -p tcp --tcp-option 30 -j REJECT --reject-with tcp-reset &&
+		   ip netns exec "${ns2}" ${iptables} -A INPUT -d "10.0.1.2" \
+		      -p tcp --tcp-option 30 -j REJECT --reject-with tcp-reset; then
+			wait_event ns2 MPTCP_LIB_EVENT_SUB_CLOSED 1
+			wait_event ns1 MPTCP_LIB_EVENT_SUB_CLOSED 1
+			chk_subflows_total 1 1
+			userspace_pm_add_sf $ns2 10.0.1.2 0
+			wait_event ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2
+			chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2
+		fi
+		kill_events_pids
+		mptcp_lib_kill_group_wait $tests_pid
+	fi
 }
 
 endpoint_tests()
-- 
cgit v1.2.3


From 2ef9e3a3845d0a20b62b01f5b731debd0364688d Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 27 Jan 2026 20:27:26 +0100
Subject: selftests: mptcp: check subflow errors in close events

This validates the previous commit: subflow closed events should contain
an error field when a subflow got closed with an error, e.g. reset or
timeout.

For this test, the chk_evt_nr helper has been extended to check
attributes in the matched events.

In this test, the 2 subflow closed events should have an error.

The 'Fixes' tag here below is the same as the one from the previous
commit: this patch here is not fixing anything wrong in the selftests,
but it validates the previous fix for an issue introduced by this commit
ID.

Fixes: 15cc10453398 ("mptcp: deliver ssk errors to msk")
Cc: stable@vger.kernel.org
Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260127-net-mptcp-dup-nl-events-v1-4-7f71e1bc4feb@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 1765714a1e2f..3fc29201362a 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -3847,21 +3847,28 @@ userspace_pm_chk_get_addr()
 	fi
 }
 
-# $1: ns ; $2: event type ; $3: count
+# $1: ns ; $2: event type ; $3: count ; [ $4: attr ; $5: attr count ]
 chk_evt_nr()
 {
 	local ns=${1}
 	local evt_name="${2}"
 	local exp="${3}"
+	local attr="${4}"
+	local attr_exp="${5}"
 
 	local evts="${evts_ns1}"
 	local evt="${!evt_name}"
+	local attr_name
 	local count
 
+	if [ -n "${attr}" ]; then
+		attr_name=", ${attr}: ${attr_exp}"
+	fi
+
 	evt_name="${evt_name:16}" # without MPTCP_LIB_EVENT_
 	[ "${ns}" == "ns2" ] && evts="${evts_ns2}"
 
-	print_check "event ${ns} ${evt_name} (${exp})"
+	print_check "event ${ns} ${evt_name} (${exp}${attr_name})"
 
 	if [[ "${evt_name}" = "LISTENER_"* ]] &&
 	   ! mptcp_lib_kallsyms_has "mptcp_event_pm_listener$"; then
@@ -3873,6 +3880,16 @@ chk_evt_nr()
 	if [ "${count}" != "${exp}" ]; then
 		fail_test "got ${count} events, expected ${exp}"
 		cat "${evts}"
+		return
+	elif [ -z "${attr}" ]; then
+		print_ok
+		return
+	fi
+
+	count=$(grep -w "type:${evt}" "${evts}" | grep -c ",${attr}:")
+	if [ "${count}" != "${attr_exp}" ]; then
+		fail_test "got ${count} event attributes, expected ${attr_exp}"
+		grep -w "type:${evt}" "${evts}"
 	else
 		print_ok
 	fi
@@ -4131,7 +4148,7 @@ userspace_tests()
 			chk_subflows_total 1 1
 			userspace_pm_add_sf $ns2 10.0.1.2 0
 			wait_event ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2
-			chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2
+			chk_evt_nr ns2 MPTCP_LIB_EVENT_SUB_CLOSED 2 error 2
 		fi
 		kill_events_pids
 		mptcp_lib_kill_group_wait $tests_pid
-- 
cgit v1.2.3


From c5d5ecf21fdd9ce91e6116feb3aa83cee73352cc Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 27 Jan 2026 20:27:27 +0100
Subject: selftests: mptcp: join: fix local endp not being tracked

When running this mptcp_join.sh selftest on older kernel versions not
supporting local endpoints tracking, this test fails because 3 MP_JOIN
ACKs have been received, while only 2 were expected.

It is not clear why only 2 MP_JOIN ACKs were expected on old kernel
versions, while 3 MP_JOIN SYN and SYN+ACK were expected. When testing on
the v5.15.197 kernel, 3 MP_JOIN ACKs are seen, which is also what is
expected in the selftests included in this kernel version, see commit
f4480eaad489 ("selftests: mptcp: add missing join check").

Switch the expected MP_JOIN ACKs to 3. While at it, move this
chk_join_nr helper out of the special condition for older kernel
versions as it is now the same as with more recent ones. Also, invert
the condition to be more logical: what's expected on newer kernel
versions having such helper first.

Fixes: d4c81bbb8600 ("selftests: mptcp: join: support local endpoint being tracked or not")
Cc: stable@vger.kernel.org
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260127-net-mptcp-dup-nl-events-v1-5-7f71e1bc4feb@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 3fc29201362a..e70d3420954f 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -2329,17 +2329,16 @@ signal_address_tests()
 		ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=1
 		speed=slow \
 			run_tests $ns1 $ns2 10.0.1.1
+		chk_join_nr 3 3 3
 
 		# It is not directly linked to the commit introducing this
 		# symbol but for the parent one which is linked anyway.
-		if ! mptcp_lib_kallsyms_has "mptcp_pm_subflow_check_next$"; then
-			chk_join_nr 3 3 2
-			chk_add_nr 4 4
-		else
-			chk_join_nr 3 3 3
+		if mptcp_lib_kallsyms_has "mptcp_pm_subflow_check_next$"; then
 			# the server will not signal the address terminating
 			# the MPC subflow
 			chk_add_nr 3 3
+		else
+			chk_add_nr 4 4
 		fi
 	fi
 }
-- 
cgit v1.2.3


From 5e51803521938566bdf099501379a9bdbacb6066 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Thu, 22 Jan 2026 18:46:17 +0100
Subject: selftests: netfilter: nft_flowtable.sh: Add IP6IP6 flowtable selftest

Similar to IPIP, introduce specific selftest for IP6IP6 flowtable SW
acceleration in nft_flowtable.sh

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 .../selftests/net/netfilter/nft_flowtable.sh       | 62 ++++++++++++++++++----
 1 file changed, 53 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
index a68bc882fa4e..14d7f67715ed 100755
--- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh
+++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
@@ -592,16 +592,28 @@ ip -net "$nsr1" link set tun0 up
 ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0
 ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null
 
+ip -net "$nsr1" link add name tun6 type ip6tnl local fee1:2::1 remote fee1:2::2
+ip -net "$nsr1" link set tun6 up
+ip -net "$nsr1" addr add fee1:3::1/64 dev tun6 nodad
+
 ip -net "$nsr2" link add name tun0 type ipip local 192.168.10.2 remote 192.168.10.1
 ip -net "$nsr2" link set tun0 up
 ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0
 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null
 
+ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1
+ip -net "$nsr2" link set tun6 up
+ip -net "$nsr2" addr add fee1:3::2/64 dev tun6 nodad
+
 ip -net "$nsr1" route change default via 192.168.100.2
 ip -net "$nsr2" route change default via 192.168.100.1
+ip -6 -net "$nsr1" route change default via fee1:3::2
+ip -6 -net "$nsr2" route change default via fee1:3::1
 ip -net "$ns2" route add default via 10.0.2.1
+ip -6 -net "$ns2" route add default via dead:2::1
 
 ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0 accept'
+ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6 accept'
 ip netns exec "$nsr1" nft -a insert rule inet filter forward \
 	'meta oif "veth0" tcp sport 12345 ct mark set 1 flow add @f1 counter name routed_repl accept'
 
@@ -611,28 +623,51 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel"; then
 	ret=1
 fi
 
+if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then
+	echo "PASS: flow offload for ns1/ns2 IP6IP6 tunnel"
+else
+	echo "FAIL: flow offload for ns1/ns2 with IP6IP6 tunnel" 1>&2
+	ip netns exec "$nsr1" nft list ruleset
+	ret=1
+fi
+
 # Create vlan tagged devices for IPIP traffic.
 ip -net "$nsr1" link add link veth1 name veth1.10 type vlan id 10
 ip -net "$nsr1" link set veth1.10 up
 ip -net "$nsr1" addr add 192.168.20.1/24 dev veth1.10
+ip -net "$nsr1" addr add fee1:4::1/64 dev veth1.10 nodad
 ip netns exec "$nsr1" sysctl net.ipv4.conf.veth1/10.forwarding=1 > /dev/null
 ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif veth1.10 accept'
-ip -net "$nsr1" link add name tun1 type ipip local 192.168.20.1 remote 192.168.20.2
-ip -net "$nsr1" link set tun1 up
-ip -net "$nsr1" addr add 192.168.200.1/24 dev tun1
+
+ip -net "$nsr1" link add name tun0.10 type ipip local 192.168.20.1 remote 192.168.20.2
+ip -net "$nsr1" link set tun0.10 up
+ip -net "$nsr1" addr add 192.168.200.1/24 dev tun0.10
 ip -net "$nsr1" route change default via 192.168.200.2
-ip netns exec "$nsr1" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null
-ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun1 accept'
+ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null
+ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0.10 accept'
+
+ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote fee1:4::2
+ip -net "$nsr1" link set tun6.10 up
+ip -net "$nsr1" addr add fee1:5::1/64 dev tun6.10 nodad
+ip -6 -net "$nsr1" route change default via fee1:5::2
+ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6.10 accept'
 
 ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10
 ip -net "$nsr2" link set veth0.10 up
 ip -net "$nsr2" addr add 192.168.20.2/24 dev veth0.10
+ip -net "$nsr2" addr add fee1:4::2/64 dev veth0.10 nodad
 ip netns exec "$nsr2" sysctl net.ipv4.conf.veth0/10.forwarding=1 > /dev/null
-ip -net "$nsr2" link add name tun1 type ipip local 192.168.20.2 remote 192.168.20.1
-ip -net "$nsr2" link set tun1 up
-ip -net "$nsr2" addr add 192.168.200.2/24 dev tun1
+
+ip -net "$nsr2" link add name tun0.10 type ipip local 192.168.20.2 remote 192.168.20.1
+ip -net "$nsr2" link set tun0.10 up
+ip -net "$nsr2" addr add 192.168.200.2/24 dev tun0.10
 ip -net "$nsr2" route change default via 192.168.200.1
-ip netns exec "$nsr2" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null
+ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null
+
+ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1
+ip -net "$nsr2" link set tun6.10 up
+ip -net "$nsr2" addr add fee1:5::2/64 dev tun6.10 nodad
+ip -6 -net "$nsr2" route change default via fee1:5::1
 
 if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then
 	echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2
@@ -640,10 +675,19 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then
 	ret=1
 fi
 
+if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then
+	echo "PASS: flow offload for ns1/ns2 IP6IP6 tunnel over vlan"
+else
+	echo "FAIL: flow offload for ns1/ns2 with IP6IP6 tunnel over vlan" 1>&2
+	ip netns exec "$nsr1" nft list ruleset
+	ret=1
+fi
+
 # Restore the previous configuration
 ip -net "$nsr1" route change default via 192.168.10.2
 ip -net "$nsr2" route change default via 192.168.10.1
 ip -net "$ns2" route del default via 10.0.2.1
+ip -6 -net "$ns2" route del default via dead:2::1
 }
 
 # Another test:
-- 
cgit v1.2.3


From 462a94fb8ae8ba0d4d3901c7283b4af052ab8804 Mon Sep 17 00:00:00 2001
From: Paul Walmsley <pjw@kernel.org>
Date: Sun, 25 Jan 2026 21:09:55 -0700
Subject: riscv: hwprobe: add support for RISCV_HWPROBE_KEY_IMA_EXT_1

We've run out of bits to describe RISC-V ISA extensions in our initial
hwprobe key, RISCV_HWPROBE_KEY_IMA_EXT_0.  So, let's add
RISCV_HWPROBE_KEY_IMA_EXT_1, along with the framework to set the
appropriate hwprobe tuple, and add testing for it.

Based on a suggestion from Andrew Jones <andrew.jones@oss.qualcomm.com>,
also fix the documentation for RISCV_HWPROBE_KEY_IMA_EXT_0.

Reviewed-by: Andrew Jones <andrew.jones@oss.qualcomm.com>
Signed-off-by: Paul Walmsley <pjw@kernel.org>
---
 Documentation/arch/riscv/hwprobe.rst               |   6 +-
 arch/riscv/include/asm/hwprobe.h                   |   3 +-
 arch/riscv/include/uapi/asm/hwprobe.h              |   1 +
 arch/riscv/kernel/sys_hwprobe.c                    | 169 ++++++++++++---------
 tools/testing/selftests/riscv/hwprobe/which-cpus.c |  18 ++-
 5 files changed, 121 insertions(+), 76 deletions(-)

(limited to 'tools')

diff --git a/Documentation/arch/riscv/hwprobe.rst b/Documentation/arch/riscv/hwprobe.rst
index 641ec4abb906..c420a8349bc6 100644
--- a/Documentation/arch/riscv/hwprobe.rst
+++ b/Documentation/arch/riscv/hwprobe.rst
@@ -67,7 +67,7 @@ The following keys are defined:
       programs (it may still be executed in userspace via a
       kernel-controlled mechanism such as the vDSO).
 
-* :c:macro:`RISCV_HWPROBE_KEY_IMA_EXT_0`: A bitmask containing the extensions
+* :c:macro:`RISCV_HWPROBE_KEY_IMA_EXT_0`: A bitmask containing extensions
   that are compatible with the :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`:
   base system behavior.
 
@@ -387,3 +387,7 @@ The following keys are defined:
 
 * :c:macro:`RISCV_HWPROBE_KEY_ZICBOP_BLOCK_SIZE`: An unsigned int which
   represents the size of the Zicbop block in bytes.
+
+* :c:macro:`RISCV_HWPROBE_KEY_IMA_EXT_1`: A bitmask containing additional
+  extensions that are compatible with the
+  :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`: base system behavior.
diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
index 8c572a464719..8b9f5e1cf4cb 100644
--- a/arch/riscv/include/asm/hwprobe.h
+++ b/arch/riscv/include/asm/hwprobe.h
@@ -8,7 +8,7 @@
 
 #include <uapi/asm/hwprobe.h>
 
-#define RISCV_HWPROBE_MAX_KEY 15
+#define RISCV_HWPROBE_MAX_KEY		16
 
 static inline bool riscv_hwprobe_key_is_valid(__s64 key)
 {
@@ -20,6 +20,7 @@ static inline bool hwprobe_key_is_bitmask(__s64 key)
 	switch (key) {
 	case RISCV_HWPROBE_KEY_BASE_BEHAVIOR:
 	case RISCV_HWPROBE_KEY_IMA_EXT_0:
+	case RISCV_HWPROBE_KEY_IMA_EXT_1:
 	case RISCV_HWPROBE_KEY_CPUPERF_0:
 	case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0:
 	case RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0:
diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
index cd3c126730c3..ed2621a5a47d 100644
--- a/arch/riscv/include/uapi/asm/hwprobe.h
+++ b/arch/riscv/include/uapi/asm/hwprobe.h
@@ -113,6 +113,7 @@ struct riscv_hwprobe {
 #define RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0	13
 #define RISCV_HWPROBE_KEY_VENDOR_EXT_MIPS_0	14
 #define RISCV_HWPROBE_KEY_ZICBOP_BLOCK_SIZE	15
+#define RISCV_HWPROBE_KEY_IMA_EXT_1		16
 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
 
 /* Flags */
diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index e6787ba7f2fc..53731ace7984 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -24,6 +24,14 @@
 #include <vdso/vsyscall.h>
 
 
+#define EXT_KEY(isa_arg, ext, pv, missing)					\
+	do {										\
+		if (__riscv_isa_extension_available(isa_arg, RISCV_ISA_EXT_##ext))	\
+			pv |= RISCV_HWPROBE_EXT_##ext;				\
+		else									\
+			missing |= RISCV_HWPROBE_EXT_##ext;				\
+	} while (false)
+
 static void hwprobe_arch_id(struct riscv_hwprobe *pair,
 			    const struct cpumask *cpus)
 {
@@ -93,90 +101,109 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair,
 	for_each_cpu(cpu, cpus) {
 		struct riscv_isainfo *isainfo = &hart_isa[cpu];
 
-#define EXT_KEY(ext)									\
-	do {										\
-		if (__riscv_isa_extension_available(isainfo->isa, RISCV_ISA_EXT_##ext))	\
-			pair->value |= RISCV_HWPROBE_EXT_##ext;				\
-		else									\
-			missing |= RISCV_HWPROBE_EXT_##ext;				\
-	} while (false)
-
 		/*
 		 * Only use EXT_KEY() for extensions which can be exposed to userspace,
 		 * regardless of the kernel's configuration, as no other checks, besides
 		 * presence in the hart_isa bitmap, are made.
 		 */
-		EXT_KEY(ZAAMO);
-		EXT_KEY(ZABHA);
-		EXT_KEY(ZACAS);
-		EXT_KEY(ZALASR);
-		EXT_KEY(ZALRSC);
-		EXT_KEY(ZAWRS);
-		EXT_KEY(ZBA);
-		EXT_KEY(ZBB);
-		EXT_KEY(ZBC);
-		EXT_KEY(ZBKB);
-		EXT_KEY(ZBKC);
-		EXT_KEY(ZBKX);
-		EXT_KEY(ZBS);
-		EXT_KEY(ZCA);
-		EXT_KEY(ZCB);
-		EXT_KEY(ZCLSD);
-		EXT_KEY(ZCMOP);
-		EXT_KEY(ZICBOM);
-		EXT_KEY(ZICBOP);
-		EXT_KEY(ZICBOZ);
-		EXT_KEY(ZICNTR);
-		EXT_KEY(ZICOND);
-		EXT_KEY(ZIHINTNTL);
-		EXT_KEY(ZIHINTPAUSE);
-		EXT_KEY(ZIHPM);
-		EXT_KEY(ZILSD);
-		EXT_KEY(ZIMOP);
-		EXT_KEY(ZKND);
-		EXT_KEY(ZKNE);
-		EXT_KEY(ZKNH);
-		EXT_KEY(ZKSED);
-		EXT_KEY(ZKSH);
-		EXT_KEY(ZKT);
-		EXT_KEY(ZTSO);
+		EXT_KEY(isainfo->isa, ZAAMO, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZABHA, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZACAS, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZALASR, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZALRSC, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZAWRS, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZBA, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZBB, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZBC, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZBKB, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZBKC, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZBKX, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZBS, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZCA, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZCB, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZCLSD, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZCMOP, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZICBOM, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZICBOP, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZICBOZ, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZICNTR, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZICOND, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZIHINTNTL, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZIHINTPAUSE, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZIHPM, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZILSD, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZIMOP, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZKND, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZKNE, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZKNH, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZKSED, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZKSH, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZKT, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZTSO, pair->value, missing);
 
 		/*
 		 * All the following extensions must depend on the kernel
 		 * support of V.
 		 */
 		if (has_vector()) {
-			EXT_KEY(ZVBB);
-			EXT_KEY(ZVBC);
-			EXT_KEY(ZVE32F);
-			EXT_KEY(ZVE32X);
-			EXT_KEY(ZVE64D);
-			EXT_KEY(ZVE64F);
-			EXT_KEY(ZVE64X);
-			EXT_KEY(ZVFBFMIN);
-			EXT_KEY(ZVFBFWMA);
-			EXT_KEY(ZVFH);
-			EXT_KEY(ZVFHMIN);
-			EXT_KEY(ZVKB);
-			EXT_KEY(ZVKG);
-			EXT_KEY(ZVKNED);
-			EXT_KEY(ZVKNHA);
-			EXT_KEY(ZVKNHB);
-			EXT_KEY(ZVKSED);
-			EXT_KEY(ZVKSH);
-			EXT_KEY(ZVKT);
+			EXT_KEY(isainfo->isa, ZVBB, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVBC, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVE32F, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVE32X, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVE64D, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVE64F, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVE64X, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVFBFMIN, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVFBFWMA, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVFH, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVFHMIN, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVKB, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVKG, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVKNED, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVKNHA, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVKNHB, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVKSED, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVKSH, pair->value, missing);
+			EXT_KEY(isainfo->isa, ZVKT, pair->value, missing);
 		}
 
-		EXT_KEY(ZCD);
-		EXT_KEY(ZCF);
-		EXT_KEY(ZFA);
-		EXT_KEY(ZFBFMIN);
-		EXT_KEY(ZFH);
-		EXT_KEY(ZFHMIN);
+		EXT_KEY(isainfo->isa, ZCD, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZCF, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZFA, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZFBFMIN, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZFH, pair->value, missing);
+		EXT_KEY(isainfo->isa, ZFHMIN, pair->value, missing);
 
 		if (IS_ENABLED(CONFIG_RISCV_ISA_SUPM))
-			EXT_KEY(SUPM);
-#undef EXT_KEY
+			EXT_KEY(isainfo->isa, SUPM, pair->value, missing);
+	}
+
+	/* Now turn off reporting features if any CPU is missing it. */
+	pair->value &= ~missing;
+}
+
+static void hwprobe_isa_ext1(struct riscv_hwprobe *pair,
+			     const struct cpumask *cpus)
+{
+	int cpu;
+	u64 missing = 0;
+
+	pair->value = 0;
+
+	/*
+	 * Loop through and record extensions that 1) anyone has, and 2) anyone
+	 * doesn't have.
+	 */
+	for_each_cpu(cpu, cpus) {
+		/* struct riscv_isainfo *isainfo = &hart_isa[cpu]; */
+
+		/*
+		 * Only use EXT_KEY() for extensions which can be
+		 * exposed to userspace, regardless of the kernel's
+		 * configuration, as no other checks, besides presence
+		 * in the hart_isa bitmap, are made.
+		 */
+		/* Nothing here yet */
 	}
 
 	/* Now turn off reporting features if any CPU is missing it. */
@@ -287,6 +314,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
 		hwprobe_isa_ext0(pair, cpus);
 		break;
 
+	case RISCV_HWPROBE_KEY_IMA_EXT_1:
+		hwprobe_isa_ext1(pair, cpus);
+		break;
+
 	case RISCV_HWPROBE_KEY_CPUPERF_0:
 	case RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF:
 		pair->value = hwprobe_misaligned(cpus);
diff --git a/tools/testing/selftests/riscv/hwprobe/which-cpus.c b/tools/testing/selftests/riscv/hwprobe/which-cpus.c
index 3ab53067e8dd..587feb198c04 100644
--- a/tools/testing/selftests/riscv/hwprobe/which-cpus.c
+++ b/tools/testing/selftests/riscv/hwprobe/which-cpus.c
@@ -83,9 +83,9 @@ static void do_which_cpus(int argc, char **argv, cpu_set_t *cpus)
 
 int main(int argc, char **argv)
 {
-	struct riscv_hwprobe pairs[2];
+	struct riscv_hwprobe pairs[3];
 	cpu_set_t cpus_aff, cpus;
-	__u64 ext0_all;
+	__u64 ext0_all, ext1_all;
 	long rc;
 
 	rc = sched_getaffinity(0, sizeof(cpu_set_t), &cpus_aff);
@@ -112,6 +112,11 @@ int main(int argc, char **argv)
 	assert(rc == 0 && pairs[0].key == RISCV_HWPROBE_KEY_IMA_EXT_0);
 	ext0_all = pairs[0].value;
 
+	pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, };
+	rc = riscv_hwprobe(pairs, 1, 0, NULL, 0);
+	assert(rc == 0 && pairs[0].key == RISCV_HWPROBE_KEY_IMA_EXT_1);
+	ext1_all = pairs[0].value;
+
 	pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, };
 	CPU_ZERO(&cpus);
 	rc = riscv_hwprobe(pairs, 1, 0, (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS);
@@ -134,20 +139,23 @@ int main(int argc, char **argv)
 
 	pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, };
 	pairs[1] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, .value = ext0_all, };
+	pairs[2] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, .value = ext1_all, };
 	CPU_ZERO(&cpus);
-	rc = riscv_hwprobe(pairs, 2, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS);
+	rc = riscv_hwprobe(pairs, 3, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS);
 	ksft_test_result(rc == 0 && CPU_COUNT(&cpus) == sysconf(_SC_NPROCESSORS_ONLN), "set all cpus\n");
 
 	pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, };
 	pairs[1] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, .value = ext0_all, };
+	pairs[2] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, .value = ext1_all, };
 	memcpy(&cpus, &cpus_aff, sizeof(cpu_set_t));
-	rc = riscv_hwprobe(pairs, 2, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS);
+	rc = riscv_hwprobe(pairs, 3, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS);
 	ksft_test_result(rc == 0 && CPU_EQUAL(&cpus, &cpus_aff), "set all affinity cpus\n");
 
 	pairs[0] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR, .value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA, };
 	pairs[1] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, .value = ~ext0_all, };
+	pairs[2] = (struct riscv_hwprobe){ .key = RISCV_HWPROBE_KEY_IMA_EXT_1, .value = ~ext1_all, };
 	memcpy(&cpus, &cpus_aff, sizeof(cpu_set_t));
-	rc = riscv_hwprobe(pairs, 2, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS);
+	rc = riscv_hwprobe(pairs, 3, sizeof(cpu_set_t), (unsigned long *)&cpus, RISCV_HWPROBE_WHICH_CPUS);
 	ksft_test_result(rc == 0 && CPU_COUNT(&cpus) == 0, "clear all cpus\n");
 
 	ksft_finished();
-- 
cgit v1.2.3


From d30c1683aaecb93d2ab95685dc4300a33d3cea7a Mon Sep 17 00:00:00 2001
From: Deepak Gupta <debug@rivosinc.com>
Date: Sun, 25 Jan 2026 21:09:56 -0700
Subject: kselftest/riscv: add kselftest for user mode CFI

Add a kselftest for RISC-V control flow integrity implementation for
user mode. There is not a lot going on in the kernel to enable landing
pad for user mode. CFI selftests are intended to be compiled with a
zicfilp and zicfiss enabled compiler. This kselftest simply checks if
landing pads and shadow stacks for the process are enabled or not and
executes ptrace selftests on CFI. The selftest then registers a
SIGSEGV signal handler.  Any control flow violations are reported as
SIGSEGV with si_code = SEGV_CPERR.  The test will fail on receiving
any SEGV_CPERR. The shadow stack part has more changes in the kernel,
and thus there are separate tests for that.

- Exercise 'map_shadow_stack' syscall
- 'fork' test to make sure COW works for shadow stack pages
- gup tests
  Kernel uses FOLL_FORCE when access happens to memory via
  /proc/<pid>/mem. Not breaking that for shadow stack.
- signal test. Make sure signal delivery results in token creation on
  shadow stack and consumes (and verifies) token on sigreturn
- shadow stack protection test. attempts to write using regular store
  instruction on shadow stack memory must result in access faults
- ptrace test: adds landing pad violation, clears ELP and continues

In case the toolchain doesn't support the CFI extension, the CFI
kselftest won't be built.

Test output
===========

"""
TAP version 13
1..5
  This is to ensure shadow stack is indeed enabled and working
  This is to ensure shadow stack is indeed enabled and working
ok 1 shstk fork test
ok 2 map shadow stack syscall
ok 3 shadow stack gup tests
ok 4 shadow stack signal tests
ok 5 memory protections of shadow stack memory
"""

Suggested-by: Charlie Jenkins <charlie@rivosinc.com>
Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Signed-off-by: Deepak Gupta <debug@rivosinc.com>
Tested-by: Andreas Korb <andreas.korb@aisec.fraunhofer.de> # QEMU, custom CVA6
Tested-by: Valentin Haudiquet <valentin.haudiquet@canonical.com>
Link: https://patch.msgid.link/20251112-v5_user_cfi_series-v23-28-b55691eacf4f@rivosinc.com
[pjw@kernel.org: updated to apply; cleaned up patch description, code comments]
Signed-off-by: Paul Walmsley <pjw@kernel.org>
---
 tools/testing/selftests/riscv/Makefile          |   2 +-
 tools/testing/selftests/riscv/cfi/.gitignore    |   2 +
 tools/testing/selftests/riscv/cfi/Makefile      |  23 ++
 tools/testing/selftests/riscv/cfi/cfi_rv_test.h |  82 +++++
 tools/testing/selftests/riscv/cfi/cfitests.c    | 173 +++++++++++
 tools/testing/selftests/riscv/cfi/shadowstack.c | 385 ++++++++++++++++++++++++
 tools/testing/selftests/riscv/cfi/shadowstack.h |  27 ++
 7 files changed, 693 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/riscv/cfi/.gitignore
 create mode 100644 tools/testing/selftests/riscv/cfi/Makefile
 create mode 100644 tools/testing/selftests/riscv/cfi/cfi_rv_test.h
 create mode 100644 tools/testing/selftests/riscv/cfi/cfitests.c
 create mode 100644 tools/testing/selftests/riscv/cfi/shadowstack.c
 create mode 100644 tools/testing/selftests/riscv/cfi/shadowstack.h

(limited to 'tools')

diff --git a/tools/testing/selftests/riscv/Makefile b/tools/testing/selftests/riscv/Makefile
index 099b8c1f46f8..5671b4405a12 100644
--- a/tools/testing/selftests/riscv/Makefile
+++ b/tools/testing/selftests/riscv/Makefile
@@ -5,7 +5,7 @@
 ARCH ?= $(shell uname -m 2>/dev/null || echo not)
 
 ifneq (,$(filter $(ARCH),riscv))
-RISCV_SUBTARGETS ?= abi hwprobe mm sigreturn vector
+RISCV_SUBTARGETS ?= abi hwprobe mm sigreturn vector cfi
 else
 RISCV_SUBTARGETS :=
 endif
diff --git a/tools/testing/selftests/riscv/cfi/.gitignore b/tools/testing/selftests/riscv/cfi/.gitignore
new file mode 100644
index 000000000000..c1faf7ca4346
--- /dev/null
+++ b/tools/testing/selftests/riscv/cfi/.gitignore
@@ -0,0 +1,2 @@
+cfitests
+shadowstack
diff --git a/tools/testing/selftests/riscv/cfi/Makefile b/tools/testing/selftests/riscv/cfi/Makefile
new file mode 100644
index 000000000000..96a4dc4b69c3
--- /dev/null
+++ b/tools/testing/selftests/riscv/cfi/Makefile
@@ -0,0 +1,23 @@
+CFLAGS += $(KHDR_INCLUDES)
+CFLAGS += -I$(top_srcdir)/tools/include
+
+CFLAGS += -march=rv64gc_zicfilp_zicfiss -fcf-protection=full
+
+# Check for zicfi* extensions needs cross compiler
+# which is not set until lib.mk is included
+ifeq ($(LLVM)$(CC),cc)
+CC := $(CROSS_COMPILE)gcc
+endif
+
+
+ifeq ($(shell $(CC) $(CFLAGS) -nostdlib -xc /dev/null -o /dev/null > /dev/null 2>&1; echo $$?),0)
+TEST_GEN_PROGS := cfitests
+
+$(OUTPUT)/cfitests: cfitests.c shadowstack.c
+	$(CC) -o$@ $(CFLAGS) $(LDFLAGS) $^
+else
+
+$(shell echo "Toolchain doesn't support CFI, skipping CFI kselftest." >&2)
+endif
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/riscv/cfi/cfi_rv_test.h b/tools/testing/selftests/riscv/cfi/cfi_rv_test.h
new file mode 100644
index 000000000000..1c8043f2b778
--- /dev/null
+++ b/tools/testing/selftests/riscv/cfi/cfi_rv_test.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef SELFTEST_RISCV_CFI_H
+#define SELFTEST_RISCV_CFI_H
+#include <stddef.h>
+#include <sys/types.h>
+#include "shadowstack.h"
+
+#define CHILD_EXIT_CODE_SSWRITE		10
+#define CHILD_EXIT_CODE_SIG_TEST	11
+
+#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5)			\
+({									\
+	register long _num  __asm__ ("a7") = (num);			\
+	register long _arg1 __asm__ ("a0") = (long)(arg1);		\
+	register long _arg2 __asm__ ("a1") = (long)(arg2);		\
+	register long _arg3 __asm__ ("a2") = (long)(arg3);		\
+	register long _arg4 __asm__ ("a3") = (long)(arg4);		\
+	register long _arg5 __asm__ ("a4") = (long)(arg5);		\
+									\
+	__asm__ volatile(						\
+		"ecall\n"						\
+		: "+r"							\
+		(_arg1)							\
+		: "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5),	\
+		  "r"(_num)						\
+		: "memory", "cc"					\
+	);								\
+	_arg1;								\
+})
+
+#define my_syscall3(num, arg1, arg2, arg3)				\
+({									\
+	register long _num  __asm__ ("a7") = (num);			\
+	register long _arg1 __asm__ ("a0") = (long)(arg1);		\
+	register long _arg2 __asm__ ("a1") = (long)(arg2);		\
+	register long _arg3 __asm__ ("a2") = (long)(arg3);		\
+									\
+	__asm__ volatile(						\
+		"ecall\n"						\
+		: "+r" (_arg1)						\
+		: "r"(_arg2), "r"(_arg3),				\
+		  "r"(_num)						\
+		: "memory", "cc"					\
+	);								\
+	_arg1;								\
+})
+
+#ifndef __NR_prctl
+#define __NR_prctl 167
+#endif
+
+#ifndef __NR_map_shadow_stack
+#define __NR_map_shadow_stack 453
+#endif
+
+#define CSR_SSP 0x011
+
+#ifdef __ASSEMBLY__
+#define __ASM_STR(x)    x
+#else
+#define __ASM_STR(x)    #x
+#endif
+
+#define csr_read(csr)							\
+({									\
+	register unsigned long __v;					\
+	__asm__ __volatile__ ("csrr %0, " __ASM_STR(csr)		\
+				: "=r" (__v) :				\
+				: "memory");				\
+	__v;								\
+})
+
+#define csr_write(csr, val)						\
+({									\
+	unsigned long __v = (unsigned long)(val);			\
+	__asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0"		\
+				: : "rK" (__v)				\
+				: "memory");				\
+})
+
+#endif
diff --git a/tools/testing/selftests/riscv/cfi/cfitests.c b/tools/testing/selftests/riscv/cfi/cfitests.c
new file mode 100644
index 000000000000..298544854415
--- /dev/null
+++ b/tools/testing/selftests/riscv/cfi/cfitests.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "../../kselftest.h"
+#include <sys/signal.h>
+#include <asm/ucontext.h>
+#include <linux/prctl.h>
+#include <errno.h>
+#include <linux/ptrace.h>
+#include <sys/wait.h>
+#include <linux/elf.h>
+#include <sys/uio.h>
+#include <asm-generic/unistd.h>
+
+#include "cfi_rv_test.h"
+
+/* do not optimize cfi related test functions */
+#pragma GCC push_options
+#pragma GCC optimize("O0")
+
+void sigsegv_handler(int signum, siginfo_t *si, void *uc)
+{
+	struct ucontext *ctx = (struct ucontext *)uc;
+
+	if (si->si_code == SEGV_CPERR) {
+		ksft_print_msg("Control flow violation happened somewhere\n");
+		ksft_print_msg("PC where violation happened %lx\n", ctx->uc_mcontext.gregs[0]);
+		exit(-1);
+	}
+
+	/* all other cases are expected to be of shadow stack write case */
+	exit(CHILD_EXIT_CODE_SSWRITE);
+}
+
+bool register_signal_handler(void)
+{
+	struct sigaction sa = {};
+
+	sa.sa_sigaction = sigsegv_handler;
+	sa.sa_flags = SA_SIGINFO;
+	if (sigaction(SIGSEGV, &sa, NULL)) {
+		ksft_print_msg("Registering signal handler for landing pad violation failed\n");
+		return false;
+	}
+
+	return true;
+}
+
+long ptrace(int request, pid_t pid, void *addr, void *data);
+
+bool cfi_ptrace_test(void)
+{
+	pid_t pid;
+	int status, ret = 0;
+	unsigned long ptrace_test_num = 0, total_ptrace_tests = 2;
+
+	struct user_cfi_state cfi_reg;
+	struct iovec iov;
+
+	pid = fork();
+
+	if (pid == -1) {
+		ksft_exit_fail_msg("%s: fork failed\n", __func__);
+		exit(1);
+	}
+
+	if (pid == 0) {
+		/* allow to be traced */
+		ptrace(PTRACE_TRACEME, 0, NULL, NULL);
+		raise(SIGSTOP);
+		asm volatile ("la a5, 1f\n"
+			      "jalr a5\n"
+			      "nop\n"
+			      "nop\n"
+			      "1: nop\n"
+			      : : : "a5");
+		exit(11);
+		/* child shouldn't go beyond here */
+	}
+
+	/* parent's code goes here */
+	iov.iov_base = &cfi_reg;
+	iov.iov_len = sizeof(cfi_reg);
+
+	while (ptrace_test_num < total_ptrace_tests) {
+		memset(&cfi_reg, 0, sizeof(cfi_reg));
+		waitpid(pid, &status, 0);
+		if (WIFSTOPPED(status)) {
+			errno = 0;
+			ret = ptrace(PTRACE_GETREGSET, pid, (void *)NT_RISCV_USER_CFI, &iov);
+			if (ret == -1 && errno)
+				ksft_exit_fail_msg("%s: PTRACE_GETREGSET failed\n", __func__);
+		} else {
+			ksft_exit_fail_msg("%s: child didn't stop, failed\n", __func__);
+		}
+
+		switch (ptrace_test_num) {
+#define CFI_ENABLE_MASK (PTRACE_CFI_LP_EN_STATE |	\
+			 PTRACE_CFI_SS_EN_STATE |	\
+			 PTRACE_CFI_SS_PTR_STATE)
+		case 0:
+			if ((cfi_reg.cfi_status.cfi_state & CFI_ENABLE_MASK) != CFI_ENABLE_MASK)
+				ksft_exit_fail_msg("%s: ptrace_getregset failed, %llu\n", __func__,
+						   cfi_reg.cfi_status.cfi_state);
+			if (!cfi_reg.shstk_ptr)
+				ksft_exit_fail_msg("%s: NULL shadow stack pointer, test failed\n",
+						   __func__);
+			break;
+		case 1:
+			if (!(cfi_reg.cfi_status.cfi_state & PTRACE_CFI_ELP_STATE))
+				ksft_exit_fail_msg("%s: elp must have been set\n", __func__);
+			/* clear elp state. not interested in anything else */
+			cfi_reg.cfi_status.cfi_state = 0;
+
+			ret = ptrace(PTRACE_SETREGSET, pid, (void *)NT_RISCV_USER_CFI, &iov);
+			if (ret == -1 && errno)
+				ksft_exit_fail_msg("%s: PTRACE_GETREGSET failed\n", __func__);
+			break;
+		default:
+			ksft_exit_fail_msg("%s: unreachable switch case\n", __func__);
+			break;
+		}
+		ptrace(PTRACE_CONT, pid, NULL, NULL);
+		ptrace_test_num++;
+	}
+
+	waitpid(pid, &status, 0);
+	if (WEXITSTATUS(status) != 11)
+		ksft_print_msg("%s, bad return code from child\n", __func__);
+
+	ksft_print_msg("%s, ptrace test succeeded\n", __func__);
+	return true;
+}
+
+int main(int argc, char *argv[])
+{
+	int ret = 0;
+	unsigned long lpad_status = 0, ss_status = 0;
+
+	ksft_print_header();
+
+	ksft_print_msg("Starting risc-v tests\n");
+
+	/*
+	 * Landing pad test. Not a lot of kernel changes to support landing
+	 * pads for user mode except lighting up a bit in senvcfg via a prctl.
+	 * Enable landing pad support throughout the execution of the test binary.
+	 */
+	ret = my_syscall5(__NR_prctl, PR_GET_INDIR_BR_LP_STATUS, &lpad_status, 0, 0, 0);
+	if (ret)
+		ksft_exit_fail_msg("Get landing pad status failed with %d\n", ret);
+
+	if (!(lpad_status & PR_INDIR_BR_LP_ENABLE))
+		ksft_exit_fail_msg("Landing pad is not enabled, should be enabled via glibc\n");
+
+	ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &ss_status, 0, 0, 0);
+	if (ret)
+		ksft_exit_fail_msg("Get shadow stack failed with %d\n", ret);
+
+	if (!(ss_status & PR_SHADOW_STACK_ENABLE))
+		ksft_exit_fail_msg("Shadow stack is not enabled, should be enabled via glibc\n");
+
+	if (!register_signal_handler())
+		ksft_exit_fail_msg("Registering signal handler for SIGSEGV failed\n");
+
+	ksft_print_msg("Landing pad and shadow stack are enabled for binary\n");
+	cfi_ptrace_test();
+
+	execute_shadow_stack_tests();
+
+	return 0;
+}
+
+#pragma GCC pop_options
diff --git a/tools/testing/selftests/riscv/cfi/shadowstack.c b/tools/testing/selftests/riscv/cfi/shadowstack.c
new file mode 100644
index 000000000000..f8eed8260a12
--- /dev/null
+++ b/tools/testing/selftests/riscv/cfi/shadowstack.c
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "../../kselftest.h"
+#include <sys/wait.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <asm-generic/unistd.h>
+#include <sys/mman.h>
+#include "shadowstack.h"
+#include "cfi_rv_test.h"
+
+static struct shadow_stack_tests shstk_tests[] = {
+	{ "shstk fork test\n", shadow_stack_fork_test },
+	{ "map shadow stack syscall\n", shadow_stack_map_test },
+	{ "shadow stack gup tests\n", shadow_stack_gup_tests },
+	{ "shadow stack signal tests\n", shadow_stack_signal_test},
+	{ "memory protections of shadow stack memory\n", shadow_stack_protection_test }
+};
+
+#define RISCV_SHADOW_STACK_TESTS ARRAY_SIZE(shstk_tests)
+
+/* do not optimize shadow stack related test functions */
+#pragma GCC push_options
+#pragma GCC optimize("O0")
+
+void zar(void)
+{
+	unsigned long ssp = 0;
+
+	ssp = csr_read(CSR_SSP);
+	ksft_print_msg("Spewing out shadow stack ptr: %lx\n"
+			"  This is to ensure shadow stack is indeed enabled and working\n",
+			ssp);
+}
+
+void bar(void)
+{
+	zar();
+}
+
+void foo(void)
+{
+	bar();
+}
+
+void zar_child(void)
+{
+	unsigned long ssp = 0;
+
+	ssp = csr_read(CSR_SSP);
+	ksft_print_msg("Spewing out shadow stack ptr: %lx\n"
+			"  This is to ensure shadow stack is indeed enabled and working\n",
+			ssp);
+}
+
+void bar_child(void)
+{
+	zar_child();
+}
+
+void foo_child(void)
+{
+	bar_child();
+}
+
+typedef void (call_func_ptr)(void);
+/*
+ * call couple of functions to test push/pop.
+ */
+int shadow_stack_call_tests(call_func_ptr fn_ptr, bool parent)
+{
+	ksft_print_msg("dummy calls for sspush and sspopchk in context of %s\n",
+		       parent ? "parent" : "child");
+
+	(fn_ptr)();
+
+	return 0;
+}
+
+/* forks a thread, and ensure shadow stacks fork out */
+bool shadow_stack_fork_test(unsigned long test_num, void *ctx)
+{
+	int pid = 0, child_status = 0, parent_pid = 0, ret = 0;
+	unsigned long ss_status = 0;
+
+	ksft_print_msg("Exercising shadow stack fork test\n");
+
+	ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &ss_status, 0, 0, 0);
+	if (ret) {
+		ksft_exit_skip("Shadow stack get status prctl failed with errorcode %d\n", ret);
+		return false;
+	}
+
+	if (!(ss_status & PR_SHADOW_STACK_ENABLE))
+		ksft_exit_skip("Shadow stack is not enabled, should be enabled via glibc\n");
+
+	parent_pid = getpid();
+	pid = fork();
+
+	if (pid) {
+		ksft_print_msg("Parent pid %d and child pid %d\n", parent_pid, pid);
+		shadow_stack_call_tests(&foo, true);
+	} else {
+		shadow_stack_call_tests(&foo_child, false);
+	}
+
+	if (pid) {
+		ksft_print_msg("Waiting on child to finish\n");
+		wait(&child_status);
+	} else {
+		/* exit child gracefully */
+		exit(0);
+	}
+
+	if (pid && WIFSIGNALED(child_status)) {
+		ksft_print_msg("Child faulted, fork test failed\n");
+		return false;
+	}
+
+	return true;
+}
+
+/* exercise 'map_shadow_stack', pivot to it and call some functions to ensure it works */
+#define SHADOW_STACK_ALLOC_SIZE 4096
+bool shadow_stack_map_test(unsigned long test_num, void *ctx)
+{
+	unsigned long shdw_addr;
+	int ret = 0;
+
+	ksft_print_msg("Exercising shadow stack map test\n");
+
+	shdw_addr = my_syscall3(__NR_map_shadow_stack, NULL, SHADOW_STACK_ALLOC_SIZE, 0);
+
+	if (((long)shdw_addr) <= 0) {
+		ksft_print_msg("map_shadow_stack failed with error code %d\n",
+			       (int)shdw_addr);
+		return false;
+	}
+
+	ret = munmap((void *)shdw_addr, SHADOW_STACK_ALLOC_SIZE);
+
+	if (ret) {
+		ksft_print_msg("munmap failed with error code %d\n", ret);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * shadow stack protection tests. map a shadow stack and
+ * validate all memory protections work on it
+ */
+bool shadow_stack_protection_test(unsigned long test_num, void *ctx)
+{
+	unsigned long shdw_addr;
+	unsigned long *write_addr = NULL;
+	int ret = 0, pid = 0, child_status = 0;
+
+	ksft_print_msg("Exercising shadow stack protection test (WPT)\n");
+
+	shdw_addr = my_syscall3(__NR_map_shadow_stack, NULL, SHADOW_STACK_ALLOC_SIZE, 0);
+
+	if (((long)shdw_addr) <= 0) {
+		ksft_print_msg("map_shadow_stack failed with error code %d\n",
+			       (int)shdw_addr);
+		return false;
+	}
+
+	write_addr = (unsigned long *)shdw_addr;
+	pid = fork();
+
+	/* no child was created, return false */
+	if (pid == -1)
+		return false;
+
+	/*
+	 * try to perform a store from child on shadow stack memory
+	 * it should result in SIGSEGV
+	 */
+	if (!pid) {
+		/* below write must lead to SIGSEGV */
+		*write_addr = 0xdeadbeef;
+	} else {
+		wait(&child_status);
+	}
+
+	/* test fail, if 0xdeadbeef present on shadow stack address */
+	if (*write_addr == 0xdeadbeef) {
+		ksft_print_msg("Shadow stack WPT failed\n");
+		return false;
+	}
+
+	/* if child reached here, then fail */
+	if (!pid) {
+		ksft_print_msg("Shadow stack WPT failed: child reached unreachable state\n");
+		return false;
+	}
+
+	/* if child exited via signal handler but not for write on ss */
+	if (WIFEXITED(child_status) &&
+	    WEXITSTATUS(child_status) != CHILD_EXIT_CODE_SSWRITE) {
+		ksft_print_msg("Shadow stack WPT failed: child wasn't signaled for write\n");
+		return false;
+	}
+
+	ret = munmap(write_addr, SHADOW_STACK_ALLOC_SIZE);
+	if (ret) {
+		ksft_print_msg("Shadow stack WPT failed: munmap failed, error code %d\n",
+			       ret);
+		return false;
+	}
+
+	return true;
+}
+
+#define SS_MAGIC_WRITE_VAL 0xbeefdead
+
+int gup_tests(int mem_fd, unsigned long *shdw_addr)
+{
+	unsigned long val = 0;
+
+	lseek(mem_fd, (unsigned long)shdw_addr, SEEK_SET);
+	if (read(mem_fd, &val, sizeof(val)) < 0) {
+		ksft_print_msg("Reading shadow stack mem via gup failed\n");
+		return 1;
+	}
+
+	val = SS_MAGIC_WRITE_VAL;
+	lseek(mem_fd, (unsigned long)shdw_addr, SEEK_SET);
+	if (write(mem_fd, &val, sizeof(val)) < 0) {
+		ksft_print_msg("Writing shadow stack mem via gup failed\n");
+		return 1;
+	}
+
+	if (*shdw_addr != SS_MAGIC_WRITE_VAL) {
+		ksft_print_msg("GUP write to shadow stack memory failed\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+bool shadow_stack_gup_tests(unsigned long test_num, void *ctx)
+{
+	unsigned long shdw_addr = 0;
+	unsigned long *write_addr = NULL;
+	int fd = 0;
+	bool ret = false;
+
+	ksft_print_msg("Exercising shadow stack gup tests\n");
+	shdw_addr = my_syscall3(__NR_map_shadow_stack, NULL, SHADOW_STACK_ALLOC_SIZE, 0);
+
+	if (((long)shdw_addr) <= 0) {
+		ksft_print_msg("map_shadow_stack failed with error code %d\n", (int)shdw_addr);
+		return false;
+	}
+
+	write_addr = (unsigned long *)shdw_addr;
+
+	fd = open("/proc/self/mem", O_RDWR);
+	if (fd == -1)
+		return false;
+
+	if (gup_tests(fd, write_addr)) {
+		ksft_print_msg("gup tests failed\n");
+		goto out;
+	}
+
+	ret = true;
+out:
+	if (shdw_addr && munmap(write_addr, SHADOW_STACK_ALLOC_SIZE)) {
+		ksft_print_msg("munmap failed with error code %d\n", ret);
+		ret = false;
+	}
+
+	return ret;
+}
+
+volatile bool break_loop;
+
+void sigusr1_handler(int signo)
+{
+	break_loop = true;
+}
+
+bool sigusr1_signal_test(void)
+{
+	struct sigaction sa = {};
+
+	sa.sa_handler = sigusr1_handler;
+	sa.sa_flags = 0;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(SIGUSR1, &sa, NULL)) {
+		ksft_print_msg("Registering signal handler for SIGUSR1 failed\n");
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * shadow stack signal test. shadow stack must be enabled.
+ * register a signal, fork another thread which is waiting
+ * on signal. Send a signal from parent to child, verify
+ * that signal was received by child. If not test fails
+ */
+bool shadow_stack_signal_test(unsigned long test_num, void *ctx)
+{
+	int pid = 0, child_status = 0, ret = 0;
+	unsigned long ss_status = 0;
+
+	ksft_print_msg("Exercising shadow stack signal test\n");
+
+	ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &ss_status, 0, 0, 0);
+	if (ret) {
+		ksft_print_msg("Shadow stack get status prctl failed with errorcode %d\n", ret);
+		return false;
+	}
+
+	if (!(ss_status & PR_SHADOW_STACK_ENABLE))
+		ksft_print_msg("Shadow stack is not enabled, should be enabled via glibc\n");
+
+	/* this should be caught by signal handler and do an exit */
+	if (!sigusr1_signal_test()) {
+		ksft_print_msg("Registering sigusr1 handler failed\n");
+		exit(-1);
+	}
+
+	pid = fork();
+
+	if (pid == -1) {
+		ksft_print_msg("Signal test: fork failed\n");
+		goto out;
+	}
+
+	if (pid == 0) {
+		while (!break_loop)
+			sleep(1);
+
+		exit(11);
+		/* child shouldn't go beyond here */
+	}
+
+	/* send SIGUSR1 to child */
+	kill(pid, SIGUSR1);
+	wait(&child_status);
+
+out:
+
+	return (WIFEXITED(child_status) &&
+		WEXITSTATUS(child_status) == 11);
+}
+
+int execute_shadow_stack_tests(void)
+{
+	int ret = 0;
+	unsigned long test_count = 0;
+	unsigned long shstk_status = 0;
+	bool test_pass = false;
+
+	ksft_print_msg("Executing RISC-V shadow stack self tests\n");
+	ksft_set_plan(RISCV_SHADOW_STACK_TESTS);
+
+	ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &shstk_status, 0, 0, 0);
+
+	if (ret != 0)
+		ksft_exit_fail_msg("Get shadow stack status failed with %d\n", ret);
+
+	/*
+	 * If we are here that means get shadow stack status succeeded and
+	 * thus shadow stack support is baked in the kernel.
+	 */
+	while (test_count < RISCV_SHADOW_STACK_TESTS) {
+		test_pass = (*shstk_tests[test_count].t_func)(test_count, NULL);
+		ksft_test_result(test_pass, shstk_tests[test_count].name);
+		test_count++;
+	}
+
+	ksft_finished();
+
+	return 0;
+}
+
+#pragma GCC pop_options
diff --git a/tools/testing/selftests/riscv/cfi/shadowstack.h b/tools/testing/selftests/riscv/cfi/shadowstack.h
new file mode 100644
index 000000000000..943a3685905f
--- /dev/null
+++ b/tools/testing/selftests/riscv/cfi/shadowstack.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef SELFTEST_SHADOWSTACK_TEST_H
+#define SELFTEST_SHADOWSTACK_TEST_H
+#include <stddef.h>
+#include <linux/prctl.h>
+
+/*
+ * A CFI test returns true for success or false for fail.
+ * Takes a test number to index into array, and a void pointer.
+ */
+typedef bool (*shstk_test_func)(unsigned long test_num, void *);
+
+struct shadow_stack_tests {
+	char *name;
+	shstk_test_func t_func;
+};
+
+bool shadow_stack_fork_test(unsigned long test_num, void *ctx);
+bool shadow_stack_map_test(unsigned long test_num, void *ctx);
+bool shadow_stack_protection_test(unsigned long test_num, void *ctx);
+bool shadow_stack_gup_tests(unsigned long test_num, void *ctx);
+bool shadow_stack_signal_test(unsigned long test_num, void *ctx);
+
+int execute_shadow_stack_tests(void);
+
+#endif
-- 
cgit v1.2.3


From 6bc85baba4b08c787a8c9ba1bb0252a83e5c5603 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 9 Jan 2026 11:21:30 -0500
Subject: xdrgen: Implement pass-through lines in specifications

XDR specification files can contain lines prefixed with '%' that
pass through unchanged to generated output. Traditional rpcgen
removes the '%' and emits the remainder verbatim, allowing direct
insertion of C includes, pragma directives, or other language-
specific content into the generated code.

Until now, xdrgen silently discarded these lines during parsing.
This prevented specifications from including necessary headers or
preprocessor directives that might be required for the generated
code to compile correctly.

The grammar now captures pass-through lines instead of ignoring
them. A new AST node type represents pass-through content, and
the AST transformer strips the leading '%' character. Definition
and source generators emit pass-through content in document order,
preserving the original placement within the specification.

This brings xdrgen closer to feature parity with traditional
rpcgen while maintaining the existing document-order processing
model.

Existing generated xdrgen source code has been regenerated.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr_gen.c                              | 11 +++++-
 fs/nfsd/nfs4xdr_gen.h                              |  2 +-
 include/linux/sunrpc/xdrgen/nfs4_1.h               | 11 +++++-
 tools/net/sunrpc/xdrgen/README                     |  2 --
 tools/net/sunrpc/xdrgen/generators/passthru.py     | 26 +++++++++++++++
 tools/net/sunrpc/xdrgen/grammars/xdr.lark          |  6 ++--
 tools/net/sunrpc/xdrgen/subcmds/declarations.py    |  4 +--
 tools/net/sunrpc/xdrgen/subcmds/definitions.py     |  5 ++-
 tools/net/sunrpc/xdrgen/subcmds/source.py          | 24 +++++++++----
 .../xdrgen/templates/C/passthru/definition.j2      |  3 ++
 .../sunrpc/xdrgen/templates/C/passthru/source.j2   |  3 ++
 tools/net/sunrpc/xdrgen/xdr_ast.py                 | 39 ++++++++++++++++++++--
 12 files changed, 116 insertions(+), 20 deletions(-)
 create mode 100644 tools/net/sunrpc/xdrgen/generators/passthru.py
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2
 create mode 100644 tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2

(limited to 'tools')

diff --git a/fs/nfsd/nfs4xdr_gen.c b/fs/nfsd/nfs4xdr_gen.c
index 1e5e2243625c..ce5c36e9070a 100644
--- a/fs/nfsd/nfs4xdr_gen.c
+++ b/fs/nfsd/nfs4xdr_gen.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 // Generated by xdrgen. Manual edits will be lost.
 // XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x
-// XDR specification modification time: Thu Dec 25 13:44:43 2025
+// XDR specification modification time: Thu Jan  8 23:11:48 2026
 
 #include <linux/sunrpc/svc.h>
 
@@ -178,6 +178,10 @@ xdrgen_decode_fattr4_open_arguments(struct xdr_stream *xdr, fattr4_open_argument
 	return xdrgen_decode_open_arguments4(xdr, ptr);
 }
 
+/*
+ * Determine what OPEN supports.
+ */
+
 bool
 xdrgen_decode_fattr4_time_deleg_access(struct xdr_stream *xdr, fattr4_time_deleg_access *ptr)
 {
@@ -190,6 +194,11 @@ xdrgen_decode_fattr4_time_deleg_modify(struct xdr_stream *xdr, fattr4_time_deleg
 	return xdrgen_decode_nfstime4(xdr, ptr);
 }
 
+/*
+ * New RECOMMENDED Attribute for
+ * delegation caching of times
+ */
+
 static bool __maybe_unused
 xdrgen_decode_open_delegation_type4(struct xdr_stream *xdr, open_delegation_type4 *ptr)
 {
diff --git a/fs/nfsd/nfs4xdr_gen.h b/fs/nfsd/nfs4xdr_gen.h
index 47437876e803..8dfe10246506 100644
--- a/fs/nfsd/nfs4xdr_gen.h
+++ b/fs/nfsd/nfs4xdr_gen.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Generated by xdrgen. Manual edits will be lost. */
 /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */
-/* XDR specification modification time: Thu Dec 25 13:44:43 2025 */
+/* XDR specification modification time: Thu Jan  8 23:11:48 2026 */
 
 #ifndef _LINUX_XDRGEN_NFS4_1_DECL_H
 #define _LINUX_XDRGEN_NFS4_1_DECL_H
diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h
index 352bffda08f7..5283739242c7 100644
--- a/include/linux/sunrpc/xdrgen/nfs4_1.h
+++ b/include/linux/sunrpc/xdrgen/nfs4_1.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Generated by xdrgen. Manual edits will be lost. */
 /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */
-/* XDR specification modification time: Thu Dec 25 13:44:43 2025 */
+/* XDR specification modification time: Thu Jan  8 23:11:48 2026 */
 
 #ifndef _LINUX_XDRGEN_NFS4_1_DEF_H
 #define _LINUX_XDRGEN_NFS4_1_DEF_H
@@ -87,6 +87,10 @@ typedef enum open_args_createmode4 open_args_createmode4;
 
 typedef struct open_arguments4 fattr4_open_arguments;
 
+/*
+ * Determine what OPEN supports.
+ */
+
 enum { FATTR4_OPEN_ARGUMENTS = 86 };
 
 enum { OPEN4_RESULT_NO_OPEN_STATEID = 0x00000010 };
@@ -95,6 +99,11 @@ typedef struct nfstime4 fattr4_time_deleg_access;
 
 typedef struct nfstime4 fattr4_time_deleg_modify;
 
+/*
+ * New RECOMMENDED Attribute for
+ * delegation caching of times
+ */
+
 enum { FATTR4_TIME_DELEG_ACCESS = 84 };
 
 enum { FATTR4_TIME_DELEG_MODIFY = 85 };
diff --git a/tools/net/sunrpc/xdrgen/README b/tools/net/sunrpc/xdrgen/README
index 27218a78ab40..2cf05d1e4cd9 100644
--- a/tools/net/sunrpc/xdrgen/README
+++ b/tools/net/sunrpc/xdrgen/README
@@ -250,8 +250,6 @@ Add more pragma directives:
 Enable something like a #include to dynamically insert the content
 of other specification files
 
-Properly support line-by-line pass-through via the "%" decorator
-
 Build a unit test suite for verifying translation of XDR language
 into compilable code
 
diff --git a/tools/net/sunrpc/xdrgen/generators/passthru.py b/tools/net/sunrpc/xdrgen/generators/passthru.py
new file mode 100644
index 000000000000..cb17bd977f1e
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/generators/passthru.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+# ex: set filetype=python:
+
+"""Generate code for XDR pass-through lines"""
+
+from generators import SourceGenerator, create_jinja2_environment
+from xdr_ast import _XdrPassthru
+
+
+class XdrPassthruGenerator(SourceGenerator):
+    """Generate source code for XDR pass-through content"""
+
+    def __init__(self, language: str, peer: str):
+        """Initialize an instance of this class"""
+        self.environment = create_jinja2_environment(language, "passthru")
+        self.peer = peer
+
+    def emit_definition(self, node: _XdrPassthru) -> None:
+        """Emit one pass-through line"""
+        template = self.environment.get_template("definition.j2")
+        print(template.render(content=node.content))
+
+    def emit_decoder(self, node: _XdrPassthru) -> None:
+        """Emit one pass-through line"""
+        template = self.environment.get_template("source.j2")
+        print(template.render(content=node.content))
diff --git a/tools/net/sunrpc/xdrgen/grammars/xdr.lark b/tools/net/sunrpc/xdrgen/grammars/xdr.lark
index b7c664f2acb7..1d2afff98ac5 100644
--- a/tools/net/sunrpc/xdrgen/grammars/xdr.lark
+++ b/tools/net/sunrpc/xdrgen/grammars/xdr.lark
@@ -78,6 +78,9 @@ definition              : constant_def
                         | type_def
                         | program_def
                         | pragma_def
+                        | passthru_def
+
+passthru_def            : PASSTHRU
 
 //
 // RPC program definitions not specified in RFC 4506
@@ -115,8 +118,7 @@ decimal_constant        : /[\+-]?(0|[1-9][0-9]*)/
 hexadecimal_constant    : /0x([a-f]|[A-F]|[0-9])+/
 octal_constant          : /0[0-7]+/
 
-PASSTHRU                : "%" | "%" /.+/
-%ignore PASSTHRU
+PASSTHRU                : /%.*/
 
 %import common.C_COMMENT
 %ignore C_COMMENT
diff --git a/tools/net/sunrpc/xdrgen/subcmds/declarations.py b/tools/net/sunrpc/xdrgen/subcmds/declarations.py
index 97ffb76a02f1..ed83d48d1f68 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/declarations.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/declarations.py
@@ -10,7 +10,6 @@ from argparse import Namespace
 from lark import logger
 from lark.exceptions import VisitError
 
-from generators.constant import XdrConstantGenerator
 from generators.enum import XdrEnumGenerator
 from generators.header_bottom import XdrHeaderBottomGenerator
 from generators.header_top import XdrHeaderTopGenerator
@@ -21,8 +20,7 @@ from generators.struct import XdrStructGenerator
 from generators.union import XdrUnionGenerator
 
 from xdr_ast import transform_parse_tree, _RpcProgram, Specification
-from xdr_ast import _XdrConstant, _XdrEnum, _XdrPointer
-from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion
+from xdr_ast import _XdrEnum, _XdrPointer, _XdrTypedef, _XdrStruct, _XdrUnion
 from xdr_parse import xdr_parser, set_xdr_annotate
 from xdr_parse import make_error_handler, XdrParseError
 from xdr_parse import handle_transform_error
diff --git a/tools/net/sunrpc/xdrgen/subcmds/definitions.py b/tools/net/sunrpc/xdrgen/subcmds/definitions.py
index b17526a03dda..a48ca0549382 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/definitions.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/definitions.py
@@ -14,6 +14,7 @@ from generators.constant import XdrConstantGenerator
 from generators.enum import XdrEnumGenerator
 from generators.header_bottom import XdrHeaderBottomGenerator
 from generators.header_top import XdrHeaderTopGenerator
+from generators.passthru import XdrPassthruGenerator
 from generators.pointer import XdrPointerGenerator
 from generators.program import XdrProgramGenerator
 from generators.typedef import XdrTypedefGenerator
@@ -21,7 +22,7 @@ from generators.struct import XdrStructGenerator
 from generators.union import XdrUnionGenerator
 
 from xdr_ast import transform_parse_tree, Specification
-from xdr_ast import _RpcProgram, _XdrConstant, _XdrEnum, _XdrPointer
+from xdr_ast import _RpcProgram, _XdrConstant, _XdrEnum, _XdrPassthru, _XdrPointer
 from xdr_ast import _XdrTypedef, _XdrStruct, _XdrUnion
 from xdr_parse import xdr_parser, set_xdr_annotate
 from xdr_parse import make_error_handler, XdrParseError
@@ -47,6 +48,8 @@ def emit_header_definitions(root: Specification, language: str, peer: str) -> No
             gen = XdrStructGenerator(language, peer)
         elif isinstance(definition.value, _XdrUnion):
             gen = XdrUnionGenerator(language, peer)
+        elif isinstance(definition.value, _XdrPassthru):
+            gen = XdrPassthruGenerator(language, peer)
         else:
             continue
         gen.emit_definition(definition.value)
diff --git a/tools/net/sunrpc/xdrgen/subcmds/source.py b/tools/net/sunrpc/xdrgen/subcmds/source.py
index 6508563494fe..27e8767b1b58 100644
--- a/tools/net/sunrpc/xdrgen/subcmds/source.py
+++ b/tools/net/sunrpc/xdrgen/subcmds/source.py
@@ -12,6 +12,7 @@ from lark.exceptions import VisitError
 
 from generators.source_top import XdrSourceTopGenerator
 from generators.enum import XdrEnumGenerator
+from generators.passthru import XdrPassthruGenerator
 from generators.pointer import XdrPointerGenerator
 from generators.program import XdrProgramGenerator
 from generators.typedef import XdrTypedefGenerator
@@ -19,7 +20,7 @@ from generators.struct import XdrStructGenerator
 from generators.union import XdrUnionGenerator
 
 from xdr_ast import transform_parse_tree, _RpcProgram, Specification
-from xdr_ast import _XdrAst, _XdrEnum, _XdrPointer
+from xdr_ast import _XdrAst, _XdrEnum, _XdrPassthru, _XdrPointer
 from xdr_ast import _XdrStruct, _XdrTypedef, _XdrUnion
 
 from xdr_parse import xdr_parser, set_xdr_annotate, set_xdr_enum_validation
@@ -74,22 +75,31 @@ def generate_server_source(filename: str, root: Specification, language: str) ->
     gen.emit_source(filename, root)
 
     for definition in root.definitions:
-        emit_source_decoder(definition.value, language, "server")
+        if isinstance(definition.value, _XdrPassthru):
+            passthru_gen = XdrPassthruGenerator(language, "server")
+            passthru_gen.emit_decoder(definition.value)
+        else:
+            emit_source_decoder(definition.value, language, "server")
     for definition in root.definitions:
-        emit_source_encoder(definition.value, language, "server")
+        if not isinstance(definition.value, _XdrPassthru):
+            emit_source_encoder(definition.value, language, "server")
 
 
 def generate_client_source(filename: str, root: Specification, language: str) -> None:
-    """Generate server-side source code"""
+    """Generate client-side source code"""
 
     gen = XdrSourceTopGenerator(language, "client")
     gen.emit_source(filename, root)
 
-    print("")
     for definition in root.definitions:
-        emit_source_encoder(definition.value, language, "client")
+        if isinstance(definition.value, _XdrPassthru):
+            passthru_gen = XdrPassthruGenerator(language, "client")
+            passthru_gen.emit_decoder(definition.value)
+        else:
+            emit_source_encoder(definition.value, language, "client")
     for definition in root.definitions:
-        emit_source_decoder(definition.value, language, "client")
+        if not isinstance(definition.value, _XdrPassthru):
+            emit_source_decoder(definition.value, language, "client")
 
     # cel: todo: client needs PROC macros
 
diff --git a/tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2 b/tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2
new file mode 100644
index 000000000000..900c7516a29c
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/passthru/definition.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{{ content }}
diff --git a/tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2 b/tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2
new file mode 100644
index 000000000000..900c7516a29c
--- /dev/null
+++ b/tools/net/sunrpc/xdrgen/templates/C/passthru/source.j2
@@ -0,0 +1,3 @@
+{# SPDX-License-Identifier: GPL-2.0 #}
+
+{{ content }}
diff --git a/tools/net/sunrpc/xdrgen/xdr_ast.py b/tools/net/sunrpc/xdrgen/xdr_ast.py
index dc2fa9fd8ec2..14bff9477473 100644
--- a/tools/net/sunrpc/xdrgen/xdr_ast.py
+++ b/tools/net/sunrpc/xdrgen/xdr_ast.py
@@ -516,6 +516,13 @@ class _Pragma(_XdrAst):
     """Empty class for pragma directives"""
 
 
+@dataclass
+class _XdrPassthru(_XdrAst):
+    """Passthrough line to emit verbatim in output"""
+
+    content: str
+
+
 @dataclass
 class Definition(_XdrAst, ast_utils.WithMeta):
     """Corresponds to 'definition' in the grammar"""
@@ -738,14 +745,42 @@ class ParseToAst(Transformer):
                 raise NotImplementedError("Directive not supported")
         return _Pragma()
 
+    def passthru_def(self, children):
+        """Instantiate one _XdrPassthru object"""
+        token = children[0]
+        content = token.value[1:]
+        return _XdrPassthru(content)
+
 
 transformer = ast_utils.create_transformer(this_module, ParseToAst())
 
 
+def _merge_consecutive_passthru(definitions: List[Definition]) -> List[Definition]:
+    """Merge consecutive passthru definitions into single nodes"""
+    result = []
+    i = 0
+    while i < len(definitions):
+        if isinstance(definitions[i].value, _XdrPassthru):
+            lines = [definitions[i].value.content]
+            meta = definitions[i].meta
+            j = i + 1
+            while j < len(definitions) and isinstance(definitions[j].value, _XdrPassthru):
+                lines.append(definitions[j].value.content)
+                j += 1
+            merged = _XdrPassthru("\n".join(lines))
+            result.append(Definition(meta, merged))
+            i = j
+        else:
+            result.append(definitions[i])
+            i += 1
+    return result
+
+
 def transform_parse_tree(parse_tree):
     """Transform productions into an abstract syntax tree"""
-
-    return transformer.transform(parse_tree)
+    ast = transformer.transform(parse_tree)
+    ast.definitions = _merge_consecutive_passthru(ast.definitions)
+    return ast
 
 
 def get_header_name() -> str:
-- 
cgit v1.2.3


From c17b9046faf7d1f3b8bb992e4d53da873dc478fc Mon Sep 17 00:00:00 2001
From: Koichiro Den <den@valinux.co.jp>
Date: Sat, 24 Jan 2026 23:50:12 +0900
Subject: selftests: pci_endpoint: Add BAR subrange mapping test case

Add BAR_SUBRANGE_TEST to the pci_endpoint kselftest suite.

The test uses the PCITEST_BAR_SUBRANGE ioctl and will skip when the
chosen BAR is disabled (-ENODATA), when the endpoint/controller does not
support subrange mapping (-EOPNOTSUPP), or when the BAR is reserved for
the test register space (-EBUSY).

Signed-off-by: Koichiro Den <den@valinux.co.jp>
Signed-off-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patch.msgid.link/20260124145012.2794108-9-den@valinux.co.jp
---
 .../testing/selftests/pci_endpoint/pci_endpoint_test.c  | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c b/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c
index 23aac6f97061..eecb776c33af 100644
--- a/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c
+++ b/tools/testing/selftests/pci_endpoint/pci_endpoint_test.c
@@ -70,6 +70,23 @@ TEST_F(pci_ep_bar, BAR_TEST)
 	EXPECT_FALSE(ret) TH_LOG("Test failed for BAR%d", variant->barno);
 }
 
+TEST_F(pci_ep_bar, BAR_SUBRANGE_TEST)
+{
+	int ret;
+
+	pci_ep_ioctl(PCITEST_SET_IRQTYPE, PCITEST_IRQ_TYPE_AUTO);
+	ASSERT_EQ(0, ret) TH_LOG("Can't set AUTO IRQ type");
+
+	pci_ep_ioctl(PCITEST_BAR_SUBRANGE, variant->barno);
+	if (ret == -ENODATA)
+		SKIP(return, "BAR is disabled");
+	if (ret == -EBUSY)
+		SKIP(return, "BAR is test register space");
+	if (ret == -EOPNOTSUPP)
+		SKIP(return, "Subrange map is not supported");
+	EXPECT_FALSE(ret) TH_LOG("Test failed for BAR%d", variant->barno);
+}
+
 FIXTURE(pci_ep_basic)
 {
 	int fd;
-- 
cgit v1.2.3


From b3827c91cc9979fe04d99e016fb9c5f6260f29a0 Mon Sep 17 00:00:00 2001
From: Andre Carvalho <asantostc@gmail.com>
Date: Tue, 27 Jan 2026 19:39:20 +0000
Subject: netconsole: selftests: Move netconsole selftests to separate target

This patch moves netconsole selftests from drivers/net to its own target
in drivers/net/netconsole.

This change helps saving some resources from CI since tests in
drivers/net automatically run against real hardware which are not used
by netconsole tests as they rely solely on netdevsim.

lib_netcons.sh is kept under drivers/net/lib since it is also used by
bonding selftests. Finally, drivers/net config remains unchanged as
netpoll_basic.py requires netconsole (and does leverage real HW testing).

Reviewed-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Andre Carvalho <asantostc@gmail.com>
Link: https://patch.msgid.link/20260127-netcons-selftest-target-v2-1-f509ab65b3bc@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS                                        |   2 +-
 tools/testing/selftests/Makefile                   |   1 +
 tools/testing/selftests/drivers/net/Makefile       |   7 -
 .../testing/selftests/drivers/net/netcons_basic.sh |  74 ------
 .../selftests/drivers/net/netcons_cmdline.sh       |  65 -----
 .../drivers/net/netcons_fragmented_msg.sh          | 122 ---------
 .../selftests/drivers/net/netcons_overflow.sh      |  67 -----
 .../selftests/drivers/net/netcons_resume.sh        | 124 ----------
 .../selftests/drivers/net/netcons_sysdata.sh       | 272 ---------------------
 .../selftests/drivers/net/netcons_torture.sh       | 130 ----------
 .../selftests/drivers/net/netconsole/Makefile      |  19 ++
 .../selftests/drivers/net/netconsole/config        |   6 +
 .../drivers/net/netconsole/netcons_basic.sh        |  74 ++++++
 .../drivers/net/netconsole/netcons_cmdline.sh      |  65 +++++
 .../net/netconsole/netcons_fragmented_msg.sh       | 122 +++++++++
 .../drivers/net/netconsole/netcons_overflow.sh     |  67 +++++
 .../drivers/net/netconsole/netcons_resume.sh       | 124 ++++++++++
 .../drivers/net/netconsole/netcons_sysdata.sh      | 272 +++++++++++++++++++++
 .../drivers/net/netconsole/netcons_torture.sh      | 130 ++++++++++
 19 files changed, 881 insertions(+), 862 deletions(-)
 delete mode 100755 tools/testing/selftests/drivers/net/netcons_basic.sh
 delete mode 100755 tools/testing/selftests/drivers/net/netcons_cmdline.sh
 delete mode 100755 tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh
 delete mode 100755 tools/testing/selftests/drivers/net/netcons_overflow.sh
 delete mode 100755 tools/testing/selftests/drivers/net/netcons_resume.sh
 delete mode 100755 tools/testing/selftests/drivers/net/netcons_sysdata.sh
 delete mode 100755 tools/testing/selftests/drivers/net/netcons_torture.sh
 create mode 100644 tools/testing/selftests/drivers/net/netconsole/Makefile
 create mode 100644 tools/testing/selftests/drivers/net/netconsole/config
 create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh
 create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh
 create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh
 create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh
 create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh
 create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh
 create mode 100755 tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh

(limited to 'tools')

diff --git a/MAINTAINERS b/MAINTAINERS
index ff9e204c5f33..0caa8aee5840 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18021,7 +18021,7 @@ S:	Maintained
 F:	Documentation/networking/netconsole.rst
 F:	drivers/net/netconsole.c
 F:	tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
-F:	tools/testing/selftests/drivers/net/netcons\*
+F:	tools/testing/selftests/drivers/net/netconsole/
 
 NETDEVSIM
 M:	Jakub Kicinski <kuba@kernel.org>
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 56e44a98d6a5..450f13ba4cca 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -22,6 +22,7 @@ TARGETS += drivers/ntsync
 TARGETS += drivers/s390x/uvdevice
 TARGETS += drivers/net
 TARGETS += drivers/net/bonding
+TARGETS += drivers/net/netconsole
 TARGETS += drivers/net/team
 TARGETS += drivers/net/virtio_net
 TARGETS += drivers/platform/x86/intel/ifs
diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile
index 3eba569b3366..8154d6d429d3 100644
--- a/tools/testing/selftests/drivers/net/Makefile
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -15,13 +15,6 @@ TEST_PROGS := \
 	hds.py \
 	napi_id.py \
 	napi_threaded.py \
-	netcons_basic.sh \
-	netcons_cmdline.sh \
-	netcons_fragmented_msg.sh \
-	netcons_overflow.sh \
-	netcons_resume.sh \
-	netcons_sysdata.sh \
-	netcons_torture.sh \
 	netpoll_basic.py \
 	ping.py \
 	psp.py \
diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh
deleted file mode 100755
index 2022f3061738..000000000000
--- a/tools/testing/selftests/drivers/net/netcons_basic.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-License-Identifier: GPL-2.0
-
-# This test creates two netdevsim virtual interfaces, assigns one of them (the
-# "destination interface") to a new namespace, and assigns IP addresses to both
-# interfaces.
-#
-# It listens on the destination interface using socat and configures a dynamic
-# target on netconsole, pointing to the destination IP address.
-#
-# Finally, it checks whether the message was received properly on the
-# destination interface.  Note that this test may pollute the kernel log buffer
-# (dmesg) and relies on dynamic configuration and namespaces being configured.
-#
-# Author: Breno Leitao <leitao@debian.org>
-
-set -euo pipefail
-
-SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
-
-source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
-
-modprobe netdevsim 2> /dev/null || true
-modprobe netconsole 2> /dev/null || true
-
-# The content of kmsg will be save to the following file
-OUTPUT_FILE="/tmp/${TARGET}"
-
-# Check for basic system dependency and exit if not found
-check_for_dependencies
-# Remove the namespace, interfaces and netconsole target on exit
-trap cleanup EXIT
-
-# Run the test twice, with different format modes
-for FORMAT in "basic" "extended"
-do
-	for IP_VERSION in "ipv6" "ipv4"
-	do
-		echo "Running with target mode: ${FORMAT} (${IP_VERSION})"
-		# Set current loglevel to KERN_INFO(6), and default to
-		# KERN_NOTICE(5)
-		echo "6 5" > /proc/sys/kernel/printk
-		# Create one namespace and two interfaces
-		set_network "${IP_VERSION}"
-		# Create a dynamic target for netconsole
-		create_dynamic_target "${FORMAT}"
-		# Only set userdata for extended format
-		if [ "$FORMAT" == "extended" ]
-		then
-			# Set userdata "key" with the "value" value
-			set_user_data
-		fi
-		# Listed for netconsole port inside the namespace and
-		# destination interface
-		listen_port_and_save_to "${OUTPUT_FILE}" "${IP_VERSION}" &
-		# Wait for socat to start and listen to the port.
-		wait_for_port "${NAMESPACE}" "${PORT}" "${IP_VERSION}"
-		# Send the message
-		echo "${MSG}: ${TARGET}" > /dev/kmsg
-		# Wait until socat saves the file to disk
-		busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
-
-		# Make sure the message was received in the dst part
-		# and exit
-		validate_result "${OUTPUT_FILE}" "${FORMAT}"
-		# kill socat in case it is still running
-		pkill_socat
-		cleanup
-		echo "${FORMAT} : ${IP_VERSION} : Test passed" >&2
-	done
-done
-
-trap - EXIT
-exit "${ksft_pass}"
diff --git a/tools/testing/selftests/drivers/net/netcons_cmdline.sh b/tools/testing/selftests/drivers/net/netcons_cmdline.sh
deleted file mode 100755
index d1d23dc67f99..000000000000
--- a/tools/testing/selftests/drivers/net/netcons_cmdline.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-License-Identifier: GPL-2.0
-
-# This is a selftest to test cmdline arguments on netconsole.
-# It exercises loading of netconsole from cmdline instead of the dynamic
-# reconfiguration. This includes parsing the long netconsole= line and all the
-# flow through init_netconsole().
-#
-# Author: Breno Leitao <leitao@debian.org>
-
-set -euo pipefail
-
-SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
-
-source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
-
-check_netconsole_module
-
-modprobe netdevsim 2> /dev/null || true
-rmmod netconsole 2> /dev/null || true
-
-# Check for basic system dependency and exit if not found
-# check_for_dependencies
-# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
-echo "6 5" > /proc/sys/kernel/printk
-# Remove the namespace and network interfaces
-trap do_cleanup EXIT
-# Create one namespace and two interfaces
-set_network
-
-# Run the test twice, with different cmdline parameters
-for BINDMODE in "ifname" "mac"
-do
-	echo "Running with bind mode: ${BINDMODE}" >&2
-	# Create the command line for netconsole, with the configuration from
-	# the function above
-	CMDLINE=$(create_cmdline_str "${BINDMODE}")
-
-	# The content of kmsg will be save to the following file
-	OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}"
-
-	# Load the module, with the cmdline set
-	modprobe netconsole "${CMDLINE}"
-
-	# Listed for netconsole port inside the namespace and destination
-	# interface
-	listen_port_and_save_to "${OUTPUT_FILE}" &
-	# Wait for socat to start and listen to the port.
-	wait_local_port_listen "${NAMESPACE}" "${PORT}" udp
-	# Send the message
-	echo "${MSG}: ${TARGET}" > /dev/kmsg
-	# Wait until socat saves the file to disk
-	busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
-	# Make sure the message was received in the dst part
-	# and exit
-	validate_msg "${OUTPUT_FILE}"
-
-	# kill socat in case it is still running
-	pkill_socat
-	# Unload the module
-	rmmod netconsole
-	echo "${BINDMODE} : Test passed" >&2
-done
-
-exit "${ksft_pass}"
diff --git a/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh b/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh
deleted file mode 100755
index 4a71e01a230c..000000000000
--- a/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-License-Identifier: GPL-2.0
-
-# Test netconsole's message fragmentation functionality.
-#
-# When a message exceeds the maximum packet size, netconsole splits it into
-# multiple fragments for transmission. This test verifies:
-#  - Correct fragmentation of large messages
-#  - Proper reassembly of fragments at the receiver
-#  - Preservation of userdata across fragments
-#  - Behavior with and without kernel release version appending
-#
-# Author: Breno Leitao <leitao@debian.org>
-
-set -euo pipefail
-
-SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
-
-source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
-
-modprobe netdevsim 2> /dev/null || true
-modprobe netconsole 2> /dev/null || true
-
-# The content of kmsg will be save to the following file
-OUTPUT_FILE="/tmp/${TARGET}"
-
-# set userdata to a long value. In this case, it is "1-2-3-4...50-"
-USERDATA_VALUE=$(printf -- '%.2s-' {1..60})
-
-# Convert the header string in a regexp, so, we can remove
-# the second header as well.
-# A header looks like "13,468,514729715,-,ncfrag=0/1135;". If
-# release is appended, you might find something like:L
-# "6.13.0-04048-g4f561a87745a,13,468,514729715,-,ncfrag=0/1135;"
-function header_to_regex() {
-	# header is everything before ;
-	local HEADER="${1}"
-	REGEX=$(echo "${HEADER}" | cut -d'=' -f1)
-	echo "${REGEX}=[0-9]*\/[0-9]*;"
-}
-
-# We have two headers in the message. Remove both to get the full message,
-# and extract the full message.
-function extract_msg() {
-	local MSGFILE="${1}"
-	# Extract the header, which is the very first thing that arrives in the
-	# first list.
-	HEADER=$(sed -n '1p' "${MSGFILE}" | cut -d';' -f1)
-	HEADER_REGEX=$(header_to_regex "${HEADER}")
-
-	# Remove the two headers from the received message
-	# This will return the message without any header, similarly to what
-	# was sent.
-	sed "s/""${HEADER_REGEX}""//g" "${MSGFILE}"
-}
-
-# Validate the message, which has two messages glued together.
-# unwrap them to make sure all the characters were transmitted.
-# File will look like the following:
-#  13,468,514729715,-,ncfrag=0/1135;<message>
-#   key=<part of key>-13,468,514729715,-,ncfrag=967/1135;<rest of the key>
-function validate_fragmented_result() {
-	# Discard the netconsole headers, and assemble the full message
-	RCVMSG=$(extract_msg "${1}")
-
-	# check for the main message
-	if ! echo "${RCVMSG}" | grep -q "${MSG}"; then
-		echo "Message body doesn't match." >&2
-		echo "msg received=" "${RCVMSG}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	# check userdata
-	if ! echo "${RCVMSG}" | grep -q "${USERDATA_VALUE}"; then
-		echo "message userdata doesn't match" >&2
-		echo "msg received=" "${RCVMSG}" >&2
-		exit "${ksft_fail}"
-	fi
-	# test passed. hooray
-}
-
-# Check for basic system dependency and exit if not found
-check_for_dependencies
-# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
-echo "6 5" > /proc/sys/kernel/printk
-# Remove the namespace, interfaces and netconsole target on exit
-trap cleanup EXIT
-# Create one namespace and two interfaces
-set_network
-# Create a dynamic target for netconsole
-create_dynamic_target
-# Set userdata "key" with the "value" value
-set_user_data
-
-
-# TEST 1: Send message and userdata. They will fragment
-# =======
-MSG=$(printf -- 'MSG%.3s=' {1..150})
-
-# Listen for netconsole port inside the namespace and destination interface
-listen_port_and_save_to "${OUTPUT_FILE}" &
-# Wait for socat to start and listen to the port.
-wait_local_port_listen "${NAMESPACE}" "${PORT}" udp
-# Send the message
-echo "${MSG}: ${TARGET}" > /dev/kmsg
-# Wait until socat saves the file to disk
-busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
-# Check if the message was not corrupted
-validate_fragmented_result "${OUTPUT_FILE}"
-
-# TEST 2: Test with smaller message, and without release appended
-# =======
-MSG=$(printf -- 'FOOBAR%.3s=' {1..100})
-# Let's disable release and test again.
-disable_release_append
-
-listen_port_and_save_to "${OUTPUT_FILE}" &
-wait_local_port_listen "${NAMESPACE}" "${PORT}" udp
-echo "${MSG}: ${TARGET}" > /dev/kmsg
-busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
-validate_fragmented_result "${OUTPUT_FILE}"
-exit "${ksft_pass}"
diff --git a/tools/testing/selftests/drivers/net/netcons_overflow.sh b/tools/testing/selftests/drivers/net/netcons_overflow.sh
deleted file mode 100755
index 06089643b771..000000000000
--- a/tools/testing/selftests/drivers/net/netcons_overflow.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-License-Identifier: GPL-2.0
-
-# This test verifies that users can successfully create up to
-# MAX_USERDATA_ITEMS userdata entries without encountering any failures.
-#
-# Additionally, it tests for expected failure when attempting to exceed this
-# maximum limit.
-#
-# Author: Breno Leitao <leitao@debian.org>
-
-set -euo pipefail
-
-SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
-
-source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
-# This is coming from netconsole code. Check for it in drivers/net/netconsole.c
-MAX_USERDATA_ITEMS=256
-
-# Function to create userdata entries
-function create_userdata_max_entries() {
-	# All these keys should be created without any error
-	for i in $(seq $MAX_USERDATA_ITEMS)
-	do
-		# USERDATA_KEY is used by set_user_data
-		USERDATA_KEY="key"${i}
-		set_user_data
-	done
-}
-
-# Function to verify the entry limit
-function verify_entry_limit() {
-	# Allowing the test to fail without exiting, since the next command
-	# will fail
-	set +e
-	mkdir "${NETCONS_PATH}/userdata/key_that_will_fail" 2> /dev/null
-	ret="$?"
-	set -e
-	if [ "$ret" -eq 0 ];
-	then
-		echo "Adding more than ${MAX_USERDATA_ITEMS} entries in userdata should fail, but it didn't" >&2
-		ls "${NETCONS_PATH}/userdata/" >&2
-		exit "${ksft_fail}"
-	fi
-}
-
-# ========== #
-# Start here #
-# ========== #
-
-modprobe netdevsim 2> /dev/null || true
-modprobe netconsole 2> /dev/null || true
-
-# Check for basic system dependency and exit if not found
-check_for_dependencies
-
-# Remove the namespace, interfaces and netconsole target on exit
-trap cleanup EXIT
-# Create one namespace and two interfaces
-set_network
-# Create a dynamic target for netconsole
-create_dynamic_target
-# populate the maximum number of supported keys in userdata
-create_userdata_max_entries
-# Verify an additional entry is not allowed
-verify_entry_limit
-exit "${ksft_pass}"
diff --git a/tools/testing/selftests/drivers/net/netcons_resume.sh b/tools/testing/selftests/drivers/net/netcons_resume.sh
deleted file mode 100755
index fc5e5e3ad3d4..000000000000
--- a/tools/testing/selftests/drivers/net/netcons_resume.sh
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-License-Identifier: GPL-2.0
-
-# This test validates that netconsole is able to resume a target that was
-# deactivated when its interface was removed when the interface is brought
-# back up.
-#
-# The test configures a netconsole target and then removes netdevsim module to
-# cause the interface to disappear. Targets are configured via cmdline to ensure
-# targets bound by interface name and mac address can be resumed.
-# The test verifies that the target moved to disabled state before adding
-# netdevsim and the interface back.
-#
-# Finally, the test verifies that the target is re-enabled automatically and
-# the message is received on the destination interface.
-#
-# Author: Andre Carvalho <asantostc@gmail.com>
-
-set -euo pipefail
-
-SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
-
-source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
-
-SAVED_SRCMAC="" # to be populated later
-SAVED_DSTMAC="" # to be populated later
-
-modprobe netdevsim 2> /dev/null || true
-rmmod netconsole 2> /dev/null || true
-
-check_netconsole_module
-
-function cleanup() {
-	cleanup_netcons "${NETCONS_CONFIGFS}/cmdline0"
-	do_cleanup
-	rmmod netconsole
-}
-
-function trigger_reactivation() {
-	# Add back low level module
-	modprobe netdevsim
-	# Recreate namespace and two interfaces
-	set_network
-	# Restore MACs
-	ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" \
-		address "${SAVED_DSTMAC}"
-	if [ "${BINDMODE}" == "mac" ]; then
-		ip link set dev "${SRCIF}" down
-		ip link set dev "${SRCIF}" address "${SAVED_SRCMAC}"
-		# Rename device in order to trigger target resume, as initial
-		# when device was recreated it didn't have correct mac address.
-		ip link set dev "${SRCIF}" name "${TARGET}"
-	fi
-}
-
-function trigger_deactivation() {
-	# Start by storing mac addresses so we can be restored in reactivate
-	SAVED_DSTMAC=$(ip netns exec "${NAMESPACE}" \
-		cat /sys/class/net/"$DSTIF"/address)
-	SAVED_SRCMAC=$(mac_get "${SRCIF}")
-	# Remove low level module
-	rmmod netdevsim
-}
-
-trap cleanup EXIT
-
-# Run the test twice, with different cmdline parameters
-for BINDMODE in "ifname" "mac"
-do
-	echo "Running with bind mode: ${BINDMODE}" >&2
-	# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
-	echo "6 5" > /proc/sys/kernel/printk
-
-	# Create one namespace and two interfaces
-	set_network
-
-	# Create the command line for netconsole, with the configuration from
-	# the function above
-	CMDLINE=$(create_cmdline_str "${BINDMODE}")
-
-	# The content of kmsg will be save to the following file
-	OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}"
-
-	# Load the module, with the cmdline set
-	modprobe netconsole "${CMDLINE}"
-	# Expose cmdline target in configfs
-	mkdir "${NETCONS_CONFIGFS}/cmdline0"
-
-	# Target should be enabled
-	wait_target_state "cmdline0" "enabled"
-
-	# Trigger deactivation by unloading netdevsim module. Target should be
-	# disabled.
-	trigger_deactivation
-	wait_target_state "cmdline0" "disabled"
-
-	# Trigger reactivation by loading netdevsim, recreating the network and
-	# restoring mac addresses. Target should be re-enabled.
-	trigger_reactivation
-	wait_target_state "cmdline0" "enabled"
-
-	# Listen for netconsole port inside the namespace and destination
-	# interface
-	listen_port_and_save_to "${OUTPUT_FILE}" &
-	# Wait for socat to start and listen to the port.
-	wait_local_port_listen "${NAMESPACE}" "${PORT}" udp
-	# Send the message
-	echo "${MSG}: ${TARGET}" > /dev/kmsg
-	# Wait until socat saves the file to disk
-	busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
-	# Make sure the message was received in the dst part
-	# and exit
-	validate_msg "${OUTPUT_FILE}"
-
-	# kill socat in case it is still running
-	pkill_socat
-	# Cleanup & unload the module
-	cleanup
-
-	echo "${BINDMODE} : Test passed" >&2
-done
-
-trap - EXIT
-exit "${EXIT_STATUS}"
diff --git a/tools/testing/selftests/drivers/net/netcons_sysdata.sh b/tools/testing/selftests/drivers/net/netcons_sysdata.sh
deleted file mode 100755
index baf69031089e..000000000000
--- a/tools/testing/selftests/drivers/net/netcons_sysdata.sh
+++ /dev/null
@@ -1,272 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-License-Identifier: GPL-2.0
-
-# A test that makes sure that sysdata runtime CPU data is properly set
-# when a message is sent.
-#
-# There are 3 different tests, every time sent using a random CPU.
-#  - Test #1
-#    * Only enable cpu_nr sysdata feature.
-#  - Test #2
-#    * Keep cpu_nr sysdata feature enable and enable userdata.
-#  - Test #3
-#    * keep userdata enabled, and disable sysdata cpu_nr feature.
-#
-# Author: Breno Leitao <leitao@debian.org>
-
-set -euo pipefail
-
-SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
-
-source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
-
-# Enable the sysdata cpu_nr feature
-function set_cpu_nr() {
-	if [[ ! -f "${NETCONS_PATH}/userdata/cpu_nr_enabled" ]]
-	then
-		echo "Populate CPU configfs path not available in ${NETCONS_PATH}/userdata/cpu_nr_enabled" >&2
-		exit "${ksft_skip}"
-	fi
-
-	echo 1 > "${NETCONS_PATH}/userdata/cpu_nr_enabled"
-}
-
-# Enable the taskname to be appended to sysdata
-function set_taskname() {
-	if [[ ! -f "${NETCONS_PATH}/userdata/taskname_enabled" ]]
-	then
-		echo "Not able to enable taskname sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/taskname_enabled" >&2
-		exit "${ksft_skip}"
-	fi
-
-	echo 1 > "${NETCONS_PATH}/userdata/taskname_enabled"
-}
-
-# Enable the release to be appended to sysdata
-function set_release() {
-	if [[ ! -f "${NETCONS_PATH}/userdata/release_enabled" ]]
-	then
-		echo "Not able to enable release sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/release_enabled" >&2
-		exit "${ksft_skip}"
-	fi
-
-	echo 1 > "${NETCONS_PATH}/userdata/release_enabled"
-}
-
-# Enable the msgid to be appended to sysdata
-function set_msgid() {
-	if [[ ! -f "${NETCONS_PATH}/userdata/msgid_enabled" ]]
-	then
-		echo "Not able to enable msgid sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/msgid_enabled" >&2
-		exit "${ksft_skip}"
-	fi
-
-	echo 1 > "${NETCONS_PATH}/userdata/msgid_enabled"
-}
-
-# Disable the sysdata cpu_nr feature
-function unset_cpu_nr() {
-	echo 0 > "${NETCONS_PATH}/userdata/cpu_nr_enabled"
-}
-
-# Once called, taskname=<..> will not be appended anymore
-function unset_taskname() {
-	echo 0 > "${NETCONS_PATH}/userdata/taskname_enabled"
-}
-
-function unset_release() {
-	echo 0 > "${NETCONS_PATH}/userdata/release_enabled"
-}
-
-function unset_msgid() {
-	echo 0 > "${NETCONS_PATH}/userdata/msgid_enabled"
-}
-
-# Test if MSG contains sysdata
-function validate_sysdata() {
-	# OUTPUT_FILE will contain something like:
-	# 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM
-	#  userdatakey=userdatavalue
-	#  cpu=X
-	#  taskname=<taskname>
-	#  msgid=<id>
-
-	# Echo is what this test uses to create the message. See runtest()
-	# function
-	SENDER="echo"
-
-	if [ ! -f "$OUTPUT_FILE" ]; then
-		echo "FAIL: File was not generated." >&2
-		exit "${ksft_fail}"
-	fi
-
-	if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then
-		echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2
-		cat "${OUTPUT_FILE}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	# Check if cpu=XX exists in the file and matches the one used
-	# in taskset(1)
-	if ! grep -q "cpu=${CPU}\+" "${OUTPUT_FILE}"; then
-		echo "FAIL: 'cpu=${CPU}' not found in ${OUTPUT_FILE}" >&2
-		cat "${OUTPUT_FILE}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	if ! grep -q "taskname=${SENDER}" "${OUTPUT_FILE}"; then
-		echo "FAIL: 'taskname=echo' not found in ${OUTPUT_FILE}" >&2
-		cat "${OUTPUT_FILE}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	if ! grep -q "msgid=[0-9]\+$" "${OUTPUT_FILE}"; then
-		echo "FAIL: 'msgid=<id>' not found in ${OUTPUT_FILE}" >&2
-		cat "${OUTPUT_FILE}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	rm "${OUTPUT_FILE}"
-	pkill_socat
-}
-
-function validate_release() {
-	RELEASE=$(uname -r)
-
-	if [ ! -f "$OUTPUT_FILE" ]; then
-		echo "FAIL: File was not generated." >&2
-		exit "${ksft_fail}"
-	fi
-
-	if ! grep -q "release=${RELEASE}" "${OUTPUT_FILE}"; then
-		echo "FAIL: 'release=${RELEASE}' not found in ${OUTPUT_FILE}" >&2
-		cat "${OUTPUT_FILE}" >&2
-		exit "${ksft_fail}"
-	fi
-}
-
-# Test if MSG content exists in OUTPUT_FILE but no `cpu=` and `taskname=`
-# strings
-function validate_no_sysdata() {
-	if [ ! -f "$OUTPUT_FILE" ]; then
-		echo "FAIL: File was not generated." >&2
-		exit "${ksft_fail}"
-	fi
-
-	if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then
-		echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2
-		cat "${OUTPUT_FILE}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	if grep -q "cpu=" "${OUTPUT_FILE}"; then
-		echo "FAIL: 'cpu=  found in ${OUTPUT_FILE}" >&2
-		cat "${OUTPUT_FILE}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	if grep -q "taskname=" "${OUTPUT_FILE}"; then
-		echo "FAIL: 'taskname=  found in ${OUTPUT_FILE}" >&2
-		cat "${OUTPUT_FILE}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	if grep -q "release=" "${OUTPUT_FILE}"; then
-		echo "FAIL: 'release=  found in ${OUTPUT_FILE}" >&2
-		cat "${OUTPUT_FILE}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	if grep -q "msgid=" "${OUTPUT_FILE}"; then
-		echo "FAIL: 'msgid=  found in ${OUTPUT_FILE}" >&2
-		cat "${OUTPUT_FILE}" >&2
-		exit "${ksft_fail}"
-	fi
-
-	rm "${OUTPUT_FILE}"
-}
-
-# Start socat, send the message and wait for the file to show up in the file
-# system
-function runtest {
-	# Listen for netconsole port inside the namespace and destination
-	# interface
-	listen_port_and_save_to "${OUTPUT_FILE}" &
-	# Wait for socat to start and listen to the port.
-	wait_local_port_listen "${NAMESPACE}" "${PORT}" udp
-	# Send the message
-	taskset -c "${CPU}" echo "${MSG}: ${TARGET}" > /dev/kmsg
-	# Wait until socat saves the file to disk
-	busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
-}
-
-# ========== #
-# Start here #
-# ========== #
-
-modprobe netdevsim 2> /dev/null || true
-modprobe netconsole 2> /dev/null || true
-
-# Check for basic system dependency and exit if not found
-check_for_dependencies
-# This test also depends on taskset(1). Check for it before starting the test
-check_for_taskset
-
-# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
-echo "6 5" > /proc/sys/kernel/printk
-# Remove the namespace, interfaces and netconsole target on exit
-trap cleanup EXIT
-# Create one namespace and two interfaces
-set_network
-# Create a dynamic target for netconsole
-create_dynamic_target
-
-#====================================================
-# TEST #1
-# Send message from a random CPU
-#====================================================
-# Random CPU in the system
-CPU=$((RANDOM % $(nproc)))
-OUTPUT_FILE="/tmp/${TARGET}_1"
-MSG="Test #1 from CPU${CPU}"
-# Enable the auto population of cpu_nr
-set_cpu_nr
-# Enable taskname to be appended to sysdata
-set_taskname
-set_release
-set_msgid
-runtest
-# Make sure the message was received in the dst part
-# and exit
-validate_release
-validate_sysdata
-
-#====================================================
-# TEST #2
-# This test now adds userdata together with sysdata
-# ===================================================
-# Get a new random CPU
-CPU=$((RANDOM % $(nproc)))
-OUTPUT_FILE="/tmp/${TARGET}_2"
-MSG="Test #2 from CPU${CPU}"
-set_user_data
-runtest
-validate_release
-validate_sysdata
-
-# ===================================================
-# TEST #3
-# Unset all sysdata, fail if any userdata is set
-# ===================================================
-CPU=$((RANDOM % $(nproc)))
-OUTPUT_FILE="/tmp/${TARGET}_3"
-MSG="Test #3 from CPU${CPU}"
-unset_cpu_nr
-unset_taskname
-unset_release
-unset_msgid
-runtest
-# At this time, cpu= shouldn't be present in the msg
-validate_no_sysdata
-
-exit "${ksft_pass}"
diff --git a/tools/testing/selftests/drivers/net/netcons_torture.sh b/tools/testing/selftests/drivers/net/netcons_torture.sh
deleted file mode 100755
index 2ce9ee3719d1..000000000000
--- a/tools/testing/selftests/drivers/net/netcons_torture.sh
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/usr/bin/env bash
-# SPDX-License-Identifier: GPL-2.0
-
-# Repeatedly send kernel messages, toggles netconsole targets on and off,
-# creates and deletes targets in parallel, and toggles the source interface to
-# simulate stress conditions.
-#
-# This test aims to verify the robustness of netconsole under dynamic
-# configurations and concurrent operations.
-#
-# The major goal is to run this test with LOCKDEP, Kmemleak and KASAN to make
-# sure no issues is reported.
-#
-# Author: Breno Leitao <leitao@debian.org>
-
-set -euo pipefail
-
-SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
-
-source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
-
-# Number of times the main loop run
-ITERATIONS=${1:-150}
-
-# Only test extended format
-FORMAT="extended"
-# And ipv6 only
-IP_VERSION="ipv6"
-
-# Create, enable and delete some targets.
-create_and_delete_random_target() {
-	COUNT=2
-	RND_PREFIX=$(mktemp -u netcons_rnd_XXXX_)
-
-	if [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}${COUNT}"  ] || \
-	   [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}0" ]; then
-		echo "Function didn't finish yet, skipping it." >&2
-		return
-	fi
-
-	# enable COUNT targets
-	for i in $(seq ${COUNT})
-	do
-		RND_TARGET="${RND_PREFIX}"${i}
-		RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}"
-
-		# Basic population so the target can come up
-		_create_dynamic_target "${FORMAT}" "${RND_TARGET_PATH}"
-	done
-
-	echo "netconsole selftest: ${COUNT} additional targets were created" > /dev/kmsg
-	# disable them all
-	for i in $(seq ${COUNT})
-	do
-		RND_TARGET="${RND_PREFIX}"${i}
-		RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}"
-		if [[ $(cat "${RND_TARGET_PATH}/enabled") -eq 1 ]]
-		then
-			echo 0 > "${RND_TARGET_PATH}"/enabled
-		fi
-		rmdir "${RND_TARGET_PATH}"
-	done
-}
-
-# Disable and enable the target mid-air, while messages
-# are being transmitted.
-toggle_netcons_target() {
-	for i in $(seq 2)
-	do
-		if [ ! -d "${NETCONS_PATH}" ]
-		then
-			break
-		fi
-		echo 0 > "${NETCONS_PATH}"/enabled 2> /dev/null || true
-		# Try to enable a bit harder, given it might fail to enable
-		# Write to `enabled` might fail depending on the lock, which is
-		# highly contentious here
-		for _ in $(seq 5)
-		do
-			echo 1 > "${NETCONS_PATH}"/enabled 2> /dev/null || true
-		done
-	done
-}
-
-toggle_iface(){
-	ip link set "${SRCIF}" down
-	ip link set "${SRCIF}" up
-}
-
-# Start here
-
-modprobe netdevsim 2> /dev/null || true
-modprobe netconsole 2> /dev/null || true
-
-# Check for basic system dependency and exit if not found
-check_for_dependencies
-# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
-echo "6 5" > /proc/sys/kernel/printk
-# Remove the namespace, interfaces and netconsole target on exit
-trap cleanup EXIT
-# Create one namespace and two interfaces
-set_network "${IP_VERSION}"
-# Create a dynamic target for netconsole
-create_dynamic_target "${FORMAT}"
-
-for i in $(seq "$ITERATIONS")
-do
-	for _ in $(seq 10)
-	do
-		echo "${MSG}: ${TARGET} ${i}" > /dev/kmsg
-	done
-	wait
-
-	if (( i % 30 == 0 )); then
-		toggle_netcons_target &
-	fi
-
-	if (( i % 50 == 0 )); then
-		# create some targets, enable them, send msg and disable
-		# all in a parallel thread
-		create_and_delete_random_target &
-	fi
-
-	if (( i % 70 == 0 )); then
-		toggle_iface &
-	fi
-done
-wait
-
-exit "${EXIT_STATUS}"
diff --git a/tools/testing/selftests/drivers/net/netconsole/Makefile b/tools/testing/selftests/drivers/net/netconsole/Makefile
new file mode 100644
index 000000000000..b56c70b7e274
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netconsole/Makefile
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_INCLUDES := \
+	../../../net/lib.sh \
+	../lib/sh/lib_netcons.sh \
+# end of TEST_INCLUDES
+
+TEST_PROGS := \
+	netcons_basic.sh \
+	netcons_cmdline.sh \
+	netcons_fragmented_msg.sh \
+	netcons_overflow.sh \
+	netcons_resume.sh \
+	netcons_sysdata.sh \
+	netcons_torture.sh \
+# end of TEST_PROGS
+
+include ../../../lib.mk
+
diff --git a/tools/testing/selftests/drivers/net/netconsole/config b/tools/testing/selftests/drivers/net/netconsole/config
new file mode 100644
index 000000000000..a3f6b0fd44ef
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netconsole/config
@@ -0,0 +1,6 @@
+CONFIG_CONFIGFS_FS=y
+CONFIG_IPV6=y
+CONFIG_NETCONSOLE=m
+CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_NETCONSOLE_EXTENDED_LOG=y
+CONFIG_NETDEVSIM=m
diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh
new file mode 100755
index 000000000000..59cf10013ecd
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netconsole/netcons_basic.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test creates two netdevsim virtual interfaces, assigns one of them (the
+# "destination interface") to a new namespace, and assigns IP addresses to both
+# interfaces.
+#
+# It listens on the destination interface using socat and configures a dynamic
+# target on netconsole, pointing to the destination IP address.
+#
+# Finally, it checks whether the message was received properly on the
+# destination interface.  Note that this test may pollute the kernel log buffer
+# (dmesg) and relies on dynamic configuration and namespaces being configured.
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh
+
+modprobe netdevsim 2> /dev/null || true
+modprobe netconsole 2> /dev/null || true
+
+# The content of kmsg will be save to the following file
+OUTPUT_FILE="/tmp/${TARGET}"
+
+# Check for basic system dependency and exit if not found
+check_for_dependencies
+# Remove the namespace, interfaces and netconsole target on exit
+trap cleanup EXIT
+
+# Run the test twice, with different format modes
+for FORMAT in "basic" "extended"
+do
+	for IP_VERSION in "ipv6" "ipv4"
+	do
+		echo "Running with target mode: ${FORMAT} (${IP_VERSION})"
+		# Set current loglevel to KERN_INFO(6), and default to
+		# KERN_NOTICE(5)
+		echo "6 5" > /proc/sys/kernel/printk
+		# Create one namespace and two interfaces
+		set_network "${IP_VERSION}"
+		# Create a dynamic target for netconsole
+		create_dynamic_target "${FORMAT}"
+		# Only set userdata for extended format
+		if [ "$FORMAT" == "extended" ]
+		then
+			# Set userdata "key" with the "value" value
+			set_user_data
+		fi
+		# Listed for netconsole port inside the namespace and
+		# destination interface
+		listen_port_and_save_to "${OUTPUT_FILE}" "${IP_VERSION}" &
+		# Wait for socat to start and listen to the port.
+		wait_for_port "${NAMESPACE}" "${PORT}" "${IP_VERSION}"
+		# Send the message
+		echo "${MSG}: ${TARGET}" > /dev/kmsg
+		# Wait until socat saves the file to disk
+		busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
+
+		# Make sure the message was received in the dst part
+		# and exit
+		validate_result "${OUTPUT_FILE}" "${FORMAT}"
+		# kill socat in case it is still running
+		pkill_socat
+		cleanup
+		echo "${FORMAT} : ${IP_VERSION} : Test passed" >&2
+	done
+done
+
+trap - EXIT
+exit "${ksft_pass}"
diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh
new file mode 100755
index 000000000000..96d704b8d9d9
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netconsole/netcons_cmdline.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This is a selftest to test cmdline arguments on netconsole.
+# It exercises loading of netconsole from cmdline instead of the dynamic
+# reconfiguration. This includes parsing the long netconsole= line and all the
+# flow through init_netconsole().
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh
+
+check_netconsole_module
+
+modprobe netdevsim 2> /dev/null || true
+rmmod netconsole 2> /dev/null || true
+
+# Check for basic system dependency and exit if not found
+# check_for_dependencies
+# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
+echo "6 5" > /proc/sys/kernel/printk
+# Remove the namespace and network interfaces
+trap do_cleanup EXIT
+# Create one namespace and two interfaces
+set_network
+
+# Run the test twice, with different cmdline parameters
+for BINDMODE in "ifname" "mac"
+do
+	echo "Running with bind mode: ${BINDMODE}" >&2
+	# Create the command line for netconsole, with the configuration from
+	# the function above
+	CMDLINE=$(create_cmdline_str "${BINDMODE}")
+
+	# The content of kmsg will be save to the following file
+	OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}"
+
+	# Load the module, with the cmdline set
+	modprobe netconsole "${CMDLINE}"
+
+	# Listed for netconsole port inside the namespace and destination
+	# interface
+	listen_port_and_save_to "${OUTPUT_FILE}" &
+	# Wait for socat to start and listen to the port.
+	wait_local_port_listen "${NAMESPACE}" "${PORT}" udp
+	# Send the message
+	echo "${MSG}: ${TARGET}" > /dev/kmsg
+	# Wait until socat saves the file to disk
+	busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
+	# Make sure the message was received in the dst part
+	# and exit
+	validate_msg "${OUTPUT_FILE}"
+
+	# kill socat in case it is still running
+	pkill_socat
+	# Unload the module
+	rmmod netconsole
+	echo "${BINDMODE} : Test passed" >&2
+done
+
+exit "${ksft_pass}"
diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh
new file mode 100755
index 000000000000..0dc7280c3080
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netconsole/netcons_fragmented_msg.sh
@@ -0,0 +1,122 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test netconsole's message fragmentation functionality.
+#
+# When a message exceeds the maximum packet size, netconsole splits it into
+# multiple fragments for transmission. This test verifies:
+#  - Correct fragmentation of large messages
+#  - Proper reassembly of fragments at the receiver
+#  - Preservation of userdata across fragments
+#  - Behavior with and without kernel release version appending
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh
+
+modprobe netdevsim 2> /dev/null || true
+modprobe netconsole 2> /dev/null || true
+
+# The content of kmsg will be save to the following file
+OUTPUT_FILE="/tmp/${TARGET}"
+
+# set userdata to a long value. In this case, it is "1-2-3-4...50-"
+USERDATA_VALUE=$(printf -- '%.2s-' {1..60})
+
+# Convert the header string in a regexp, so, we can remove
+# the second header as well.
+# A header looks like "13,468,514729715,-,ncfrag=0/1135;". If
+# release is appended, you might find something like:L
+# "6.13.0-04048-g4f561a87745a,13,468,514729715,-,ncfrag=0/1135;"
+function header_to_regex() {
+	# header is everything before ;
+	local HEADER="${1}"
+	REGEX=$(echo "${HEADER}" | cut -d'=' -f1)
+	echo "${REGEX}=[0-9]*\/[0-9]*;"
+}
+
+# We have two headers in the message. Remove both to get the full message,
+# and extract the full message.
+function extract_msg() {
+	local MSGFILE="${1}"
+	# Extract the header, which is the very first thing that arrives in the
+	# first list.
+	HEADER=$(sed -n '1p' "${MSGFILE}" | cut -d';' -f1)
+	HEADER_REGEX=$(header_to_regex "${HEADER}")
+
+	# Remove the two headers from the received message
+	# This will return the message without any header, similarly to what
+	# was sent.
+	sed "s/""${HEADER_REGEX}""//g" "${MSGFILE}"
+}
+
+# Validate the message, which has two messages glued together.
+# unwrap them to make sure all the characters were transmitted.
+# File will look like the following:
+#  13,468,514729715,-,ncfrag=0/1135;<message>
+#   key=<part of key>-13,468,514729715,-,ncfrag=967/1135;<rest of the key>
+function validate_fragmented_result() {
+	# Discard the netconsole headers, and assemble the full message
+	RCVMSG=$(extract_msg "${1}")
+
+	# check for the main message
+	if ! echo "${RCVMSG}" | grep -q "${MSG}"; then
+		echo "Message body doesn't match." >&2
+		echo "msg received=" "${RCVMSG}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	# check userdata
+	if ! echo "${RCVMSG}" | grep -q "${USERDATA_VALUE}"; then
+		echo "message userdata doesn't match" >&2
+		echo "msg received=" "${RCVMSG}" >&2
+		exit "${ksft_fail}"
+	fi
+	# test passed. hooray
+}
+
+# Check for basic system dependency and exit if not found
+check_for_dependencies
+# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
+echo "6 5" > /proc/sys/kernel/printk
+# Remove the namespace, interfaces and netconsole target on exit
+trap cleanup EXIT
+# Create one namespace and two interfaces
+set_network
+# Create a dynamic target for netconsole
+create_dynamic_target
+# Set userdata "key" with the "value" value
+set_user_data
+
+
+# TEST 1: Send message and userdata. They will fragment
+# =======
+MSG=$(printf -- 'MSG%.3s=' {1..150})
+
+# Listen for netconsole port inside the namespace and destination interface
+listen_port_and_save_to "${OUTPUT_FILE}" &
+# Wait for socat to start and listen to the port.
+wait_local_port_listen "${NAMESPACE}" "${PORT}" udp
+# Send the message
+echo "${MSG}: ${TARGET}" > /dev/kmsg
+# Wait until socat saves the file to disk
+busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
+# Check if the message was not corrupted
+validate_fragmented_result "${OUTPUT_FILE}"
+
+# TEST 2: Test with smaller message, and without release appended
+# =======
+MSG=$(printf -- 'FOOBAR%.3s=' {1..100})
+# Let's disable release and test again.
+disable_release_append
+
+listen_port_and_save_to "${OUTPUT_FILE}" &
+wait_local_port_listen "${NAMESPACE}" "${PORT}" udp
+echo "${MSG}: ${TARGET}" > /dev/kmsg
+busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
+validate_fragmented_result "${OUTPUT_FILE}"
+exit "${ksft_pass}"
diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh
new file mode 100755
index 000000000000..a8e43d08c166
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netconsole/netcons_overflow.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test verifies that users can successfully create up to
+# MAX_USERDATA_ITEMS userdata entries without encountering any failures.
+#
+# Additionally, it tests for expected failure when attempting to exceed this
+# maximum limit.
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh
+# This is coming from netconsole code. Check for it in drivers/net/netconsole.c
+MAX_USERDATA_ITEMS=256
+
+# Function to create userdata entries
+function create_userdata_max_entries() {
+	# All these keys should be created without any error
+	for i in $(seq $MAX_USERDATA_ITEMS)
+	do
+		# USERDATA_KEY is used by set_user_data
+		USERDATA_KEY="key"${i}
+		set_user_data
+	done
+}
+
+# Function to verify the entry limit
+function verify_entry_limit() {
+	# Allowing the test to fail without exiting, since the next command
+	# will fail
+	set +e
+	mkdir "${NETCONS_PATH}/userdata/key_that_will_fail" 2> /dev/null
+	ret="$?"
+	set -e
+	if [ "$ret" -eq 0 ];
+	then
+		echo "Adding more than ${MAX_USERDATA_ITEMS} entries in userdata should fail, but it didn't" >&2
+		ls "${NETCONS_PATH}/userdata/" >&2
+		exit "${ksft_fail}"
+	fi
+}
+
+# ========== #
+# Start here #
+# ========== #
+
+modprobe netdevsim 2> /dev/null || true
+modprobe netconsole 2> /dev/null || true
+
+# Check for basic system dependency and exit if not found
+check_for_dependencies
+
+# Remove the namespace, interfaces and netconsole target on exit
+trap cleanup EXIT
+# Create one namespace and two interfaces
+set_network
+# Create a dynamic target for netconsole
+create_dynamic_target
+# populate the maximum number of supported keys in userdata
+create_userdata_max_entries
+# Verify an additional entry is not allowed
+verify_entry_limit
+exit "${ksft_pass}"
diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh
new file mode 100755
index 000000000000..cb59cf436dd0
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netconsole/netcons_resume.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test validates that netconsole is able to resume a target that was
+# deactivated when its interface was removed when the interface is brought
+# back up.
+#
+# The test configures a netconsole target and then removes netdevsim module to
+# cause the interface to disappear. Targets are configured via cmdline to ensure
+# targets bound by interface name and mac address can be resumed.
+# The test verifies that the target moved to disabled state before adding
+# netdevsim and the interface back.
+#
+# Finally, the test verifies that the target is re-enabled automatically and
+# the message is received on the destination interface.
+#
+# Author: Andre Carvalho <asantostc@gmail.com>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh
+
+SAVED_SRCMAC="" # to be populated later
+SAVED_DSTMAC="" # to be populated later
+
+modprobe netdevsim 2> /dev/null || true
+rmmod netconsole 2> /dev/null || true
+
+check_netconsole_module
+
+function cleanup() {
+	cleanup_netcons "${NETCONS_CONFIGFS}/cmdline0"
+	do_cleanup
+	rmmod netconsole
+}
+
+function trigger_reactivation() {
+	# Add back low level module
+	modprobe netdevsim
+	# Recreate namespace and two interfaces
+	set_network
+	# Restore MACs
+	ip netns exec "${NAMESPACE}" ip link set "${DSTIF}" \
+		address "${SAVED_DSTMAC}"
+	if [ "${BINDMODE}" == "mac" ]; then
+		ip link set dev "${SRCIF}" down
+		ip link set dev "${SRCIF}" address "${SAVED_SRCMAC}"
+		# Rename device in order to trigger target resume, as initial
+		# when device was recreated it didn't have correct mac address.
+		ip link set dev "${SRCIF}" name "${TARGET}"
+	fi
+}
+
+function trigger_deactivation() {
+	# Start by storing mac addresses so we can be restored in reactivate
+	SAVED_DSTMAC=$(ip netns exec "${NAMESPACE}" \
+		cat /sys/class/net/"$DSTIF"/address)
+	SAVED_SRCMAC=$(mac_get "${SRCIF}")
+	# Remove low level module
+	rmmod netdevsim
+}
+
+trap cleanup EXIT
+
+# Run the test twice, with different cmdline parameters
+for BINDMODE in "ifname" "mac"
+do
+	echo "Running with bind mode: ${BINDMODE}" >&2
+	# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
+	echo "6 5" > /proc/sys/kernel/printk
+
+	# Create one namespace and two interfaces
+	set_network
+
+	# Create the command line for netconsole, with the configuration from
+	# the function above
+	CMDLINE=$(create_cmdline_str "${BINDMODE}")
+
+	# The content of kmsg will be save to the following file
+	OUTPUT_FILE="/tmp/${TARGET}-${BINDMODE}"
+
+	# Load the module, with the cmdline set
+	modprobe netconsole "${CMDLINE}"
+	# Expose cmdline target in configfs
+	mkdir "${NETCONS_CONFIGFS}/cmdline0"
+
+	# Target should be enabled
+	wait_target_state "cmdline0" "enabled"
+
+	# Trigger deactivation by unloading netdevsim module. Target should be
+	# disabled.
+	trigger_deactivation
+	wait_target_state "cmdline0" "disabled"
+
+	# Trigger reactivation by loading netdevsim, recreating the network and
+	# restoring mac addresses. Target should be re-enabled.
+	trigger_reactivation
+	wait_target_state "cmdline0" "enabled"
+
+	# Listen for netconsole port inside the namespace and destination
+	# interface
+	listen_port_and_save_to "${OUTPUT_FILE}" &
+	# Wait for socat to start and listen to the port.
+	wait_local_port_listen "${NAMESPACE}" "${PORT}" udp
+	# Send the message
+	echo "${MSG}: ${TARGET}" > /dev/kmsg
+	# Wait until socat saves the file to disk
+	busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
+	# Make sure the message was received in the dst part
+	# and exit
+	validate_msg "${OUTPUT_FILE}"
+
+	# kill socat in case it is still running
+	pkill_socat
+	# Cleanup & unload the module
+	cleanup
+
+	echo "${BINDMODE} : Test passed" >&2
+done
+
+trap - EXIT
+exit "${EXIT_STATUS}"
diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh
new file mode 100755
index 000000000000..3fb8c4afe3d2
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netconsole/netcons_sysdata.sh
@@ -0,0 +1,272 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# A test that makes sure that sysdata runtime CPU data is properly set
+# when a message is sent.
+#
+# There are 3 different tests, every time sent using a random CPU.
+#  - Test #1
+#    * Only enable cpu_nr sysdata feature.
+#  - Test #2
+#    * Keep cpu_nr sysdata feature enable and enable userdata.
+#  - Test #3
+#    * keep userdata enabled, and disable sysdata cpu_nr feature.
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh
+
+# Enable the sysdata cpu_nr feature
+function set_cpu_nr() {
+	if [[ ! -f "${NETCONS_PATH}/userdata/cpu_nr_enabled" ]]
+	then
+		echo "Populate CPU configfs path not available in ${NETCONS_PATH}/userdata/cpu_nr_enabled" >&2
+		exit "${ksft_skip}"
+	fi
+
+	echo 1 > "${NETCONS_PATH}/userdata/cpu_nr_enabled"
+}
+
+# Enable the taskname to be appended to sysdata
+function set_taskname() {
+	if [[ ! -f "${NETCONS_PATH}/userdata/taskname_enabled" ]]
+	then
+		echo "Not able to enable taskname sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/taskname_enabled" >&2
+		exit "${ksft_skip}"
+	fi
+
+	echo 1 > "${NETCONS_PATH}/userdata/taskname_enabled"
+}
+
+# Enable the release to be appended to sysdata
+function set_release() {
+	if [[ ! -f "${NETCONS_PATH}/userdata/release_enabled" ]]
+	then
+		echo "Not able to enable release sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/release_enabled" >&2
+		exit "${ksft_skip}"
+	fi
+
+	echo 1 > "${NETCONS_PATH}/userdata/release_enabled"
+}
+
+# Enable the msgid to be appended to sysdata
+function set_msgid() {
+	if [[ ! -f "${NETCONS_PATH}/userdata/msgid_enabled" ]]
+	then
+		echo "Not able to enable msgid sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/msgid_enabled" >&2
+		exit "${ksft_skip}"
+	fi
+
+	echo 1 > "${NETCONS_PATH}/userdata/msgid_enabled"
+}
+
+# Disable the sysdata cpu_nr feature
+function unset_cpu_nr() {
+	echo 0 > "${NETCONS_PATH}/userdata/cpu_nr_enabled"
+}
+
+# Once called, taskname=<..> will not be appended anymore
+function unset_taskname() {
+	echo 0 > "${NETCONS_PATH}/userdata/taskname_enabled"
+}
+
+function unset_release() {
+	echo 0 > "${NETCONS_PATH}/userdata/release_enabled"
+}
+
+function unset_msgid() {
+	echo 0 > "${NETCONS_PATH}/userdata/msgid_enabled"
+}
+
+# Test if MSG contains sysdata
+function validate_sysdata() {
+	# OUTPUT_FILE will contain something like:
+	# 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM
+	#  userdatakey=userdatavalue
+	#  cpu=X
+	#  taskname=<taskname>
+	#  msgid=<id>
+
+	# Echo is what this test uses to create the message. See runtest()
+	# function
+	SENDER="echo"
+
+	if [ ! -f "$OUTPUT_FILE" ]; then
+		echo "FAIL: File was not generated." >&2
+		exit "${ksft_fail}"
+	fi
+
+	if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then
+		echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2
+		cat "${OUTPUT_FILE}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	# Check if cpu=XX exists in the file and matches the one used
+	# in taskset(1)
+	if ! grep -q "cpu=${CPU}\+" "${OUTPUT_FILE}"; then
+		echo "FAIL: 'cpu=${CPU}' not found in ${OUTPUT_FILE}" >&2
+		cat "${OUTPUT_FILE}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	if ! grep -q "taskname=${SENDER}" "${OUTPUT_FILE}"; then
+		echo "FAIL: 'taskname=echo' not found in ${OUTPUT_FILE}" >&2
+		cat "${OUTPUT_FILE}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	if ! grep -q "msgid=[0-9]\+$" "${OUTPUT_FILE}"; then
+		echo "FAIL: 'msgid=<id>' not found in ${OUTPUT_FILE}" >&2
+		cat "${OUTPUT_FILE}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	rm "${OUTPUT_FILE}"
+	pkill_socat
+}
+
+function validate_release() {
+	RELEASE=$(uname -r)
+
+	if [ ! -f "$OUTPUT_FILE" ]; then
+		echo "FAIL: File was not generated." >&2
+		exit "${ksft_fail}"
+	fi
+
+	if ! grep -q "release=${RELEASE}" "${OUTPUT_FILE}"; then
+		echo "FAIL: 'release=${RELEASE}' not found in ${OUTPUT_FILE}" >&2
+		cat "${OUTPUT_FILE}" >&2
+		exit "${ksft_fail}"
+	fi
+}
+
+# Test if MSG content exists in OUTPUT_FILE but no `cpu=` and `taskname=`
+# strings
+function validate_no_sysdata() {
+	if [ ! -f "$OUTPUT_FILE" ]; then
+		echo "FAIL: File was not generated." >&2
+		exit "${ksft_fail}"
+	fi
+
+	if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then
+		echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2
+		cat "${OUTPUT_FILE}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	if grep -q "cpu=" "${OUTPUT_FILE}"; then
+		echo "FAIL: 'cpu=  found in ${OUTPUT_FILE}" >&2
+		cat "${OUTPUT_FILE}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	if grep -q "taskname=" "${OUTPUT_FILE}"; then
+		echo "FAIL: 'taskname=  found in ${OUTPUT_FILE}" >&2
+		cat "${OUTPUT_FILE}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	if grep -q "release=" "${OUTPUT_FILE}"; then
+		echo "FAIL: 'release=  found in ${OUTPUT_FILE}" >&2
+		cat "${OUTPUT_FILE}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	if grep -q "msgid=" "${OUTPUT_FILE}"; then
+		echo "FAIL: 'msgid=  found in ${OUTPUT_FILE}" >&2
+		cat "${OUTPUT_FILE}" >&2
+		exit "${ksft_fail}"
+	fi
+
+	rm "${OUTPUT_FILE}"
+}
+
+# Start socat, send the message and wait for the file to show up in the file
+# system
+function runtest {
+	# Listen for netconsole port inside the namespace and destination
+	# interface
+	listen_port_and_save_to "${OUTPUT_FILE}" &
+	# Wait for socat to start and listen to the port.
+	wait_local_port_listen "${NAMESPACE}" "${PORT}" udp
+	# Send the message
+	taskset -c "${CPU}" echo "${MSG}: ${TARGET}" > /dev/kmsg
+	# Wait until socat saves the file to disk
+	busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
+}
+
+# ========== #
+# Start here #
+# ========== #
+
+modprobe netdevsim 2> /dev/null || true
+modprobe netconsole 2> /dev/null || true
+
+# Check for basic system dependency and exit if not found
+check_for_dependencies
+# This test also depends on taskset(1). Check for it before starting the test
+check_for_taskset
+
+# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
+echo "6 5" > /proc/sys/kernel/printk
+# Remove the namespace, interfaces and netconsole target on exit
+trap cleanup EXIT
+# Create one namespace and two interfaces
+set_network
+# Create a dynamic target for netconsole
+create_dynamic_target
+
+#====================================================
+# TEST #1
+# Send message from a random CPU
+#====================================================
+# Random CPU in the system
+CPU=$((RANDOM % $(nproc)))
+OUTPUT_FILE="/tmp/${TARGET}_1"
+MSG="Test #1 from CPU${CPU}"
+# Enable the auto population of cpu_nr
+set_cpu_nr
+# Enable taskname to be appended to sysdata
+set_taskname
+set_release
+set_msgid
+runtest
+# Make sure the message was received in the dst part
+# and exit
+validate_release
+validate_sysdata
+
+#====================================================
+# TEST #2
+# This test now adds userdata together with sysdata
+# ===================================================
+# Get a new random CPU
+CPU=$((RANDOM % $(nproc)))
+OUTPUT_FILE="/tmp/${TARGET}_2"
+MSG="Test #2 from CPU${CPU}"
+set_user_data
+runtest
+validate_release
+validate_sysdata
+
+# ===================================================
+# TEST #3
+# Unset all sysdata, fail if any userdata is set
+# ===================================================
+CPU=$((RANDOM % $(nproc)))
+OUTPUT_FILE="/tmp/${TARGET}_3"
+MSG="Test #3 from CPU${CPU}"
+unset_cpu_nr
+unset_taskname
+unset_release
+unset_msgid
+runtest
+# At this time, cpu= shouldn't be present in the msg
+validate_no_sysdata
+
+exit "${ksft_pass}"
diff --git a/tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh b/tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh
new file mode 100755
index 000000000000..33a44adb6f8f
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netconsole/netcons_torture.sh
@@ -0,0 +1,130 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Repeatedly send kernel messages, toggles netconsole targets on and off,
+# creates and deletes targets in parallel, and toggles the source interface to
+# simulate stress conditions.
+#
+# This test aims to verify the robustness of netconsole under dynamic
+# configurations and concurrent operations.
+#
+# The major goal is to run this test with LOCKDEP, Kmemleak and KASAN to make
+# sure no issues is reported.
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh
+
+# Number of times the main loop run
+ITERATIONS=${1:-150}
+
+# Only test extended format
+FORMAT="extended"
+# And ipv6 only
+IP_VERSION="ipv6"
+
+# Create, enable and delete some targets.
+create_and_delete_random_target() {
+	COUNT=2
+	RND_PREFIX=$(mktemp -u netcons_rnd_XXXX_)
+
+	if [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}${COUNT}"  ] || \
+	   [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}0" ]; then
+		echo "Function didn't finish yet, skipping it." >&2
+		return
+	fi
+
+	# enable COUNT targets
+	for i in $(seq ${COUNT})
+	do
+		RND_TARGET="${RND_PREFIX}"${i}
+		RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}"
+
+		# Basic population so the target can come up
+		_create_dynamic_target "${FORMAT}" "${RND_TARGET_PATH}"
+	done
+
+	echo "netconsole selftest: ${COUNT} additional targets were created" > /dev/kmsg
+	# disable them all
+	for i in $(seq ${COUNT})
+	do
+		RND_TARGET="${RND_PREFIX}"${i}
+		RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}"
+		if [[ $(cat "${RND_TARGET_PATH}/enabled") -eq 1 ]]
+		then
+			echo 0 > "${RND_TARGET_PATH}"/enabled
+		fi
+		rmdir "${RND_TARGET_PATH}"
+	done
+}
+
+# Disable and enable the target mid-air, while messages
+# are being transmitted.
+toggle_netcons_target() {
+	for i in $(seq 2)
+	do
+		if [ ! -d "${NETCONS_PATH}" ]
+		then
+			break
+		fi
+		echo 0 > "${NETCONS_PATH}"/enabled 2> /dev/null || true
+		# Try to enable a bit harder, given it might fail to enable
+		# Write to `enabled` might fail depending on the lock, which is
+		# highly contentious here
+		for _ in $(seq 5)
+		do
+			echo 1 > "${NETCONS_PATH}"/enabled 2> /dev/null || true
+		done
+	done
+}
+
+toggle_iface(){
+	ip link set "${SRCIF}" down
+	ip link set "${SRCIF}" up
+}
+
+# Start here
+
+modprobe netdevsim 2> /dev/null || true
+modprobe netconsole 2> /dev/null || true
+
+# Check for basic system dependency and exit if not found
+check_for_dependencies
+# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
+echo "6 5" > /proc/sys/kernel/printk
+# Remove the namespace, interfaces and netconsole target on exit
+trap cleanup EXIT
+# Create one namespace and two interfaces
+set_network "${IP_VERSION}"
+# Create a dynamic target for netconsole
+create_dynamic_target "${FORMAT}"
+
+for i in $(seq "$ITERATIONS")
+do
+	for _ in $(seq 10)
+	do
+		echo "${MSG}: ${TARGET} ${i}" > /dev/kmsg
+	done
+	wait
+
+	if (( i % 30 == 0 )); then
+		toggle_netcons_target &
+	fi
+
+	if (( i % 50 == 0 )); then
+		# create some targets, enable them, send msg and disable
+		# all in a parallel thread
+		create_and_delete_random_target &
+	fi
+
+	if (( i % 70 == 0 )); then
+		toggle_iface &
+	fi
+done
+wait
+
+exit "${EXIT_STATUS}"
-- 
cgit v1.2.3


From 7b85c77585409f76609c817b760db60f3bf8fd33 Mon Sep 17 00:00:00 2001
From: Nimrod Oren <noren@nvidia.com>
Date: Wed, 28 Jan 2026 11:02:17 +0200
Subject: selftests: drv-net: rss_flow_label: skip unsupported devices

The test_rss_flow_label_6only test case fails on devices that do not
support IPv6 flow label hashing. Make it skip neatly, consistent with
the behavior of the test_rss_flow_label case.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Nimrod Oren <noren@nvidia.com>
Link: https://patch.msgid.link/20260128090217.663366-1-noren@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/rss_flow_label.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/rss_flow_label.py b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py
index 6fa95fe27c47..7dc80070884a 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_flow_label.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py
@@ -145,9 +145,14 @@ def test_rss_flow_label_6only(cfg):
 
     # Try to enable Flow Labels and check again, in case it leaks thru
     initial = _ethtool_get_cfg(cfg, "udp6")
-    changed = initial.replace("l", "") if "l" in initial else initial + "l"
-
-    cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {changed}")
+    no_lbl = initial.replace("l", "")
+    if "l" not in initial:
+        try:
+            cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 l{no_lbl}")
+        except CmdExitFailure as exc:
+            raise KsftSkipEx("Device doesn't support Flow Label for UDP6") from exc
+    else:
+        cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {no_lbl}")
     restore = defer(cmd, f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {initial}")
 
     _check_v4_flow_types(cfg)
-- 
cgit v1.2.3


From 15ac1adf0f84a90605121fbe4a6238b24c865f92 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 30 Jan 2026 09:12:08 +0100
Subject: selftests/bpf: Add test for sleepable program tailcalls

Adding test that makes sure we can't mix sleepable and non-sleepable
bpf programs in the BPF_MAP_TYPE_PROG_ARRAY map and that we can do
tail call in the sleepable program.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20260130081208.1130204-3-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/tailcalls.c | 74 ++++++++++++++++++++++
 .../selftests/bpf/progs/tailcall_sleepable.c       | 43 +++++++++++++
 2 files changed, 117 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/tailcall_sleepable.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
index 0ab36503c3b2..7d534fde0af9 100644
--- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c
+++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
@@ -8,6 +8,7 @@
 #include "tailcall_freplace.skel.h"
 #include "tc_bpf2bpf.skel.h"
 #include "tailcall_fail.skel.h"
+#include "tailcall_sleepable.skel.h"
 
 /* test_tailcall_1 checks basic functionality by patching multiple locations
  * in a single program for a single tail call slot with nop->jmp, jmp->nop
@@ -1653,6 +1654,77 @@ static void test_tailcall_failure()
 	RUN_TESTS(tailcall_fail);
 }
 
+noinline void uprobe_sleepable_trigger(void)
+{
+	asm volatile ("");
+}
+
+static void test_tailcall_sleepable(void)
+{
+	LIBBPF_OPTS(bpf_uprobe_opts, opts);
+	struct tailcall_sleepable *skel;
+	int prog_fd, map_fd;
+	int err, key;
+
+	skel = tailcall_sleepable__open();
+	if (!ASSERT_OK_PTR(skel, "tailcall_sleepable__open"))
+		return;
+
+	/*
+	 * Test that we can't load uprobe_normal and uprobe_sleepable_1,
+	 * because they share tailcall map.
+	 */
+	bpf_program__set_autoload(skel->progs.uprobe_normal, true);
+	bpf_program__set_autoload(skel->progs.uprobe_sleepable_1, true);
+
+	err = tailcall_sleepable__load(skel);
+	if (!ASSERT_ERR(err, "tailcall_sleepable__load"))
+		goto out;
+
+	tailcall_sleepable__destroy(skel);
+
+	/*
+	 * Test that we can tail call from sleepable to sleepable program.
+	 */
+	skel = tailcall_sleepable__open();
+	if (!ASSERT_OK_PTR(skel, "tailcall_sleepable__open"))
+		return;
+
+	bpf_program__set_autoload(skel->progs.uprobe_sleepable_1, true);
+	bpf_program__set_autoload(skel->progs.uprobe_sleepable_2, true);
+
+	err = tailcall_sleepable__load(skel);
+	if (!ASSERT_OK(err, "tailcall_sleepable__load"))
+		goto out;
+
+	/* Add sleepable uprobe_sleepable_2 to jmp_table[0]. */
+	key = 0;
+	prog_fd = bpf_program__fd(skel->progs.uprobe_sleepable_2);
+	map_fd = bpf_map__fd(skel->maps.jmp_table);
+	err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY);
+	if (!ASSERT_OK(err, "update jmp_table"))
+		goto out;
+
+	skel->bss->my_pid = getpid();
+
+	/* Attach uprobe_sleepable_1 to uprobe_sleepable_trigger and hit it.  */
+	opts.func_name = "uprobe_sleepable_trigger";
+	skel->links.uprobe_sleepable_1 = bpf_program__attach_uprobe_opts(
+						skel->progs.uprobe_sleepable_1,
+						-1,
+						"/proc/self/exe",
+						0 /* offset */,
+						&opts);
+	if (!ASSERT_OK_PTR(skel->links.uprobe_sleepable_1, "bpf_program__attach_uprobe_opts"))
+		goto out;
+
+	uprobe_sleepable_trigger();
+	ASSERT_EQ(skel->bss->executed, 1, "executed");
+
+out:
+	tailcall_sleepable__destroy(skel);
+}
+
 void test_tailcalls(void)
 {
 	if (test__start_subtest("tailcall_1"))
@@ -1707,4 +1779,6 @@ void test_tailcalls(void)
 		test_tailcall_bpf2bpf_freplace();
 	if (test__start_subtest("tailcall_failure"))
 		test_tailcall_failure();
+	if (test__start_subtest("tailcall_sleepable"))
+		test_tailcall_sleepable();
 }
diff --git a/tools/testing/selftests/bpf/progs/tailcall_sleepable.c b/tools/testing/selftests/bpf/progs/tailcall_sleepable.c
new file mode 100644
index 000000000000..d959a9eaaa9c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_sleepable.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+#include "bpf_test_utils.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(max_entries, 1);
+	__uint(key_size, sizeof(__u32));
+	__array(values, void (void));
+} jmp_table SEC(".maps");
+
+SEC("?uprobe")
+int uprobe_normal(void *ctx)
+{
+	bpf_tail_call_static(ctx, &jmp_table, 0);
+	return 0;
+}
+
+SEC("?uprobe.s")
+int uprobe_sleepable_1(void *ctx)
+{
+	bpf_tail_call_static(ctx, &jmp_table, 0);
+	return 0;
+}
+
+int executed = 0;
+int my_pid = 0;
+
+SEC("?uprobe.s")
+int uprobe_sleepable_2(void *ctx)
+{
+	int pid = bpf_get_current_pid_tgid() >> 32;
+
+	if (pid != my_pid)
+		return 0;
+
+	executed++;
+	return 0;
+}
+
+char __license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From cd77618c418254b827f2a807b4c27b97088fdb52 Mon Sep 17 00:00:00 2001
From: Changwoo Min <changwoo@igalia.com>
Date: Fri, 30 Jan 2026 11:18:43 +0900
Subject: selftests/bpf: Make bpf get_preempt_count() work for v6.14+ kernels

Recent x86 kernels export __preempt_count as a ksym, while some old kernels
between v6.1 and v6.14 expose the preemption counter via
pcpu_hot.preempt_count. The existing selftest helper unconditionally
dereferenced __preempt_count, which breaks BPF program loading on such old
kernels.

Make the x86 preemption count lookup version-agnostic by:
- Marking __preempt_count and pcpu_hot as weak ksyms.
- Introducing a BTF-described pcpu_hot___local layout with
  preserve_access_index.
- Selecting the appropriate access path at runtime using ksym availability
  and bpf_ksym_exists() and bpf_core_field_exists().

This allows a single BPF binary to run correctly across kernel versions
(e.g., v6.18 vs. v6.13) without relying on compile-time version checks.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
Link: https://lore.kernel.org/r/20260130021843.154885-1-changwoo@igalia.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/bpf_experimental.h | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index a39576c8ba04..4b7210c318dd 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -614,7 +614,13 @@ extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str,
 
 extern bool CONFIG_PREEMPT_RT __kconfig __weak;
 #ifdef bpf_target_x86
-extern const int __preempt_count __ksym;
+extern const int __preempt_count __ksym __weak;
+
+struct pcpu_hot___local {
+	int preempt_count;
+} __attribute__((preserve_access_index));
+
+extern struct pcpu_hot___local pcpu_hot __ksym __weak;
 #endif
 
 struct task_struct___preempt_rt {
@@ -624,7 +630,19 @@ struct task_struct___preempt_rt {
 static inline int get_preempt_count(void)
 {
 #if defined(bpf_target_x86)
-	return *(int *) bpf_this_cpu_ptr(&__preempt_count);
+	/* By default, read the per-CPU __preempt_count. */
+	if (bpf_ksym_exists(&__preempt_count))
+		return *(int *) bpf_this_cpu_ptr(&__preempt_count);
+
+	/*
+	 * If __preempt_count does not exist, try to read preempt_count under
+	 * struct pcpu_hot. Between v6.1 and v6.14 -- more specifically,
+	 * [64701838bf057, 46e8fff6d45fe), preempt_count had been managed
+	 * under struct pcpu_hot.
+	 */
+	if (bpf_core_field_exists(pcpu_hot.preempt_count))
+		return ((struct pcpu_hot___local *)
+			bpf_this_cpu_ptr(&pcpu_hot))->preempt_count;
 #elif defined(bpf_target_arm64)
 	return bpf_get_current_task_btf()->thread_info.preempt.count;
 #endif
-- 
cgit v1.2.3


From 0207f94971e72a13380e28022c86da147e8e090f Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 26 Jan 2026 22:18:34 +0100
Subject: selftests/bpf: Fix kprobe multi stacktrace_ips test

We now include the attached function in the stack trace,
fixing the test accordingly.

Fixes: c9e208fa93cd ("selftests/bpf: Add stacktrace ips test for kprobe_multi/kretprobe_multi")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260126211837.472802-4-jolsa@kernel.org
---
 .../testing/selftests/bpf/prog_tests/stacktrace_ips.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c
index c9efdd2a5b18..c93718dafd9b 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c
@@ -74,11 +74,20 @@ static void test_stacktrace_ips_kprobe_multi(bool retprobe)
 
 	load_kallsyms();
 
-	check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4,
-			     ksym_get_addr("bpf_testmod_stacktrace_test_3"),
-			     ksym_get_addr("bpf_testmod_stacktrace_test_2"),
-			     ksym_get_addr("bpf_testmod_stacktrace_test_1"),
-			     ksym_get_addr("bpf_testmod_test_read"));
+	if (retprobe) {
+		check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4,
+				     ksym_get_addr("bpf_testmod_stacktrace_test_3"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_2"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_1"),
+				     ksym_get_addr("bpf_testmod_test_read"));
+	} else {
+		check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5,
+				     ksym_get_addr("bpf_testmod_stacktrace_test"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_3"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_2"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_1"),
+				     ksym_get_addr("bpf_testmod_test_read"));
+	}
 
 cleanup:
 	stacktrace_ips__destroy(skel);
-- 
cgit v1.2.3


From 7373f97e868ad01fc73de8e7b71834eeba25d4f1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 26 Jan 2026 22:18:35 +0100
Subject: selftests/bpf: Add stacktrace ips test for kprobe/kretprobe

Adding test that attaches kprobe/kretprobe and verifies the
ORC stacktrace matches expected functions.

The test is only for ORC unwinder to keep it simple.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260126211837.472802-5-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/stacktrace_ips.c      | 50 ++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/stacktrace_ips.c |  7 +++
 2 files changed, 57 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c
index c93718dafd9b..852830536109 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c
@@ -137,6 +137,52 @@ cleanup:
 	stacktrace_ips__destroy(skel);
 }
 
+static void test_stacktrace_ips_kprobe(bool retprobe)
+{
+	LIBBPF_OPTS(bpf_kprobe_opts, opts,
+		.retprobe = retprobe
+	);
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	struct stacktrace_ips *skel;
+
+	skel = stacktrace_ips__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load"))
+		return;
+
+	if (!skel->kconfig->CONFIG_UNWINDER_ORC) {
+		test__skip();
+		goto cleanup;
+	}
+
+	skel->links.kprobe_test = bpf_program__attach_kprobe_opts(
+						skel->progs.kprobe_test,
+						"bpf_testmod_stacktrace_test", &opts);
+	if (!ASSERT_OK_PTR(skel->links.kprobe_test, "bpf_program__attach_kprobe_opts"))
+		goto cleanup;
+
+	trigger_module_test_read(1);
+
+	load_kallsyms();
+
+	if (retprobe) {
+		check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4,
+				     ksym_get_addr("bpf_testmod_stacktrace_test_3"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_2"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_1"),
+				     ksym_get_addr("bpf_testmod_test_read"));
+	} else {
+		check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5,
+				     ksym_get_addr("bpf_testmod_stacktrace_test"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_3"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_2"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_1"),
+				     ksym_get_addr("bpf_testmod_test_read"));
+	}
+
+cleanup:
+	stacktrace_ips__destroy(skel);
+}
+
 static void __test_stacktrace_ips(void)
 {
 	if (test__start_subtest("kprobe_multi"))
@@ -145,6 +191,10 @@ static void __test_stacktrace_ips(void)
 		test_stacktrace_ips_kprobe_multi(true);
 	if (test__start_subtest("raw_tp"))
 		test_stacktrace_ips_raw_tp();
+	if (test__start_subtest("kprobe"))
+		test_stacktrace_ips_kprobe(false);
+	if (test__start_subtest("kretprobe"))
+		test_stacktrace_ips_kprobe(true);
 }
 #else
 static void __test_stacktrace_ips(void)
diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c
index a96c8150d7f5..cae077a4061b 100644
--- a/tools/testing/selftests/bpf/progs/stacktrace_ips.c
+++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c
@@ -31,6 +31,13 @@ int unused(void)
 
 __u32 stack_key;
 
+SEC("kprobe")
+int kprobe_test(struct pt_regs *ctx)
+{
+	stack_key = bpf_get_stackid(ctx, &stackmap, 0);
+	return 0;
+}
+
 SEC("kprobe.multi")
 int kprobe_multi_test(struct pt_regs *ctx)
 {
-- 
cgit v1.2.3


From e5d532be4a3b0d1f0a9210c0da2c04a6b4605904 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 26 Jan 2026 22:18:36 +0100
Subject: selftests/bpf: Add stacktrace ips test for fentry/fexit

Adding test that attaches fentry/fexitand verifies the
ORC stacktrace matches expected functions.

The test is only for ORC unwinder to keep it simple.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260126211837.472802-6-jolsa@kernel.org
---
 .../selftests/bpf/prog_tests/stacktrace_ips.c      | 51 ++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/stacktrace_ips.c | 20 +++++++++
 2 files changed, 71 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c
index 852830536109..da42b00e3d1f 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c
@@ -183,6 +183,53 @@ cleanup:
 	stacktrace_ips__destroy(skel);
 }
 
+static void test_stacktrace_ips_trampoline(bool retprobe)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	struct stacktrace_ips *skel;
+
+	skel = stacktrace_ips__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load"))
+		return;
+
+	if (!skel->kconfig->CONFIG_UNWINDER_ORC) {
+		test__skip();
+		goto cleanup;
+	}
+
+	if (retprobe) {
+		skel->links.fexit_test = bpf_program__attach_trace(skel->progs.fexit_test);
+		if (!ASSERT_OK_PTR(skel->links.fexit_test, "bpf_program__attach_trace"))
+			goto cleanup;
+	} else {
+		skel->links.fentry_test = bpf_program__attach_trace(skel->progs.fentry_test);
+		if (!ASSERT_OK_PTR(skel->links.fentry_test, "bpf_program__attach_trace"))
+			goto cleanup;
+	}
+
+	trigger_module_test_read(1);
+
+	load_kallsyms();
+
+	if (retprobe) {
+		check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4,
+				     ksym_get_addr("bpf_testmod_stacktrace_test_3"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_2"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_1"),
+				     ksym_get_addr("bpf_testmod_test_read"));
+	} else {
+		check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5,
+				     ksym_get_addr("bpf_testmod_stacktrace_test"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_3"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_2"),
+				     ksym_get_addr("bpf_testmod_stacktrace_test_1"),
+				     ksym_get_addr("bpf_testmod_test_read"));
+	}
+
+cleanup:
+	stacktrace_ips__destroy(skel);
+}
+
 static void __test_stacktrace_ips(void)
 {
 	if (test__start_subtest("kprobe_multi"))
@@ -195,6 +242,10 @@ static void __test_stacktrace_ips(void)
 		test_stacktrace_ips_kprobe(false);
 	if (test__start_subtest("kretprobe"))
 		test_stacktrace_ips_kprobe(true);
+	if (test__start_subtest("fentry"))
+		test_stacktrace_ips_trampoline(false);
+	if (test__start_subtest("fexit"))
+		test_stacktrace_ips_trampoline(true);
 }
 #else
 static void __test_stacktrace_ips(void)
diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c
index cae077a4061b..6830f2978613 100644
--- a/tools/testing/selftests/bpf/progs/stacktrace_ips.c
+++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c
@@ -53,4 +53,24 @@ int rawtp_test(void *ctx)
 	return 0;
 }
 
+SEC("fentry/bpf_testmod_stacktrace_test")
+int fentry_test(struct pt_regs *ctx)
+{
+	/*
+	 * Skip 2 bpf_program/trampoline stack entries:
+	 * - bpf_prog_bd1f7a949f55fb03_fentry_test
+	 * - bpf_trampoline_182536277701
+	 */
+	stack_key = bpf_get_stackid(ctx, &stackmap, 2);
+	return 0;
+}
+
+SEC("fexit/bpf_testmod_stacktrace_test")
+int fexit_test(struct pt_regs *ctx)
+{
+	/* Skip 2 bpf_program/trampoline stack entries, check fentry_test. */
+	stack_key = bpf_get_stackid(ctx, &stackmap, 2);
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 4173b494d93a8057d3ed23e65853cd76b647f870 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 26 Jan 2026 22:18:37 +0100
Subject: selftests/bpf: Allow to benchmark trigger with stacktrace

Adding support to call bpf_get_stackid helper from trigger programs,
so far added for kprobe multi.

Adding the --stacktrace/-g option to enable it.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260126211837.472802-7-jolsa@kernel.org
---
 tools/testing/selftests/bpf/bench.c                |  4 ++
 tools/testing/selftests/bpf/bench.h                |  1 +
 tools/testing/selftests/bpf/benchs/bench_trigger.c |  1 +
 tools/testing/selftests/bpf/progs/trigger_bench.c  | 46 +++++++++++++++++-----
 4 files changed, 43 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index bd29bb2e6cb5..8368bd3a0665 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -265,6 +265,7 @@ static const struct argp_option opts[] = {
 	{ "verbose", 'v', NULL, 0, "Verbose debug output"},
 	{ "affinity", 'a', NULL, 0, "Set consumer/producer thread affinity"},
 	{ "quiet", 'q', NULL, 0, "Be more quiet"},
+	{ "stacktrace", 's', NULL, 0, "Get stack trace"},
 	{ "prod-affinity", ARG_PROD_AFFINITY_SET, "CPUSET", 0,
 	  "Set of CPUs for producer threads; implies --affinity"},
 	{ "cons-affinity", ARG_CONS_AFFINITY_SET, "CPUSET", 0,
@@ -350,6 +351,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 	case 'q':
 		env.quiet = true;
 		break;
+	case 's':
+		env.stacktrace = true;
+		break;
 	case ARG_PROD_AFFINITY_SET:
 		env.affinity = true;
 		if (parse_num_list(arg, &env.prod_cpus.cpus,
diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
index bea323820ffb..7cf21936e7ed 100644
--- a/tools/testing/selftests/bpf/bench.h
+++ b/tools/testing/selftests/bpf/bench.h
@@ -26,6 +26,7 @@ struct env {
 	bool list;
 	bool affinity;
 	bool quiet;
+	bool stacktrace;
 	int consumer_cnt;
 	int producer_cnt;
 	int nr_cpus;
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 34018fc3927f..aeec9edd3851 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -146,6 +146,7 @@ static void setup_ctx(void)
 	bpf_program__set_autoload(ctx.skel->progs.trigger_driver, true);
 
 	ctx.skel->rodata->batch_iters = args.batch_iters;
+	ctx.skel->rodata->stacktrace = env.stacktrace;
 }
 
 static void load_ctx(void)
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 2898b3749d07..4ea0422d1042 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -25,6 +25,34 @@ static __always_inline void inc_counter(void)
 	__sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1);
 }
 
+volatile const int stacktrace;
+
+typedef __u64 stack_trace_t[128];
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	 __uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, stack_trace_t);
+} stack_heap SEC(".maps");
+
+static __always_inline void do_stacktrace(void *ctx)
+{
+	if (!stacktrace)
+		return;
+
+	__u64 *ptr = bpf_map_lookup_elem(&stack_heap, &(__u32){0});
+
+	if (ptr)
+		bpf_get_stack(ctx, ptr, sizeof(stack_trace_t), 0);
+}
+
+static __always_inline void handle(void *ctx)
+{
+	inc_counter();
+	do_stacktrace(ctx);
+}
+
 SEC("?uprobe")
 int bench_trigger_uprobe(void *ctx)
 {
@@ -81,21 +109,21 @@ int trigger_driver_kfunc(void *ctx)
 SEC("?kprobe/bpf_get_numa_node_id")
 int bench_trigger_kprobe(void *ctx)
 {
-	inc_counter();
+	handle(ctx);
 	return 0;
 }
 
 SEC("?kretprobe/bpf_get_numa_node_id")
 int bench_trigger_kretprobe(void *ctx)
 {
-	inc_counter();
+	handle(ctx);
 	return 0;
 }
 
 SEC("?kprobe.multi/bpf_get_numa_node_id")
 int bench_trigger_kprobe_multi(void *ctx)
 {
-	inc_counter();
+	handle(ctx);
 	return 0;
 }
 
@@ -108,7 +136,7 @@ int bench_kprobe_multi_empty(void *ctx)
 SEC("?kretprobe.multi/bpf_get_numa_node_id")
 int bench_trigger_kretprobe_multi(void *ctx)
 {
-	inc_counter();
+	handle(ctx);
 	return 0;
 }
 
@@ -121,34 +149,34 @@ int bench_kretprobe_multi_empty(void *ctx)
 SEC("?fentry/bpf_get_numa_node_id")
 int bench_trigger_fentry(void *ctx)
 {
-	inc_counter();
+	handle(ctx);
 	return 0;
 }
 
 SEC("?fexit/bpf_get_numa_node_id")
 int bench_trigger_fexit(void *ctx)
 {
-	inc_counter();
+	handle(ctx);
 	return 0;
 }
 
 SEC("?fmod_ret/bpf_modify_return_test_tp")
 int bench_trigger_fmodret(void *ctx)
 {
-	inc_counter();
+	handle(ctx);
 	return -22;
 }
 
 SEC("?tp/bpf_test_run/bpf_trigger_tp")
 int bench_trigger_tp(void *ctx)
 {
-	inc_counter();
+	handle(ctx);
 	return 0;
 }
 
 SEC("?raw_tp/bpf_trigger_tp")
 int bench_trigger_rawtp(void *ctx)
 {
-	inc_counter();
+	handle(ctx);
 	return 0;
 }
-- 
cgit v1.2.3


From 3a4d8bed0b47543b2dfce0b1d714b40d68ff2f7e Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 30 Jan 2026 00:19:52 +0800
Subject: selftests: ublk: derive TID automatically from script name

Add automatic TID derivation in test_common.sh based on the script
filename. The TID is extracted by stripping the "test_" prefix and
".sh" suffix from the script name (e.g., test_loop_01.sh -> loop_01).

This removes the need for each test script to manually define TID,
reducing boilerplate and preventing potential mismatches between
the script name and TID. Scripts can still override TID after
sourcing test_common.sh if needed.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/test_batch_01.sh   | 1 -
 tools/testing/selftests/ublk/test_batch_02.sh   | 1 -
 tools/testing/selftests/ublk/test_batch_03.sh   | 1 -
 tools/testing/selftests/ublk/test_common.sh     | 5 +++++
 tools/testing/selftests/ublk/test_generic_01.sh | 1 -
 tools/testing/selftests/ublk/test_generic_02.sh | 1 -
 tools/testing/selftests/ublk/test_generic_03.sh | 1 -
 tools/testing/selftests/ublk/test_generic_04.sh | 1 -
 tools/testing/selftests/ublk/test_generic_05.sh | 1 -
 tools/testing/selftests/ublk/test_generic_06.sh | 1 -
 tools/testing/selftests/ublk/test_generic_07.sh | 1 -
 tools/testing/selftests/ublk/test_generic_08.sh | 1 -
 tools/testing/selftests/ublk/test_generic_09.sh | 1 -
 tools/testing/selftests/ublk/test_generic_10.sh | 1 -
 tools/testing/selftests/ublk/test_generic_11.sh | 1 -
 tools/testing/selftests/ublk/test_generic_12.sh | 1 -
 tools/testing/selftests/ublk/test_generic_13.sh | 1 -
 tools/testing/selftests/ublk/test_generic_14.sh | 1 -
 tools/testing/selftests/ublk/test_generic_15.sh | 1 -
 tools/testing/selftests/ublk/test_generic_16.sh | 1 -
 tools/testing/selftests/ublk/test_loop_01.sh    | 1 -
 tools/testing/selftests/ublk/test_loop_02.sh    | 1 -
 tools/testing/selftests/ublk/test_loop_03.sh    | 1 -
 tools/testing/selftests/ublk/test_loop_04.sh    | 1 -
 tools/testing/selftests/ublk/test_loop_05.sh    | 1 -
 tools/testing/selftests/ublk/test_loop_06.sh    | 1 -
 tools/testing/selftests/ublk/test_loop_07.sh    | 1 -
 tools/testing/selftests/ublk/test_loop_08.sh    | 1 -
 tools/testing/selftests/ublk/test_null_01.sh    | 1 -
 tools/testing/selftests/ublk/test_null_02.sh    | 1 -
 tools/testing/selftests/ublk/test_null_03.sh    | 1 -
 tools/testing/selftests/ublk/test_null_04.sh    | 1 -
 tools/testing/selftests/ublk/test_stress_01.sh  | 1 -
 tools/testing/selftests/ublk/test_stress_02.sh  | 1 -
 tools/testing/selftests/ublk/test_stress_03.sh  | 1 -
 tools/testing/selftests/ublk/test_stress_04.sh  | 1 -
 tools/testing/selftests/ublk/test_stress_05.sh  | 1 -
 tools/testing/selftests/ublk/test_stress_06.sh  | 1 -
 tools/testing/selftests/ublk/test_stress_07.sh  | 1 -
 tools/testing/selftests/ublk/test_stress_08.sh  | 1 -
 tools/testing/selftests/ublk/test_stress_09.sh  | 1 -
 tools/testing/selftests/ublk/test_stripe_01.sh  | 1 -
 tools/testing/selftests/ublk/test_stripe_02.sh  | 1 -
 tools/testing/selftests/ublk/test_stripe_03.sh  | 1 -
 tools/testing/selftests/ublk/test_stripe_04.sh  | 1 -
 tools/testing/selftests/ublk/test_stripe_05.sh  | 1 -
 tools/testing/selftests/ublk/test_stripe_06.sh  | 1 -
 47 files changed, 5 insertions(+), 46 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/test_batch_01.sh b/tools/testing/selftests/ublk/test_batch_01.sh
index 9fa9fff5c62f..a18fb39af8be 100755
--- a/tools/testing/selftests/ublk/test_batch_01.sh
+++ b/tools/testing/selftests/ublk/test_batch_01.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="batch_01"
 ERR_CODE=0
 
 if ! _have_feature "BATCH_IO"; then
diff --git a/tools/testing/selftests/ublk/test_batch_02.sh b/tools/testing/selftests/ublk/test_batch_02.sh
index b477f91359e1..7ca384d11987 100755
--- a/tools/testing/selftests/ublk/test_batch_02.sh
+++ b/tools/testing/selftests/ublk/test_batch_02.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="batch_02"
 ERR_CODE=0
 
 if ! _have_feature "BATCH_IO"; then
diff --git a/tools/testing/selftests/ublk/test_batch_03.sh b/tools/testing/selftests/ublk/test_batch_03.sh
index 13a2b3d3a1b9..aca9cf144b55 100755
--- a/tools/testing/selftests/ublk/test_batch_03.sh
+++ b/tools/testing/selftests/ublk/test_batch_03.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="batch_03"
 ERR_CODE=0
 
 if ! _have_feature "BATCH_IO"; then
diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index 7ff6ce79d62c..bbe031c94a29 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -1,6 +1,11 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+# Derive TID from script name: test_<type>_<num>.sh -> <type>_<num>
+# Can be overridden in test script after sourcing this file
+TID=$(basename "$0" .sh)
+TID=${TID#test_}
+
 UBLK_SKIP_CODE=4
 
 _have_program() {
diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh
index 21a31cd5491a..26cf3c7ceeb5 100755
--- a/tools/testing/selftests/ublk/test_generic_01.sh
+++ b/tools/testing/selftests/ublk/test_generic_01.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_01"
 ERR_CODE=0
 
 if ! _have_program bpftrace; then
diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh
index 12920768b1a0..1d4b1d6e059c 100755
--- a/tools/testing/selftests/ublk/test_generic_02.sh
+++ b/tools/testing/selftests/ublk/test_generic_02.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_02"
 ERR_CODE=0
 
 if ! _have_program bpftrace; then
diff --git a/tools/testing/selftests/ublk/test_generic_03.sh b/tools/testing/selftests/ublk/test_generic_03.sh
index b551aa76cb0d..8934ea926762 100755
--- a/tools/testing/selftests/ublk/test_generic_03.sh
+++ b/tools/testing/selftests/ublk/test_generic_03.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_03"
 ERR_CODE=0
 
 _prep_test "null" "check dma & segment limits for zero copy"
diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_generic_04.sh
index be2292822bbe..2672f9c40fa8 100755
--- a/tools/testing/selftests/ublk/test_generic_04.sh
+++ b/tools/testing/selftests/ublk/test_generic_04.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_04"
 ERR_CODE=0
 
 ublk_run_recover_test()
diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh
index 9b7f71c16d82..bda5064bc31f 100755
--- a/tools/testing/selftests/ublk/test_generic_05.sh
+++ b/tools/testing/selftests/ublk/test_generic_05.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_05"
 ERR_CODE=0
 
 ublk_run_recover_test()
diff --git a/tools/testing/selftests/ublk/test_generic_06.sh b/tools/testing/selftests/ublk/test_generic_06.sh
index fd42062b7b76..14a05054fcd8 100755
--- a/tools/testing/selftests/ublk/test_generic_06.sh
+++ b/tools/testing/selftests/ublk/test_generic_06.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_06"
 ERR_CODE=0
 
 _prep_test "fault_inject" "fast cleanup when all I/Os of one hctx are in server"
diff --git a/tools/testing/selftests/ublk/test_generic_07.sh b/tools/testing/selftests/ublk/test_generic_07.sh
index cba86451fa5e..8dcfd8978f50 100755
--- a/tools/testing/selftests/ublk/test_generic_07.sh
+++ b/tools/testing/selftests/ublk/test_generic_07.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_07"
 ERR_CODE=0
 
 if ! _have_program fio; then
diff --git a/tools/testing/selftests/ublk/test_generic_08.sh b/tools/testing/selftests/ublk/test_generic_08.sh
index b222f3a77e12..ce88c31d6b9c 100755
--- a/tools/testing/selftests/ublk/test_generic_08.sh
+++ b/tools/testing/selftests/ublk/test_generic_08.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_08"
 ERR_CODE=0
 
 if ! _have_feature "AUTO_BUF_REG"; then
diff --git a/tools/testing/selftests/ublk/test_generic_09.sh b/tools/testing/selftests/ublk/test_generic_09.sh
index bb6f77ca5522..744d0cdaa242 100755
--- a/tools/testing/selftests/ublk/test_generic_09.sh
+++ b/tools/testing/selftests/ublk/test_generic_09.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_09"
 ERR_CODE=0
 
 if ! _have_feature "AUTO_BUF_REG"; then
diff --git a/tools/testing/selftests/ublk/test_generic_10.sh b/tools/testing/selftests/ublk/test_generic_10.sh
index abc11c3d416b..4b4293b9081f 100755
--- a/tools/testing/selftests/ublk/test_generic_10.sh
+++ b/tools/testing/selftests/ublk/test_generic_10.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_10"
 ERR_CODE=0
 
 if ! _have_feature "UPDATE_SIZE"; then
diff --git a/tools/testing/selftests/ublk/test_generic_11.sh b/tools/testing/selftests/ublk/test_generic_11.sh
index d1f973c8c645..e0dc0b8fe5d6 100755
--- a/tools/testing/selftests/ublk/test_generic_11.sh
+++ b/tools/testing/selftests/ublk/test_generic_11.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_11"
 ERR_CODE=0
 
 ublk_run_quiesce_recover()
diff --git a/tools/testing/selftests/ublk/test_generic_12.sh b/tools/testing/selftests/ublk/test_generic_12.sh
index b4046201b4d9..54b81ddfe9f9 100755
--- a/tools/testing/selftests/ublk/test_generic_12.sh
+++ b/tools/testing/selftests/ublk/test_generic_12.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_12"
 ERR_CODE=0
 
 if ! _have_program bpftrace; then
diff --git a/tools/testing/selftests/ublk/test_generic_13.sh b/tools/testing/selftests/ublk/test_generic_13.sh
index b7aa90b1cb74..922115aa14f4 100755
--- a/tools/testing/selftests/ublk/test_generic_13.sh
+++ b/tools/testing/selftests/ublk/test_generic_13.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_13"
 ERR_CODE=0
 
 _prep_test "null" "check that feature list is complete"
diff --git a/tools/testing/selftests/ublk/test_generic_14.sh b/tools/testing/selftests/ublk/test_generic_14.sh
index cd9b44b97c24..178443394ca5 100755
--- a/tools/testing/selftests/ublk/test_generic_14.sh
+++ b/tools/testing/selftests/ublk/test_generic_14.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_14"
 ERR_CODE=0
 
 ublk_run_recover_test()
diff --git a/tools/testing/selftests/ublk/test_generic_15.sh b/tools/testing/selftests/ublk/test_generic_15.sh
index 76379362e0a2..727d0f4610d6 100755
--- a/tools/testing/selftests/ublk/test_generic_15.sh
+++ b/tools/testing/selftests/ublk/test_generic_15.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_15"
 ERR_CODE=0
 
 _test_partition_scan_no_hang()
diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh
index e08af7b685c9..42e8d2e16ec9 100755
--- a/tools/testing/selftests/ublk/test_generic_16.sh
+++ b/tools/testing/selftests/ublk/test_generic_16.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="generic_16"
 ERR_CODE=0
 
 _prep_test "null" "stop --safe command"
diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh
index 833fa0dbc700..338a235fd82a 100755
--- a/tools/testing/selftests/ublk/test_loop_01.sh
+++ b/tools/testing/selftests/ublk/test_loop_01.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="loop_01"
 ERR_CODE=0
 
 if ! _have_program fio; then
diff --git a/tools/testing/selftests/ublk/test_loop_02.sh b/tools/testing/selftests/ublk/test_loop_02.sh
index 874568b3646b..04c52454e2ec 100755
--- a/tools/testing/selftests/ublk/test_loop_02.sh
+++ b/tools/testing/selftests/ublk/test_loop_02.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="loop_02"
 ERR_CODE=0
 
 _prep_test "loop" "mkfs & mount & umount"
diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh
index c30f797c6429..6e8f649fe93d 100755
--- a/tools/testing/selftests/ublk/test_loop_03.sh
+++ b/tools/testing/selftests/ublk/test_loop_03.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="loop_03"
 ERR_CODE=0
 
 if ! _have_program fio; then
diff --git a/tools/testing/selftests/ublk/test_loop_04.sh b/tools/testing/selftests/ublk/test_loop_04.sh
index b01d75b3214d..9f6774ec0de6 100755
--- a/tools/testing/selftests/ublk/test_loop_04.sh
+++ b/tools/testing/selftests/ublk/test_loop_04.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="loop_04"
 ERR_CODE=0
 
 _prep_test "loop" "mkfs & mount & umount with zero copy"
diff --git a/tools/testing/selftests/ublk/test_loop_05.sh b/tools/testing/selftests/ublk/test_loop_05.sh
index de2141533074..2b8d99e007be 100755
--- a/tools/testing/selftests/ublk/test_loop_05.sh
+++ b/tools/testing/selftests/ublk/test_loop_05.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="loop_05"
 ERR_CODE=0
 
 if ! _have_program fio; then
diff --git a/tools/testing/selftests/ublk/test_loop_06.sh b/tools/testing/selftests/ublk/test_loop_06.sh
index 1d1a8a725502..e73f6f4844db 100755
--- a/tools/testing/selftests/ublk/test_loop_06.sh
+++ b/tools/testing/selftests/ublk/test_loop_06.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="loop_06"
 ERR_CODE=0
 
 if ! _have_program fio; then
diff --git a/tools/testing/selftests/ublk/test_loop_07.sh b/tools/testing/selftests/ublk/test_loop_07.sh
index 493f3fb611a5..264d20e7c530 100755
--- a/tools/testing/selftests/ublk/test_loop_07.sh
+++ b/tools/testing/selftests/ublk/test_loop_07.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="loop_07"
 ERR_CODE=0
 
 _prep_test "loop" "mkfs & mount & umount with user copy"
diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh
index ca289cfb2ad4..2caa7ba748fb 100755
--- a/tools/testing/selftests/ublk/test_loop_08.sh
+++ b/tools/testing/selftests/ublk/test_loop_08.sh
@@ -13,7 +13,6 @@ if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then
 	exit $UBLK_SKIP_CODE
 fi
 
-TID=loop_08
 
 _prep_test "loop" "end-to-end integrity"
 
diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh
index c2cb8f7a09fe..eebce8076530 100755
--- a/tools/testing/selftests/ublk/test_null_01.sh
+++ b/tools/testing/selftests/ublk/test_null_01.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="null_01"
 ERR_CODE=0
 
 if ! _have_program fio; then
diff --git a/tools/testing/selftests/ublk/test_null_02.sh b/tools/testing/selftests/ublk/test_null_02.sh
index 8accd35beb55..654bdff39664 100755
--- a/tools/testing/selftests/ublk/test_null_02.sh
+++ b/tools/testing/selftests/ublk/test_null_02.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="null_02"
 ERR_CODE=0
 
 if ! _have_program fio; then
diff --git a/tools/testing/selftests/ublk/test_null_03.sh b/tools/testing/selftests/ublk/test_null_03.sh
index 0051067b4686..29cd09f06672 100755
--- a/tools/testing/selftests/ublk/test_null_03.sh
+++ b/tools/testing/selftests/ublk/test_null_03.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="null_03"
 ERR_CODE=0
 
 if ! _have_program fio; then
diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh
index 0b0719ea33a3..7491b8c17f00 100755
--- a/tools/testing/selftests/ublk/test_null_04.sh
+++ b/tools/testing/selftests/ublk/test_null_04.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID=null_04
 
 _prep_test "null" "integrity params"
 
diff --git a/tools/testing/selftests/ublk/test_stress_01.sh b/tools/testing/selftests/ublk/test_stress_01.sh
index 7d3150f057d4..a9322ce496e9 100755
--- a/tools/testing/selftests/ublk/test_stress_01.sh
+++ b/tools/testing/selftests/ublk/test_stress_01.sh
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-TID="stress_01"
 ERR_CODE=0
 
 ublk_io_and_remove()
diff --git a/tools/testing/selftests/ublk/test_stress_02.sh b/tools/testing/selftests/ublk/test_stress_02.sh
index 4bdd921081e5..6c114194f9c9 100755
--- a/tools/testing/selftests/ublk/test_stress_02.sh
+++ b/tools/testing/selftests/ublk/test_stress_02.sh
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-TID="stress_02"
 ERR_CODE=0
 
 if ! _have_program fio; then
diff --git a/tools/testing/selftests/ublk/test_stress_03.sh b/tools/testing/selftests/ublk/test_stress_03.sh
index 3ed4c9b2d8c0..4e81ca0db758 100755
--- a/tools/testing/selftests/ublk/test_stress_03.sh
+++ b/tools/testing/selftests/ublk/test_stress_03.sh
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-TID="stress_03"
 ERR_CODE=0
 
 ublk_io_and_remove()
diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh
index efa8dc33234b..6c6f44b172bc 100755
--- a/tools/testing/selftests/ublk/test_stress_04.sh
+++ b/tools/testing/selftests/ublk/test_stress_04.sh
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-TID="stress_04"
 ERR_CODE=0
 
 ublk_io_and_kill_daemon()
diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh
index 68a194144302..7e9324de2030 100755
--- a/tools/testing/selftests/ublk/test_stress_05.sh
+++ b/tools/testing/selftests/ublk/test_stress_05.sh
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-TID="stress_05"
 ERR_CODE=0
 
 if ! _have_program fio; then
diff --git a/tools/testing/selftests/ublk/test_stress_06.sh b/tools/testing/selftests/ublk/test_stress_06.sh
index 37188ec2e1f7..c72e5d0b14be 100755
--- a/tools/testing/selftests/ublk/test_stress_06.sh
+++ b/tools/testing/selftests/ublk/test_stress_06.sh
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-TID="stress_06"
 ERR_CODE=0
 
 ublk_io_and_remove()
diff --git a/tools/testing/selftests/ublk/test_stress_07.sh b/tools/testing/selftests/ublk/test_stress_07.sh
index fb061fc26d36..04c2764d5238 100755
--- a/tools/testing/selftests/ublk/test_stress_07.sh
+++ b/tools/testing/selftests/ublk/test_stress_07.sh
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-TID="stress_07"
 ERR_CODE=0
 
 ublk_io_and_kill_daemon()
diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh
index 9abb50ee3d00..37f7d204879a 100755
--- a/tools/testing/selftests/ublk/test_stress_08.sh
+++ b/tools/testing/selftests/ublk/test_stress_08.sh
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-TID="stress_08"
 ERR_CODE=0
 
 ublk_io_and_remove()
diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh
index 87b92b0a2410..53c1e3b2ab30 100755
--- a/tools/testing/selftests/ublk/test_stress_09.sh
+++ b/tools/testing/selftests/ublk/test_stress_09.sh
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-TID="stress_09"
 ERR_CODE=0
 
 ublk_io_and_kill_daemon()
diff --git a/tools/testing/selftests/ublk/test_stripe_01.sh b/tools/testing/selftests/ublk/test_stripe_01.sh
index 4e4f0fdf3c9b..3bc821aadad8 100755
--- a/tools/testing/selftests/ublk/test_stripe_01.sh
+++ b/tools/testing/selftests/ublk/test_stripe_01.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="stripe_01"
 ERR_CODE=0
 
 if ! _have_program fio; then
diff --git a/tools/testing/selftests/ublk/test_stripe_02.sh b/tools/testing/selftests/ublk/test_stripe_02.sh
index 5820ab2efba4..4a7d2b21a6bf 100755
--- a/tools/testing/selftests/ublk/test_stripe_02.sh
+++ b/tools/testing/selftests/ublk/test_stripe_02.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="stripe_02"
 ERR_CODE=0
 
 _prep_test "stripe" "mkfs & mount & umount"
diff --git a/tools/testing/selftests/ublk/test_stripe_03.sh b/tools/testing/selftests/ublk/test_stripe_03.sh
index 20b977e27814..a1c159d54e53 100755
--- a/tools/testing/selftests/ublk/test_stripe_03.sh
+++ b/tools/testing/selftests/ublk/test_stripe_03.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="stripe_03"
 ERR_CODE=0
 
 if ! _have_program fio; then
diff --git a/tools/testing/selftests/ublk/test_stripe_04.sh b/tools/testing/selftests/ublk/test_stripe_04.sh
index 1b51ed2f1d84..0c30bd6c2b3b 100755
--- a/tools/testing/selftests/ublk/test_stripe_04.sh
+++ b/tools/testing/selftests/ublk/test_stripe_04.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="stripe_04"
 ERR_CODE=0
 
 _prep_test "stripe" "mkfs & mount & umount on zero copy"
diff --git a/tools/testing/selftests/ublk/test_stripe_05.sh b/tools/testing/selftests/ublk/test_stripe_05.sh
index 05d71951d710..6ddfa88ad226 100755
--- a/tools/testing/selftests/ublk/test_stripe_05.sh
+++ b/tools/testing/selftests/ublk/test_stripe_05.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="stripe_05"
 ERR_CODE=0
 
 if ! _have_program fio; then
diff --git a/tools/testing/selftests/ublk/test_stripe_06.sh b/tools/testing/selftests/ublk/test_stripe_06.sh
index d06cac7626e2..a2c7bf4cc613 100755
--- a/tools/testing/selftests/ublk/test_stripe_06.sh
+++ b/tools/testing/selftests/ublk/test_stripe_06.sh
@@ -3,7 +3,6 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
-TID="stripe_06"
 ERR_CODE=0
 
 _prep_test "stripe" "mkfs & mount & umount on user copy"
-- 
cgit v1.2.3


From e07a2039b6d4ae3acf8ae39b86be449b7fa18d4a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 30 Jan 2026 00:19:53 +0800
Subject: selftests: ublk: add selftest for UBLK_F_NO_AUTO_PART_SCAN

Add test_part_01.sh to test the UBLK_F_NO_AUTO_PART_SCAN feature
flag which allows suppressing automatic partition scanning during
device startup while still allowing manual partition probing.

The test verifies:
- Normal behavior: partitions are auto-detected without the flag
- With flag: partitions are not auto-detected during START_DEV
- Manual scan: blockdev --rereadpt works with the flag

Also update kublk tool to support --no_auto_part_scan option and
recognize the feature flag.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile        |   2 +
 tools/testing/selftests/ublk/kublk.c         |   6 +-
 tools/testing/selftests/ublk/kublk.h         |   3 +-
 tools/testing/selftests/ublk/test_part_01.sh | 104 +++++++++++++++++++++++++++
 4 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100755 tools/testing/selftests/ublk/test_part_01.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index e39a6f871fcc..bc5bd7d1381d 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -48,6 +48,8 @@ TEST_PROGS += test_stripe_04.sh
 TEST_PROGS += test_stripe_05.sh
 TEST_PROGS += test_stripe_06.sh
 
+TEST_PROGS += test_part_01.sh
+
 TEST_PROGS += test_stress_01.sh
 TEST_PROGS += test_stress_02.sh
 TEST_PROGS += test_stress_03.sh
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 2da37557e1a9..e8279c4acc40 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -1615,6 +1615,7 @@ static int cmd_dev_get_features(void)
 		FEAT_NAME(UBLK_F_INTEGRITY),
 		FEAT_NAME(UBLK_F_SAFE_STOP_DEV),
 		FEAT_NAME(UBLK_F_BATCH_IO),
+		FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN),
 	};
 	struct ublk_dev *dev;
 	__u64 features = 0;
@@ -1712,7 +1713,7 @@ static void __cmd_create_help(char *exe, bool recovery)
 	printf("\t[--nthreads threads] [--per_io_tasks]\n");
 	printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] "
 		 "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n");
-	printf("\t[--batch|-b]\n");
+	printf("\t[--batch|-b] [--no_auto_part_scan]\n");
 	printf("\t[target options] [backfile1] [backfile2] ...\n");
 	printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
 	printf("\tdefault: nthreads=nr_queues");
@@ -1786,6 +1787,7 @@ int main(int argc, char *argv[])
 		{ "tag_size",		1,	NULL,  0 },
 		{ "safe",		0,	NULL,  0 },
 		{ "batch",              0,      NULL, 'b'},
+		{ "no_auto_part_scan",	0,	NULL,  0 },
 		{ 0, 0, 0, 0 }
 	};
 	const struct ublk_tgt_ops *ops = NULL;
@@ -1898,6 +1900,8 @@ int main(int argc, char *argv[])
 				ctx.tag_size = strtoul(optarg, NULL, 0);
 			if (!strcmp(longopts[option_idx].name, "safe"))
 				ctx.safe_stop = 1;
+			if (!strcmp(longopts[option_idx].name, "no_auto_part_scan"))
+				ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN;
 			break;
 		case '?':
 			/*
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index ca97deb5e208..1faeccaaecae 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -78,12 +78,13 @@ struct dev_ctx {
 	unsigned int	auto_zc_fallback:1;
 	unsigned int	per_io_tasks:1;
 	unsigned int	no_ublk_fixed_fd:1;
+	unsigned int	safe_stop:1;
+	unsigned int	no_auto_part_scan:1;
 	__u32 integrity_flags;
 	__u8 metadata_size;
 	__u8 pi_offset;
 	__u8 csum_type;
 	__u8 tag_size;
-	unsigned int	safe_stop:1;
 
 	int _evtfd;
 	int _shmid;
diff --git a/tools/testing/selftests/ublk/test_part_01.sh b/tools/testing/selftests/ublk/test_part_01.sh
new file mode 100755
index 000000000000..8028f6e4b3a5
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_part_01.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+ERR_CODE=0
+
+format_backing_file()
+{
+	local backing_file=$1
+
+	# Create ublk device to write partition table
+	local tmp_dev=$(_add_ublk_dev -t loop "${backing_file}")
+	[ $? -ne 0 ] && return 1
+
+	# Write partition table with sfdisk
+	sfdisk /dev/ublkb"${tmp_dev}" > /dev/null 2>&1 <<EOF
+label: dos
+start=2048, size=100MiB, type=83
+start=206848, size=100MiB, type=83
+EOF
+	local ret=$?
+
+	"${UBLK_PROG}" del -n "${tmp_dev}"
+
+	return $ret
+}
+
+test_auto_part_scan()
+{
+	local backing_file=$1
+
+	# Create device WITHOUT --no_auto_part_scan
+	local dev_id=$(_add_ublk_dev -t loop "${backing_file}")
+	[ $? -ne 0 ] && return 1
+
+	udevadm settle
+
+	# Partitions should be auto-detected
+	if [ ! -e /dev/ublkb"${dev_id}"p1 ] || [ ! -e /dev/ublkb"${dev_id}"p2 ]; then
+		"${UBLK_PROG}" del -n "${dev_id}"
+		return 1
+	fi
+
+	"${UBLK_PROG}" del -n "${dev_id}"
+	return 0
+}
+
+test_no_auto_part_scan()
+{
+	local backing_file=$1
+
+	# Create device WITH --no_auto_part_scan
+	local dev_id=$(_add_ublk_dev -t loop --no_auto_part_scan "${backing_file}")
+	[ $? -ne 0 ] && return 1
+
+	udevadm settle
+
+	# Partitions should NOT be auto-detected
+	if [ -e /dev/ublkb"${dev_id}"p1 ]; then
+		"${UBLK_PROG}" del -n "${dev_id}"
+		return 1
+	fi
+
+	# Manual scan should work
+	blockdev --rereadpt /dev/ublkb"${dev_id}" > /dev/null 2>&1
+	udevadm settle
+
+	if [ ! -e /dev/ublkb"${dev_id}"p1 ] || [ ! -e /dev/ublkb"${dev_id}"p2 ]; then
+		"${UBLK_PROG}" del -n "${dev_id}"
+		return 1
+	fi
+
+	"${UBLK_PROG}" del -n "${dev_id}"
+	return 0
+}
+
+if ! _have_program sfdisk || ! _have_program blockdev; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "generic" "test UBLK_F_NO_AUTO_PART_SCAN"
+
+if ! _have_feature "UBLK_F_NO_AUTO_PART_SCAN"; then
+	_cleanup_test "generic"
+	exit "$UBLK_SKIP_CODE"
+fi
+
+
+# Create and format backing file with partition table
+_create_backfile 0 256M
+format_backing_file "${UBLK_BACKFILES[0]}"
+[ $? -ne 0 ] && ERR_CODE=255
+
+# Test normal auto partition scan
+[ "$ERR_CODE" -eq 0 ] && test_auto_part_scan "${UBLK_BACKFILES[0]}"
+[ $? -ne 0 ] && ERR_CODE=255
+
+# Test no auto partition scan with manual scan
+[ "$ERR_CODE" -eq 0 ] && test_no_auto_part_scan "${UBLK_BACKFILES[0]}"
+[ $? -ne 0 ] && ERR_CODE=255
+
+_cleanup_test "generic"
+_show_result $TID $ERR_CODE
-- 
cgit v1.2.3


From 7a30d3dfea4a455d1109d5258fe332f2157071ba Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 30 Jan 2026 00:19:54 +0800
Subject: selftests: ublk: rename test_generic_15 to test_part_02

This test exercises partition scanning behavior, so move it to
the test_part_* group for consistency.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile           |  2 +-
 tools/testing/selftests/ublk/test_generic_15.sh | 67 -------------------------
 tools/testing/selftests/ublk/test_part_02.sh    | 67 +++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 68 deletions(-)
 delete mode 100755 tools/testing/selftests/ublk/test_generic_15.sh
 create mode 100755 tools/testing/selftests/ublk/test_part_02.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index bc5bd7d1381d..ca8588ed962c 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -22,7 +22,6 @@ TEST_PROGS += test_generic_11.sh
 TEST_PROGS += test_generic_12.sh
 TEST_PROGS += test_generic_13.sh
 TEST_PROGS += test_generic_14.sh
-TEST_PROGS += test_generic_15.sh
 TEST_PROGS += test_generic_16.sh
 
 TEST_PROGS += test_batch_01.sh
@@ -49,6 +48,7 @@ TEST_PROGS += test_stripe_05.sh
 TEST_PROGS += test_stripe_06.sh
 
 TEST_PROGS += test_part_01.sh
+TEST_PROGS += test_part_02.sh
 
 TEST_PROGS += test_stress_01.sh
 TEST_PROGS += test_stress_02.sh
diff --git a/tools/testing/selftests/ublk/test_generic_15.sh b/tools/testing/selftests/ublk/test_generic_15.sh
deleted file mode 100755
index 727d0f4610d6..000000000000
--- a/tools/testing/selftests/ublk/test_generic_15.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-
-ERR_CODE=0
-
-_test_partition_scan_no_hang()
-{
-	local recovery_flag=$1
-	local expected_state=$2
-	local dev_id
-	local state
-	local daemon_pid
-	local start_time
-	local elapsed
-
-	# Create ublk device with fault_inject target and very large delay
-	# to simulate hang during partition table read
-	# --delay_us 60000000 = 60 seconds delay
-	# Use _add_ublk_dev_no_settle to avoid udevadm settle hang waiting
-	# for partition scan events to complete
-	if [ "$recovery_flag" = "yes" ]; then
-		echo "Testing partition scan with recovery support..."
-		dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000 -r 1)
-	else
-		echo "Testing partition scan without recovery..."
-		dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000)
-	fi
-
-	_check_add_dev "$TID" $?
-
-	# The add command should return quickly because partition scan is async.
-	# Now sleep briefly to let the async partition scan work start and hit
-	# the delay in the fault_inject handler.
-	sleep 1
-
-	# Kill the ublk daemon while partition scan is potentially blocked
-	# And check state transitions properly
-	start_time=${SECONDS}
-	daemon_pid=$(_get_ublk_daemon_pid "${dev_id}")
-	state=$(__ublk_kill_daemon "${dev_id}" "${expected_state}")
-	elapsed=$((SECONDS - start_time))
-
-	# Verify the device transitioned to expected state
-	if [ "$state" != "${expected_state}" ]; then
-		echo "FAIL: Device state is $state, expected ${expected_state}"
-		ERR_CODE=255
-		${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1
-		return
-	fi
-	echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging"
-
-	# Clean up the device
-	${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1
-}
-
-_prep_test "partition_scan" "verify async partition scan prevents IO hang"
-
-# Test 1: Without recovery support - should transition to DEAD
-_test_partition_scan_no_hang "no" "DEAD"
-
-# Test 2: With recovery support - should transition to QUIESCED
-_test_partition_scan_no_hang "yes" "QUIESCED"
-
-_cleanup_test "partition_scan"
-_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_part_02.sh b/tools/testing/selftests/ublk/test_part_02.sh
new file mode 100755
index 000000000000..727d0f4610d6
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_part_02.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+ERR_CODE=0
+
+_test_partition_scan_no_hang()
+{
+	local recovery_flag=$1
+	local expected_state=$2
+	local dev_id
+	local state
+	local daemon_pid
+	local start_time
+	local elapsed
+
+	# Create ublk device with fault_inject target and very large delay
+	# to simulate hang during partition table read
+	# --delay_us 60000000 = 60 seconds delay
+	# Use _add_ublk_dev_no_settle to avoid udevadm settle hang waiting
+	# for partition scan events to complete
+	if [ "$recovery_flag" = "yes" ]; then
+		echo "Testing partition scan with recovery support..."
+		dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000 -r 1)
+	else
+		echo "Testing partition scan without recovery..."
+		dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000)
+	fi
+
+	_check_add_dev "$TID" $?
+
+	# The add command should return quickly because partition scan is async.
+	# Now sleep briefly to let the async partition scan work start and hit
+	# the delay in the fault_inject handler.
+	sleep 1
+
+	# Kill the ublk daemon while partition scan is potentially blocked
+	# And check state transitions properly
+	start_time=${SECONDS}
+	daemon_pid=$(_get_ublk_daemon_pid "${dev_id}")
+	state=$(__ublk_kill_daemon "${dev_id}" "${expected_state}")
+	elapsed=$((SECONDS - start_time))
+
+	# Verify the device transitioned to expected state
+	if [ "$state" != "${expected_state}" ]; then
+		echo "FAIL: Device state is $state, expected ${expected_state}"
+		ERR_CODE=255
+		${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1
+		return
+	fi
+	echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging"
+
+	# Clean up the device
+	${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1
+}
+
+_prep_test "partition_scan" "verify async partition scan prevents IO hang"
+
+# Test 1: Without recovery support - should transition to DEAD
+_test_partition_scan_no_hang "no" "DEAD"
+
+# Test 2: With recovery support - should transition to QUIESCED
+_test_partition_scan_no_hang "yes" "QUIESCED"
+
+_cleanup_test "partition_scan"
+_show_result $TID $ERR_CODE
-- 
cgit v1.2.3


From 130975353b1548d76aa9790a4ac7e74bd2a37221 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 30 Jan 2026 00:19:55 +0800
Subject: selftests: ublk: refactor test_null_04 into separate functions

Encapsulate each test case in its own function that creates the
device, runs checks, and deletes only that device. This avoids
calling _cleanup_test multiple times.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/test_null_04.sh | 248 ++++++++++-----------------
 1 file changed, 94 insertions(+), 154 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh
index 7491b8c17f00..22328e0f3925 100755
--- a/tools/testing/selftests/ublk/test_null_04.sh
+++ b/tools/testing/selftests/ublk/test_null_04.sh
@@ -3,163 +3,103 @@
 
 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
 
+ERR_CODE=0
 
-_prep_test "null" "integrity params"
+_check_value() {
+	local name=$1
+	local actual=$2
+	local expected=$3
 
-dev_id=$(_add_ublk_dev -t null -u --metadata_size 8)
-_check_add_dev $TID $?
-metadata_size=$(_get_metadata_size "$dev_id" metadata_size)
-if [ "$metadata_size" != 8 ]; then
-	echo "metadata_size $metadata_size != 8"
-	_show_result $TID 255
-fi
-pi_offset=$(_get_metadata_size "$dev_id" pi_offset)
-if [ "$pi_offset" != 0 ]; then
-	echo "pi_offset $pi_offset != 0"
-	_show_result $TID 255
-fi
-pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size)
-if [ "$pi_tuple_size" != 0 ]; then
-	echo "pi_tuple_size $pi_tuple_size != 0"
-	_show_result $TID 255
-fi
-capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")
-if [ "$capable" != 0 ]; then
-	echo "device_is_integrity_capable $capable != 0"
-	_show_result $TID 255
-fi
-format=$(cat "/sys/block/ublkb$dev_id/integrity/format")
-if [ "$format" != nop ]; then
-	echo "format $format != nop"
-	_show_result $TID 255
-fi
-protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")
-if [ "$protection_interval_bytes" != 512 ]; then
-	echo "protection_interval_bytes $protection_interval_bytes != 512"
-	_show_result $TID 255
-fi
-tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")
-if [ "$tag_size" != 0 ]; then
-	echo "tag_size $tag_size != 0"
-	_show_result $TID 255
-fi
-_cleanup_test
+	if [ "$actual" != "$expected" ]; then
+		echo "$name $actual != $expected"
+		ERR_CODE=255
+		return 1
+	fi
+	return 0
+}
 
-dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip)
-_check_add_dev $TID $?
-metadata_size=$(_get_metadata_size "$dev_id" metadata_size)
-if [ "$metadata_size" != 64 ]; then
-	echo "metadata_size $metadata_size != 64"
-	_show_result $TID 255
-fi
-pi_offset=$(_get_metadata_size "$dev_id" pi_offset)
-if [ "$pi_offset" != 56 ]; then
-	echo "pi_offset $pi_offset != 56"
-	_show_result $TID 255
-fi
-pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size)
-if [ "$pi_tuple_size" != 8 ]; then
-	echo "pi_tuple_size $pi_tuple_size != 8"
-	_show_result $TID 255
-fi
-capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")
-if [ "$capable" != 1 ]; then
-	echo "device_is_integrity_capable $capable != 1"
-	_show_result $TID 255
-fi
-format=$(cat "/sys/block/ublkb$dev_id/integrity/format")
-if [ "$format" != T10-DIF-TYPE3-IP ]; then
-	echo "format $format != T10-DIF-TYPE3-IP"
-	_show_result $TID 255
-fi
-protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")
-if [ "$protection_interval_bytes" != 512 ]; then
-	echo "protection_interval_bytes $protection_interval_bytes != 512"
-	_show_result $TID 255
-fi
-tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")
-if [ "$tag_size" != 0 ]; then
-	echo "tag_size $tag_size != 0"
-	_show_result $TID 255
-fi
-_cleanup_test
+_test_metadata_only() {
+	local dev_id
 
-dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif)
-_check_add_dev $TID $?
-metadata_size=$(_get_metadata_size "$dev_id" metadata_size)
-if [ "$metadata_size" != 8 ]; then
-	echo "metadata_size $metadata_size != 8"
-	_show_result $TID 255
-fi
-pi_offset=$(_get_metadata_size "$dev_id" pi_offset)
-if [ "$pi_offset" != 0 ]; then
-	echo "pi_offset $pi_offset != 0"
-	_show_result $TID 255
-fi
-pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size)
-if [ "$pi_tuple_size" != 8 ]; then
-	echo "pi_tuple_size $pi_tuple_size != 8"
-	_show_result $TID 255
-fi
-capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")
-if [ "$capable" != 0 ]; then
-	echo "device_is_integrity_capable $capable != 0"
-	_show_result $TID 255
-fi
-format=$(cat "/sys/block/ublkb$dev_id/integrity/format")
-if [ "$format" != T10-DIF-TYPE1-CRC ]; then
-	echo "format $format != T10-DIF-TYPE1-CRC"
-	_show_result $TID 255
-fi
-protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")
-if [ "$protection_interval_bytes" != 512 ]; then
-	echo "protection_interval_bytes $protection_interval_bytes != 512"
-	_show_result $TID 255
-fi
-tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")
-if [ "$tag_size" != 0 ]; then
-	echo "tag_size $tag_size != 0"
-	_show_result $TID 255
-fi
-_cleanup_test
+	dev_id=$(_add_ublk_dev -t null -u --metadata_size 8)
+	_check_add_dev "$TID" $?
 
-dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8)
-_check_add_dev $TID $?
-metadata_size=$(_get_metadata_size "$dev_id" metadata_size)
-if [ "$metadata_size" != 16 ]; then
-	echo "metadata_size $metadata_size != 16"
-	_show_result $TID 255
-fi
-pi_offset=$(_get_metadata_size "$dev_id" pi_offset)
-if [ "$pi_offset" != 0 ]; then
-	echo "pi_offset $pi_offset != 0"
-	_show_result $TID 255
-fi
-pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size)
-if [ "$pi_tuple_size" != 16 ]; then
-	echo "pi_tuple_size $pi_tuple_size != 16"
-	_show_result $TID 255
-fi
-capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")
-if [ "$capable" != 0 ]; then
-	echo "device_is_integrity_capable $capable != 0"
-	_show_result $TID 255
-fi
-format=$(cat "/sys/block/ublkb$dev_id/integrity/format")
-if [ "$format" != EXT-DIF-TYPE3-CRC64 ]; then
-	echo "format $format != EXT-DIF-TYPE3-CRC64"
-	_show_result $TID 255
-fi
-protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")
-if [ "$protection_interval_bytes" != 512 ]; then
-	echo "protection_interval_bytes $protection_interval_bytes != 512"
-	_show_result $TID 255
-fi
-tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")
-if [ "$tag_size" != 8 ]; then
-	echo "tag_size $tag_size != 8"
-	_show_result $TID 255
-fi
-_cleanup_test
+	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 &&
+	_check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 &&
+	_check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 0 &&
+	_check_value "device_is_integrity_capable" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 &&
+	_check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" nop &&
+	_check_value "protection_interval_bytes" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
+	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0
+
+	${UBLK_PROG} del -n "${dev_id}"
+}
+
+_test_integrity_capable_ip() {
+	local dev_id
+
+	dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip)
+	_check_add_dev "$TID" $?
+
+	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 &&
+	_check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 56 &&
+	_check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 &&
+	_check_value "device_is_integrity_capable" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 1 &&
+	_check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE3-IP &&
+	_check_value "protection_interval_bytes" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
+	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0
+
+	${UBLK_PROG} del -n "${dev_id}"
+}
+
+_test_integrity_reftag_t10dif() {
+	local dev_id
 
-_show_result $TID 0
+	dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif)
+	_check_add_dev "$TID" $?
+
+	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 &&
+	_check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 &&
+	_check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 &&
+	_check_value "device_is_integrity_capable" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 &&
+	_check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE1-CRC &&
+	_check_value "protection_interval_bytes" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
+	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0
+
+	${UBLK_PROG} del -n "${dev_id}"
+}
+
+_test_nvme_csum() {
+	local dev_id
+
+	dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8)
+	_check_add_dev "$TID" $?
+
+	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 &&
+	_check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 &&
+	_check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 16 &&
+	_check_value "device_is_integrity_capable" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 &&
+	_check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" EXT-DIF-TYPE3-CRC64 &&
+	_check_value "protection_interval_bytes" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
+	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8
+
+	${UBLK_PROG} del -n "${dev_id}"
+}
+
+_prep_test "null" "integrity params"
+
+_test_metadata_only
+_test_integrity_capable_ip
+_test_integrity_reftag_t10dif
+_test_nvme_csum
+
+_cleanup_test
+_show_result "$TID" $ERR_CODE
-- 
cgit v1.2.3


From 76334de7da404c385e18efb3640ed60ca77a899f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 30 Jan 2026 00:19:56 +0800
Subject: selftests: ublk: disable partition scan for integrity tests

The null target doesn't handle IO, so disable partition scan to avoid IO
failures caused by integrity verification during the kernel's partition
table read.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/test_null_04.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh
index 22328e0f3925..a5599d38583a 100755
--- a/tools/testing/selftests/ublk/test_null_04.sh
+++ b/tools/testing/selftests/ublk/test_null_04.sh
@@ -21,7 +21,7 @@ _check_value() {
 _test_metadata_only() {
 	local dev_id
 
-	dev_id=$(_add_ublk_dev -t null -u --metadata_size 8)
+	dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 8)
 	_check_add_dev "$TID" $?
 
 	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 &&
@@ -40,7 +40,7 @@ _test_metadata_only() {
 _test_integrity_capable_ip() {
 	local dev_id
 
-	dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip)
+	dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip)
 	_check_add_dev "$TID" $?
 
 	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 &&
@@ -59,7 +59,7 @@ _test_integrity_capable_ip() {
 _test_integrity_reftag_t10dif() {
 	local dev_id
 
-	dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif)
+	dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_reftag --metadata_size 8 --csum_type t10dif)
 	_check_add_dev "$TID" $?
 
 	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 &&
@@ -78,7 +78,7 @@ _test_integrity_reftag_t10dif() {
 _test_nvme_csum() {
 	local dev_id
 
-	dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8)
+	dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 16 --csum_type nvme --tag_size 8)
 	_check_add_dev "$TID" $?
 
 	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 &&
-- 
cgit v1.2.3


From 4e0d293af9e37c735aec574c1e69ed71f81f94b2 Mon Sep 17 00:00:00 2001
From: Alexander Atanasov <alex@zazolabs.com>
Date: Fri, 30 Jan 2026 00:19:57 +0800
Subject: selftests: ublk: mark each test start and end time in dmesg

Log test start and end time in dmesg, so generated log messages
during the test run can be linked to specific test from the test
suite.

(switch to `date +%F %T`)

Signed-off-by: Alexander Atanasov <alex@zazolabs.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/test_common.sh | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index bbe031c94a29..dd4eff97610a 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -126,6 +126,7 @@ _prep_test() {
 	modprobe ublk_drv > /dev/null 2>&1
 	UBLK_TMP=$(mktemp ublk_test_XXXXX)
 	[ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*"
+	echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg
 }
 
 _remove_test_files()
@@ -170,6 +171,7 @@ _cleanup_test() {
 	"${UBLK_PROG}" del -a
 
 	_remove_files
+	echo "ublk selftest: $TID done at $(date '+%F %T')" | tee /dev/kmsg
 }
 
 _have_feature()
-- 
cgit v1.2.3


From 2feca79ef8df5505b87c00812b9ba263b92c64ed Mon Sep 17 00:00:00 2001
From: Alexander Atanasov <alex@zazolabs.com>
Date: Fri, 30 Jan 2026 00:19:58 +0800
Subject: selftests: ublk: move test temp files into a sub directory

Create and use a temporary directory for the files created during
test runs. If TMPDIR environment variable is set use it as a base
for the temporary directory path.
TMPDIR=/mnt/scratch make run_tests
and
TMPDIR=/mnt/scratch ./test_generic_01.sh
will place test directory under /mnt/scratch

Signed-off-by: Alexander Atanasov <alex@zazolabs.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/test_common.sh | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index dd4eff97610a..21ba51fcc7d7 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -48,7 +48,7 @@ _create_backfile() {
 	old_file="${UBLK_BACKFILES[$index]}"
 	[ -f "$old_file" ] && rm -f "$old_file"
 
-	new_file=$(mktemp ublk_file_"${new_size}"_XXXXX)
+	new_file=$(mktemp ${UBLK_TEST_DIR}/ublk_file_"${new_size}"_XXXXX)
 	truncate -s "${new_size}" "${new_file}"
 	UBLK_BACKFILES["$index"]="$new_file"
 }
@@ -65,7 +65,7 @@ _remove_files() {
 _create_tmp_dir() {
 	local my_file;
 
-	my_file=$(mktemp -d ublk_dir_XXXXX)
+	my_file=$(mktemp -d ${UBLK_TEST_DIR}/ublk_dir_XXXXX)
 	echo "$my_file"
 }
 
@@ -124,7 +124,9 @@ _prep_test() {
 	local type=$1
 	shift 1
 	modprobe ublk_drv > /dev/null 2>&1
-	UBLK_TMP=$(mktemp ublk_test_XXXXX)
+	TDIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX)
+	export UBLK_TEST_DIR=${TDIR}
+	UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX)
 	[ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*"
 	echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg
 }
@@ -171,6 +173,7 @@ _cleanup_test() {
 	"${UBLK_PROG}" del -a
 
 	_remove_files
+	rmdir ${UBLK_TEST_DIR}
 	echo "ublk selftest: $TID done at $(date '+%F %T')" | tee /dev/kmsg
 }
 
@@ -405,6 +408,8 @@ UBLK_PROG=$(_ublk_test_top_dir)/kublk
 UBLK_TEST_QUIET=1
 UBLK_TEST_SHOW_RESULT=1
 UBLK_BACKFILES=()
+UBLK_TEST_DIR=${TMPDIR:-.}
 export UBLK_PROG
 export UBLK_TEST_QUIET
 export UBLK_TEST_SHOW_RESULT
+export UBLK_TEST_DIR
-- 
cgit v1.2.3


From f0b5b3d6b56f8717e255406366d81bbcd3631660 Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul.chaignon@gmail.com>
Date: Sat, 31 Jan 2026 17:09:02 +0100
Subject: selftests/bpf: Test access from RO map from xdp_store_bytes

This new test simply checks that helper bpf_xdp_store_bytes can
successfully read from a read-only map.

Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
Link: https://lore.kernel.org/r/4fdb934a713b2d7cf133288c77f6cfefe9856440.1769875479.git.paul.chaignon@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_xdp.c | 35 ++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_xdp.c b/tools/testing/selftests/bpf/progs/verifier_xdp.c
index 50768ed179b3..7dc9226aeb34 100644
--- a/tools/testing/selftests/bpf/progs/verifier_xdp.c
+++ b/tools/testing/selftests/bpf/progs/verifier_xdp.c
@@ -5,6 +5,14 @@
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
 
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, int);
+	__type(value, __u64);
+	__uint(map_flags, BPF_F_RDONLY_PROG);
+} map_array_ro SEC(".maps");
+
 SEC("xdp")
 __description("XDP, using ifindex from netdev")
 __success __retval(1)
@@ -21,4 +29,31 @@ l0_%=:	exit;						\
 	: __clobber_all);
 }
 
+SEC("xdp")
+__description("XDP, using xdp_store_bytes from RO map")
+__success __retval(0)
+__naked void xdp_store_bytes_from_ro_map(void)
+{
+	asm volatile ("					\
+	r6 = r1;					\
+	r1 = 0;                                         \
+	*(u64*)(r10 - 8) = r1;                          \
+	r2 = r10;                                       \
+	r2 += -8;                                       \
+	r1 = %[map_array_ro] ll;                        \
+	call %[bpf_map_lookup_elem];                    \
+	if r0 == 0 goto l0_%=;                          \
+	r1 = r6;					\
+	r2 = 0;						\
+	r3 = r0;					\
+	r4 = 8;						\
+	call %[bpf_xdp_store_bytes];			\
+l0_%=:	exit;						\
+"	:
+	: __imm(bpf_map_lookup_elem),
+	  __imm(bpf_xdp_store_bytes),
+	  __imm_addr(map_array_ro)
+	: __clobber_all);
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 8798902f2b8bcae6f90229a1a1496b48ddda2972 Mon Sep 17 00:00:00 2001
From: Leon Hwang <leon.hwang@linux.dev>
Date: Sat, 31 Jan 2026 22:49:48 +0800
Subject: bpf: Add bpf_jit_supports_fsession()

The added fsession does not prevent running on those architectures, that
haven't added fsession support.

For example, try to run fsession tests on arm64:

test_fsession_basic:PASS:fsession_test__open_and_load 0 nsec
test_fsession_basic:PASS:fsession_attach 0 nsec
check_result:FAIL:test_run_opts err unexpected error: -14 (errno 14)

In order to prevent such errors, add bpf_jit_supports_fsession() to guard
those architectures.

Fixes: 2d419c44658f ("bpf: add fsession support")
Acked-by: Puranjay Mohan <puranjay@kernel.org>
Tested-by: Puranjay Mohan <puranjay@kernel.org>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260131144950.16294-2-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c                        |  5 ++++
 include/linux/filter.h                             |  1 +
 kernel/bpf/core.c                                  |  5 ++++
 kernel/bpf/verifier.c                              |  5 ++++
 .../selftests/bpf/prog_tests/fsession_test.c       | 32 ++++++++++++++++------
 5 files changed, 40 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5a075e06cf45..070ba80e39d7 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -4112,3 +4112,8 @@ bool bpf_jit_supports_timed_may_goto(void)
 {
 	return true;
 }
+
+bool bpf_jit_supports_fsession(void)
+{
+	return true;
+}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index fd54fed8f95f..4e1cb4f91f49 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1167,6 +1167,7 @@ bool bpf_jit_supports_arena(void);
 bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
 bool bpf_jit_supports_private_stack(void);
 bool bpf_jit_supports_timed_may_goto(void);
+bool bpf_jit_supports_fsession(void);
 u64 bpf_arch_uaddress_limit(void);
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
 u64 arch_bpf_timed_may_goto(void);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5ebece600aeb..dc906dfdff94 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3144,6 +3144,11 @@ bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
 	return false;
 }
 
+bool __weak bpf_jit_supports_fsession(void)
+{
+	return false;
+}
+
 u64 __weak bpf_arch_uaddress_limit(void)
 {
 #if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 256cc5c1a7df..6b62b6d57175 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -24828,6 +24828,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
 	case BPF_TRACE_FSESSION:
+		if (prog->expected_attach_type == BPF_TRACE_FSESSION &&
+		    !bpf_jit_supports_fsession()) {
+			bpf_log(log, "JIT does not support fsession\n");
+			return -EOPNOTSUPP;
+		}
 		if (!btf_type_is_func(t)) {
 			bpf_log(log, "attach_btf_id %u is not a function\n",
 				btf_id);
diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c
index 0c4b428e1cee..a299aeb8cc2e 100644
--- a/tools/testing/selftests/bpf/prog_tests/fsession_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c
@@ -29,8 +29,16 @@ static void test_fsession_basic(void)
 	struct fsession_test *skel = NULL;
 	int err;
 
-	skel = fsession_test__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load"))
+	skel = fsession_test__open();
+	if (!ASSERT_OK_PTR(skel, "fsession_test__open"))
+		return;
+
+	err = fsession_test__load(skel);
+	if (err == -EOPNOTSUPP) {
+		test__skip();
+		goto cleanup;
+	}
+	if (!ASSERT_OK(err, "fsession_test__load"))
 		goto cleanup;
 
 	err = fsession_test__attach(skel);
@@ -47,8 +55,16 @@ static void test_fsession_reattach(void)
 	struct fsession_test *skel = NULL;
 	int err;
 
-	skel = fsession_test__open_and_load();
-	if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load"))
+	skel = fsession_test__open();
+	if (!ASSERT_OK_PTR(skel, "fsession_test__open"))
+		return;
+
+	err = fsession_test__load(skel);
+	if (err == -EOPNOTSUPP) {
+		test__skip();
+		goto cleanup;
+	}
+	if (!ASSERT_OK(err, "fsession_test__load"))
 		goto cleanup;
 
 	/* first attach */
@@ -94,6 +110,10 @@ static void test_fsession_cookie(void)
 	bpf_program__set_autoload(skel->progs.test6, false);
 
 	err = fsession_test__load(skel);
+	if (err == -EOPNOTSUPP) {
+		test__skip();
+		goto cleanup;
+	}
 	if (!ASSERT_OK(err, "fsession_test__load"))
 		goto cleanup;
 
@@ -111,10 +131,6 @@ cleanup:
 
 void test_fsession_test(void)
 {
-#if !defined(__x86_64__)
-	test__skip();
-	return;
-#endif
 	if (test__start_subtest("fsession_test"))
 		test_fsession_basic();
 	if (test__start_subtest("fsession_reattach"))
-- 
cgit v1.2.3


From 7f10da2133b18b0f1bc02d58671883537e212279 Mon Sep 17 00:00:00 2001
From: Leon Hwang <leon.hwang@linux.dev>
Date: Sat, 31 Jan 2026 22:49:50 +0800
Subject: selftests/bpf: Enable get_func_args and get_func_ip tests on arm64

Allow get_func_args, and get_func_ip fsession selftests to run on arm64.

Acked-by: Puranjay Mohan <puranjay@kernel.org>
Tested-by: Puranjay Mohan <puranjay@kernel.org>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Link: https://lore.kernel.org/r/20260131144950.16294-4-leon.hwang@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/get_func_args_test.c | 2 +-
 tools/testing/selftests/bpf/progs/get_func_ip_test.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c
index 0a3236a7a109..180ba5098ca1 100644
--- a/tools/testing/selftests/bpf/progs/get_func_args_test.c
+++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c
@@ -167,7 +167,7 @@ int BPF_PROG(tp_test2)
 }
 
 __u64 test7_result = 0;
-#ifdef __TARGET_ARCH_x86
+#if defined(bpf_target_x86) || defined(bpf_target_arm64)
 SEC("fsession/bpf_fentry_test1")
 int BPF_PROG(test7)
 {
diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
index 65f7e1f182bf..43ff836a8ed8 100644
--- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c
+++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
@@ -106,7 +106,7 @@ int BPF_URETPROBE(test8, int ret)
 
 __u64 test9_entry_result = 0;
 __u64 test9_exit_result = 0;
-#ifdef __TARGET_ARCH_x86
+#if defined(bpf_target_x86) || defined(bpf_target_arm64)
 SEC("fsession/bpf_fentry_test1")
 int BPF_PROG(test9, int a)
 {
-- 
cgit v1.2.3


From 5af302a15a1d628a025a78892001fe8afea90c60 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 1 Feb 2026 00:23:32 +0800
Subject: selftests: ublk: simplify UBLK_TEST_DIR handling

Remove intermediate TDIR variable and set UBLK_TEST_DIR directly
in _prep_test(). Remove default initialization since the directory
is created dynamically when tests run.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/test_common.sh | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index 21ba51fcc7d7..8d298a7ee7b1 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -124,8 +124,7 @@ _prep_test() {
 	local type=$1
 	shift 1
 	modprobe ublk_drv > /dev/null 2>&1
-	TDIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX)
-	export UBLK_TEST_DIR=${TDIR}
+	UBLK_TEST_DIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX)
 	UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX)
 	[ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*"
 	echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg
@@ -408,8 +407,6 @@ UBLK_PROG=$(_ublk_test_top_dir)/kublk
 UBLK_TEST_QUIET=1
 UBLK_TEST_SHOW_RESULT=1
 UBLK_BACKFILES=()
-UBLK_TEST_DIR=${TMPDIR:-.}
 export UBLK_PROG
 export UBLK_TEST_QUIET
 export UBLK_TEST_SHOW_RESULT
-export UBLK_TEST_DIR
-- 
cgit v1.2.3


From 842b6520e579b8bd7d6ea09937e1fb7729cce1c5 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 1 Feb 2026 00:23:33 +0800
Subject: selftests: ublk: refactor test_loop_08 into separate functions

Encapsulate each test case in its own function for better organization
and maintainability:

- _setup_device(): device and backfile initialization
- _test_fill_and_verify(): initial data population
- _test_corrupted_reftag(): reftag corruption detection test
- _test_corrupted_data(): data corruption detection test
- _test_bad_apptag(): apptag mismatch detection test

Also fix temp file creation to use ${UBLK_TEST_DIR}/fio_err_XXXXX instead of
creating in current directory.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/test_loop_08.sh | 199 ++++++++++++++++-----------
 1 file changed, 115 insertions(+), 84 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh
index 2caa7ba748fb..aaf1f52da559 100755
--- a/tools/testing/selftests/ublk/test_loop_08.sh
+++ b/tools/testing/selftests/ublk/test_loop_08.sh
@@ -13,98 +13,129 @@ if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then
 	exit $UBLK_SKIP_CODE
 fi
 
+ERR_CODE=0
 
-_prep_test "loop" "end-to-end integrity"
+# Global variables set during device setup
+dev_id=""
+fio_args=""
+fio_err=""
 
-_create_backfile 0 256M
-_create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes)
-integrity_params="--integrity_capable --integrity_reftag
-                  --metadata_size 64 --pi_offset 56 --csum_type t10dif"
-dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}")
-_check_add_dev $TID $?
-
-# 1M * (64 integrity bytes / 512 data bytes) = 128K
-fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32
-          --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG
-          --filename /dev/ublkb$dev_id"
-fio --name fill --rw randwrite $fio_args > /dev/null
-err=$?
-if [ $err != 0 ]; then
-	echo "fio fill failed"
-	_show_result $TID $err
-fi
+_setup_device() {
+	_create_backfile 0 256M
+	_create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes)
 
-fio --name verify --rw randread $fio_args > /dev/null
-err=$?
-if [ $err != 0 ]; then
-	echo "fio verify failed"
-	_show_result $TID $err
-fi
+	local integrity_params="--integrity_capable --integrity_reftag
+		--metadata_size 64 --pi_offset 56 --csum_type t10dif"
+	dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}")
+	_check_add_dev "$TID" $?
 
-fio_err=$(mktemp fio_err_XXXXX)
+	# 1M * (64 integrity bytes / 512 data bytes) = 128K
+	fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32
+		--md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG
+		--filename /dev/ublkb$dev_id"
 
-# Overwrite 4-byte reftag at offset 56 + 4 = 60
-dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none"
-dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args
-err=$?
-if [ $err != 0 ]; then
-	echo "dd corrupted_reftag failed"
-	rm -f "$fio_err"
-	_show_result $TID $err
-fi
-if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then
-	echo "fio corrupted_reftag unexpectedly succeeded"
-	rm -f "$fio_err"
-	_show_result $TID 255
-fi
-expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual="
-if ! grep -q "$expected_err" "$fio_err"; then
-	echo "fio corrupted_reftag message not found: $expected_err"
-	rm -f "$fio_err"
-	_show_result $TID 255
-fi
-# Reset to 0
-dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args
-err=$?
-if [ $err != 0 ]; then
-	echo "dd restore corrupted_reftag failed"
-	rm -f "$fio_err"
-	_show_result $TID $err
-fi
+	fio_err=$(mktemp "${UBLK_TEST_DIR}"/fio_err_XXXXX)
+}
 
-dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none"
-dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args
-err=$?
-if [ $err != 0 ]; then
-	echo "dd corrupted_data failed"
-	rm -f "$fio_err"
-	_show_result $TID $err
-fi
-if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then
-	echo "fio corrupted_data unexpectedly succeeded"
-	rm -f "$fio_err"
-	_show_result $TID 255
-fi
-expected_err="Guard compare error: LBA: 0 Expected=0, Actual="
-if ! grep -q "$expected_err" "$fio_err"; then
-	echo "fio corrupted_data message not found: $expected_err"
-	rm -f "$fio_err"
-	_show_result $TID 255
-fi
+_test_fill_and_verify() {
+	fio --name fill --rw randwrite $fio_args > /dev/null
+	if [ $? != 0 ]; then
+		echo "fio fill failed"
+		ERR_CODE=255
+		return 1
+	fi
 
-if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then
-	echo "fio bad_apptag unexpectedly succeeded"
-	rm -f "$fio_err"
-	_show_result $TID 255
-fi
-expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234"
-if ! grep -q "$expected_err" "$fio_err"; then
-	echo "fio bad_apptag message not found: $expected_err"
-	rm -f "$fio_err"
-	_show_result $TID 255
-fi
+	fio --name verify --rw randread $fio_args > /dev/null
+	if [ $? != 0 ]; then
+		echo "fio verify failed"
+		ERR_CODE=255
+		return 1
+	fi
+}
+
+_test_corrupted_reftag() {
+	local dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none"
+	local expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual="
+
+	# Overwrite 4-byte reftag at offset 56 + 4 = 60
+	dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args
+	if [ $? != 0 ]; then
+		echo "dd corrupted_reftag failed"
+		ERR_CODE=255
+		return 1
+	fi
+
+	if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then
+		echo "fio corrupted_reftag unexpectedly succeeded"
+		ERR_CODE=255
+		return 1
+	fi
+
+	if ! grep -q "$expected_err" "$fio_err"; then
+		echo "fio corrupted_reftag message not found: $expected_err"
+		ERR_CODE=255
+		return 1
+	fi
+
+	# Reset to 0
+	dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args
+	if [ $? != 0 ]; then
+		echo "dd restore corrupted_reftag failed"
+		ERR_CODE=255
+		return 1
+	fi
+}
+
+_test_corrupted_data() {
+	local dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none"
+	local expected_err="Guard compare error: LBA: 0 Expected=0, Actual="
+
+	dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args
+	if [ $? != 0 ]; then
+		echo "dd corrupted_data failed"
+		ERR_CODE=255
+		return 1
+	fi
+
+	if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then
+		echo "fio corrupted_data unexpectedly succeeded"
+		ERR_CODE=255
+		return 1
+	fi
+
+	if ! grep -q "$expected_err" "$fio_err"; then
+		echo "fio corrupted_data message not found: $expected_err"
+		ERR_CODE=255
+		return 1
+	fi
+}
+
+_test_bad_apptag() {
+	local expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234"
+
+	if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then
+		echo "fio bad_apptag unexpectedly succeeded"
+		ERR_CODE=255
+		return 1
+	fi
+
+	if ! grep -q "$expected_err" "$fio_err"; then
+		echo "fio bad_apptag message not found: $expected_err"
+		ERR_CODE=255
+		return 1
+	fi
+}
+
+_prep_test "loop" "end-to-end integrity"
+
+_setup_device
+
+_test_fill_and_verify && \
+_test_corrupted_reftag && \
+_test_corrupted_data && \
+_test_bad_apptag
 
 rm -f "$fio_err"
 
 _cleanup_test
-_show_result $TID 0
+_show_result "$TID" $ERR_CODE
-- 
cgit v1.2.3


From 92734a4f3a7a5449b0c7d0160ba658a2b665c31b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 1 Feb 2026 00:23:34 +0800
Subject: selftests: ublk: add _ublk_del_dev helper function

Add _ublk_del_dev() to delete a specific ublk device by ID and
use it in all test scripts instead of calling UBLK_PROG directly.

Also remove unused _remove_ublk_devices() function.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/test_common.sh     | 13 +++++++------
 tools/testing/selftests/ublk/test_generic_16.sh |  4 ++--
 tools/testing/selftests/ublk/test_null_04.sh    |  8 ++++----
 tools/testing/selftests/ublk/test_part_02.sh    |  4 ++--
 4 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index 8d298a7ee7b1..0f1fdb0892b4 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -106,11 +106,6 @@ _check_root() {
 	fi
 }
 
-_remove_ublk_devices() {
-	${UBLK_PROG} del -a
-	modprobe -r ublk_drv > /dev/null 2>&1
-}
-
 _get_ublk_dev_state() {
 	${UBLK_PROG} list -n "$1" | grep "state" | awk '{print $11}'
 }
@@ -277,10 +272,16 @@ __ublk_kill_daemon()
 	echo "$state"
 }
 
-__remove_ublk_dev_return() {
+_ublk_del_dev() {
 	local dev_id=$1
 
 	${UBLK_PROG} del -n "${dev_id}"
+}
+
+__remove_ublk_dev_return() {
+	local dev_id=$1
+
+	_ublk_del_dev "${dev_id}"
 	local res=$?
 	udevadm settle
 	return ${res}
diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh
index 42e8d2e16ec9..3ef367836ac5 100755
--- a/tools/testing/selftests/ublk/test_generic_16.sh
+++ b/tools/testing/selftests/ublk/test_generic_16.sh
@@ -24,7 +24,7 @@ if ! ${UBLK_PROG} stop -n "${dev_id}" --safe; then
 fi
 
 # Clean up device
-${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1
+_ublk_del_dev "${dev_id}" > /dev/null 2>&1
 udevadm settle
 
 # Test 2: stop --safe on device with active opener should fail
@@ -49,7 +49,7 @@ kill $dd_pid 2>/dev/null
 wait $dd_pid 2>/dev/null
 
 # Now device should be idle, regular delete should work
-${UBLK_PROG} del -n "${dev_id}"
+_ublk_del_dev "${dev_id}"
 udevadm settle
 
 _cleanup_test "null"
diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh
index a5599d38583a..6713b280a6ff 100755
--- a/tools/testing/selftests/ublk/test_null_04.sh
+++ b/tools/testing/selftests/ublk/test_null_04.sh
@@ -34,7 +34,7 @@ _test_metadata_only() {
 		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
 	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0
 
-	${UBLK_PROG} del -n "${dev_id}"
+	_ublk_del_dev "${dev_id}"
 }
 
 _test_integrity_capable_ip() {
@@ -53,7 +53,7 @@ _test_integrity_capable_ip() {
 		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
 	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0
 
-	${UBLK_PROG} del -n "${dev_id}"
+	_ublk_del_dev "${dev_id}"
 }
 
 _test_integrity_reftag_t10dif() {
@@ -72,7 +72,7 @@ _test_integrity_reftag_t10dif() {
 		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
 	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0
 
-	${UBLK_PROG} del -n "${dev_id}"
+	_ublk_del_dev "${dev_id}"
 }
 
 _test_nvme_csum() {
@@ -91,7 +91,7 @@ _test_nvme_csum() {
 		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
 	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8
 
-	${UBLK_PROG} del -n "${dev_id}"
+	_ublk_del_dev "${dev_id}"
 }
 
 _prep_test "null" "integrity params"
diff --git a/tools/testing/selftests/ublk/test_part_02.sh b/tools/testing/selftests/ublk/test_part_02.sh
index 727d0f4610d6..acd098deda3a 100755
--- a/tools/testing/selftests/ublk/test_part_02.sh
+++ b/tools/testing/selftests/ublk/test_part_02.sh
@@ -46,13 +46,13 @@ _test_partition_scan_no_hang()
 	if [ "$state" != "${expected_state}" ]; then
 		echo "FAIL: Device state is $state, expected ${expected_state}"
 		ERR_CODE=255
-		${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1
+		_ublk_del_dev "${dev_id}" > /dev/null 2>&1
 		return
 	fi
 	echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging"
 
 	# Clean up the device
-	${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1
+	_ublk_del_dev "${dev_id}" > /dev/null 2>&1
 }
 
 _prep_test "partition_scan" "verify async partition scan prevents IO hang"
-- 
cgit v1.2.3


From 2021e6109de3e97adfce262c40a657ff206ef495 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 1 Feb 2026 00:23:35 +0800
Subject: selftests: ublk: track created devices for per-test cleanup

Track device IDs in UBLK_DEVS array when created. Update
_cleanup_test() to only delete devices created by this test
instead of using 'del -a' which removes all devices.

This prepares for running tests concurrently where each test
should only clean up its own devices.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/test_common.sh | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index 0f1fdb0892b4..422882c32490 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -164,7 +164,12 @@ _check_add_dev()
 }
 
 _cleanup_test() {
-	"${UBLK_PROG}" del -a
+	if [ -f "${UBLK_TEST_DIR}/.ublk_devs" ]; then
+		while read -r dev_id; do
+			${UBLK_PROG} del -n "${dev_id}"
+		done < "${UBLK_TEST_DIR}/.ublk_devs"
+		rm -f "${UBLK_TEST_DIR}/.ublk_devs"
+	fi
 
 	_remove_files
 	rmdir ${UBLK_TEST_DIR}
@@ -205,6 +210,7 @@ _create_ublk_dev() {
 	fi
 
 	if [[ "$dev_id" =~ ^[0-9]+$ ]]; then
+		echo "$dev_id" >> "${UBLK_TEST_DIR}/.ublk_devs"
 		echo "${dev_id}"
 	else
 		return 255
@@ -276,6 +282,11 @@ _ublk_del_dev() {
 	local dev_id=$1
 
 	${UBLK_PROG} del -n "${dev_id}"
+
+	# Remove from tracking file
+	if [ -f "${UBLK_TEST_DIR}/.ublk_devs" ]; then
+		sed -i "/^${dev_id}$/d" "${UBLK_TEST_DIR}/.ublk_devs"
+	fi
 }
 
 __remove_ublk_dev_return() {
-- 
cgit v1.2.3


From b6bbc3bec19efd557f888d78865b627b80b37a32 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 1 Feb 2026 00:23:36 +0800
Subject: selftests: ublk: add group-based test targets

Add convenient Makefile targets for running specific test groups:
- run_generic, run_batch, run_null, run_loop, run_stripe, run_stress, etc.
- run_all for running all tests

Test groups are auto-detected from TEST_PROGS using pattern matching
(test_<group>_<num>.sh -> group), and targets are generated dynamically
using define/eval templates.

Supports parallel execution via JOBS variable:
- JOBS=1 (default): sequential with kselftest TAP output
- JOBS>1: parallel execution with xargs -P

Usage examples:
  make run_null           # Sequential execution
  make run_stress JOBS=4  # Parallel with 4 jobs
  make run_all JOBS=8     # Run all tests with 8 parallel jobs

With JOBS=8, running time of `make run_all` is reduced to 2m2s from 6m5s
in my test VM.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile | 36 +++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index ca8588ed962c..37e012d3a8a7 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -72,3 +72,39 @@ $(OUTPUT)/kublk: $(filter-out $(STANDALONE_UTILS),$(wildcard *.c))
 
 check:
 	shellcheck -x -f gcc *.sh
+
+# Test groups for running subsets of tests
+# JOBS=1 (default): sequential with kselftest TAP output
+# JOBS>1: parallel execution with xargs -P
+# Usage: make run_null JOBS=4
+JOBS ?= 1
+
+# Auto-detect test groups from TEST_PROGS (test_<group>_<num>.sh -> group)
+TEST_GROUPS := $(shell echo "$(TEST_PROGS)" | tr ' ' '\n' | \
+	sed 's/test_\([^_]*\)_.*/\1/' | sort -u)
+
+# Template for group test targets
+# $(1) = group name (e.g., null, generic, stress)
+define RUN_GROUP
+run_$(1): all
+	@if [ $$(JOBS) -gt 1 ]; then \
+		echo $$(filter test_$(1)_%.sh,$$(TEST_PROGS)) | tr ' ' '\n' | \
+			xargs -P $$(JOBS) -n1 sh -c './"$$$$0"' || true; \
+	else \
+		$$(call RUN_TESTS, $$(filter test_$(1)_%.sh,$$(TEST_PROGS))); \
+	fi
+.PHONY: run_$(1)
+endef
+
+# Generate targets for each discovered test group
+$(foreach group,$(TEST_GROUPS),$(eval $(call RUN_GROUP,$(group))))
+
+# Run all tests (parallel when JOBS>1)
+run_all: all
+	@if [ $(JOBS) -gt 1 ]; then \
+		echo $(TEST_PROGS) | tr ' ' '\n' | \
+			xargs -P $(JOBS) -n1 sh -c './"$$0"' || true; \
+	else \
+		$(call RUN_TESTS, $(TEST_PROGS)); \
+	fi
+.PHONY: run_all
-- 
cgit v1.2.3


From 64406dd2f69fe27921c7bf06088871c002cf6186 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 1 Feb 2026 00:23:37 +0800
Subject: selftests: ublk: add _ublk_sleep helper for parallel execution

Add _ublk_sleep() helper function that uses different sleep times
depending on whether tests run in parallel or sequential mode.

Usage: _ublk_sleep <normal_secs> <parallel_secs>

Export JOBS variable from Makefile so test scripts can detect parallel
execution, and use _ublk_sleep in test_part_02.sh to handle the
partition scan delay (1s normal, 5s parallel).

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile        |  1 +
 tools/testing/selftests/ublk/test_common.sh  | 10 ++++++++++
 tools/testing/selftests/ublk/test_part_02.sh |  2 +-
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 37e012d3a8a7..1ceae611acb7 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -78,6 +78,7 @@ check:
 # JOBS>1: parallel execution with xargs -P
 # Usage: make run_null JOBS=4
 JOBS ?= 1
+export JOBS
 
 # Auto-detect test groups from TEST_PROGS (test_<group>_<num>.sh -> group)
 TEST_GROUPS := $(shell echo "$(TEST_PROGS)" | tr ' ' '\n' | \
diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index 422882c32490..bd27a6875c1a 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -15,6 +15,16 @@ _have_program() {
 	return 1
 }
 
+# Sleep with awareness of parallel execution.
+# Usage: _ublk_sleep <normal_secs> <parallel_secs>
+_ublk_sleep() {
+	if [ "${JOBS:-1}" -gt 1 ]; then
+		sleep "$2"
+	else
+		sleep "$1"
+	fi
+}
+
 _get_disk_dev_t() {
 	local dev_id=$1
 	local dev
diff --git a/tools/testing/selftests/ublk/test_part_02.sh b/tools/testing/selftests/ublk/test_part_02.sh
index acd098deda3a..7d42ab4d6e83 100755
--- a/tools/testing/selftests/ublk/test_part_02.sh
+++ b/tools/testing/selftests/ublk/test_part_02.sh
@@ -33,7 +33,7 @@ _test_partition_scan_no_hang()
 	# The add command should return quickly because partition scan is async.
 	# Now sleep briefly to let the async partition scan work start and hit
 	# the delay in the fault_inject handler.
-	sleep 1
+	_ublk_sleep 1 5
 
 	# Kill the ublk daemon while partition scan is potentially blocked
 	# And check state transitions properly
-- 
cgit v1.2.3


From 56a08b87f9f2a763cb5546f83b78ebe1e96260af Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 1 Feb 2026 00:23:38 +0800
Subject: selftests: ublk: increase timeouts for parallel test execution

When running tests in parallel with high JOBS count (e.g., JOBS=64),
the existing timeouts can be insufficient due to system load:

- Increase state wait loops from 20/50 to 100 iterations in
  _recover_ublk_dev(), __ublk_quiesce_dev(), and __ublk_kill_daemon()
  to handle slower state transitions under heavy load

- Add --timeout=20 to udevadm settle calls to prevent indefinite
  hangs when udev event queue is overwhelmed by rapid device
  creation/deletion

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/test_common.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index bd27a6875c1a..c3afd00783a2 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -216,7 +216,7 @@ _create_ublk_dev() {
 	fi
 
 	if [ "$settle" = "yes" ]; then
-		udevadm settle
+		udevadm settle --timeout=20
 	fi
 
 	if [[ "$dev_id" =~ ^[0-9]+$ ]]; then
@@ -240,7 +240,7 @@ _recover_ublk_dev() {
 	local state
 
 	dev_id=$(_create_ublk_dev "recover" "yes" "$@")
-	for ((j=0;j<20;j++)); do
+	for ((j=0;j<100;j++)); do
 		state=$(_get_ublk_dev_state "${dev_id}")
 		[ "$state" == "LIVE" ] && break
 		sleep 1
@@ -260,7 +260,7 @@ __ublk_quiesce_dev()
 		return "$state"
 	fi
 
-	for ((j=0;j<50;j++)); do
+	for ((j=0;j<100;j++)); do
 		state=$(_get_ublk_dev_state "${dev_id}")
 		[ "$state" == "$exp_state" ] && break
 		sleep 1
@@ -279,7 +279,7 @@ __ublk_kill_daemon()
 	daemon_pid=$(_get_ublk_daemon_pid "${dev_id}")
 	state=$(_get_ublk_dev_state "${dev_id}")
 
-	for ((j=0;j<50;j++)); do
+	for ((j=0;j<100;j++)); do
 		[ "$state" == "$exp_state" ] && break
 		kill -9 "$daemon_pid" > /dev/null 2>&1
 		sleep 1
@@ -304,7 +304,7 @@ __remove_ublk_dev_return() {
 
 	_ublk_del_dev "${dev_id}"
 	local res=$?
-	udevadm settle
+	udevadm settle --timeout=20
 	return ${res}
 }
 
-- 
cgit v1.2.3


From d9a36ab302b1c90d8f03a3b13538b8676eb6ed3b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 1 Feb 2026 00:23:39 +0800
Subject: selftests: ublk: reorganize tests into integrity and recover groups

Move integrity-focused tests into new 'integrity' group:
- test_null_04.sh -> test_integrity_01.sh
- test_loop_08.sh -> test_integrity_02.sh

Move recovery-focused tests into new 'recover' group:
- test_generic_04.sh -> test_recover_01.sh
- test_generic_05.sh -> test_recover_02.sh
- test_generic_11.sh -> test_recover_03.sh
- test_generic_14.sh -> test_recover_04.sh

Update Makefile to reflect the reorganization.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile             |  14 ++-
 tools/testing/selftests/ublk/test_generic_04.sh   |  44 -------
 tools/testing/selftests/ublk/test_generic_05.sh   |  48 --------
 tools/testing/selftests/ublk/test_generic_11.sh   |  43 -------
 tools/testing/selftests/ublk/test_generic_14.sh   |  39 ------
 tools/testing/selftests/ublk/test_integrity_01.sh | 105 ++++++++++++++++
 tools/testing/selftests/ublk/test_integrity_02.sh | 141 ++++++++++++++++++++++
 tools/testing/selftests/ublk/test_loop_08.sh      | 141 ----------------------
 tools/testing/selftests/ublk/test_null_04.sh      | 105 ----------------
 tools/testing/selftests/ublk/test_recover_01.sh   |  44 +++++++
 tools/testing/selftests/ublk/test_recover_02.sh   |  48 ++++++++
 tools/testing/selftests/ublk/test_recover_03.sh   |  43 +++++++
 tools/testing/selftests/ublk/test_recover_04.sh   |  39 ++++++
 13 files changed, 428 insertions(+), 426 deletions(-)
 delete mode 100755 tools/testing/selftests/ublk/test_generic_04.sh
 delete mode 100755 tools/testing/selftests/ublk/test_generic_05.sh
 delete mode 100755 tools/testing/selftests/ublk/test_generic_11.sh
 delete mode 100755 tools/testing/selftests/ublk/test_generic_14.sh
 create mode 100755 tools/testing/selftests/ublk/test_integrity_01.sh
 create mode 100755 tools/testing/selftests/ublk/test_integrity_02.sh
 delete mode 100755 tools/testing/selftests/ublk/test_loop_08.sh
 delete mode 100755 tools/testing/selftests/ublk/test_null_04.sh
 create mode 100755 tools/testing/selftests/ublk/test_recover_01.sh
 create mode 100755 tools/testing/selftests/ublk/test_recover_02.sh
 create mode 100755 tools/testing/selftests/ublk/test_recover_03.sh
 create mode 100755 tools/testing/selftests/ublk/test_recover_04.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 1ceae611acb7..a62a06e13006 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -10,18 +10,14 @@ LDLIBS += -lpthread -lm -luring
 TEST_PROGS := test_generic_01.sh
 TEST_PROGS += test_generic_02.sh
 TEST_PROGS += test_generic_03.sh
-TEST_PROGS += test_generic_04.sh
-TEST_PROGS += test_generic_05.sh
 TEST_PROGS += test_generic_06.sh
 TEST_PROGS += test_generic_07.sh
 
 TEST_PROGS += test_generic_08.sh
 TEST_PROGS += test_generic_09.sh
 TEST_PROGS += test_generic_10.sh
-TEST_PROGS += test_generic_11.sh
 TEST_PROGS += test_generic_12.sh
 TEST_PROGS += test_generic_13.sh
-TEST_PROGS += test_generic_14.sh
 TEST_PROGS += test_generic_16.sh
 
 TEST_PROGS += test_batch_01.sh
@@ -31,7 +27,6 @@ TEST_PROGS += test_batch_03.sh
 TEST_PROGS += test_null_01.sh
 TEST_PROGS += test_null_02.sh
 TEST_PROGS += test_null_03.sh
-TEST_PROGS += test_null_04.sh
 TEST_PROGS += test_loop_01.sh
 TEST_PROGS += test_loop_02.sh
 TEST_PROGS += test_loop_03.sh
@@ -39,7 +34,14 @@ TEST_PROGS += test_loop_04.sh
 TEST_PROGS += test_loop_05.sh
 TEST_PROGS += test_loop_06.sh
 TEST_PROGS += test_loop_07.sh
-TEST_PROGS += test_loop_08.sh
+
+TEST_PROGS += test_integrity_01.sh
+TEST_PROGS += test_integrity_02.sh
+
+TEST_PROGS += test_recover_01.sh
+TEST_PROGS += test_recover_02.sh
+TEST_PROGS += test_recover_03.sh
+TEST_PROGS += test_recover_04.sh
 TEST_PROGS += test_stripe_01.sh
 TEST_PROGS += test_stripe_02.sh
 TEST_PROGS += test_stripe_03.sh
diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_generic_04.sh
deleted file mode 100755
index 2672f9c40fa8..000000000000
--- a/tools/testing/selftests/ublk/test_generic_04.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-
-ERR_CODE=0
-
-ublk_run_recover_test()
-{
-	run_io_and_recover 256M "kill_daemon" "$@"
-	ERR_CODE=$?
-	if [ ${ERR_CODE} -ne 0 ]; then
-		echo "$TID failure: $*"
-		_show_result $TID $ERR_CODE
-	fi
-}
-
-if ! _have_program fio; then
-	exit "$UBLK_SKIP_CODE"
-fi
-
-_prep_test "recover" "basic recover function verification"
-
-_create_backfile 0 256M
-_create_backfile 1 128M
-_create_backfile 2 128M
-
-ublk_run_recover_test -t null -q 2 -r 1 -b &
-ublk_run_recover_test -t loop -q 2 -r 1 -b "${UBLK_BACKFILES[0]}" &
-ublk_run_recover_test -t stripe -q 2 -r 1 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
-wait
-
-ublk_run_recover_test -t null -q 2 -r 1 &
-ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" &
-ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
-wait
-
-ublk_run_recover_test -t null -q 2 -r 1 -i 1 &
-ublk_run_recover_test -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" &
-ublk_run_recover_test -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
-wait
-
-_cleanup_test "recover"
-_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh
deleted file mode 100755
index bda5064bc31f..000000000000
--- a/tools/testing/selftests/ublk/test_generic_05.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-
-ERR_CODE=0
-
-ublk_run_recover_test()
-{
-	run_io_and_recover 256M "kill_daemon" "$@"
-	ERR_CODE=$?
-	if [ ${ERR_CODE} -ne 0 ]; then
-		echo "$TID failure: $*"
-		_show_result $TID $ERR_CODE
-	fi
-}
-
-if ! _have_program fio; then
-	exit "$UBLK_SKIP_CODE"
-fi
-
-if ! _have_feature "ZERO_COPY"; then
-	exit "$UBLK_SKIP_CODE"
-fi
-
-_prep_test "recover" "basic recover function verification (zero copy)"
-
-_create_backfile 0 256M
-_create_backfile 1 128M
-_create_backfile 2 128M
-
-ublk_run_recover_test -t null -q 2 -r 1 -z -b &
-ublk_run_recover_test -t loop -q 2 -r 1 -z -b "${UBLK_BACKFILES[0]}" &
-ublk_run_recover_test -t stripe -q 2 -r 1 -z -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
-wait
-
-ublk_run_recover_test -t null -q 2 -r 1 -z &
-ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" &
-ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
-wait
-
-ublk_run_recover_test -t null -q 2 -r 1 -z -i 1 &
-ublk_run_recover_test -t loop -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[0]}" &
-ublk_run_recover_test -t stripe -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
-wait
-
-_cleanup_test "recover"
-_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_generic_11.sh b/tools/testing/selftests/ublk/test_generic_11.sh
deleted file mode 100755
index e0dc0b8fe5d6..000000000000
--- a/tools/testing/selftests/ublk/test_generic_11.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-
-ERR_CODE=0
-
-ublk_run_quiesce_recover()
-{
-	run_io_and_recover 256M "quiesce_dev" "$@"
-	ERR_CODE=$?
-	if [ ${ERR_CODE} -ne 0 ]; then
-		echo "$TID failure: $*"
-		_show_result $TID $ERR_CODE
-	fi
-}
-
-if ! _have_feature "QUIESCE"; then
-	exit "$UBLK_SKIP_CODE"
-fi
-
-if ! _have_program fio; then
-	exit "$UBLK_SKIP_CODE"
-fi
-
-_prep_test "quiesce" "basic quiesce & recover function verification"
-
-_create_backfile 0 256M
-_create_backfile 1 128M
-_create_backfile 2 128M
-
-ublk_run_quiesce_recover -t null -q 2 -r 1 &
-ublk_run_quiesce_recover -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" &
-ublk_run_quiesce_recover -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
-wait
-
-ublk_run_quiesce_recover -t null -q 2 -r 1 -i 1 &
-ublk_run_quiesce_recover -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" &
-ublk_run_quiesce_recover -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
-wait
-
-_cleanup_test "quiesce"
-_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_generic_14.sh b/tools/testing/selftests/ublk/test_generic_14.sh
deleted file mode 100755
index 178443394ca5..000000000000
--- a/tools/testing/selftests/ublk/test_generic_14.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-
-ERR_CODE=0
-
-ublk_run_recover_test()
-{
-	run_io_and_recover 256M "kill_daemon" "$@"
-	ERR_CODE=$?
-	if [ ${ERR_CODE} -ne 0 ]; then
-		echo "$TID failure: $*"
-		_show_result $TID $ERR_CODE
-	fi
-}
-
-if ! _have_program fio; then
-	exit "$UBLK_SKIP_CODE"
-fi
-
-_prep_test "recover" "basic recover function verification (user copy)"
-
-_create_backfile 0 256M
-_create_backfile 1 128M
-_create_backfile 2 128M
-
-ublk_run_recover_test -t null -q 2 -r 1 -u &
-ublk_run_recover_test -t loop -q 2 -r 1 -u "${UBLK_BACKFILES[0]}" &
-ublk_run_recover_test -t stripe -q 2 -r 1 -u "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
-wait
-
-ublk_run_recover_test -t null -q 2 -r 1 -u -i 1 &
-ublk_run_recover_test -t loop -q 2 -r 1 -u -i 1 "${UBLK_BACKFILES[0]}" &
-ublk_run_recover_test -t stripe -q 2 -r 1 -u -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
-wait
-
-_cleanup_test "recover"
-_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_integrity_01.sh b/tools/testing/selftests/ublk/test_integrity_01.sh
new file mode 100755
index 000000000000..6713b280a6ff
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_integrity_01.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+ERR_CODE=0
+
+_check_value() {
+	local name=$1
+	local actual=$2
+	local expected=$3
+
+	if [ "$actual" != "$expected" ]; then
+		echo "$name $actual != $expected"
+		ERR_CODE=255
+		return 1
+	fi
+	return 0
+}
+
+_test_metadata_only() {
+	local dev_id
+
+	dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 8)
+	_check_add_dev "$TID" $?
+
+	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 &&
+	_check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 &&
+	_check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 0 &&
+	_check_value "device_is_integrity_capable" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 &&
+	_check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" nop &&
+	_check_value "protection_interval_bytes" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
+	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0
+
+	_ublk_del_dev "${dev_id}"
+}
+
+_test_integrity_capable_ip() {
+	local dev_id
+
+	dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip)
+	_check_add_dev "$TID" $?
+
+	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 &&
+	_check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 56 &&
+	_check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 &&
+	_check_value "device_is_integrity_capable" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 1 &&
+	_check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE3-IP &&
+	_check_value "protection_interval_bytes" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
+	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0
+
+	_ublk_del_dev "${dev_id}"
+}
+
+_test_integrity_reftag_t10dif() {
+	local dev_id
+
+	dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_reftag --metadata_size 8 --csum_type t10dif)
+	_check_add_dev "$TID" $?
+
+	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 &&
+	_check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 &&
+	_check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 &&
+	_check_value "device_is_integrity_capable" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 &&
+	_check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE1-CRC &&
+	_check_value "protection_interval_bytes" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
+	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0
+
+	_ublk_del_dev "${dev_id}"
+}
+
+_test_nvme_csum() {
+	local dev_id
+
+	dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 16 --csum_type nvme --tag_size 8)
+	_check_add_dev "$TID" $?
+
+	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 &&
+	_check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 &&
+	_check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 16 &&
+	_check_value "device_is_integrity_capable" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 &&
+	_check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" EXT-DIF-TYPE3-CRC64 &&
+	_check_value "protection_interval_bytes" \
+		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
+	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8
+
+	_ublk_del_dev "${dev_id}"
+}
+
+_prep_test "null" "integrity params"
+
+_test_metadata_only
+_test_integrity_capable_ip
+_test_integrity_reftag_t10dif
+_test_nvme_csum
+
+_cleanup_test
+_show_result "$TID" $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_integrity_02.sh b/tools/testing/selftests/ublk/test_integrity_02.sh
new file mode 100755
index 000000000000..aaf1f52da559
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_integrity_02.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+if ! _have_program fio; then
+	exit $UBLK_SKIP_CODE
+fi
+
+fio_version=$(fio --version)
+if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then
+	echo "Requires development fio version with https://github.com/axboe/fio/pull/1992"
+	exit $UBLK_SKIP_CODE
+fi
+
+ERR_CODE=0
+
+# Global variables set during device setup
+dev_id=""
+fio_args=""
+fio_err=""
+
+_setup_device() {
+	_create_backfile 0 256M
+	_create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes)
+
+	local integrity_params="--integrity_capable --integrity_reftag
+		--metadata_size 64 --pi_offset 56 --csum_type t10dif"
+	dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}")
+	_check_add_dev "$TID" $?
+
+	# 1M * (64 integrity bytes / 512 data bytes) = 128K
+	fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32
+		--md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG
+		--filename /dev/ublkb$dev_id"
+
+	fio_err=$(mktemp "${UBLK_TEST_DIR}"/fio_err_XXXXX)
+}
+
+_test_fill_and_verify() {
+	fio --name fill --rw randwrite $fio_args > /dev/null
+	if [ $? != 0 ]; then
+		echo "fio fill failed"
+		ERR_CODE=255
+		return 1
+	fi
+
+	fio --name verify --rw randread $fio_args > /dev/null
+	if [ $? != 0 ]; then
+		echo "fio verify failed"
+		ERR_CODE=255
+		return 1
+	fi
+}
+
+_test_corrupted_reftag() {
+	local dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none"
+	local expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual="
+
+	# Overwrite 4-byte reftag at offset 56 + 4 = 60
+	dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args
+	if [ $? != 0 ]; then
+		echo "dd corrupted_reftag failed"
+		ERR_CODE=255
+		return 1
+	fi
+
+	if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then
+		echo "fio corrupted_reftag unexpectedly succeeded"
+		ERR_CODE=255
+		return 1
+	fi
+
+	if ! grep -q "$expected_err" "$fio_err"; then
+		echo "fio corrupted_reftag message not found: $expected_err"
+		ERR_CODE=255
+		return 1
+	fi
+
+	# Reset to 0
+	dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args
+	if [ $? != 0 ]; then
+		echo "dd restore corrupted_reftag failed"
+		ERR_CODE=255
+		return 1
+	fi
+}
+
+_test_corrupted_data() {
+	local dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none"
+	local expected_err="Guard compare error: LBA: 0 Expected=0, Actual="
+
+	dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args
+	if [ $? != 0 ]; then
+		echo "dd corrupted_data failed"
+		ERR_CODE=255
+		return 1
+	fi
+
+	if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then
+		echo "fio corrupted_data unexpectedly succeeded"
+		ERR_CODE=255
+		return 1
+	fi
+
+	if ! grep -q "$expected_err" "$fio_err"; then
+		echo "fio corrupted_data message not found: $expected_err"
+		ERR_CODE=255
+		return 1
+	fi
+}
+
+_test_bad_apptag() {
+	local expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234"
+
+	if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then
+		echo "fio bad_apptag unexpectedly succeeded"
+		ERR_CODE=255
+		return 1
+	fi
+
+	if ! grep -q "$expected_err" "$fio_err"; then
+		echo "fio bad_apptag message not found: $expected_err"
+		ERR_CODE=255
+		return 1
+	fi
+}
+
+_prep_test "loop" "end-to-end integrity"
+
+_setup_device
+
+_test_fill_and_verify && \
+_test_corrupted_reftag && \
+_test_corrupted_data && \
+_test_bad_apptag
+
+rm -f "$fio_err"
+
+_cleanup_test
+_show_result "$TID" $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh
deleted file mode 100755
index aaf1f52da559..000000000000
--- a/tools/testing/selftests/ublk/test_loop_08.sh
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-
-if ! _have_program fio; then
-	exit $UBLK_SKIP_CODE
-fi
-
-fio_version=$(fio --version)
-if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then
-	echo "Requires development fio version with https://github.com/axboe/fio/pull/1992"
-	exit $UBLK_SKIP_CODE
-fi
-
-ERR_CODE=0
-
-# Global variables set during device setup
-dev_id=""
-fio_args=""
-fio_err=""
-
-_setup_device() {
-	_create_backfile 0 256M
-	_create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes)
-
-	local integrity_params="--integrity_capable --integrity_reftag
-		--metadata_size 64 --pi_offset 56 --csum_type t10dif"
-	dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}")
-	_check_add_dev "$TID" $?
-
-	# 1M * (64 integrity bytes / 512 data bytes) = 128K
-	fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32
-		--md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG
-		--filename /dev/ublkb$dev_id"
-
-	fio_err=$(mktemp "${UBLK_TEST_DIR}"/fio_err_XXXXX)
-}
-
-_test_fill_and_verify() {
-	fio --name fill --rw randwrite $fio_args > /dev/null
-	if [ $? != 0 ]; then
-		echo "fio fill failed"
-		ERR_CODE=255
-		return 1
-	fi
-
-	fio --name verify --rw randread $fio_args > /dev/null
-	if [ $? != 0 ]; then
-		echo "fio verify failed"
-		ERR_CODE=255
-		return 1
-	fi
-}
-
-_test_corrupted_reftag() {
-	local dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none"
-	local expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual="
-
-	# Overwrite 4-byte reftag at offset 56 + 4 = 60
-	dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args
-	if [ $? != 0 ]; then
-		echo "dd corrupted_reftag failed"
-		ERR_CODE=255
-		return 1
-	fi
-
-	if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then
-		echo "fio corrupted_reftag unexpectedly succeeded"
-		ERR_CODE=255
-		return 1
-	fi
-
-	if ! grep -q "$expected_err" "$fio_err"; then
-		echo "fio corrupted_reftag message not found: $expected_err"
-		ERR_CODE=255
-		return 1
-	fi
-
-	# Reset to 0
-	dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args
-	if [ $? != 0 ]; then
-		echo "dd restore corrupted_reftag failed"
-		ERR_CODE=255
-		return 1
-	fi
-}
-
-_test_corrupted_data() {
-	local dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none"
-	local expected_err="Guard compare error: LBA: 0 Expected=0, Actual="
-
-	dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args
-	if [ $? != 0 ]; then
-		echo "dd corrupted_data failed"
-		ERR_CODE=255
-		return 1
-	fi
-
-	if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then
-		echo "fio corrupted_data unexpectedly succeeded"
-		ERR_CODE=255
-		return 1
-	fi
-
-	if ! grep -q "$expected_err" "$fio_err"; then
-		echo "fio corrupted_data message not found: $expected_err"
-		ERR_CODE=255
-		return 1
-	fi
-}
-
-_test_bad_apptag() {
-	local expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234"
-
-	if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then
-		echo "fio bad_apptag unexpectedly succeeded"
-		ERR_CODE=255
-		return 1
-	fi
-
-	if ! grep -q "$expected_err" "$fio_err"; then
-		echo "fio bad_apptag message not found: $expected_err"
-		ERR_CODE=255
-		return 1
-	fi
-}
-
-_prep_test "loop" "end-to-end integrity"
-
-_setup_device
-
-_test_fill_and_verify && \
-_test_corrupted_reftag && \
-_test_corrupted_data && \
-_test_bad_apptag
-
-rm -f "$fio_err"
-
-_cleanup_test
-_show_result "$TID" $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh
deleted file mode 100755
index 6713b280a6ff..000000000000
--- a/tools/testing/selftests/ublk/test_null_04.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-
-ERR_CODE=0
-
-_check_value() {
-	local name=$1
-	local actual=$2
-	local expected=$3
-
-	if [ "$actual" != "$expected" ]; then
-		echo "$name $actual != $expected"
-		ERR_CODE=255
-		return 1
-	fi
-	return 0
-}
-
-_test_metadata_only() {
-	local dev_id
-
-	dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 8)
-	_check_add_dev "$TID" $?
-
-	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 &&
-	_check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 &&
-	_check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 0 &&
-	_check_value "device_is_integrity_capable" \
-		"$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 &&
-	_check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" nop &&
-	_check_value "protection_interval_bytes" \
-		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
-	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0
-
-	_ublk_del_dev "${dev_id}"
-}
-
-_test_integrity_capable_ip() {
-	local dev_id
-
-	dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip)
-	_check_add_dev "$TID" $?
-
-	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 &&
-	_check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 56 &&
-	_check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 &&
-	_check_value "device_is_integrity_capable" \
-		"$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 1 &&
-	_check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE3-IP &&
-	_check_value "protection_interval_bytes" \
-		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
-	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0
-
-	_ublk_del_dev "${dev_id}"
-}
-
-_test_integrity_reftag_t10dif() {
-	local dev_id
-
-	dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_reftag --metadata_size 8 --csum_type t10dif)
-	_check_add_dev "$TID" $?
-
-	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 &&
-	_check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 &&
-	_check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 &&
-	_check_value "device_is_integrity_capable" \
-		"$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 &&
-	_check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE1-CRC &&
-	_check_value "protection_interval_bytes" \
-		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
-	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0
-
-	_ublk_del_dev "${dev_id}"
-}
-
-_test_nvme_csum() {
-	local dev_id
-
-	dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 16 --csum_type nvme --tag_size 8)
-	_check_add_dev "$TID" $?
-
-	_check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 &&
-	_check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 &&
-	_check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 16 &&
-	_check_value "device_is_integrity_capable" \
-		"$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 &&
-	_check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" EXT-DIF-TYPE3-CRC64 &&
-	_check_value "protection_interval_bytes" \
-		"$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 &&
-	_check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8
-
-	_ublk_del_dev "${dev_id}"
-}
-
-_prep_test "null" "integrity params"
-
-_test_metadata_only
-_test_integrity_capable_ip
-_test_integrity_reftag_t10dif
-_test_nvme_csum
-
-_cleanup_test
-_show_result "$TID" $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_recover_01.sh b/tools/testing/selftests/ublk/test_recover_01.sh
new file mode 100755
index 000000000000..2672f9c40fa8
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_recover_01.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+ERR_CODE=0
+
+ublk_run_recover_test()
+{
+	run_io_and_recover 256M "kill_daemon" "$@"
+	ERR_CODE=$?
+	if [ ${ERR_CODE} -ne 0 ]; then
+		echo "$TID failure: $*"
+		_show_result $TID $ERR_CODE
+	fi
+}
+
+if ! _have_program fio; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "recover" "basic recover function verification"
+
+_create_backfile 0 256M
+_create_backfile 1 128M
+_create_backfile 2 128M
+
+ublk_run_recover_test -t null -q 2 -r 1 -b &
+ublk_run_recover_test -t loop -q 2 -r 1 -b "${UBLK_BACKFILES[0]}" &
+ublk_run_recover_test -t stripe -q 2 -r 1 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
+ublk_run_recover_test -t null -q 2 -r 1 &
+ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" &
+ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
+ublk_run_recover_test -t null -q 2 -r 1 -i 1 &
+ublk_run_recover_test -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" &
+ublk_run_recover_test -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
+_cleanup_test "recover"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_recover_02.sh b/tools/testing/selftests/ublk/test_recover_02.sh
new file mode 100755
index 000000000000..bda5064bc31f
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_recover_02.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+ERR_CODE=0
+
+ublk_run_recover_test()
+{
+	run_io_and_recover 256M "kill_daemon" "$@"
+	ERR_CODE=$?
+	if [ ${ERR_CODE} -ne 0 ]; then
+		echo "$TID failure: $*"
+		_show_result $TID $ERR_CODE
+	fi
+}
+
+if ! _have_program fio; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+if ! _have_feature "ZERO_COPY"; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "recover" "basic recover function verification (zero copy)"
+
+_create_backfile 0 256M
+_create_backfile 1 128M
+_create_backfile 2 128M
+
+ublk_run_recover_test -t null -q 2 -r 1 -z -b &
+ublk_run_recover_test -t loop -q 2 -r 1 -z -b "${UBLK_BACKFILES[0]}" &
+ublk_run_recover_test -t stripe -q 2 -r 1 -z -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
+ublk_run_recover_test -t null -q 2 -r 1 -z &
+ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" &
+ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
+ublk_run_recover_test -t null -q 2 -r 1 -z -i 1 &
+ublk_run_recover_test -t loop -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[0]}" &
+ublk_run_recover_test -t stripe -q 2 -r 1 -z -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
+_cleanup_test "recover"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_recover_03.sh b/tools/testing/selftests/ublk/test_recover_03.sh
new file mode 100755
index 000000000000..e0dc0b8fe5d6
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_recover_03.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+ERR_CODE=0
+
+ublk_run_quiesce_recover()
+{
+	run_io_and_recover 256M "quiesce_dev" "$@"
+	ERR_CODE=$?
+	if [ ${ERR_CODE} -ne 0 ]; then
+		echo "$TID failure: $*"
+		_show_result $TID $ERR_CODE
+	fi
+}
+
+if ! _have_feature "QUIESCE"; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+if ! _have_program fio; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "quiesce" "basic quiesce & recover function verification"
+
+_create_backfile 0 256M
+_create_backfile 1 128M
+_create_backfile 2 128M
+
+ublk_run_quiesce_recover -t null -q 2 -r 1 &
+ublk_run_quiesce_recover -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" &
+ublk_run_quiesce_recover -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
+ublk_run_quiesce_recover -t null -q 2 -r 1 -i 1 &
+ublk_run_quiesce_recover -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" &
+ublk_run_quiesce_recover -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
+_cleanup_test "quiesce"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_recover_04.sh b/tools/testing/selftests/ublk/test_recover_04.sh
new file mode 100755
index 000000000000..178443394ca5
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_recover_04.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+ERR_CODE=0
+
+ublk_run_recover_test()
+{
+	run_io_and_recover 256M "kill_daemon" "$@"
+	ERR_CODE=$?
+	if [ ${ERR_CODE} -ne 0 ]; then
+		echo "$TID failure: $*"
+		_show_result $TID $ERR_CODE
+	fi
+}
+
+if ! _have_program fio; then
+	exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "recover" "basic recover function verification (user copy)"
+
+_create_backfile 0 256M
+_create_backfile 1 128M
+_create_backfile 2 128M
+
+ublk_run_recover_test -t null -q 2 -r 1 -u &
+ublk_run_recover_test -t loop -q 2 -r 1 -u "${UBLK_BACKFILES[0]}" &
+ublk_run_recover_test -t stripe -q 2 -r 1 -u "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
+ublk_run_recover_test -t null -q 2 -r 1 -u -i 1 &
+ublk_run_recover_test -t loop -q 2 -r 1 -u -i 1 "${UBLK_BACKFILES[0]}" &
+ublk_run_recover_test -t stripe -q 2 -r 1 -u -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
+_cleanup_test "recover"
+_show_result $TID $ERR_CODE
-- 
cgit v1.2.3


From 5314d25afbc44d0449fa2519d2c9d7f3c319f74c Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 1 Feb 2026 00:23:40 +0800
Subject: selftests: ublk: improve I/O ordering test with bpftrace

Remove test_generic_01.sh since block layer may reorder I/O, making
the test prone to false positives. Apply the improvements to
test_generic_02.sh instead, which supposes for covering ublk dispatch
io order.

Rework test_generic_02 to verify that ublk dispatch doesn't reorder I/O
by comparing request start order with completion order using bpftrace.

The bpftrace script now:
- Tracks each request's start sequence number in a map keyed by sector
- On completion, verifies the request's start order matches expected
  completion order
- Reports any out-of-order completions detected

The test script:
- Wait bpftrace BEGIN code block is run
- Pins fio to CPU 0 for deterministic behavior
- Uses block_io_start and block_rq_complete tracepoints
- Checks bpftrace output for reordering errors

Reported-and-tested-by: Alexander Atanasov <alex@zazolabs.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile           |  3 +-
 tools/testing/selftests/ublk/test_generic_01.sh | 47 -------------------------
 tools/testing/selftests/ublk/test_generic_02.sh | 22 ++++++++----
 tools/testing/selftests/ublk/trace/seq_io.bt    | 47 ++++++++++++++++++++-----
 4 files changed, 54 insertions(+), 65 deletions(-)
 delete mode 100755 tools/testing/selftests/ublk/test_generic_01.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index a62a06e13006..8ac2d4a682a1 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -7,8 +7,7 @@ endif
 
 LDLIBS += -lpthread -lm -luring
 
-TEST_PROGS := test_generic_01.sh
-TEST_PROGS += test_generic_02.sh
+TEST_PROGS := test_generic_02.sh
 TEST_PROGS += test_generic_03.sh
 TEST_PROGS += test_generic_06.sh
 TEST_PROGS += test_generic_07.sh
diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh
deleted file mode 100755
index 26cf3c7ceeb5..000000000000
--- a/tools/testing/selftests/ublk/test_generic_01.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
-
-ERR_CODE=0
-
-if ! _have_program bpftrace; then
-	exit "$UBLK_SKIP_CODE"
-fi
-
-if ! _have_program fio; then
-	exit "$UBLK_SKIP_CODE"
-fi
-
-_prep_test "null" "sequential io order"
-
-dev_id=$(_add_ublk_dev -t null)
-_check_add_dev $TID $?
-
-dev_t=$(_get_disk_dev_t "$dev_id")
-bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 &
-btrace_pid=$!
-sleep 2
-
-if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then
-	_cleanup_test "null"
-	exit "$UBLK_SKIP_CODE"
-fi
-
-# run fio over this ublk disk
-fio --name=write_seq \
-    --filename=/dev/ublkb"${dev_id}" \
-    --ioengine=libaio --iodepth=16 \
-    --rw=write \
-    --size=512M \
-    --direct=1 \
-    --bs=4k > /dev/null 2>&1
-ERR_CODE=$?
-kill "$btrace_pid"
-wait
-if grep -q "io_out_of_order" "$UBLK_TMP"; then
-	cat "$UBLK_TMP"
-	ERR_CODE=255
-fi
-_cleanup_test "null"
-_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh
index 1d4b1d6e059c..46b657143fd6 100755
--- a/tools/testing/selftests/ublk/test_generic_02.sh
+++ b/tools/testing/selftests/ublk/test_generic_02.sh
@@ -13,7 +13,7 @@ if ! _have_program fio; then
 	exit "$UBLK_SKIP_CODE"
 fi
 
-_prep_test "null" "sequential io order for MQ"
+_prep_test "null" "ublk dispatch won't reorder IO for MQ"
 
 dev_id=$(_add_ublk_dev -t null -q 2)
 _check_add_dev $TID $?
@@ -21,15 +21,20 @@ _check_add_dev $TID $?
 dev_t=$(_get_disk_dev_t "$dev_id")
 bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 &
 btrace_pid=$!
-sleep 2
 
-if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then
+# Wait for bpftrace probes to be attached (BEGIN block prints BPFTRACE_READY)
+for _ in $(seq 100); do
+	grep -q "BPFTRACE_READY" "$UBLK_TMP" 2>/dev/null && break
+	sleep 0.1
+done
+
+if ! kill -0 "$btrace_pid" 2>/dev/null; then
 	_cleanup_test "null"
 	exit "$UBLK_SKIP_CODE"
 fi
 
-# run fio over this ublk disk
-fio --name=write_seq \
+# run fio over this ublk disk (pinned to CPU 0)
+taskset -c 0 fio --name=write_seq \
     --filename=/dev/ublkb"${dev_id}" \
     --ioengine=libaio --iodepth=16 \
     --rw=write \
@@ -39,8 +44,11 @@ fio --name=write_seq \
 ERR_CODE=$?
 kill "$btrace_pid"
 wait
-if grep -q "io_out_of_order" "$UBLK_TMP"; then
-	cat "$UBLK_TMP"
+
+# Check for out-of-order completions detected by bpftrace
+if grep -q "^out_of_order:" "$UBLK_TMP"; then
+	echo "I/O reordering detected:"
+	grep "^out_of_order:" "$UBLK_TMP"
 	ERR_CODE=255
 fi
 _cleanup_test "null"
diff --git a/tools/testing/selftests/ublk/trace/seq_io.bt b/tools/testing/selftests/ublk/trace/seq_io.bt
index b2f60a92b118..9d36ba35468f 100644
--- a/tools/testing/selftests/ublk/trace/seq_io.bt
+++ b/tools/testing/selftests/ublk/trace/seq_io.bt
@@ -2,23 +2,52 @@
 	$1: 	dev_t
 	$2: 	RWBS
 	$3:     strlen($2)
+
+	Track request order between block_io_start and block_rq_complete.
+	Sequence starts at 1 so 0 means "never seen". On first valid
+	completion, sync complete_seq to handle probe attachment races.
+	block_rq_complete listed first to reduce missed completion window.
 */
+
 BEGIN {
-	@last_rw[$1, str($2)] = (uint64)0;
+	@start_seq = (uint64)1;
+	@complete_seq = (uint64)0;
+	@out_of_order = (uint64)0;
+	@start_order[0] = (uint64)0;
+	delete(@start_order[0]);
+	printf("BPFTRACE_READY\n");
 }
+
 tracepoint:block:block_rq_complete
+/(int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)/
 {
-	$dev = $1;
-	if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) {
-		$last = @last_rw[$dev, str($2)];
-		if ((uint64)args.sector != $last) {
-			printf("io_out_of_order: exp %llu actual %llu\n",
-				args.sector, $last);
+	$expected = @start_order[args.sector];
+	if ($expected > 0) {
+		if (@complete_seq == 0) {
+			@complete_seq = $expected;
+		}
+		if ($expected != @complete_seq) {
+			printf("out_of_order: sector %llu started at seq %llu but completed at seq %llu\n",
+				args.sector, $expected, @complete_seq);
+			@out_of_order = @out_of_order + 1;
 		}
-		@last_rw[$dev, str($2)] = (args.sector + args.nr_sector);
+		delete(@start_order[args.sector]);
+		@complete_seq = @complete_seq + 1;
 	}
 }
 
+tracepoint:block:block_io_start
+/(int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)/
+{
+	@start_order[args.sector] = @start_seq;
+	@start_seq = @start_seq + 1;
+}
+
 END {
-	clear(@last_rw);
+	printf("total_start: %llu total_complete: %llu out_of_order: %llu\n",
+		@start_seq - 1, @complete_seq, @out_of_order);
+	clear(@start_order);
+	clear(@start_seq);
+	clear(@complete_seq);
+	clear(@out_of_order);
 }
-- 
cgit v1.2.3


From 4ac76c51709dff01b285a2d8afea80ca7ae66d28 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Thu, 22 Jan 2026 17:02:16 +0000
Subject: selftests/mm: default KDIR to build directory

Patch series "Various mm kselftests improvements/fixes", v3.

Various improvements/fixes for the mm kselftests:

- Patch 1-3 extend support for more build configurations: out-of-tree
  $KDIR, cross-compilation, etc.

- Patch 4-7 fix issues related to faulting in pages, introducing a new
  helper for that purpose.

- Patch 8 fixes the value returned by pagemap_ioctl (PASS was always
  returned, which explains why the issue fixed in patch 6 went
  unnoticed).

- Patch 9 improves the exit code of pfnmap.

Net results:
- 1 test no longer fails (patch 7)
- 3 tests are no longer skipped (patch 4)
- More accurate return values for whole suites (patch 8, 9)
- Extra tests are more likely to be built (patch 1-3)


This patch (of 9):

KDIR currently defaults to the running kernel's modules directory when
building the page_frag module.  The underlying assumption is that most
users build the kselftests in order to run them against the system they're
built on.

This assumption seems questionable, and there is no guarantee that the
module can actually be built against the running kernel.

Switch the default value of KDIR to the kernel's build directory, i.e.
$(O) if O= or KBUILD_OUTPUT= is used, and the source directory otherwise.
This seems like the least surprising option: the test module is built
against the kernel that has been previously built.

Note: we can't use $(top_srcdir) in mm/Makefile because it is only defined
once lib.mk is included.

Link: https://lkml.kernel.org/r/20260122170224.4056513-1-kevin.brodsky@arm.com
Link: https://lkml.kernel.org/r/20260122170224.4056513-2-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Usama Anjum <Usama.Anjum@arm.com>
Cc: wang lian <lianux.mm@gmail.com>
Cc: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/Makefile           | 2 +-
 tools/testing/selftests/mm/page_frag/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index eaf9312097f7..bb93101e339e 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -44,7 +44,7 @@ LDLIBS = -lrt -lpthread -lm
 # warnings.
 CFLAGS += -U_FORTIFY_SOURCE
 
-KDIR ?= /lib/modules/$(shell uname -r)/build
+KDIR ?= $(if $(O),$(O),$(realpath ../../../..))
 ifneq (,$(wildcard $(KDIR)/Module.symvers))
 ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h))
 TEST_GEN_MODS_DIR := page_frag
diff --git a/tools/testing/selftests/mm/page_frag/Makefile b/tools/testing/selftests/mm/page_frag/Makefile
index 8c8bb39ffa28..96e5f646e69b 100644
--- a/tools/testing/selftests/mm/page_frag/Makefile
+++ b/tools/testing/selftests/mm/page_frag/Makefile
@@ -1,5 +1,5 @@
 PAGE_FRAG_TEST_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
-KDIR ?= /lib/modules/$(shell uname -r)/build
+KDIR ?= $(if $(O),$(O),$(realpath ../../../../..))
 
 ifeq ($(V),1)
 Q =
-- 
cgit v1.2.3


From 1821be740d2e9329805cafa368e476064fde0789 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Thu, 22 Jan 2026 17:02:17 +0000
Subject: selftests/mm: remove flaky header check

Commit 96ed62ea0298 ("mm: page_frag: fix a compile error when kernel is
not compiled") introduced a check to avoid attempting to build the
page_frag module if <linux/page_frag_cache.h> is missing.

Unfortunately this check only works if KDIR points to /lib/modules/...  or
an in-tree kernel build.  It always fails if KDIR points to an out-of-tree
build (i.e.  when the kernel was built with O=...  make) because only
generated headers are present under $KDIR/include/ in that case.

A recent commit switched KDIR to default to the kernel's build directory,
so that check is no longer justified.

Link: https://lkml.kernel.org/r/20260122170224.4056513-3-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Yunsheng Lin <linyunsheng@huawei.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Usama Anjum <Usama.Anjum@arm.com>
Cc: wang lian <lianux.mm@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/Makefile | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index bb93101e339e..4e5c8a330a0c 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -46,12 +46,8 @@ CFLAGS += -U_FORTIFY_SOURCE
 
 KDIR ?= $(if $(O),$(O),$(realpath ../../../..))
 ifneq (,$(wildcard $(KDIR)/Module.symvers))
-ifneq (,$(wildcard $(KDIR)/include/linux/page_frag_cache.h))
 TEST_GEN_MODS_DIR := page_frag
 else
-PAGE_FRAG_WARNING = "missing page_frag_cache.h, please use a newer kernel"
-endif
-else
 PAGE_FRAG_WARNING = "missing Module.symvers, please have the kernel built first"
 endif
 
-- 
cgit v1.2.3


From 7f532d19c8be76ad2fcd7ab6b0c9eb618f70966b Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Thu, 22 Jan 2026 17:02:18 +0000
Subject: selftests/mm: pass down full CC and CFLAGS to check_config.sh

check_config.sh checks that liburing is available by running the compiler
provided as its first argument.  This makes two assumptions:

1. CC consists of only one word
2. No extra flag is required

Unfortunately, there are many situations where these assumptions don't
hold.  For instance:

- When using Clang, CC consists of multiple words
- When cross-compiling, extra flags may be required to allow the
  compiler to find headers

Remove these assumptions by passing down CC and CFLAGS as-is from the
Makefile, so that the same command line is used as when actually building
the tests.

Link: https://lkml.kernel.org/r/20260122170224.4056513-4-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Usama Anjum <Usama.Anjum@arm.com>
Cc: wang lian <lianux.mm@gmail.com>
Cc: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/Makefile        | 2 +-
 tools/testing/selftests/mm/check_config.sh | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 4e5c8a330a0c..de4afc34e3b1 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -230,7 +230,7 @@ $(OUTPUT)/migration: LDLIBS += -lnuma
 $(OUTPUT)/rmap: LDLIBS += -lnuma
 
 local_config.mk local_config.h: check_config.sh
-	/bin/sh ./check_config.sh $(CC)
+	CC="$(CC)" CFLAGS="$(CFLAGS)" ./check_config.sh
 
 EXTRA_CLEAN += local_config.mk local_config.h
 
diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh
index 3954f4746161..b84c82bbf875 100755
--- a/tools/testing/selftests/mm/check_config.sh
+++ b/tools/testing/selftests/mm/check_config.sh
@@ -16,8 +16,7 @@ echo "#include <sys/types.h>"        > $tmpfile_c
 echo "#include <liburing.h>"        >> $tmpfile_c
 echo "int func(void) { return 0; }" >> $tmpfile_c
 
-CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"}
-$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
+$CC $CFLAGS -c $tmpfile_c -o $tmpfile_o
 
 if [ -f $tmpfile_o ]; then
     echo "#define LOCAL_CONFIG_HAVE_LIBURING 1"  > $OUTPUT_H_FILE
-- 
cgit v1.2.3


From bce1dabd310e87fefe0645fec9ba98b84d37e418 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Thu, 22 Jan 2026 17:02:19 +0000
Subject: selftests/mm: fix usage of FORCE_READ() in cow tests

Commit 5bbc2b785e63 ("selftests/mm: fix FORCE_READ to read input value
correctly") modified FORCE_READ() to take a value instead of a pointer.
It also changed most of the call sites accordingly, but missed many of
them in cow.c.  In those cases, we ended up with the pointer itself being
read, not the memory it points to.

No failure occurred as a result, so it looks like the tests work just fine
without faulting in.  However, the huge_zeropage tests explicitly check
that pages are populated, so those became skipped.

Convert all the remaining FORCE_READ() to fault in the mapped page, as was
originally intended.  This allows the huge_zeropage tests to run again (3
tests in total).

Link: https://lkml.kernel.org/r/20260122170224.4056513-5-kevin.brodsky@arm.com
Fixes: 5bbc2b785e63 ("selftests/mm: fix FORCE_READ to read input value correctly")
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Acked-by: SeongJae Park <sj@kernel.org>
Reviewed-by: wang lian <lianux.mm@gmail.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Usama Anjum <Usama.Anjum@arm.com>
Cc: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/cow.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c
index accfd198dbda..83b3563be26b 100644
--- a/tools/testing/selftests/mm/cow.c
+++ b/tools/testing/selftests/mm/cow.c
@@ -1612,8 +1612,8 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
 	 * the first sub-page and test if we get another sub-page populated
 	 * automatically.
 	 */
-	FORCE_READ(mem);
-	FORCE_READ(smem);
+	FORCE_READ(*mem);
+	FORCE_READ(*smem);
 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
 	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
 		ksft_test_result_skip("Did not get THPs populated\n");
@@ -1663,8 +1663,8 @@ static void run_with_memfd(non_anon_test_fn fn, const char *desc)
 	}
 
 	/* Fault the page in. */
-	FORCE_READ(mem);
-	FORCE_READ(smem);
+	FORCE_READ(*mem);
+	FORCE_READ(*smem);
 
 	fn(mem, smem, pagesize);
 munmap:
@@ -1719,8 +1719,8 @@ static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
 	}
 
 	/* Fault the page in. */
-	FORCE_READ(mem);
-	FORCE_READ(smem);
+	FORCE_READ(*mem);
+	FORCE_READ(*smem);
 
 	fn(mem, smem, pagesize);
 munmap:
@@ -1773,8 +1773,8 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
 	}
 
 	/* Fault the page in. */
-	FORCE_READ(mem);
-	FORCE_READ(smem);
+	FORCE_READ(*mem);
+	FORCE_READ(*smem);
 
 	fn(mem, smem, hugetlbsize);
 munmap:
-- 
cgit v1.2.3


From 20d3fac43608a1d7ef71991935abc4456baa1da7 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Thu, 22 Jan 2026 17:02:20 +0000
Subject: selftests/mm: check that FORCE_READ() succeeded

Many cow tests rely on FORCE_READ() to populate pages.  Introduce a helper
to make sure that the pages are actually populated, and fail otherwise.

Link: https://lkml.kernel.org/r/20260122170224.4056513-6-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Usama Anjum <Usama.Anjum@arm.com>
Cc: wang lian <lianux.mm@gmail.com>
Cc: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/cow.c | 43 ++++++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c
index 83b3563be26b..d9c69c04b67d 100644
--- a/tools/testing/selftests/mm/cow.c
+++ b/tools/testing/selftests/mm/cow.c
@@ -75,6 +75,18 @@ static bool range_is_swapped(void *addr, size_t size)
 	return true;
 }
 
+static bool populate_page_checked(char *addr)
+{
+	bool ret;
+
+	FORCE_READ(*addr);
+	ret = pagemap_is_populated(pagemap_fd, addr);
+	if (!ret)
+		ksft_print_msg("Failed to populate page\n");
+
+	return ret;
+}
+
 struct comm_pipes {
 	int child_ready[2];
 	int parent_ready[2];
@@ -1549,8 +1561,10 @@ static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
 	}
 
 	/* Read from the page to populate the shared zeropage. */
-	FORCE_READ(*mem);
-	FORCE_READ(*smem);
+	if (!populate_page_checked(mem) || !populate_page_checked(smem)) {
+		log_test_result(KSFT_FAIL);
+		goto munmap;
+	}
 
 	fn(mem, smem, pagesize);
 munmap:
@@ -1612,8 +1626,11 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
 	 * the first sub-page and test if we get another sub-page populated
 	 * automatically.
 	 */
-	FORCE_READ(*mem);
-	FORCE_READ(*smem);
+	if (!populate_page_checked(mem) || !populate_page_checked(smem)) {
+		log_test_result(KSFT_FAIL);
+		goto munmap;
+	}
+
 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
 	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
 		ksft_test_result_skip("Did not get THPs populated\n");
@@ -1663,8 +1680,10 @@ static void run_with_memfd(non_anon_test_fn fn, const char *desc)
 	}
 
 	/* Fault the page in. */
-	FORCE_READ(*mem);
-	FORCE_READ(*smem);
+	if (!populate_page_checked(mem) || !populate_page_checked(smem)) {
+		log_test_result(KSFT_FAIL);
+		goto munmap;
+	}
 
 	fn(mem, smem, pagesize);
 munmap:
@@ -1719,8 +1738,10 @@ static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
 	}
 
 	/* Fault the page in. */
-	FORCE_READ(*mem);
-	FORCE_READ(*smem);
+	if (!populate_page_checked(mem) || !populate_page_checked(smem)) {
+		log_test_result(KSFT_FAIL);
+		goto munmap;
+	}
 
 	fn(mem, smem, pagesize);
 munmap:
@@ -1773,8 +1794,10 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
 	}
 
 	/* Fault the page in. */
-	FORCE_READ(*mem);
-	FORCE_READ(*smem);
+	if (!populate_page_checked(mem) || !populate_page_checked(smem)) {
+		log_test_result(KSFT_FAIL);
+		goto munmap;
+	}
 
 	fn(mem, smem, hugetlbsize);
 munmap:
-- 
cgit v1.2.3


From dd2b4e04c09808ff921e3460a608537d1a94595d Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Thu, 22 Jan 2026 17:02:21 +0000
Subject: selftests/mm: introduce helper to read every page

FORCE_READ(*addr) ensures that the compiler will emit a load from addr.
Several tests need to trigger such a load for a range of pages, ensuring
that every page is faulted in, if it wasn't already.

Introduce a new helper force_read_pages() that does exactly that and
replace existing loops with a call to it.

The step size (regular/huge page size) is preserved for all loops, except
in split_huge_page_test.  Reading every byte is unnecessary; we now read
every huge page, matching the following call to check_huge_file().

Link: https://lkml.kernel.org/r/20260122170224.4056513-7-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Muhammad Usama Anjum <usama.anjum@arm.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: wang lian <lianux.mm@gmail.com>
Cc: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/hugetlb-madvise.c      | 9 +--------
 tools/testing/selftests/mm/pfnmap.c               | 9 +++------
 tools/testing/selftests/mm/split_huge_page_test.c | 6 +-----
 tools/testing/selftests/mm/vm_util.h              | 7 +++++++
 4 files changed, 12 insertions(+), 19 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c
index 05d9d2805ae4..5b12041fa310 100644
--- a/tools/testing/selftests/mm/hugetlb-madvise.c
+++ b/tools/testing/selftests/mm/hugetlb-madvise.c
@@ -47,14 +47,7 @@ void write_fault_pages(void *addr, unsigned long nr_pages)
 
 void read_fault_pages(void *addr, unsigned long nr_pages)
 {
-	unsigned long i;
-
-	for (i = 0; i < nr_pages; i++) {
-		unsigned long *addr2 =
-			((unsigned long *)(addr + (i * huge_page_size)));
-		/* Prevent the compiler from optimizing out the entire loop: */
-		FORCE_READ(*addr2);
-	}
+	force_read_pages(addr, nr_pages, huge_page_size);
 }
 
 int main(int argc, char **argv)
diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c
index f546dfb10cae..45b5f1cf6019 100644
--- a/tools/testing/selftests/mm/pfnmap.c
+++ b/tools/testing/selftests/mm/pfnmap.c
@@ -35,18 +35,15 @@ static void signal_handler(int sig)
 
 static int test_read_access(char *addr, size_t size, size_t pagesize)
 {
-	size_t offs;
 	int ret;
 
 	if (signal(SIGSEGV, signal_handler) == SIG_ERR)
 		return -EINVAL;
 
 	ret = sigsetjmp(sigjmp_buf_env, 1);
-	if (!ret) {
-		for (offs = 0; offs < size; offs += pagesize)
-			/* Force a read that the compiler cannot optimize out. */
-			*((volatile char *)(addr + offs));
-	}
+	if (!ret)
+		force_read_pages(addr, size/pagesize, pagesize);
+
 	if (signal(SIGSEGV, SIG_DFL) == SIG_ERR)
 		return -EINVAL;
 
diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index 40799f3f0213..e0167111bdd1 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -652,11 +652,7 @@ static int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size,
 	}
 	madvise(*addr, fd_size, MADV_HUGEPAGE);
 
-	for (size_t i = 0; i < fd_size; i++) {
-		char *addr2 = *addr + i;
-
-		FORCE_READ(*addr2);
-	}
+	force_read_pages(*addr, fd_size / pmd_pagesize, pmd_pagesize);
 
 	if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) {
 		ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n");
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index 6ad32b1830f1..522f7f9050f5 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -54,6 +54,13 @@ static inline unsigned int pshift(void)
 	return __page_shift;
 }
 
+static inline void force_read_pages(char *addr, unsigned int nr_pages,
+				    size_t pagesize)
+{
+	for (unsigned int i = 0; i < nr_pages; i++)
+		FORCE_READ(addr[i * pagesize]);
+}
+
 bool detect_huge_zeropage(void);
 
 /*
-- 
cgit v1.2.3


From 7e938f00b00319510ae097e20b7487dfa578d53f Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Thu, 22 Jan 2026 17:02:22 +0000
Subject: selftests/mm: fix faulting-in code in pagemap_ioctl test

One of the pagemap_ioctl tests attempts to fault in pages by memcpy()'ing
them to an unused buffer.  This probably worked originally, but since
commit 46036188ea1f ("selftests/mm: build with -O2") the compiler is free
to optimise away that unused buffer and the memcpy() with it.  As a result
there might not be any resident page in the mapping and the test may fail.

We don't need to copy all that memory anyway.  Just fault in every page.

While at it also make sure to compute the number of pages once using
simple integer arithmetic instead of ceilf() and implicit conversions.

Link: https://lkml.kernel.org/r/20260122170224.4056513-8-kevin.brodsky@arm.com
Fixes: 46036188ea1f ("selftests/mm: build with -O2")
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Muhammad Usama Anjum <usama.anjum@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: wang lian <lianux.mm@gmail.com>
Cc: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pagemap_ioctl.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c
index 2cb5441f29c7..1896c7d4f72e 100644
--- a/tools/testing/selftests/mm/pagemap_ioctl.c
+++ b/tools/testing/selftests/mm/pagemap_ioctl.c
@@ -1052,11 +1052,10 @@ static void test_simple(void)
 int sanity_tests(void)
 {
 	unsigned long long mem_size, vec_size;
-	long ret, fd, i, buf_size;
+	long ret, fd, i, buf_size, nr_pages;
 	struct page_region *vec;
 	char *mem, *fmem;
 	struct stat sbuf;
-	char *tmp_buf;
 
 	/* 1. wrong operation */
 	mem_size = 10 * page_size;
@@ -1167,14 +1166,14 @@ int sanity_tests(void)
 	if (fmem == MAP_FAILED)
 		ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno));
 
-	tmp_buf = malloc(sbuf.st_size);
-	memcpy(tmp_buf, fmem, sbuf.st_size);
+	nr_pages = (sbuf.st_size + page_size - 1) / page_size;
+	force_read_pages(fmem, nr_pages, page_size);
 
 	ret = pagemap_ioctl(fmem, sbuf.st_size, vec, vec_size, 0, 0,
 			    0, PAGEMAP_NON_WRITTEN_BITS, 0, PAGEMAP_NON_WRITTEN_BITS);
 
 	ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)fmem &&
-			 LEN(vec[0]) == ceilf((float)sbuf.st_size/page_size) &&
+			 LEN(vec[0]) == nr_pages &&
 			 (vec[0].categories & PAGE_IS_FILE),
 			 "%s Memory mapped file\n", __func__);
 
-- 
cgit v1.2.3


From 148e5879532f835118e00c3040acef077b57721a Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Thu, 22 Jan 2026 17:02:23 +0000
Subject: selftests/mm: fix exit code in pagemap_ioctl

Make sure pagemap_ioctl exits with an appropriate value:

* If the tests are run, call ksft_finished() to report the right
  status instead of reporting PASS unconditionally.

* Report SKIP if userfaultfd isn't available (in line with other
  tests)

* Report FAIL if we failed to open /proc/self/pagemap, as this file
  has been added a long time ago and doesn't depend on any CONFIG
  option (returning -EINVAL from main() is meaningless)

Link: https://lkml.kernel.org/r/20260122170224.4056513-9-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Acked-by: SeongJae Park <sj@kernel.org>
Reviewed-by: wang lian <lianux.mm@gmail.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Cc: Usama Anjum <Usama.Anjum@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pagemap_ioctl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c
index 1896c7d4f72e..2ca8a7e3c27e 100644
--- a/tools/testing/selftests/mm/pagemap_ioctl.c
+++ b/tools/testing/selftests/mm/pagemap_ioctl.c
@@ -1552,7 +1552,7 @@ int main(int __attribute__((unused)) argc, char *argv[])
 	ksft_print_header();
 
 	if (init_uffd())
-		ksft_exit_pass();
+		ksft_exit_skip("Failed to initialize userfaultfd\n");
 
 	ksft_set_plan(117);
 
@@ -1561,7 +1561,7 @@ int main(int __attribute__((unused)) argc, char *argv[])
 
 	pagemap_fd = open(PAGEMAP, O_RDONLY);
 	if (pagemap_fd < 0)
-		return -EINVAL;
+		ksft_exit_fail_msg("Failed to open " PAGEMAP "\n");
 
 	/* 1. Sanity testing */
 	sanity_tests_sd();
@@ -1733,5 +1733,5 @@ int main(int __attribute__((unused)) argc, char *argv[])
 	zeropfn_tests();
 
 	close(pagemap_fd);
-	ksft_exit_pass();
+	ksft_finished();
 }
-- 
cgit v1.2.3


From fde8353121aa304ee88542f011dd5dc83ced47e4 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Thu, 22 Jan 2026 17:02:24 +0000
Subject: selftests/mm: report SKIP in pfnmap if a check fails

pfnmap currently checks the target file in FIXTURE_SETUP(pfnmap), meaning
once for every test, and skips the test if any check fails.

The target file is the same for every test so this is a little overkill.
More importantly, this approach means that the whole suite will report
PASS even if all the tests are skipped because kernel configuration (e.g.
CONFIG_STRICT_DEVMEM=y) prevented /dev/mem from being mapped, for
instance.

Let's ensure that KSFT_SKIP is returned as exit code if any check fails by
performing the checks in pfnmap_init(), run once.  That function also
takes care of finding the offset of the pages to be mapped and saves it in
a global.  The file is now opened only once and the fd saved in a global,
but it is still mapped/unmapped for every test, as some of them modify the
mapping.

Link: https://lkml.kernel.org/r/20260122170224.4056513-10-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Usama Anjum <Usama.Anjum@arm.com>
Cc: wang lian <lianux.mm@gmail.com>
Cc: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/pfnmap.c | 84 +++++++++++++++++++++++--------------
 1 file changed, 53 insertions(+), 31 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c
index 45b5f1cf6019..4f550822385a 100644
--- a/tools/testing/selftests/mm/pfnmap.c
+++ b/tools/testing/selftests/mm/pfnmap.c
@@ -25,8 +25,12 @@
 #include "kselftest_harness.h"
 #include "vm_util.h"
 
+#define DEV_MEM_NPAGES	2
+
 static sigjmp_buf sigjmp_buf_env;
 static char *file = "/dev/mem";
+static off_t file_offset;
+static int fd;
 
 static void signal_handler(int sig)
 {
@@ -88,7 +92,7 @@ static int find_ram_target(off_t *offset,
 			break;
 
 		/* We need two pages. */
-		if (end > start + 2 * pagesize) {
+		if (end > start + DEV_MEM_NPAGES * pagesize) {
 			fclose(file);
 			*offset = start;
 			return 0;
@@ -97,11 +101,48 @@ static int find_ram_target(off_t *offset,
 	return -ENOENT;
 }
 
+static void pfnmap_init(void)
+{
+	size_t pagesize = getpagesize();
+	size_t size = DEV_MEM_NPAGES * pagesize;
+	void *addr;
+
+	if (strncmp(file, "/dev/mem", strlen("/dev/mem")) == 0) {
+		int err = find_ram_target(&file_offset, pagesize);
+
+		if (err)
+			ksft_exit_skip("Cannot find ram target in '/proc/iomem': %s\n",
+				       strerror(-err));
+	} else {
+		file_offset = 0;
+	}
+
+	fd = open(file, O_RDONLY);
+	if (fd < 0)
+		ksft_exit_skip("Cannot open '%s': %s\n", file, strerror(errno));
+
+	/*
+	 * Make sure we can map the file, and perform some basic checks; skip
+	 * the whole suite if anything goes wrong.
+	 * A fresh mapping is then created for every test case by
+	 * FIXTURE_SETUP(pfnmap).
+	 */
+	addr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, file_offset);
+	if (addr == MAP_FAILED)
+		ksft_exit_skip("Cannot mmap '%s': %s\n", file, strerror(errno));
+
+	if (!check_vmflag_pfnmap(addr))
+		ksft_exit_skip("Invalid file: '%s'. Not pfnmap'ed\n", file);
+
+	if (test_read_access(addr, size, pagesize))
+		ksft_exit_skip("Cannot read-access mmap'ed '%s'\n", file);
+
+	munmap(addr, size);
+}
+
 FIXTURE(pfnmap)
 {
-	off_t offset;
 	size_t pagesize;
-	int dev_mem_fd;
 	char *addr1;
 	size_t size1;
 	char *addr2;
@@ -112,31 +153,10 @@ FIXTURE_SETUP(pfnmap)
 {
 	self->pagesize = getpagesize();
 
-	if (strncmp(file, "/dev/mem", strlen("/dev/mem")) == 0) {
-		/* We'll require two physical pages throughout our tests ... */
-		if (find_ram_target(&self->offset, self->pagesize))
-			SKIP(return,
-				   "Cannot find ram target in '/proc/iomem'\n");
-	} else {
-		self->offset = 0;
-	}
-
-	self->dev_mem_fd = open(file, O_RDONLY);
-	if (self->dev_mem_fd < 0)
-		SKIP(return, "Cannot open '%s'\n", file);
-
-	self->size1 = self->pagesize * 2;
+	self->size1 = DEV_MEM_NPAGES * self->pagesize;
 	self->addr1 = mmap(NULL, self->size1, PROT_READ, MAP_SHARED,
-			   self->dev_mem_fd, self->offset);
-	if (self->addr1 == MAP_FAILED)
-		SKIP(return, "Cannot mmap '%s'\n", file);
-
-	if (!check_vmflag_pfnmap(self->addr1))
-		SKIP(return, "Invalid file: '%s'. Not pfnmap'ed\n", file);
-
-	/* ... and want to be able to read from them. */
-	if (test_read_access(self->addr1, self->size1, self->pagesize))
-		SKIP(return, "Cannot read-access mmap'ed '%s'\n", file);
+			   fd, file_offset);
+	ASSERT_NE(self->addr1, MAP_FAILED);
 
 	self->size2 = 0;
 	self->addr2 = MAP_FAILED;
@@ -148,8 +168,6 @@ FIXTURE_TEARDOWN(pfnmap)
 		munmap(self->addr2, self->size2);
 	if (self->addr1 != MAP_FAILED)
 		munmap(self->addr1, self->size1);
-	if (self->dev_mem_fd >= 0)
-		close(self->dev_mem_fd);
 }
 
 TEST_F(pfnmap, madvise_disallowed)
@@ -189,7 +207,7 @@ TEST_F(pfnmap, munmap_split)
 	 */
 	self->size2 = self->pagesize;
 	self->addr2 = mmap(NULL, self->pagesize, PROT_READ, MAP_SHARED,
-			   self->dev_mem_fd, self->offset);
+			   fd, file_offset);
 	ASSERT_NE(self->addr2, MAP_FAILED);
 }
 
@@ -259,8 +277,12 @@ int main(int argc, char **argv)
 		if (strcmp(argv[i], "--") == 0) {
 			if (i + 1 < argc && strlen(argv[i + 1]) > 0)
 				file = argv[i + 1];
-			return test_harness_run(i, argv);
+			argc = i;
+			break;
 		}
 	}
+
+	pfnmap_init();
+
 	return test_harness_run(argc, argv);
 }
-- 
cgit v1.2.3


From dd2c6ec24fca9235ccd1b9bfd382d0ddb419e41a Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 16 Jan 2026 13:20:53 +0000
Subject: selftests/mm: remove virtual_address_range test

This self test is asserting internal implementation details and is highly
vulnerable to internal kernel changes as a result.

It is currently failing locally from at least v6.17, and it seems that it
may have been failing for longer in many configurations/hardware as it
skips if e.g.  CONFIG_ANON_VMA_NAME is not specified.

With these skips and the fact that run_vmtests.sh won't run the tests in
certain configurations it is likely we have simply missed this test being
broken in CI for a long while.

I have tried multiple versions of these tests and am unable to find a
working bisect as previous versions of the test fail also.

The tests are essentially mmap()'ing a series of mappings with no hint and
asserting what the get_unmapped_area*() functions will come up with, with
seemingly few checks for what other mappings may already be in place.

It then appears to be mmap()'ing with a hint, and making a series of
similar assertions about the internal implementation details of the
hinting logic.

Commit 0ef3783d7558 ("selftests/mm: add support to test 4PB VA on PPC64"),
commit 3bd6137220bb ("selftests/mm: virtual_address_range: avoid reading
from VM_IO mappings"), and especially commit a005145b9c96 ("selftests/mm:
virtual_address_range: mmap() without PROT_WRITE") are good examples of
the whack-a-mole nature of maintaining this test.

The last commit there being particularly pertinent as it was accounting
for an internal implementation detail change that really should have no
bearing on self-tests, that is commit e93d2521b27f ("x86/vdso: Split
virtual clock pages into dedicated mapping").

The purpose of the mm self-tests are to assert attributes about the API
exposed to users, and to ensure that expectations are met.

This test is emphatically not doing this, rather making a series of
assumptions about internal implementation details and asserting them.

It therefore, sadly, seems that the best course is to remove this test
altogether.

Link: https://lkml.kernel.org/r/20260116132053.857887-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: SeongJae Park <sj@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/.gitignore              |   1 -
 tools/testing/selftests/mm/Makefile                |   3 -
 tools/testing/selftests/mm/run_vmtests.sh          |  12 -
 tools/testing/selftests/mm/virtual_address_range.c | 260 ---------------------
 4 files changed, 276 deletions(-)
 delete mode 100644 tools/testing/selftests/mm/virtual_address_range.c

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index c2a8586e51a1..702e5723c35d 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -32,7 +32,6 @@ uffd-unit-tests
 uffd-wp-mremap
 mlock-intersect-test
 mlock-random-test
-virtual_address_range
 gup_test
 va_128TBswitch
 map_fixed_noreplace
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index de4afc34e3b1..2fdb05e5a56a 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -136,9 +136,6 @@ endif
 
 ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390))
 TEST_GEN_FILES += va_high_addr_switch
-ifneq ($(ARCH),riscv64)
-TEST_GEN_FILES += virtual_address_range
-endif
 TEST_GEN_FILES += write_to_hugetlbfs
 endif
 
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 2dadbfc6e535..452875db532c 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -399,18 +399,6 @@ CATEGORY="hugetlb" run_test ./hugetlb-read-hwpoison
 fi
 
 if [ $VADDR64 -ne 0 ]; then
-
-	# set overcommit_policy as OVERCOMMIT_ALWAYS so that kernel
-	# allows high virtual address allocation requests independent
-	# of platform's physical memory.
-
-	if [ -x ./virtual_address_range ]; then
-		prev_policy=$(cat /proc/sys/vm/overcommit_memory)
-		echo 1 > /proc/sys/vm/overcommit_memory
-		CATEGORY="hugevm" run_test ./virtual_address_range
-		echo $prev_policy > /proc/sys/vm/overcommit_memory
-	fi
-
 	# va high address boundary switch test
 	CATEGORY="hugevm" run_test bash ./va_high_addr_switch.sh
 fi # VADDR64
diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
deleted file mode 100644
index 4f0923825ed7..000000000000
--- a/tools/testing/selftests/mm/virtual_address_range.c
+++ /dev/null
@@ -1,260 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2017, Anshuman Khandual, IBM Corp.
- *
- * Works on architectures which support 128TB virtual
- * address range and beyond.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <errno.h>
-#include <sys/prctl.h>
-#include <sys/mman.h>
-#include <sys/time.h>
-#include <fcntl.h>
-
-#include "vm_util.h"
-#include "kselftest.h"
-
-/*
- * Maximum address range mapped with a single mmap()
- * call is little bit more than 1GB. Hence 1GB is
- * chosen as the single chunk size for address space
- * mapping.
- */
-
-#define SZ_1GB	(1024 * 1024 * 1024UL)
-#define SZ_1TB	(1024 * 1024 * 1024 * 1024UL)
-
-#define MAP_CHUNK_SIZE	SZ_1GB
-
-/*
- * Address space till 128TB is mapped without any hint
- * and is enabled by default. Address space beyond 128TB
- * till 512TB is obtained by passing hint address as the
- * first argument into mmap() system call.
- *
- * The process heap address space is divided into two
- * different areas one below 128TB and one above 128TB
- * till it reaches 512TB. One with size 128TB and the
- * other being 384TB.
- *
- * On Arm64 the address space is 256TB and support for
- * high mappings up to 4PB virtual address space has
- * been added.
- *
- * On PowerPC64, the address space up to 128TB can be
- * mapped without a hint. Addresses beyond 128TB, up to
- * 4PB, can be mapped with a hint.
- *
- */
-
-#define NR_CHUNKS_128TB   ((128 * SZ_1TB) / MAP_CHUNK_SIZE) /* Number of chunks for 128TB */
-#define NR_CHUNKS_256TB   (NR_CHUNKS_128TB * 2UL)
-#define NR_CHUNKS_384TB   (NR_CHUNKS_128TB * 3UL)
-#define NR_CHUNKS_3840TB  (NR_CHUNKS_128TB * 30UL)
-#define NR_CHUNKS_3968TB  (NR_CHUNKS_128TB * 31UL)
-
-#define ADDR_MARK_128TB  (1UL << 47) /* First address beyond 128TB */
-#define ADDR_MARK_256TB  (1UL << 48) /* First address beyond 256TB */
-
-#ifdef __aarch64__
-#define HIGH_ADDR_MARK  ADDR_MARK_256TB
-#define HIGH_ADDR_SHIFT 49
-#define NR_CHUNKS_LOW   NR_CHUNKS_256TB
-#define NR_CHUNKS_HIGH  NR_CHUNKS_3840TB
-#elif defined(__PPC64__)
-#define HIGH_ADDR_MARK  ADDR_MARK_128TB
-#define HIGH_ADDR_SHIFT 48
-#define NR_CHUNKS_LOW   NR_CHUNKS_128TB
-#define NR_CHUNKS_HIGH  NR_CHUNKS_3968TB
-#else
-#define HIGH_ADDR_MARK  ADDR_MARK_128TB
-#define HIGH_ADDR_SHIFT 48
-#define NR_CHUNKS_LOW   NR_CHUNKS_128TB
-#define NR_CHUNKS_HIGH  NR_CHUNKS_384TB
-#endif
-
-static char *hint_addr(void)
-{
-	int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT);
-
-	return (char *) (1UL << bits);
-}
-
-static void validate_addr(char *ptr, int high_addr)
-{
-	unsigned long addr = (unsigned long) ptr;
-
-	if (high_addr) {
-		if (addr < HIGH_ADDR_MARK)
-			ksft_exit_fail_msg("Bad address %lx\n", addr);
-		return;
-	}
-
-	if (addr > HIGH_ADDR_MARK)
-		ksft_exit_fail_msg("Bad address %lx\n", addr);
-}
-
-static void mark_range(char *ptr, size_t size)
-{
-	if (prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, "virtual_address_range") == -1) {
-		if (errno == EINVAL) {
-			/* Depends on CONFIG_ANON_VMA_NAME */
-			ksft_test_result_skip("prctl(PR_SET_VMA_ANON_NAME) not supported\n");
-			ksft_finished();
-		} else {
-			ksft_exit_fail_perror("prctl(PR_SET_VMA_ANON_NAME) failed\n");
-		}
-	}
-}
-
-static int is_marked_vma(const char *vma_name)
-{
-	return vma_name && !strcmp(vma_name, "[anon:virtual_address_range]\n");
-}
-
-static int validate_lower_address_hint(void)
-{
-	char *ptr;
-
-	ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ |
-		   PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
-	if (ptr == MAP_FAILED)
-		return 0;
-
-	return 1;
-}
-
-static int validate_complete_va_space(void)
-{
-	unsigned long start_addr, end_addr, prev_end_addr;
-	char line[400];
-	char prot[6];
-	FILE *file;
-	int fd;
-
-	fd = open("va_dump", O_CREAT | O_WRONLY, 0600);
-	unlink("va_dump");
-	if (fd < 0) {
-		ksft_test_result_skip("cannot create or open dump file\n");
-		ksft_finished();
-	}
-
-	file = fopen("/proc/self/maps", "r");
-	if (file == NULL)
-		ksft_exit_fail_msg("cannot open /proc/self/maps\n");
-
-	prev_end_addr = 0;
-	while (fgets(line, sizeof(line), file)) {
-		const char *vma_name = NULL;
-		int vma_name_start = 0;
-		unsigned long hop;
-
-		if (sscanf(line, "%lx-%lx %4s %*s %*s %*s %n",
-			   &start_addr, &end_addr, prot, &vma_name_start) != 3)
-			ksft_exit_fail_msg("cannot parse /proc/self/maps\n");
-
-		if (vma_name_start)
-			vma_name = line + vma_name_start;
-
-		/* end of userspace mappings; ignore vsyscall mapping */
-		if (start_addr & (1UL << 63))
-			return 0;
-
-		/* /proc/self/maps must have gaps less than MAP_CHUNK_SIZE */
-		if (start_addr - prev_end_addr >= MAP_CHUNK_SIZE)
-			return 1;
-
-		prev_end_addr = end_addr;
-
-		if (prot[0] != 'r')
-			continue;
-
-		if (check_vmflag_io((void *)start_addr))
-			continue;
-
-		/*
-		 * Confirm whether MAP_CHUNK_SIZE chunk can be found or not.
-		 * If write succeeds, no need to check MAP_CHUNK_SIZE - 1
-		 * addresses after that. If the address was not held by this
-		 * process, write would fail with errno set to EFAULT.
-		 * Anyways, if write returns anything apart from 1, exit the
-		 * program since that would mean a bug in /proc/self/maps.
-		 */
-		hop = 0;
-		while (start_addr + hop < end_addr) {
-			if (write(fd, (void *)(start_addr + hop), 1) != 1)
-				return 1;
-			lseek(fd, 0, SEEK_SET);
-
-			if (is_marked_vma(vma_name))
-				munmap((char *)(start_addr + hop), MAP_CHUNK_SIZE);
-
-			hop += MAP_CHUNK_SIZE;
-		}
-	}
-	return 0;
-}
-
-int main(int argc, char *argv[])
-{
-	char *ptr[NR_CHUNKS_LOW];
-	char **hptr;
-	char *hint;
-	unsigned long i, lchunks, hchunks;
-
-	ksft_print_header();
-	ksft_set_plan(1);
-
-	for (i = 0; i < NR_CHUNKS_LOW; i++) {
-		ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ,
-			      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
-		if (ptr[i] == MAP_FAILED) {
-			if (validate_lower_address_hint())
-				ksft_exit_fail_msg("mmap unexpectedly succeeded with hint\n");
-			break;
-		}
-
-		mark_range(ptr[i], MAP_CHUNK_SIZE);
-		validate_addr(ptr[i], 0);
-	}
-	lchunks = i;
-	hptr = (char **) calloc(NR_CHUNKS_HIGH, sizeof(char *));
-	if (hptr == NULL) {
-		ksft_test_result_skip("Memory constraint not fulfilled\n");
-		ksft_finished();
-	}
-
-	for (i = 0; i < NR_CHUNKS_HIGH; i++) {
-		hint = hint_addr();
-		hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ,
-			       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
-		if (hptr[i] == MAP_FAILED)
-			break;
-
-		mark_range(hptr[i], MAP_CHUNK_SIZE);
-		validate_addr(hptr[i], 1);
-	}
-	hchunks = i;
-	if (validate_complete_va_space()) {
-		ksft_test_result_fail("BUG in mmap() or /proc/self/maps\n");
-		ksft_finished();
-	}
-
-	for (i = 0; i < lchunks; i++)
-		munmap(ptr[i], MAP_CHUNK_SIZE);
-
-	for (i = 0; i < hchunks; i++)
-		munmap(hptr[i], MAP_CHUNK_SIZE);
-
-	free(hptr);
-
-	ksft_test_result_pass("Test\n");
-	ksft_finished();
-}
-- 
cgit v1.2.3


From 94a62284ede0250e48c886416041ad65907ee917 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 16 Jan 2026 18:07:24 -0800
Subject: selftests/damon/sysfs_memcg_path_leak.sh: use kmemleak

Patch series "selftests/damon: improve leak detection and wss estimation
reliability".

Two DAMON selftets, namely 'sysfs_memcg_leak' and
'sysfs_update_schemes_tried_regions_wss_estimation' frequently show
intermittent failures due to their unreliable leak detection and working
set size estimation.  Make those more reliable.


This patch (of 5):

sysfs_memcg_path_leak.sh determines if the memory leak has happened by
seeing if Slab size on /proc/meminfo increases more than expected after an
action.  Depending on the system and background workloads, the reasonable
expectation varies.  For the reason, the test frequently shows
intermittent failures.  Use kmemleak, which is much more reliable and
correct, instead.

Link: https://lkml.kernel.org/r/20260117020731.226785-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20260117020731.226785-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../selftests/damon/sysfs_memcg_path_leak.sh       | 26 ++++++++++++----------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh b/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh
index 64c5d8c518a4..33a7ff43ed6c 100755
--- a/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh
+++ b/tools/testing/selftests/damon/sysfs_memcg_path_leak.sh
@@ -14,6 +14,13 @@ then
 	exit $ksft_skip
 fi
 
+kmemleak="/sys/kernel/debug/kmemleak"
+if [ ! -f "$kmemleak" ]
+then
+	echo "$kmemleak not found"
+	exit $ksft_skip
+fi
+
 # ensure filter directory
 echo 1 > "$damon_sysfs/kdamonds/nr_kdamonds"
 echo 1 > "$damon_sysfs/kdamonds/0/contexts/nr_contexts"
@@ -22,22 +29,17 @@ echo 1 > "$damon_sysfs/kdamonds/0/contexts/0/schemes/0/filters/nr_filters"
 
 filter_dir="$damon_sysfs/kdamonds/0/contexts/0/schemes/0/filters/0"
 
-before_kb=$(grep Slab /proc/meminfo | awk '{print $2}')
-
-# try to leak 3000 KiB
-for i in {1..102400};
+# try to leak 128 times
+for i in {1..128};
 do
 	echo "012345678901234567890123456789" > "$filter_dir/memcg_path"
 done
 
-after_kb=$(grep Slab /proc/meminfo | awk '{print $2}')
-# expect up to 1500 KiB free from other tasks memory
-expected_after_kb_max=$((before_kb + 1500))
-
-if [ "$after_kb" -gt "$expected_after_kb_max" ]
+echo scan > "$kmemleak"
+kmemleak_report=$(cat "$kmemleak")
+if [ "$kmemleak_report" = "" ]
 then
-	echo "maybe memcg_path are leaking: $before_kb -> $after_kb"
-	exit 1
-else
 	exit 0
 fi
+echo "$kmemleak_report"
+exit 1
-- 
cgit v1.2.3


From 891d206e27dc1a684e460b079d2b53e17135d693 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 16 Jan 2026 18:07:25 -0800
Subject: selftests/damon/wss_estimation: test for up to 160 MiB working set
 size

DAMON reads and writes Accessed bits of page tables without manual TLB
flush for two reasons.  First, it minimizes the overhead.  Second, real
systems that need DAMON are expected to be memory intensive enough to
cause periodic TLB flushes.  For test setups that use small test
workloads, however, the system's TLB could be big enough to cover whole or
most accesses of the test workload.  In this case, no page table walk
happens and DAMON cannot show any access from the test workload.

The test workload for DAMON's working set size estimation selftest is such
a case.  It accesses only 10 MiB working set, and it turned out there are
test setups that have TLBs large enough to cover the 10 MiB data accesses.
As a result, the test fails depending on the test machine.

Make it more reliable by trying larger working sets up to 160 MiB when it
fails.

Link: https://lkml.kernel.org/r/20260117020731.226785-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 ..._update_schemes_tried_regions_wss_estimation.py | 29 +++++++++++++++++-----
 1 file changed, 23 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py
index 90ad7409a7a6..bf48ef8e5241 100755
--- a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py
+++ b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py
@@ -6,9 +6,8 @@ import time
 
 import _damon_sysfs
 
-def main():
-    # access two 10 MiB memory regions, 2 second per each
-    sz_region = 10 * 1024 * 1024
+def pass_wss_estimation(sz_region):
+    # access two regions of given size, 2 seocnds per each region
     proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000'])
     kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond(
             contexts=[_damon_sysfs.DamonCtx(
@@ -36,20 +35,38 @@ def main():
 
         wss_collected.append(
                 kdamonds.kdamonds[0].contexts[0].schemes[0].tried_bytes)
+    err = kdamonds.stop()
+    if err is not None:
+        print('kdamond stop failed: %s' % err)
+        exit(1)
 
     wss_collected.sort()
     acceptable_error_rate = 0.2
     for percentile in [50, 75]:
         sample = wss_collected[int(len(wss_collected) * percentile / 100)]
         error_rate = abs(sample - sz_region) / sz_region
-        print('%d-th percentile (%d) error %f' %
-                (percentile, sample, error_rate))
+        print('%d-th percentile error %f (expect %d, result %d)' %
+                (percentile, error_rate, sz_region, sample))
         if error_rate > acceptable_error_rate:
             print('the error rate is not acceptable (> %f)' %
                     acceptable_error_rate)
             print('samples are as below')
             print('\n'.join(['%d' % wss for wss in wss_collected]))
-            exit(1)
+            return False
+    return True
+
+def main():
+    # DAMON doesn't flush TLB.  If the system has large TLB that can cover
+    # whole test working set, DAMON cannot see the access.  Test up to 160 MiB
+    # test working set.
+    sz_region_mb = 10
+    max_sz_region_mb = 160
+    while sz_region_mb <= max_sz_region_mb:
+        test_pass = pass_wss_estimation(sz_region_mb * 1024 * 1024)
+        if test_pass is True:
+            exit(0)
+        sz_region_mb *= 2
+    exit(1)
 
 if __name__ == '__main__':
     main()
-- 
cgit v1.2.3


From 514d1bcb58e0ef93fafa4f9c3035d604a4219867 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 16 Jan 2026 18:07:26 -0800
Subject: selftests/damon/access_memory: add repeat mode

'access_memory' is an artificial memory access generator program that is
used for a few DAMON selftests.  It accesses a given number of regions one
by one only once, and exits.  Depending on systems, the test workload may
exit faster than expected, making the tests unreliable.  For reliable
control of the artificial memory access pattern, add a mode to make it
repeat running.

Link: https://lkml.kernel.org/r/20260117020731.226785-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/damon/access_memory.c | 29 +++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/access_memory.c b/tools/testing/selftests/damon/access_memory.c
index 56b17e8fe1be..567793b11107 100644
--- a/tools/testing/selftests/damon/access_memory.c
+++ b/tools/testing/selftests/damon/access_memory.c
@@ -8,6 +8,11 @@
 #include <string.h>
 #include <time.h>
 
+enum access_mode {
+	ACCESS_MODE_ONCE,
+	ACCESS_MODE_REPEAT,
+};
+
 int main(int argc, char *argv[])
 {
 	char **regions;
@@ -15,10 +20,12 @@ int main(int argc, char *argv[])
 	int nr_regions;
 	int sz_region;
 	int access_time_ms;
+	enum access_mode mode = ACCESS_MODE_ONCE;
+
 	int i;
 
-	if (argc != 4) {
-		printf("Usage: %s <number> <size (bytes)> <time (ms)>\n",
+	if (argc < 4) {
+		printf("Usage: %s <number> <size (bytes)> <time (ms)> [mode]\n",
 				argv[0]);
 		return -1;
 	}
@@ -27,15 +34,21 @@ int main(int argc, char *argv[])
 	sz_region = atoi(argv[2]);
 	access_time_ms = atoi(argv[3]);
 
+	if (argc > 4 && !strcmp(argv[4], "repeat"))
+		mode = ACCESS_MODE_REPEAT;
+
 	regions = malloc(sizeof(*regions) * nr_regions);
 	for (i = 0; i < nr_regions; i++)
 		regions[i] = malloc(sz_region);
 
-	for (i = 0; i < nr_regions; i++) {
-		start_clock = clock();
-		while ((clock() - start_clock) * 1000 / CLOCKS_PER_SEC <
-				access_time_ms)
-			memset(regions[i], i, sz_region);
-	}
+	do {
+		for (i = 0; i < nr_regions; i++) {
+			start_clock = clock();
+			while ((clock() - start_clock) * 1000 / CLOCKS_PER_SEC
+					< access_time_ms)
+				memset(regions[i], i, sz_region);
+		}
+	} while (mode == ACCESS_MODE_REPEAT);
+
 	return 0;
 }
-- 
cgit v1.2.3


From 57525e596bdbf2cb125df8b45902530f219ba444 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 16 Jan 2026 18:07:27 -0800
Subject: selftests/damon/wss_estimation: ensure number of collected wss

DAMON selftest for working set size estimation collects DAMON's working
set size measurements of the running artificial memory access generator
program until the program is finished.  Depending on how quickly the
program finishes, and how quickly DAMON starts, the number of collected
working set size measurements may vary, and make the test results
unreliable.  Ensure it collects 40 measurements by using the repeat mode
of the artificial memory access generator program, and finish the
measurements only after the desired number of collections are made.

Link: https://lkml.kernel.org/r/20260117020731.226785-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../damon/sysfs_update_schemes_tried_regions_wss_estimation.py      | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py
index bf48ef8e5241..cdccb9f0f855 100755
--- a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py
+++ b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py
@@ -8,7 +8,8 @@ import _damon_sysfs
 
 def pass_wss_estimation(sz_region):
     # access two regions of given size, 2 seocnds per each region
-    proc = subprocess.Popen(['./access_memory', '2', '%d' % sz_region, '2000'])
+    proc = subprocess.Popen(
+            ['./access_memory', '2', '%d' % sz_region, '2000', 'repeat'])
     kdamonds = _damon_sysfs.Kdamonds([_damon_sysfs.Kdamond(
             contexts=[_damon_sysfs.DamonCtx(
                 ops='vaddr',
@@ -26,7 +27,7 @@ def pass_wss_estimation(sz_region):
         exit(1)
 
     wss_collected = []
-    while proc.poll() == None:
+    while proc.poll() is None and len(wss_collected) < 40:
         time.sleep(0.1)
         err = kdamonds.kdamonds[0].update_schemes_tried_bytes()
         if err != None:
@@ -35,6 +36,7 @@ def pass_wss_estimation(sz_region):
 
         wss_collected.append(
                 kdamonds.kdamonds[0].contexts[0].schemes[0].tried_bytes)
+    proc.terminate()
     err = kdamonds.stop()
     if err is not None:
         print('kdamond stop failed: %s' % err)
-- 
cgit v1.2.3


From 6f06f86a6f219037a7617e3044e1c2120798320e Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 16 Jan 2026 18:07:28 -0800
Subject: selftests/damon/wss_estimation: deduplicate failed samples output

When the test fails, it shows whole sampled working set size measurements.
The purpose is showing the distribution of the measured values, to let
the tester know if it was just intermittent failure.  Multiple same values
on the output are therefore unnecessary.  It was not a big deal since the
test was failing only once in the past.  But the test can now fail
multiple times with increased working set size, until it passes or the
working set size reaches a limit.  Hence the noisy output can be quite
long and annoying.  Print only the deduplicated distribution information.

Link: https://lkml.kernel.org/r/20260117020731.226785-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../damon/sysfs_update_schemes_tried_regions_wss_estimation.py      | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py
index cdccb9f0f855..35c724a63f6c 100755
--- a/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py
+++ b/tools/testing/selftests/damon/sysfs_update_schemes_tried_regions_wss_estimation.py
@@ -53,7 +53,11 @@ def pass_wss_estimation(sz_region):
             print('the error rate is not acceptable (> %f)' %
                     acceptable_error_rate)
             print('samples are as below')
-            print('\n'.join(['%d' % wss for wss in wss_collected]))
+            for idx, wss in enumerate(wss_collected):
+                if idx < len(wss_collected) - 1 and \
+                        wss_collected[idx + 1] == wss:
+                    continue
+                print('%d/%d: %d' % (idx, len(wss_collected), wss))
             return False
     return True
 
-- 
cgit v1.2.3


From 6ce964c02f1cb49b4dbb76507948c004d5a0b4fe Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 23 Jan 2026 22:39:24 +0000
Subject: selftests/mm: have the harness run each test category separately

At present the mm selftests are integrated into the kselftest harness by
having it run run_vmtest.sh and letting it pick it's default set of tests
to invoke, rather than by telling the kselftest framework about each test
program individually as is more standard.  This has some unfortunate
interactions with the kselftest harness:

 - If any of the tests hangs the harness will kill the entire mm
   selftests run rather than just the individual test, meaning no
   further tests get run.
 - The timeout applied by the harness is applied to the whole run rather
   than an individual test which frequently leads to the suite not being
   completed in production testing.

Deploy a crude but effective mitigation for these issues by telling the
kselftest framework to run each of the test categories that run_vmtests.sh
has separately.  Since kselftest really wants to run test programs this is
done by providing a trivial wrapper script for each categorty that invokes
run_vmtest.sh, this is not a thing of great elegence but it is clear and
simple.  Since run_vmtests.sh is doing runtime support detection, scenario
enumeration and setup for many of the tests we can't consistently tell the
framework about the individual test programs.

This has the side effect of reordering the tests, hopefully the testing
is not overly sensitive to this.

Link: https://lkml.kernel.org/r/20260123-selftests-mm-run-suites-separately-v2-1-3e934edacbfa@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/Makefile                | 33 +++++++++++++++++++++-
 tools/testing/selftests/mm/ksft_compaction.sh      |  4 +++
 tools/testing/selftests/mm/ksft_cow.sh             |  4 +++
 tools/testing/selftests/mm/ksft_gup_test.sh        |  4 +++
 tools/testing/selftests/mm/ksft_hmm.sh             |  4 +++
 tools/testing/selftests/mm/ksft_hugetlb.sh         |  4 +++
 tools/testing/selftests/mm/ksft_hugevm.sh          |  4 +++
 tools/testing/selftests/mm/ksft_ksm.sh             |  4 +++
 tools/testing/selftests/mm/ksft_ksm_numa.sh        |  4 +++
 tools/testing/selftests/mm/ksft_madv_guard.sh      |  4 +++
 tools/testing/selftests/mm/ksft_madv_populate.sh   |  4 +++
 tools/testing/selftests/mm/ksft_mdwe.sh            |  4 +++
 tools/testing/selftests/mm/ksft_memfd_secret.sh    |  4 +++
 tools/testing/selftests/mm/ksft_migration.sh       |  4 +++
 tools/testing/selftests/mm/ksft_mkdirty.sh         |  4 +++
 tools/testing/selftests/mm/ksft_mlock.sh           |  4 +++
 tools/testing/selftests/mm/ksft_mmap.sh            |  4 +++
 tools/testing/selftests/mm/ksft_mremap.sh          |  4 +++
 tools/testing/selftests/mm/ksft_page_frag.sh       |  4 +++
 tools/testing/selftests/mm/ksft_pagemap.sh         |  4 +++
 tools/testing/selftests/mm/ksft_pfnmap.sh          |  4 +++
 tools/testing/selftests/mm/ksft_pkey.sh            |  4 +++
 tools/testing/selftests/mm/ksft_process_madv.sh    |  4 +++
 .../testing/selftests/mm/ksft_process_mrelease.sh  |  4 +++
 tools/testing/selftests/mm/ksft_rmap.sh            |  4 +++
 tools/testing/selftests/mm/ksft_soft_dirty.sh      |  4 +++
 tools/testing/selftests/mm/ksft_thp.sh             |  4 +++
 tools/testing/selftests/mm/ksft_userfaultfd.sh     |  4 +++
 tools/testing/selftests/mm/ksft_vma_merge.sh       |  4 +++
 tools/testing/selftests/mm/ksft_vmalloc.sh         |  4 +++
 tools/testing/selftests/mm/run_vmtests.sh          |  4 +++
 31 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/mm/ksft_compaction.sh
 create mode 100755 tools/testing/selftests/mm/ksft_cow.sh
 create mode 100755 tools/testing/selftests/mm/ksft_gup_test.sh
 create mode 100755 tools/testing/selftests/mm/ksft_hmm.sh
 create mode 100755 tools/testing/selftests/mm/ksft_hugetlb.sh
 create mode 100755 tools/testing/selftests/mm/ksft_hugevm.sh
 create mode 100755 tools/testing/selftests/mm/ksft_ksm.sh
 create mode 100755 tools/testing/selftests/mm/ksft_ksm_numa.sh
 create mode 100755 tools/testing/selftests/mm/ksft_madv_guard.sh
 create mode 100755 tools/testing/selftests/mm/ksft_madv_populate.sh
 create mode 100755 tools/testing/selftests/mm/ksft_mdwe.sh
 create mode 100755 tools/testing/selftests/mm/ksft_memfd_secret.sh
 create mode 100755 tools/testing/selftests/mm/ksft_migration.sh
 create mode 100755 tools/testing/selftests/mm/ksft_mkdirty.sh
 create mode 100755 tools/testing/selftests/mm/ksft_mlock.sh
 create mode 100755 tools/testing/selftests/mm/ksft_mmap.sh
 create mode 100755 tools/testing/selftests/mm/ksft_mremap.sh
 create mode 100755 tools/testing/selftests/mm/ksft_page_frag.sh
 create mode 100755 tools/testing/selftests/mm/ksft_pagemap.sh
 create mode 100755 tools/testing/selftests/mm/ksft_pfnmap.sh
 create mode 100755 tools/testing/selftests/mm/ksft_pkey.sh
 create mode 100755 tools/testing/selftests/mm/ksft_process_madv.sh
 create mode 100755 tools/testing/selftests/mm/ksft_process_mrelease.sh
 create mode 100755 tools/testing/selftests/mm/ksft_rmap.sh
 create mode 100755 tools/testing/selftests/mm/ksft_soft_dirty.sh
 create mode 100755 tools/testing/selftests/mm/ksft_thp.sh
 create mode 100755 tools/testing/selftests/mm/ksft_userfaultfd.sh
 create mode 100755 tools/testing/selftests/mm/ksft_vma_merge.sh
 create mode 100755 tools/testing/selftests/mm/ksft_vmalloc.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 2fdb05e5a56a..905f1e034963 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for mm selftests
 
+# IMPORTANT: If you add a new test CATEGORY please add a simple wrapper
+# script so kunit knows to run it, and add it to the list below.
+# If you do not YOUR TESTS WILL NOT RUN IN THE CI.
+
 LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h
 LOCAL_HDRS += $(selfdir)/mm/mseal_helpers.h
 
@@ -139,7 +143,33 @@ TEST_GEN_FILES += va_high_addr_switch
 TEST_GEN_FILES += write_to_hugetlbfs
 endif
 
-TEST_PROGS := run_vmtests.sh
+TEST_PROGS += ksft_compaction.sh
+TEST_PROGS += ksft_cow.sh
+TEST_PROGS += ksft_gup_test.sh
+TEST_PROGS += ksft_hmm.sh
+TEST_PROGS += ksft_hugetlb.sh
+TEST_PROGS += ksft_hugevm.sh
+TEST_PROGS += ksft_ksm.sh
+TEST_PROGS += ksft_ksm_numa.sh
+TEST_PROGS += ksft_madv_guard.sh
+TEST_PROGS += ksft_madv_populate.sh
+TEST_PROGS += ksft_memfd_secret.sh
+TEST_PROGS += ksft_migration.sh
+TEST_PROGS += ksft_mkdirty.sh
+TEST_PROGS += ksft_mlock.sh
+TEST_PROGS += ksft_mmap.sh
+TEST_PROGS += ksft_mremap.sh
+TEST_PROGS += ksft_pagemap.sh
+TEST_PROGS += ksft_pfnmap.sh
+TEST_PROGS += ksft_pkey.sh
+TEST_PROGS += ksft_process_madv.sh
+TEST_PROGS += ksft_process_mrelease.sh
+TEST_PROGS += ksft_rmap.sh
+TEST_PROGS += ksft_soft_dirty.sh
+TEST_PROGS += ksft_thp.sh
+TEST_PROGS += ksft_userfaultfd.sh
+TEST_PROGS += ksft_vma_merge.sh
+TEST_PROGS += ksft_vmalloc.sh
 
 TEST_FILES := test_vmalloc.sh
 TEST_FILES += test_hmm.sh
@@ -147,6 +177,7 @@ TEST_FILES += va_high_addr_switch.sh
 TEST_FILES += charge_reserved_hugetlb.sh
 TEST_FILES += hugetlb_reparenting_test.sh
 TEST_FILES += test_page_frag.sh
+TEST_FILES += run_vmtests.sh
 
 # required by charge_reserved_hugetlb.sh
 TEST_FILES += write_hugetlb_memory.sh
diff --git a/tools/testing/selftests/mm/ksft_compaction.sh b/tools/testing/selftests/mm/ksft_compaction.sh
new file mode 100755
index 000000000000..1f38f4228a34
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_compaction.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t compaction
diff --git a/tools/testing/selftests/mm/ksft_cow.sh b/tools/testing/selftests/mm/ksft_cow.sh
new file mode 100755
index 000000000000..1e03a95fd5f6
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_cow.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t cow
diff --git a/tools/testing/selftests/mm/ksft_gup_test.sh b/tools/testing/selftests/mm/ksft_gup_test.sh
new file mode 100755
index 000000000000..09e586d2f446
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_gup_test.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t gup_test
diff --git a/tools/testing/selftests/mm/ksft_hmm.sh b/tools/testing/selftests/mm/ksft_hmm.sh
new file mode 100755
index 000000000000..0a7b04f454d5
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_hmm.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t hmm
diff --git a/tools/testing/selftests/mm/ksft_hugetlb.sh b/tools/testing/selftests/mm/ksft_hugetlb.sh
new file mode 100755
index 000000000000..4f92974a4eb5
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_hugetlb.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t hugetlb
diff --git a/tools/testing/selftests/mm/ksft_hugevm.sh b/tools/testing/selftests/mm/ksft_hugevm.sh
new file mode 100755
index 000000000000..377967fe9c91
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_hugevm.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t hugevm
diff --git a/tools/testing/selftests/mm/ksft_ksm.sh b/tools/testing/selftests/mm/ksft_ksm.sh
new file mode 100755
index 000000000000..f6a6fe13a3b0
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_ksm.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t ksm
diff --git a/tools/testing/selftests/mm/ksft_ksm_numa.sh b/tools/testing/selftests/mm/ksft_ksm_numa.sh
new file mode 100755
index 000000000000..144b41a5e3bb
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_ksm_numa.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t ksm_numa
diff --git a/tools/testing/selftests/mm/ksft_madv_guard.sh b/tools/testing/selftests/mm/ksft_madv_guard.sh
new file mode 100755
index 000000000000..2d810c049182
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_madv_guard.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t madv_guard
diff --git a/tools/testing/selftests/mm/ksft_madv_populate.sh b/tools/testing/selftests/mm/ksft_madv_populate.sh
new file mode 100755
index 000000000000..127e22ed02c4
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_madv_populate.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t madv_populate
diff --git a/tools/testing/selftests/mm/ksft_mdwe.sh b/tools/testing/selftests/mm/ksft_mdwe.sh
new file mode 100755
index 000000000000..3dcae95ddabc
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_mdwe.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t mdwe
diff --git a/tools/testing/selftests/mm/ksft_memfd_secret.sh b/tools/testing/selftests/mm/ksft_memfd_secret.sh
new file mode 100755
index 000000000000..56e82dd648a7
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_memfd_secret.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t memfd_secret
diff --git a/tools/testing/selftests/mm/ksft_migration.sh b/tools/testing/selftests/mm/ksft_migration.sh
new file mode 100755
index 000000000000..7cf37c72d26e
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_migration.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t migration
diff --git a/tools/testing/selftests/mm/ksft_mkdirty.sh b/tools/testing/selftests/mm/ksft_mkdirty.sh
new file mode 100755
index 000000000000..dd6332df3204
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_mkdirty.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t mkdirty
diff --git a/tools/testing/selftests/mm/ksft_mlock.sh b/tools/testing/selftests/mm/ksft_mlock.sh
new file mode 100755
index 000000000000..1e25ab9fdc8b
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_mlock.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t mlock
diff --git a/tools/testing/selftests/mm/ksft_mmap.sh b/tools/testing/selftests/mm/ksft_mmap.sh
new file mode 100755
index 000000000000..2c3137ae8bc8
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_mmap.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t mmap
diff --git a/tools/testing/selftests/mm/ksft_mremap.sh b/tools/testing/selftests/mm/ksft_mremap.sh
new file mode 100755
index 000000000000..4101670d0e19
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_mremap.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t mremap
diff --git a/tools/testing/selftests/mm/ksft_page_frag.sh b/tools/testing/selftests/mm/ksft_page_frag.sh
new file mode 100755
index 000000000000..216e20ffe390
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_page_frag.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t page_frag
diff --git a/tools/testing/selftests/mm/ksft_pagemap.sh b/tools/testing/selftests/mm/ksft_pagemap.sh
new file mode 100755
index 000000000000..b8d270fdd43e
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_pagemap.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t pagemap
diff --git a/tools/testing/selftests/mm/ksft_pfnmap.sh b/tools/testing/selftests/mm/ksft_pfnmap.sh
new file mode 100755
index 000000000000..75758de968bb
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_pfnmap.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t pfnmap
diff --git a/tools/testing/selftests/mm/ksft_pkey.sh b/tools/testing/selftests/mm/ksft_pkey.sh
new file mode 100755
index 000000000000..ac944233b7f7
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_pkey.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t pkey
diff --git a/tools/testing/selftests/mm/ksft_process_madv.sh b/tools/testing/selftests/mm/ksft_process_madv.sh
new file mode 100755
index 000000000000..2c3137ae8bc8
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_process_madv.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t mmap
diff --git a/tools/testing/selftests/mm/ksft_process_mrelease.sh b/tools/testing/selftests/mm/ksft_process_mrelease.sh
new file mode 100755
index 000000000000..f560aa5e4218
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_process_mrelease.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t process_mrelease
diff --git a/tools/testing/selftests/mm/ksft_rmap.sh b/tools/testing/selftests/mm/ksft_rmap.sh
new file mode 100755
index 000000000000..974742b9b02f
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_rmap.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t rmap
diff --git a/tools/testing/selftests/mm/ksft_soft_dirty.sh b/tools/testing/selftests/mm/ksft_soft_dirty.sh
new file mode 100755
index 000000000000..d160d7fea0a9
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_soft_dirty.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t soft_dirty
diff --git a/tools/testing/selftests/mm/ksft_thp.sh b/tools/testing/selftests/mm/ksft_thp.sh
new file mode 100755
index 000000000000..95321aecabdb
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_thp.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t thp
diff --git a/tools/testing/selftests/mm/ksft_userfaultfd.sh b/tools/testing/selftests/mm/ksft_userfaultfd.sh
new file mode 100755
index 000000000000..92667abde6c6
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_userfaultfd.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t userfaultfd
diff --git a/tools/testing/selftests/mm/ksft_vma_merge.sh b/tools/testing/selftests/mm/ksft_vma_merge.sh
new file mode 100755
index 000000000000..68449d840680
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_vma_merge.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t vma_merge
diff --git a/tools/testing/selftests/mm/ksft_vmalloc.sh b/tools/testing/selftests/mm/ksft_vmalloc.sh
new file mode 100755
index 000000000000..0b5019a76612
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_vmalloc.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t vmalloc
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 452875db532c..29be9038bfb0 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -2,6 +2,10 @@
 # SPDX-License-Identifier: GPL-2.0
 # Please run as root
 
+# IMPORTANT: If you add a new test CATEGORY please add a simple wrapper
+# script so kunit knows to run it, and add it to the list below.
+# If you do not YOUR TESTS WILL NOT RUN IN THE CI.
+
 # Kselftest framework requirement - SKIP code is 4.
 ksft_skip=4
 
-- 
cgit v1.2.3


From 503efe850c7463a1e59df133b84461ef53c0361f Mon Sep 17 00:00:00 2001
From: Wang Yaxin <wang.yaxin@zte.com.cn>
Date: Mon, 19 Jan 2026 10:02:41 +0800
Subject: delayacct: add timestamp of delay max

Problem
=======
Commit 658eb5ab916d ("delayacct: add delay max to record delay peak")
introduced the delay max for getdelays, which records abnormal latency
peaks and helps us understand the magnitude of such delays.  However, the
peak latency value alone is insufficient for effective root cause
analysis.  Without the precise timestamp of when the peak occurred, we
still lack the critical context needed to correlate it with other system
events.

Solution
========
To address this, we need to additionally record a precise timestamp when
the maximum latency occurs.  By correlating this timestamp with system
logs and monitoring metrics, we can identify processes with abnormal
resource usage at the same moment, which can help us to pinpoint root
causes.

Use Case
========
bash-4.4# ./getdelays -d -t 227
print delayacct stats ON
TGID    227
CPU         count     real total  virtual total    delay total  delay average      delay max      delay min      delay max timestamp
               46      188000000      192348334        4098012          0.089ms     0.429260ms     0.051205ms    2026-01-15T15:06:58
IO          count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
SWAP        count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
RECLAIM     count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
THRAS HING   count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
COMPACT     count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
WPCOPY      count    delay total  delay average      delay max      delay min      delay max timestamp
              182       19413338          0.107ms     0.547353ms     0.022462ms    2026-01-15T15:05:24
IRQ         count    delay total  delay average      delay max      delay min      delay max timestamp
                0              0          0.000ms     0.000000ms     0.000000ms                    N/A

Link: https://lkml.kernel.org/r/20260119100241520gWubW8-5QfhSf9gjqcc_E@zte.com.cn
Signed-off-by: Wang Yaxin <wang.yaxin@zte.com.cn>
Cc: Fan Yu <fan.yu9@zte.com.cn>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/accounting/delay-accounting.rst |  32 ++---
 include/linux/delayacct.h                     |   8 ++
 include/linux/sched.h                         |   5 +
 include/uapi/linux/taskstats.h                |  22 +++-
 kernel/delayacct.c                            |  31 +++--
 kernel/sched/stats.h                          |   8 +-
 tools/accounting/getdelays.c                  | 172 ++++++++++++++++++++++----
 7 files changed, 223 insertions(+), 55 deletions(-)

(limited to 'tools')

diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
index 86d7902a657f..e209c46241b0 100644
--- a/Documentation/accounting/delay-accounting.rst
+++ b/Documentation/accounting/delay-accounting.rst
@@ -107,22 +107,22 @@ Get sum and peak of delays, since system boot, for all pids with tgid 242::
 	TGID    242
 
 
-	CPU         count     real total  virtual total    delay total  delay average      delay max      delay min
-	               39      156000000      156576579        2111069          0.054ms     0.212296ms     0.031307ms
-	IO          count    delay total  delay average      delay max      delay min
-	                0              0          0.000ms     0.000000ms     0.000000ms
-	SWAP        count    delay total  delay average      delay max      delay min
-	                0              0          0.000ms     0.000000ms     0.000000ms
-	RECLAIM     count    delay total  delay average      delay max      delay min
-	                0              0          0.000ms     0.000000ms     0.000000ms
-	THRASHING   count    delay total  delay average      delay max      delay min
-	                0              0          0.000ms     0.000000ms     0.000000ms
-	COMPACT     count    delay total  delay average      delay max      delay min
-	                0              0          0.000ms     0.000000ms     0.000000ms
-	WPCOPY      count    delay total  delay average      delay max      delay min
-	              156       11215873          0.072ms     0.207403ms     0.033913ms
-	IRQ         count    delay total  delay average      delay max      delay min
-	                0              0          0.000ms     0.000000ms     0.000000ms
+	CPU         count     real total  virtual total    delay total  delay average      delay max      delay min      delay max timestamp
+	               46      188000000      192348334        4098012          0.089ms     0.429260ms     0.051205ms    2026-01-15T15:06:58
+	IO          count    delay total  delay average      delay max      delay min      delay max timestamp
+	                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
+	SWAP        count    delay total  delay average      delay max      delay min      delay max timestamp
+	                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
+	RECLAIM     count    delay total  delay average      delay max      delay min      delay max timestamp
+	                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
+	THRASHING   count    delay total  delay average      delay max      delay min      delay max timestamp
+	                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
+	COMPACT     count    delay total  delay average      delay max      delay min      delay max timestamp
+	                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
+	WPCOPY      count    delay total  delay average      delay max      delay min      delay max timestamp
+	              182       19413338          0.107ms     0.547353ms     0.022462ms    2026-01-15T15:05:24
+	IRQ         count    delay total  delay average      delay max      delay min      delay max timestamp
+	                0              0          0.000ms     0.000000ms     0.000000ms                    N/A
 
 Get IO accounting for pid 1, it works only with -p::
 
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 800dcc360db2..ecb06f16d22c 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -69,6 +69,14 @@ struct task_delay_info {
 	u32 compact_count;	/* total count of memory compact */
 	u32 wpcopy_count;	/* total count of write-protect copy */
 	u32 irq_count;	/* total count of IRQ/SOFTIRQ */
+
+	struct timespec64 blkio_delay_max_ts;
+	struct timespec64 swapin_delay_max_ts;
+	struct timespec64 freepages_delay_max_ts;
+	struct timespec64 thrashing_delay_max_ts;
+	struct timespec64 compact_delay_max_ts;
+	struct timespec64 wpcopy_delay_max_ts;
+	struct timespec64 irq_delay_max_ts;
 };
 #endif
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index da0133524d08..1d22b6229b95 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -49,6 +49,7 @@
 #include <linux/tracepoint-defs.h>
 #include <linux/unwind_deferred_types.h>
 #include <asm/kmap_size.h>
+#include <linux/time64.h>
 #ifndef COMPILE_OFFSETS
 #include <generated/rq-offsets.h>
 #endif
@@ -86,6 +87,7 @@ struct signal_struct;
 struct task_delay_info;
 struct task_group;
 struct task_struct;
+struct timespec64;
 struct user_event_mm;
 
 #include <linux/sched/ext.h>
@@ -435,6 +437,9 @@ struct sched_info {
 	/* When were we last queued to run? */
 	unsigned long long		last_queued;
 
+	/* Timestamp of max time spent waiting on a runqueue: */
+	struct timespec64		max_run_delay_ts;
+
 #endif /* CONFIG_SCHED_INFO */
 };
 
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 5929030d4e8b..1b31e8e14d2f 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -18,6 +18,16 @@
 #define _LINUX_TASKSTATS_H
 
 #include <linux/types.h>
+#ifdef __KERNEL__
+#include <linux/time64.h>
+#else
+#ifndef _LINUX_TIME64_H
+struct timespec64 {
+	__s64   tv_sec;         /* seconds */
+	long    tv_nsec;        /* nanoseconds */
+};
+#endif
+#endif
 
 /* Format for per-task data returned to userland when
  *	- a task exits
@@ -34,7 +44,7 @@
  */
 
 
-#define TASKSTATS_VERSION	16
+#define TASKSTATS_VERSION	17
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -230,6 +240,16 @@ struct taskstats {
 
 	__u64	irq_delay_max;
 	__u64	irq_delay_min;
+
+	/*v17: delay max timestamp record*/
+	struct timespec64 cpu_delay_max_ts;
+	struct timespec64 blkio_delay_max_ts;
+	struct timespec64 swapin_delay_max_ts;
+	struct timespec64 freepages_delay_max_ts;
+	struct timespec64 thrashing_delay_max_ts;
+	struct timespec64 compact_delay_max_ts;
+	struct timespec64 wpcopy_delay_max_ts;
+	struct timespec64 irq_delay_max_ts;
 };
 
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 30e7912ebb0d..d58ffc63bcba 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -18,6 +18,7 @@
 do { \
 	d->type##_delay_max = tsk->delays->type##_delay_max; \
 	d->type##_delay_min = tsk->delays->type##_delay_min; \
+	d->type##_delay_max_ts = tsk->delays->type##_delay_max_ts; \
 	tmp = d->type##_delay_total + tsk->delays->type##_delay; \
 	d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \
 	d->type##_count += tsk->delays->type##_count; \
@@ -104,7 +105,8 @@ void __delayacct_tsk_init(struct task_struct *tsk)
  * Finish delay accounting for a statistic using its timestamps (@start),
  * accumulator (@total) and @count
  */
-static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count,
+							 u64 *max, u64 *min, struct timespec64 *ts)
 {
 	s64 ns = local_clock() - *start;
 	unsigned long flags;
@@ -113,8 +115,10 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *cou
 		raw_spin_lock_irqsave(lock, flags);
 		*total += ns;
 		(*count)++;
-		if (ns > *max)
+		if (ns > *max) {
 			*max = ns;
+			ktime_get_real_ts64(ts);
+		}
 		if (*min == 0 || ns < *min)
 			*min = ns;
 		raw_spin_unlock_irqrestore(lock, flags);
@@ -137,7 +141,8 @@ void __delayacct_blkio_end(struct task_struct *p)
 		      &p->delays->blkio_delay,
 		      &p->delays->blkio_count,
 		      &p->delays->blkio_delay_max,
-		      &p->delays->blkio_delay_min);
+		      &p->delays->blkio_delay_min,
+		      &p->delays->blkio_delay_max_ts);
 }
 
 int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -170,6 +175,7 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 
 	d->cpu_delay_max = tsk->sched_info.max_run_delay;
 	d->cpu_delay_min = tsk->sched_info.min_run_delay;
+	d->cpu_delay_max_ts = tsk->sched_info.max_run_delay_ts;
 	tmp = (s64)d->cpu_delay_total + t2;
 	d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
 	tmp = (s64)d->cpu_run_virtual_total + t3;
@@ -217,7 +223,8 @@ void __delayacct_freepages_end(void)
 		      &current->delays->freepages_delay,
 		      &current->delays->freepages_count,
 		      &current->delays->freepages_delay_max,
-		      &current->delays->freepages_delay_min);
+		      &current->delays->freepages_delay_min,
+		      &current->delays->freepages_delay_max_ts);
 }
 
 void __delayacct_thrashing_start(bool *in_thrashing)
@@ -241,7 +248,8 @@ void __delayacct_thrashing_end(bool *in_thrashing)
 		      &current->delays->thrashing_delay,
 		      &current->delays->thrashing_count,
 		      &current->delays->thrashing_delay_max,
-		      &current->delays->thrashing_delay_min);
+		      &current->delays->thrashing_delay_min,
+		      &current->delays->thrashing_delay_max_ts);
 }
 
 void __delayacct_swapin_start(void)
@@ -256,7 +264,8 @@ void __delayacct_swapin_end(void)
 		      &current->delays->swapin_delay,
 		      &current->delays->swapin_count,
 		      &current->delays->swapin_delay_max,
-		      &current->delays->swapin_delay_min);
+		      &current->delays->swapin_delay_min,
+		      &current->delays->swapin_delay_max_ts);
 }
 
 void __delayacct_compact_start(void)
@@ -271,7 +280,8 @@ void __delayacct_compact_end(void)
 		      &current->delays->compact_delay,
 		      &current->delays->compact_count,
 		      &current->delays->compact_delay_max,
-		      &current->delays->compact_delay_min);
+		      &current->delays->compact_delay_min,
+		      &current->delays->compact_delay_max_ts);
 }
 
 void __delayacct_wpcopy_start(void)
@@ -286,7 +296,8 @@ void __delayacct_wpcopy_end(void)
 		      &current->delays->wpcopy_delay,
 		      &current->delays->wpcopy_count,
 		      &current->delays->wpcopy_delay_max,
-		      &current->delays->wpcopy_delay_min);
+		      &current->delays->wpcopy_delay_min,
+		      &current->delays->wpcopy_delay_max_ts);
 }
 
 void __delayacct_irq(struct task_struct *task, u32 delta)
@@ -296,8 +307,10 @@ void __delayacct_irq(struct task_struct *task, u32 delta)
 	raw_spin_lock_irqsave(&task->delays->lock, flags);
 	task->delays->irq_delay += delta;
 	task->delays->irq_count++;
-	if (delta > task->delays->irq_delay_max)
+	if (delta > task->delays->irq_delay_max) {
 		task->delays->irq_delay_max = delta;
+		ktime_get_real_ts64(&task->delays->irq_delay_max_ts);
+	}
 	if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min))
 		task->delays->irq_delay_min = delta;
 	raw_spin_unlock_irqrestore(&task->delays->lock, flags);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c903f1a42891..a612cf253c87 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -253,8 +253,10 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
 	delta = rq_clock(rq) - t->sched_info.last_queued;
 	t->sched_info.last_queued = 0;
 	t->sched_info.run_delay += delta;
-	if (delta > t->sched_info.max_run_delay)
+	if (delta > t->sched_info.max_run_delay) {
 		t->sched_info.max_run_delay = delta;
+		ktime_get_real_ts64(&t->sched_info.max_run_delay_ts);
+	}
 	if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
 		t->sched_info.min_run_delay = delta;
 	rq_sched_info_dequeue(rq, delta);
@@ -278,8 +280,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
 	t->sched_info.run_delay += delta;
 	t->sched_info.last_arrival = now;
 	t->sched_info.pcount++;
-	if (delta > t->sched_info.max_run_delay)
+	if (delta > t->sched_info.max_run_delay) {
 		t->sched_info.max_run_delay = delta;
+		ktime_get_real_ts64(&t->sched_info.max_run_delay_ts);
+	}
 	if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
 		t->sched_info.min_run_delay = delta;
 
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 21cb3c3d1331..64796c0223be 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -24,6 +24,7 @@
 #include <sys/socket.h>
 #include <sys/wait.h>
 #include <signal.h>
+#include <time.h>
 
 #include <linux/genetlink.h>
 #include <linux/taskstats.h>
@@ -194,6 +195,37 @@ static int get_family_id(int sd)
 #define average_ms(t, c) (t / 1000000ULL / (c ? c : 1))
 #define delay_ms(t) (t / 1000000ULL)
 
+/*
+ * Format timespec64 to human readable string (YYYY-MM-DD HH:MM:SS)
+ * Returns formatted string or "N/A" if timestamp is zero
+ */
+static const char *format_timespec64(struct timespec64 *ts)
+{
+	static char buffer[32];
+	struct tm tm_info;
+	time_t time_sec;
+
+	/* Check if timestamp is zero (not set) */
+	if (ts->tv_sec == 0 && ts->tv_nsec == 0)
+		return "N/A";
+
+	time_sec = (time_t)ts->tv_sec;
+
+	/* Use thread-safe localtime_r */
+	if (localtime_r(&time_sec, &tm_info) == NULL)
+		return "N/A";
+
+	snprintf(buffer, sizeof(buffer), "%04d-%02d-%02dT%02d:%02d:%02d",
+		tm_info.tm_year + 1900,
+		tm_info.tm_mon + 1,
+		tm_info.tm_mday,
+		tm_info.tm_hour,
+		tm_info.tm_min,
+		tm_info.tm_sec);
+
+	return buffer;
+}
+
 /*
  * Version compatibility note:
  * Field availability depends on taskstats version (t->version),
@@ -205,13 +237,28 @@ static int get_family_id(int sd)
  * version >= 13  - supports WPCOPY statistics
  * version >= 14  - supports IRQ statistics
  * version >= 16  - supports *_max and *_min delay statistics
+ * version >= 17  - supports delay max timestamp statistics
  *
  * Always verify version before accessing version-dependent fields
  * to maintain backward compatibility.
  */
 #define PRINT_CPU_DELAY(version, t) \
 	do { \
-		if (version >= 16) { \
+		if (version >= 17) { \
+			printf("%-10s%15s%15s%15s%15s%15s%15s%15s%25s\n", \
+				"CPU", "count", "real total", "virtual total", \
+				"delay total", "delay average", "delay max", \
+				"delay min", "delay max timestamp"); \
+			printf("          %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms%23s\n", \
+				(unsigned long long)(t)->cpu_count, \
+				(unsigned long long)(t)->cpu_run_real_total, \
+				(unsigned long long)(t)->cpu_run_virtual_total, \
+				(unsigned long long)(t)->cpu_delay_total, \
+				average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \
+				delay_ms((double)(t)->cpu_delay_max), \
+				delay_ms((double)(t)->cpu_delay_min), \
+				format_timespec64(&(t)->cpu_delay_max_ts)); \
+		} else if (version >= 16) { \
 			printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \
 				"CPU", "count", "real total", "virtual total", \
 				"delay total", "delay average", "delay max", "delay min"); \
@@ -257,44 +304,115 @@ static int get_family_id(int sd)
 		} \
 	} while (0)
 
+#define PRINT_FILED_DELAY_WITH_TS(name, version, t, count, total, max, min, max_ts) \
+	do { \
+		if (version >= 17) { \
+			printf("%-10s%15s%15s%15s%15s%15s%25s\n", \
+				name, "count", "delay total", "delay average", \
+				"delay max", "delay min", "delay max timestamp"); \
+			printf("          %15llu%15llu%15.3fms%13.6fms%13.6fms%23s\n", \
+				(unsigned long long)(t)->count, \
+				(unsigned long long)(t)->total, \
+				average_ms((double)(t)->total, (t)->count), \
+				delay_ms((double)(t)->max), \
+				delay_ms((double)(t)->min), \
+				format_timespec64(&(t)->max_ts)); \
+		} else if (version >= 16) { \
+			printf("%-10s%15s%15s%15s%15s%15s\n", \
+				name, "count", "delay total", "delay average", \
+				"delay max", "delay min"); \
+			printf("          %15llu%15llu%15.3fms%13.6fms%13.6fms\n", \
+				(unsigned long long)(t)->count, \
+				(unsigned long long)(t)->total, \
+				average_ms((double)(t)->total, (t)->count), \
+				delay_ms((double)(t)->max), \
+				delay_ms((double)(t)->min)); \
+		} else { \
+			printf("%-10s%15s%15s%15s\n", \
+				name, "count", "delay total", "delay average"); \
+			printf("          %15llu%15llu%15.3fms\n", \
+				(unsigned long long)(t)->count, \
+				(unsigned long long)(t)->total, \
+				average_ms((double)(t)->total, (t)->count)); \
+		} \
+	} while (0)
+
 static void print_delayacct(struct taskstats *t)
 {
 	printf("\n\n");
 
 	PRINT_CPU_DELAY(t->version, t);
 
-	PRINT_FILED_DELAY("IO", t->version, t,
-		blkio_count, blkio_delay_total,
-		blkio_delay_max, blkio_delay_min);
+	/* Use new macro with timestamp support for version >= 17 */
+	if (t->version >= 17) {
+		PRINT_FILED_DELAY_WITH_TS("IO", t->version, t,
+			blkio_count, blkio_delay_total,
+			blkio_delay_max, blkio_delay_min, blkio_delay_max_ts);
 
-	PRINT_FILED_DELAY("SWAP", t->version, t,
-		swapin_count, swapin_delay_total,
-		swapin_delay_max, swapin_delay_min);
+		PRINT_FILED_DELAY_WITH_TS("SWAP", t->version, t,
+			swapin_count, swapin_delay_total,
+			swapin_delay_max, swapin_delay_min, swapin_delay_max_ts);
 
-	PRINT_FILED_DELAY("RECLAIM", t->version, t,
-		freepages_count, freepages_delay_total,
-		freepages_delay_max, freepages_delay_min);
+		PRINT_FILED_DELAY_WITH_TS("RECLAIM", t->version, t,
+			freepages_count, freepages_delay_total,
+			freepages_delay_max, freepages_delay_min, freepages_delay_max_ts);
 
-	PRINT_FILED_DELAY("THRASHING", t->version, t,
-		thrashing_count, thrashing_delay_total,
-		thrashing_delay_max, thrashing_delay_min);
+		PRINT_FILED_DELAY_WITH_TS("THRASHING", t->version, t,
+			thrashing_count, thrashing_delay_total,
+			thrashing_delay_max, thrashing_delay_min, thrashing_delay_max_ts);
 
-	if (t->version >= 11) {
-		PRINT_FILED_DELAY("COMPACT", t->version, t,
-			compact_count, compact_delay_total,
-			compact_delay_max, compact_delay_min);
-	}
+		if (t->version >= 11) {
+			PRINT_FILED_DELAY_WITH_TS("COMPACT", t->version, t,
+				compact_count, compact_delay_total,
+				compact_delay_max, compact_delay_min, compact_delay_max_ts);
+		}
 
-	if (t->version >= 13) {
-		PRINT_FILED_DELAY("WPCOPY", t->version, t,
-			wpcopy_count, wpcopy_delay_total,
-			wpcopy_delay_max, wpcopy_delay_min);
-	}
+		if (t->version >= 13) {
+			PRINT_FILED_DELAY_WITH_TS("WPCOPY", t->version, t,
+				wpcopy_count, wpcopy_delay_total,
+				wpcopy_delay_max, wpcopy_delay_min, wpcopy_delay_max_ts);
+		}
 
-	if (t->version >= 14) {
-		PRINT_FILED_DELAY("IRQ", t->version, t,
-			irq_count, irq_delay_total,
-			irq_delay_max, irq_delay_min);
+		if (t->version >= 14) {
+			PRINT_FILED_DELAY_WITH_TS("IRQ", t->version, t,
+				irq_count, irq_delay_total,
+				irq_delay_max, irq_delay_min, irq_delay_max_ts);
+		}
+	} else {
+		/* Use original macro for older versions */
+		PRINT_FILED_DELAY("IO", t->version, t,
+			blkio_count, blkio_delay_total,
+			blkio_delay_max, blkio_delay_min);
+
+		PRINT_FILED_DELAY("SWAP", t->version, t,
+			swapin_count, swapin_delay_total,
+			swapin_delay_max, swapin_delay_min);
+
+		PRINT_FILED_DELAY("RECLAIM", t->version, t,
+			freepages_count, freepages_delay_total,
+			freepages_delay_max, freepages_delay_min);
+
+		PRINT_FILED_DELAY("THRASHING", t->version, t,
+			thrashing_count, thrashing_delay_total,
+			thrashing_delay_max, thrashing_delay_min);
+
+		if (t->version >= 11) {
+			PRINT_FILED_DELAY("COMPACT", t->version, t,
+				compact_count, compact_delay_total,
+				compact_delay_max, compact_delay_min);
+		}
+
+		if (t->version >= 13) {
+			PRINT_FILED_DELAY("WPCOPY", t->version, t,
+				wpcopy_count, wpcopy_delay_total,
+				wpcopy_delay_max, wpcopy_delay_min);
+		}
+
+		if (t->version >= 14) {
+			PRINT_FILED_DELAY("IRQ", t->version, t,
+				irq_count, irq_delay_total,
+				irq_delay_max, irq_delay_min);
+		}
 	}
 }
 
-- 
cgit v1.2.3


From bd4f0822f4ecd98285208e3322ad23c2ead0c878 Mon Sep 17 00:00:00 2001
From: zhidao su <soolaugust@gmail.com>
Date: Fri, 30 Jan 2026 18:55:11 +0800
Subject: tools/sched_ext: Add error logging for dsq creation failures in
 remaining schedulers

Add scx_bpf_error() calls when scx_bpf_create_dsq() fails in the remaining
schedulers to improve debuggability:

- scx_simple.bpf.c: simple_init()
- scx_sdt.bpf.c: sdt_init()
- scx_cpu0.bpf.c: cpu0_init()
- scx_flatcg.bpf.c: fcg_init()

This follows the same pattern established in commit 2f8d489897ae
("sched_ext: Add error logging for dsq creation failures") for other
schedulers and ensures consistent error reporting across all schedulers.

Signed-off-by: zhidao su <suzhidao@xiaomi.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_cpu0.bpf.c   | 10 +++++++++-
 tools/sched_ext/scx_flatcg.bpf.c | 10 +++++++++-
 tools/sched_ext/scx_sdt.bpf.c    |  8 +++++++-
 tools/sched_ext/scx_simple.bpf.c | 10 +++++++++-
 4 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_cpu0.bpf.c b/tools/sched_ext/scx_cpu0.bpf.c
index 6326ce598c8e..9b67ab11b04c 100644
--- a/tools/sched_ext/scx_cpu0.bpf.c
+++ b/tools/sched_ext/scx_cpu0.bpf.c
@@ -71,7 +71,15 @@ void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev)
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init)
 {
-	return scx_bpf_create_dsq(DSQ_CPU0, -1);
+	int ret;
+
+	ret = scx_bpf_create_dsq(DSQ_CPU0, -1);
+	if (ret) {
+		scx_bpf_error("failed to create DSQ %d (%d)", DSQ_CPU0, ret);
+		return ret;
+	}
+
+	return 0;
 }
 
 void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei)
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index c216480c3ee0..0e785cff0f24 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -929,7 +929,15 @@ void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init)
 {
-	return scx_bpf_create_dsq(FALLBACK_DSQ, -1);
+	int ret;
+
+	ret = scx_bpf_create_dsq(FALLBACK_DSQ, -1);
+	if (ret) {
+		scx_bpf_error("failed to create DSQ %d (%d)", FALLBACK_DSQ, ret);
+		return ret;
+	}
+
+	return 0;
 }
 
 void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
diff --git a/tools/sched_ext/scx_sdt.bpf.c b/tools/sched_ext/scx_sdt.bpf.c
index 48ea18614e28..d965f7d209de 100644
--- a/tools/sched_ext/scx_sdt.bpf.c
+++ b/tools/sched_ext/scx_sdt.bpf.c
@@ -691,7 +691,13 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
 		return ret;
 	}
 
-	return scx_bpf_create_dsq(SHARED_DSQ, -1);
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret) {
+		scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
+		return ret;
+	}
+
+	return 0;
 }
 
 void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index e6de99dba7db..b456bd7cae77 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -131,7 +131,15 @@ void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
 {
-	return scx_bpf_create_dsq(SHARED_DSQ, -1);
+	int ret;
+
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret) {
+		scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
+		return ret;
+	}
+
+	return 0;
 }
 
 void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
-- 
cgit v1.2.3


From 3864cb60dad5a6c1bd9f444740cf541a1d8cda99 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 30 Jan 2026 16:03:59 -0800
Subject: cxl/port: Move dport probe operations to a driver event

In preparation for adding more register setup to the cxl_port_add_dport()
path (for RAS register mapping), move the dport creation event to a driver
callback. This achieves two goals, it puts driver operations logically
where they belong, in a driver, and it obviates the gymnastics of
DECLARE_TESTABLE() which just makes a mess of grepping for CXL symbols.

In other words, a driver callback is less of an ongoing maintenance burden
than this DECLARE_TESTABLE arrangement that does not scale and diminishes
the grep-ability of the codebase.

cxl_port_add_dport() moves mostly unmodified from drivers/cxl/core/port.c.
The only deliberate change is that it now assumes that the device_lock is
held on entry and the driver is attached (just like cxl_port_probe()).

Reviewed-by: Terry Bowman <terry.bowman@amd.com>
Tested-by: Terry Bowman <terry.bowman@amd.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Link: https://patch.msgid.link/20260131000403.2135324-6-dan.j.williams@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/hdm.c               |  6 ++--
 drivers/cxl/core/pci.c               |  8 ++---
 drivers/cxl/core/port.c              | 68 +++++++++++-------------------------
 drivers/cxl/cxl.h                    | 31 +++++++---------
 drivers/cxl/port.c                   | 50 ++++++++++++++++++++++++++
 tools/testing/cxl/Kbuild             |  2 ++
 tools/testing/cxl/cxl_core_exports.c | 22 ------------
 tools/testing/cxl/exports.h          | 13 -------
 tools/testing/cxl/test/mock.c        | 24 ++++---------
 9 files changed, 98 insertions(+), 126 deletions(-)
 delete mode 100644 tools/testing/cxl/exports.h

(limited to 'tools')

diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index 1c5d2022c87a..365b02b7a241 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -1219,12 +1219,12 @@ static int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
 }
 
 /**
- * __devm_cxl_switch_port_decoders_setup - allocate and setup switch decoders
+ * devm_cxl_switch_port_decoders_setup - allocate and setup switch decoders
  * @port: CXL port context
  *
  * Return 0 or -errno on error
  */
-int __devm_cxl_switch_port_decoders_setup(struct cxl_port *port)
+int devm_cxl_switch_port_decoders_setup(struct cxl_port *port)
 {
 	struct cxl_hdm *cxlhdm;
 
@@ -1248,7 +1248,7 @@ int __devm_cxl_switch_port_decoders_setup(struct cxl_port *port)
 	dev_err(&port->dev, "HDM decoder capability not found\n");
 	return -ENXIO;
 }
-EXPORT_SYMBOL_NS_GPL(__devm_cxl_switch_port_decoders_setup, "CXL");
+EXPORT_SYMBOL_NS_GPL(devm_cxl_switch_port_decoders_setup, "CXL");
 
 /**
  * devm_cxl_endpoint_decoders_setup - allocate and setup endpoint decoders
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index b838c59d7a3c..f96ce884a213 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -41,14 +41,14 @@ static int pci_get_port_num(struct pci_dev *pdev)
 }
 
 /**
- * __devm_cxl_add_dport_by_dev - allocate a dport by dport device
+ * devm_cxl_add_dport_by_dev - allocate a dport by dport device
  * @port: cxl_port that hosts the dport
  * @dport_dev: 'struct device' of the dport
  *
  * Returns the allocated dport on success or ERR_PTR() of -errno on error
  */
-struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port,
-					      struct device *dport_dev)
+struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port,
+					    struct device *dport_dev)
 {
 	struct cxl_register_map map;
 	struct pci_dev *pdev;
@@ -69,7 +69,7 @@ struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port,
 	device_lock_assert(&port->dev);
 	return devm_cxl_add_dport(port, dport_dev, port_num, map.resource);
 }
-EXPORT_SYMBOL_NS_GPL(__devm_cxl_add_dport_by_dev, "CXL");
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport_by_dev, "CXL");
 
 static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id)
 {
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 6a554d0466a1..7356e1725db8 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -778,7 +778,7 @@ static int cxl_setup_comp_regs(struct device *host, struct cxl_register_map *map
 	return cxl_setup_regs(map);
 }
 
-static int cxl_port_setup_regs(struct cxl_port *port,
+int cxl_port_setup_regs(struct cxl_port *port,
 			resource_size_t component_reg_phys)
 {
 	if (dev_is_platform(port->uport_dev))
@@ -786,6 +786,7 @@ static int cxl_port_setup_regs(struct cxl_port *port,
 	return cxl_setup_comp_regs(&port->dev, &port->reg_map,
 				   component_reg_phys);
 }
+EXPORT_SYMBOL_NS_GPL(cxl_port_setup_regs, "CXL");
 
 static int cxl_dport_setup_regs(struct device *host, struct cxl_dport *dport,
 				resource_size_t component_reg_phys)
@@ -1638,6 +1639,13 @@ static int update_decoder_targets(struct device *dev, void *data)
 	return 0;
 }
 
+void cxl_port_update_decoder_targets(struct cxl_port *port,
+				     struct cxl_dport *dport)
+{
+	device_for_each_child(&port->dev, dport, update_decoder_targets);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_update_decoder_targets, "CXL");
+
 static bool dport_exists(struct cxl_port *port, struct device *dport_dev)
 {
 	struct cxl_dport *dport = cxl_find_dport_by_dev(port, dport_dev);
@@ -1651,15 +1659,10 @@ static bool dport_exists(struct cxl_port *port, struct device *dport_dev)
 	return false;
 }
 
-/* note this implicitly casts the group back to its @port */
-DEFINE_FREE(cxl_port_release_dr_group, struct cxl_port *,
-	    if (_T) devres_release_group(&_T->dev, _T))
-
-static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port,
-					    struct device *dport_dev)
+static struct cxl_dport *probe_dport(struct cxl_port *port,
+				     struct device *dport_dev)
 {
-	struct cxl_dport *dport;
-	int rc;
+	struct cxl_driver *drv;
 
 	device_lock_assert(&port->dev);
 	if (!port->dev.driver)
@@ -1668,43 +1671,12 @@ static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port,
 	if (dport_exists(port, dport_dev))
 		return ERR_PTR(-EBUSY);
 
-	/* Temp group for all "first dport" and "per dport" setup actions */
-	void *port_dr_group __free(cxl_port_release_dr_group) =
-		devres_open_group(&port->dev, port, GFP_KERNEL);
-	if (!port_dr_group)
-		return ERR_PTR(-ENOMEM);
-
-	if (port->nr_dports == 0) {
-		/*
-		 * Some host bridges are known to not have component regsisters
-		 * available until a root port has trained CXL. Perform that
-		 * setup now.
-		 */
-		rc = cxl_port_setup_regs(port, port->component_reg_phys);
-		if (rc)
-			return ERR_PTR(rc);
-
-		rc = devm_cxl_switch_port_decoders_setup(port);
-		if (rc)
-			return ERR_PTR(rc);
-	}
-
-	dport = devm_cxl_add_dport_by_dev(port, dport_dev);
-	if (IS_ERR(dport))
-		return dport;
-
-	/* This group was only needed for early exit above */
-	devres_remove_group(&port->dev, no_free_ptr(port_dr_group));
-
-	cxl_switch_parse_cdat(dport);
-
-	/* New dport added, update the decoder targets */
-	device_for_each_child(&port->dev, dport, update_decoder_targets);
-
-	dev_dbg(&port->dev, "dport%d:%s added\n", dport->port_id,
-		dev_name(dport_dev));
+	drv = container_of(port->dev.driver, struct cxl_driver, drv);
+	if (!drv->add_dport)
+		return ERR_PTR(-ENXIO);
 
-	return dport;
+	/* see cxl_port_add_dport() */
+	return drv->add_dport(port, dport_dev);
 }
 
 static struct cxl_dport *devm_cxl_create_port(struct device *ep_dev,
@@ -1751,7 +1723,7 @@ static struct cxl_dport *devm_cxl_create_port(struct device *ep_dev,
 	}
 
 	guard(device)(&port->dev);
-	return cxl_port_add_dport(port, dport_dev);
+	return probe_dport(port, dport_dev);
 }
 
 static int add_port_attach_ep(struct cxl_memdev *cxlmd,
@@ -1783,7 +1755,7 @@ static int add_port_attach_ep(struct cxl_memdev *cxlmd,
 	scoped_guard(device, &parent_port->dev) {
 		parent_dport = cxl_find_dport_by_dev(parent_port, dparent);
 		if (!parent_dport) {
-			parent_dport = cxl_port_add_dport(parent_port, dparent);
+			parent_dport = probe_dport(parent_port, dparent);
 			if (IS_ERR(parent_dport))
 				return PTR_ERR(parent_dport);
 		}
@@ -1819,7 +1791,7 @@ static struct cxl_dport *find_or_add_dport(struct cxl_port *port,
 	device_lock_assert(&port->dev);
 	dport = cxl_find_dport_by_dev(port, dport_dev);
 	if (!dport) {
-		dport = cxl_port_add_dport(port, dport_dev);
+		dport = probe_dport(port, dport_dev);
 		if (IS_ERR(dport))
 			return dport;
 
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 6f3741a57932..4479d632a687 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -840,8 +840,11 @@ struct cxl_endpoint_dvsec_info {
 };
 
 int devm_cxl_switch_port_decoders_setup(struct cxl_port *port);
-int __devm_cxl_switch_port_decoders_setup(struct cxl_port *port);
 int devm_cxl_endpoint_decoders_setup(struct cxl_port *port);
+void cxl_port_update_decoder_targets(struct cxl_port *port,
+				     struct cxl_dport *dport);
+int cxl_port_setup_regs(struct cxl_port *port,
+			resource_size_t component_reg_phys);
 
 struct cxl_dev_state;
 int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds,
@@ -851,10 +854,18 @@ bool is_cxl_region(struct device *dev);
 
 extern const struct bus_type cxl_bus_type;
 
+/*
+ * Note, add_dport() is expressly for the cxl_port driver. TODO: investigate a
+ * type-safe driver model where probe()/remove() take the type of object implied
+ * by @id and the add_dport() op only defined for the CXL_DEVICE_PORT driver
+ * template.
+ */
 struct cxl_driver {
 	const char *name;
 	int (*probe)(struct device *dev);
 	void (*remove)(struct device *dev);
+	struct cxl_dport *(*add_dport)(struct cxl_port *port,
+				       struct device *dport_dev);
 	struct device_driver drv;
 	int id;
 };
@@ -939,8 +950,6 @@ void cxl_coordinates_combine(struct access_coordinate *out,
 bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
 struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port,
 					    struct device *dport_dev);
-struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port,
-					      struct device *dport_dev);
 
 /*
  * Unit test builds overrides this to __weak, find the 'strong' version
@@ -952,20 +961,4 @@ struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port,
 
 u16 cxl_gpf_get_dvsec(struct device *dev);
 
-/*
- * Declaration for functions that are mocked by cxl_test that are called by
- * cxl_core. The respective functions are defined as __foo() and called by
- * cxl_core as foo(). The macros below ensures that those functions would
- * exist as foo(). See tools/testing/cxl/cxl_core_exports.c and
- * tools/testing/cxl/exports.h for setting up the mock functions. The dance
- * is done to avoid a circular dependency where cxl_core calls a function that
- * ends up being a mock function and goes to * cxl_test where it calls a
- * cxl_core function.
- */
-#ifndef CXL_TEST_ENABLE
-#define DECLARE_TESTABLE(x) __##x
-#define devm_cxl_add_dport_by_dev DECLARE_TESTABLE(devm_cxl_add_dport_by_dev)
-#define devm_cxl_switch_port_decoders_setup DECLARE_TESTABLE(devm_cxl_switch_port_decoders_setup)
-#endif
-
 #endif /* __CXL_H__ */
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index 51c8f2f84717..913c469e067a 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -151,9 +151,59 @@ static const struct attribute_group *cxl_port_attribute_groups[] = {
 	NULL,
 };
 
+/* note this implicitly casts the group back to its @port */
+DEFINE_FREE(cxl_port_release_dr_group, struct cxl_port *,
+	    if (_T) devres_release_group(&_T->dev, _T))
+
+static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port,
+					    struct device *dport_dev)
+{
+	struct cxl_dport *dport;
+	int rc;
+
+	/* Temp group for all "first dport" and "per dport" setup actions */
+	void *port_dr_group __free(cxl_port_release_dr_group) =
+		devres_open_group(&port->dev, port, GFP_KERNEL);
+	if (!port_dr_group)
+		return ERR_PTR(-ENOMEM);
+
+	if (port->nr_dports == 0) {
+		/*
+		 * Some host bridges are known to not have component regsisters
+		 * available until a root port has trained CXL. Perform that
+		 * setup now.
+		 */
+		rc = cxl_port_setup_regs(port, port->component_reg_phys);
+		if (rc)
+			return ERR_PTR(rc);
+
+		rc = devm_cxl_switch_port_decoders_setup(port);
+		if (rc)
+			return ERR_PTR(rc);
+	}
+
+	dport = devm_cxl_add_dport_by_dev(port, dport_dev);
+	if (IS_ERR(dport))
+		return dport;
+
+	/* This group was only needed for early exit above */
+	devres_remove_group(&port->dev, no_free_ptr(port_dr_group));
+
+	cxl_switch_parse_cdat(dport);
+
+	/* New dport added, update the decoder targets */
+	cxl_port_update_decoder_targets(port, dport);
+
+	dev_dbg(&port->dev, "dport%d:%s added\n", dport->port_id,
+		dev_name(dport_dev));
+
+	return dport;
+}
+
 static struct cxl_driver cxl_port_driver = {
 	.name = "cxl_port",
 	.probe = cxl_port_probe,
+	.add_dport = cxl_port_add_dport,
 	.id = CXL_DEVICE_PORT,
 	.drv = {
 		.dev_groups = cxl_port_attribute_groups,
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 6eceefefb0e0..9b2d514a867e 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -10,6 +10,8 @@ ldflags-y += --wrap=cxl_endpoint_parse_cdat
 ldflags-y += --wrap=cxl_dport_init_ras_reporting
 ldflags-y += --wrap=devm_cxl_endpoint_decoders_setup
 ldflags-y += --wrap=hmat_get_extended_linear_cache_size
+ldflags-y += --wrap=devm_cxl_add_dport_by_dev
+ldflags-y += --wrap=devm_cxl_switch_port_decoders_setup
 
 DRIVERS := ../../../drivers
 CXL_SRC := $(DRIVERS)/cxl
diff --git a/tools/testing/cxl/cxl_core_exports.c b/tools/testing/cxl/cxl_core_exports.c
index 6754de35598d..f088792a8925 100644
--- a/tools/testing/cxl/cxl_core_exports.c
+++ b/tools/testing/cxl/cxl_core_exports.c
@@ -2,28 +2,6 @@
 /* Copyright(c) 2022 Intel Corporation. All rights reserved. */
 
 #include "cxl.h"
-#include "exports.h"
 
 /* Exporting of cxl_core symbols that are only used by cxl_test */
 EXPORT_SYMBOL_NS_GPL(cxl_num_decoders_committed, "CXL");
-
-cxl_add_dport_by_dev_fn _devm_cxl_add_dport_by_dev =
-	__devm_cxl_add_dport_by_dev;
-EXPORT_SYMBOL_NS_GPL(_devm_cxl_add_dport_by_dev, "CXL");
-
-struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port,
-					    struct device *dport_dev)
-{
-	return _devm_cxl_add_dport_by_dev(port, dport_dev);
-}
-EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport_by_dev, "CXL");
-
-cxl_switch_decoders_setup_fn _devm_cxl_switch_port_decoders_setup =
-	__devm_cxl_switch_port_decoders_setup;
-EXPORT_SYMBOL_NS_GPL(_devm_cxl_switch_port_decoders_setup, "CXL");
-
-int devm_cxl_switch_port_decoders_setup(struct cxl_port *port)
-{
-	return _devm_cxl_switch_port_decoders_setup(port);
-}
-EXPORT_SYMBOL_NS_GPL(devm_cxl_switch_port_decoders_setup, "CXL");
diff --git a/tools/testing/cxl/exports.h b/tools/testing/cxl/exports.h
deleted file mode 100644
index 7ebee7c0bd67..000000000000
--- a/tools/testing/cxl/exports.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright(c) 2025 Intel Corporation */
-#ifndef __MOCK_CXL_EXPORTS_H_
-#define __MOCK_CXL_EXPORTS_H_
-
-typedef struct cxl_dport *(*cxl_add_dport_by_dev_fn)(struct cxl_port *port,
-							  struct device *dport_dev);
-extern cxl_add_dport_by_dev_fn _devm_cxl_add_dport_by_dev;
-
-typedef int(*cxl_switch_decoders_setup_fn)(struct cxl_port *port);
-extern cxl_switch_decoders_setup_fn _devm_cxl_switch_port_decoders_setup;
-
-#endif
diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index 44bce80ef3ff..f307c5b39184 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -10,21 +10,12 @@
 #include <cxlmem.h>
 #include <cxlpci.h>
 #include "mock.h"
-#include "../exports.h"
 
 static LIST_HEAD(mock);
 
-static struct cxl_dport *
-redirect_devm_cxl_add_dport_by_dev(struct cxl_port *port,
-				   struct device *dport_dev);
-static int redirect_devm_cxl_switch_port_decoders_setup(struct cxl_port *port);
-
 void register_cxl_mock_ops(struct cxl_mock_ops *ops)
 {
 	list_add_rcu(&ops->list, &mock);
-	_devm_cxl_add_dport_by_dev = redirect_devm_cxl_add_dport_by_dev;
-	_devm_cxl_switch_port_decoders_setup =
-		redirect_devm_cxl_switch_port_decoders_setup;
 }
 EXPORT_SYMBOL_GPL(register_cxl_mock_ops);
 
@@ -32,9 +23,6 @@ DEFINE_STATIC_SRCU(cxl_mock_srcu);
 
 void unregister_cxl_mock_ops(struct cxl_mock_ops *ops)
 {
-	_devm_cxl_switch_port_decoders_setup =
-		__devm_cxl_switch_port_decoders_setup;
-	_devm_cxl_add_dport_by_dev = __devm_cxl_add_dport_by_dev;
 	list_del_rcu(&ops->list);
 	synchronize_srcu(&cxl_mock_srcu);
 }
@@ -163,7 +151,7 @@ __wrap_nvdimm_bus_register(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(__wrap_nvdimm_bus_register);
 
-int redirect_devm_cxl_switch_port_decoders_setup(struct cxl_port *port)
+int __wrap_devm_cxl_switch_port_decoders_setup(struct cxl_port *port)
 {
 	int rc, index;
 	struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
@@ -171,11 +159,12 @@ int redirect_devm_cxl_switch_port_decoders_setup(struct cxl_port *port)
 	if (ops && ops->is_mock_port(port->uport_dev))
 		rc = ops->devm_cxl_switch_port_decoders_setup(port);
 	else
-		rc = __devm_cxl_switch_port_decoders_setup(port);
+		rc = devm_cxl_switch_port_decoders_setup(port);
 	put_cxl_mock_ops(index);
 
 	return rc;
 }
+EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_switch_port_decoders_setup, "CXL");
 
 int __wrap_devm_cxl_endpoint_decoders_setup(struct cxl_port *port)
 {
@@ -257,8 +246,8 @@ void __wrap_cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device
 }
 EXPORT_SYMBOL_NS_GPL(__wrap_cxl_dport_init_ras_reporting, "CXL");
 
-struct cxl_dport *redirect_devm_cxl_add_dport_by_dev(struct cxl_port *port,
-						     struct device *dport_dev)
+struct cxl_dport *__wrap_devm_cxl_add_dport_by_dev(struct cxl_port *port,
+						   struct device *dport_dev)
 {
 	int index;
 	struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
@@ -267,11 +256,12 @@ struct cxl_dport *redirect_devm_cxl_add_dport_by_dev(struct cxl_port *port,
 	if (ops && ops->is_mock_port(port->uport_dev))
 		dport = ops->devm_cxl_add_dport_by_dev(port, dport_dev);
 	else
-		dport = __devm_cxl_add_dport_by_dev(port, dport_dev);
+		dport = devm_cxl_add_dport_by_dev(port, dport_dev);
 	put_cxl_mock_ops(index);
 
 	return dport;
 }
+EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_add_dport_by_dev, "CXL");
 
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("cxl_test: emulation module");
-- 
cgit v1.2.3


From 7f5ff740ce0bcde242dafcc3f9bb3cbe6b5b8f3a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 30 Jan 2026 16:04:00 -0800
Subject: cxl/port: Move dport RAS setup to dport add time

Towards the end goal of making all CXL RAS capability handling uniform
across host bridge ports, upstream switch ports, and endpoint ports, move
dport RAS setup. Move it to cxl_switch_port_probe() context for switch / VH
dports (via cxl_port_add_dport()) and cxl_endpoint_port_probe() context for
an RCH dport. Rename the RAS setup helper to devm_cxl_dport_ras_setup() for
symmetry with devm_cxl_switch_port_decoders_setup().

Only the RCH version needs to be exported and the cxl_test mocking can be
deleted with a dev_is_pci() check on the dport_dev.

Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Terry Bowman <terry.bowman@amd.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Link: https://patch.msgid.link/20260131000403.2135324-7-dan.j.williams@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/core.h       | 10 ++++++++++
 drivers/cxl/core/port.c       | 12 +++---------
 drivers/cxl/core/ras.c        | 30 ++++++++++++++++++------------
 drivers/cxl/cxlpci.h          |  7 ++++---
 drivers/cxl/mem.c             |  2 --
 drivers/cxl/port.c            | 12 ++++++++++++
 tools/testing/cxl/Kbuild      |  1 -
 tools/testing/cxl/test/mock.c | 12 ------------
 8 files changed, 47 insertions(+), 39 deletions(-)

(limited to 'tools')

diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 422531799af2..be3c7b137115 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -144,6 +144,14 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c);
 int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
 					struct access_coordinate *c);
 
+static inline struct device *dport_to_host(struct cxl_dport *dport)
+{
+	struct cxl_port *port = dport->port;
+
+	if (is_cxl_root(port))
+		return port->uport_dev;
+	return &port->dev;
+}
 #ifdef CONFIG_CXL_RAS
 int cxl_ras_init(void);
 void cxl_ras_exit(void);
@@ -152,6 +160,7 @@ void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base);
 void cxl_dport_map_rch_aer(struct cxl_dport *dport);
 void cxl_disable_rch_root_ints(struct cxl_dport *dport);
 void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds);
+void devm_cxl_dport_ras_setup(struct cxl_dport *dport);
 #else
 static inline int cxl_ras_init(void)
 {
@@ -166,6 +175,7 @@ static inline void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base
 static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { }
 static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { }
 static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
+static inline void devm_cxl_dport_ras_setup(struct cxl_dport *dport) { }
 #endif /* CONFIG_CXL_RAS */
 
 int cxl_gpf_port_setup(struct cxl_dport *dport);
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 7356e1725db8..9f56f7e75e81 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -1119,15 +1119,6 @@ static void cxl_dport_unlink(void *data)
 	sysfs_remove_link(&port->dev.kobj, link_name);
 }
 
-static struct device *dport_to_host(struct cxl_dport *dport)
-{
-	struct cxl_port *port = dport->port;
-
-	if (is_cxl_root(port))
-		return port->uport_dev;
-	return &port->dev;
-}
-
 static void free_dport(void *dport)
 {
 	kfree(dport);
@@ -1261,6 +1252,9 @@ __devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev,
 
 	cxl_debugfs_create_dport_dir(dport);
 
+	if (!dport->rch)
+		devm_cxl_dport_ras_setup(dport);
+
 	/* keep the group, and mark the end of devm actions */
 	cxl_dport_close_dr_group(dport, no_free_ptr(dport_dr_group));
 
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 72908f3ced77..e90b7a91bf5d 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -139,26 +139,32 @@ static void cxl_dport_map_ras(struct cxl_dport *dport)
 }
 
 /**
- * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport
+ * devm_cxl_dport_ras_setup - Setup CXL RAS report on this dport
  * @dport: the cxl_dport that needs to be initialized
- * @host: host device for devm operations
  */
-void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host)
+void devm_cxl_dport_ras_setup(struct cxl_dport *dport)
 {
-	dport->reg_map.host = host;
+	dport->reg_map.host = dport_to_host(dport);
 	cxl_dport_map_ras(dport);
+}
 
-	if (dport->rch) {
-		struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev);
+void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport)
+{
+	struct pci_host_bridge *host_bridge;
 
-		if (!host_bridge->native_aer)
-			return;
+	if (!dev_is_pci(dport->dport_dev))
+		return;
 
-		cxl_dport_map_rch_aer(dport);
-		cxl_disable_rch_root_ints(dport);
-	}
+	devm_cxl_dport_ras_setup(dport);
+
+	host_bridge = to_pci_host_bridge(dport->dport_dev);
+	if (!host_bridge->native_aer)
+		return;
+
+	cxl_dport_map_rch_aer(dport);
+	cxl_disable_rch_root_ints(dport);
 }
-EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
+EXPORT_SYMBOL_NS_GPL(devm_cxl_dport_rch_ras_setup, "CXL");
 
 void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base)
 {
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 6f9c78886fd9..65575371a35c 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -81,7 +81,7 @@ void read_cdat_data(struct cxl_port *port);
 void cxl_cor_error_detected(struct pci_dev *pdev);
 pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 				    pci_channel_state_t state);
-void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host);
+void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport);
 #else
 static inline void cxl_cor_error_detected(struct pci_dev *pdev) { }
 
@@ -91,8 +91,9 @@ static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 	return PCI_ERS_RESULT_NONE;
 }
 
-static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport,
-						struct device *host) { }
+static inline void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport)
+{
+}
 #endif
 
 #endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index c2ee7f7f6320..e25c33f8c6cf 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -166,8 +166,6 @@ static int cxl_mem_probe(struct device *dev)
 	else
 		endpoint_parent = &parent_port->dev;
 
-	cxl_dport_init_ras_reporting(dport, dev);
-
 	scoped_guard(device, endpoint_parent) {
 		if (!endpoint_parent->driver) {
 			dev_err(dev, "CXL port topology %s not enabled\n",
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index 913c469e067a..929f7e259f0d 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -71,6 +71,7 @@ static int cxl_switch_port_probe(struct cxl_port *port)
 static int cxl_endpoint_port_probe(struct cxl_port *port)
 {
 	struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
+	struct cxl_dport *dport = port->parent_dport;
 	int rc;
 
 	/* Cache the data early to ensure is_visible() works */
@@ -86,6 +87,17 @@ static int cxl_endpoint_port_probe(struct cxl_port *port)
 	if (rc)
 		return rc;
 
+	/*
+	 * With VH (CXL Virtual Host) topology the cxl_port::add_dport() method
+	 * handles RAS setup for downstream ports. With RCH (CXL Restricted CXL
+	 * Host) topologies the downstream port is enumerated early by platform
+	 * firmware, but the RCRB (root complex register block) is not mapped
+	 * until after the cxl_pci driver attaches to the RCIeP (root complex
+	 * integrated endpoint).
+	 */
+	if (dport->rch)
+		devm_cxl_dport_rch_ras_setup(dport);
+
 	/*
 	 * Now that all endpoint decoders are successfully enumerated, try to
 	 * assemble regions from committed decoders
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 9b2d514a867e..982e8ea28b92 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -7,7 +7,6 @@ ldflags-y += --wrap=nvdimm_bus_register
 ldflags-y += --wrap=cxl_await_media_ready
 ldflags-y += --wrap=devm_cxl_add_rch_dport
 ldflags-y += --wrap=cxl_endpoint_parse_cdat
-ldflags-y += --wrap=cxl_dport_init_ras_reporting
 ldflags-y += --wrap=devm_cxl_endpoint_decoders_setup
 ldflags-y += --wrap=hmat_get_extended_linear_cache_size
 ldflags-y += --wrap=devm_cxl_add_dport_by_dev
diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index f307c5b39184..b8fcb50c1027 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -234,18 +234,6 @@ void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port)
 }
 EXPORT_SYMBOL_NS_GPL(__wrap_cxl_endpoint_parse_cdat, "CXL");
 
-void __wrap_cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host)
-{
-	int index;
-	struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
-
-	if (!ops || !ops->is_mock_port(dport->dport_dev))
-		cxl_dport_init_ras_reporting(dport, host);
-
-	put_cxl_mock_ops(index);
-}
-EXPORT_SYMBOL_NS_GPL(__wrap_cxl_dport_init_ras_reporting, "CXL");
-
 struct cxl_dport *__wrap_devm_cxl_add_dport_by_dev(struct cxl_port *port,
 						   struct device *dport_dev)
 {
-- 
cgit v1.2.3


From 2e06d54ea9a25e2925a31eb5410af0f16baa8f19 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Sun, 1 Feb 2026 19:42:08 -0500
Subject: tools/sched_ext: Fix data header access during free in scx_sdt

Fix a pointer arithmetic error in scx_sdt during freeing that
causes the allocator to use the wrong memory address for the
allocation's data header.

Fixes: 36929ebd17ae ("tools/sched_ext: add arena based scheduler")
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_sdt.bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_sdt.bpf.c b/tools/sched_ext/scx_sdt.bpf.c
index d965f7d209de..31b09958e8d5 100644
--- a/tools/sched_ext/scx_sdt.bpf.c
+++ b/tools/sched_ext/scx_sdt.bpf.c
@@ -312,7 +312,7 @@ int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx)
 	pos = idx & mask;
 	data = chunk->data[pos];
 	if (likely(data)) {
-		data[pos] = (struct sdt_data) {
+		*data = (struct sdt_data) {
 			.tid.genn = data->tid.genn + 1,
 		};
 
-- 
cgit v1.2.3


From 98f51c466aebdb5afa7cb7d54aa7eb9f04b468ee Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 27 Jan 2026 09:03:32 +0100
Subject: docs: kdoc: Fix pdfdocs build for tools

the "\1" inside a docstring requires proper scaping to not be
considered a hex character and break the build.

Reported-by: Akira Yokosawa <akiyks@gmail.com>
Closes: https://lore.kernel.org/linux-doc/63e99049-cc72-4156-83af-414fdde34312@gmail.com/
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <2fff8ef1d0d64e8b68f15f5c07613f302d773855.1769500383.git.mchehab+huawei@kernel.org>
---
 tools/lib/python/kdoc/kdoc_re.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/kdoc_re.py b/tools/lib/python/kdoc/kdoc_re.py
index 2816bd9f90f8..0bf9e01cdc57 100644
--- a/tools/lib/python/kdoc/kdoc_re.py
+++ b/tools/lib/python/kdoc/kdoc_re.py
@@ -228,14 +228,18 @@ class NestedMatch:
             yield line[t[0]:t[2]]
 
     def sub(self, regex, sub, line, count=0):
-        """
+        r"""
         This is similar to re.sub:
 
         It matches a regex that it is followed by a delimiter,
         replacing occurrences only if all delimiters are paired.
 
-        if r'\1' is used, it works just like re: it places there the
-        matched paired data with the delimiter stripped.
+        if the sub argument contains::
+
+            r'\1'
+
+        it will work just like re: it places there the matched paired data
+        with the delimiter stripped.
 
         If count is different than zero, it will replace at most count
         items.
-- 
cgit v1.2.3


From b09cc1ddde9707ef62d2dd1070a1c99556ed7d76 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 27 Jan 2026 09:03:33 +0100
Subject: docs: sphinx-build-wrapper: allow -v override -q

Documentation builds were using "-q" for a long time, but sometimes
it is nice to see the Sphinx progress, without increasing build
verbosity - which would also turn on kernel-doc verbosity.

Instead of doing that, let's parse the sphinx-build already-existing
-v: each time it is used, it increases the verbosity level.

With that, if the default is to use -q, a single -v will disable
quiet mode. Passing more -v will keep increasing its verbosity.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <38b24e97a3cbd2def418359a8e69b1b087a945ad.1769500383.git.mchehab+huawei@kernel.org>
---
 tools/docs/sphinx-build-wrapper | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper
index 78ff7ac202ef..8080ace60680 100755
--- a/tools/docs/sphinx-build-wrapper
+++ b/tools/docs/sphinx-build-wrapper
@@ -168,6 +168,7 @@ class SphinxBuilder:
         parser = argparse.ArgumentParser()
         parser.add_argument('-j', '--jobs', type=int)
         parser.add_argument('-q', '--quiet', action='store_true')
+        parser.add_argument('-v', '--verbose', default=0, action='count')
 
         #
         # Other sphinx-build arguments go as-is, so place them
@@ -179,10 +180,14 @@ class SphinxBuilder:
         # Build a list of sphinx args, honoring verbosity here if specified
         #
 
-        verbose = self.verbose
         sphinx_args, self.sphinxopts = parser.parse_known_args(sphinxopts)
+
+        verbose = sphinx_args.verbose
+        if self.verbose:
+            verbose += 1
+
         if sphinx_args.quiet is True:
-            verbose = False
+            verbose = 0
 
         #
         # If the user explicitly sets "-j" at command line, use it.
@@ -195,8 +200,11 @@ class SphinxBuilder:
         else:
             self.n_jobs = None
 
-        if not verbose:
+        if verbose < 1:
             self.sphinxopts += ["-q"]
+        else:
+            for i in range(1, sphinx_args.verbose):
+                self.sphinxopts += ["-v"]
 
     def __init__(self, builddir, venv=None, verbose=False, n_jobs=None,
                  interactive=None):
-- 
cgit v1.2.3


From 64e4882c8228b07c57e75f510b8c5d7ff46e4edc Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 27 Jan 2026 09:03:34 +0100
Subject: tools: sphinx-build-wrapper: improve its help message

Besides the parameters that are passed via command line arguments,
the wrapper's behavior is affected by several environment variables.

Document that. While here, use __doc__ for its description.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <e0ccee75f7e7fb499e0f59d5b84469f4b6a21627.1769500383.git.mchehab+huawei@kernel.org>
---
 tools/docs/sphinx-build-wrapper | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/docs/sphinx-build-wrapper b/tools/docs/sphinx-build-wrapper
index 8080ace60680..b7c149dff06b 100755
--- a/tools/docs/sphinx-build-wrapper
+++ b/tools/docs/sphinx-build-wrapper
@@ -814,20 +814,42 @@ def jobs_type(value):
     except ValueError:
         raise argparse.ArgumentTypeError(f"Must be 'auto' or positive integer, got {value}")  # pylint: disable=W0707
 
+EPILOG="""
+Besides the command line arguments, several environment variables affect its
+default behavior, meant to be used when called via Kernel Makefile:
+
+- KERNELVERSION:  Kernel major version
+- KERNELRELEASE:  Kernel release
+- KBUILD_VERBOSE: Contains the value of "make V=[0|1] variable.
+                  When V=0 (KBUILD_VERBOSE=0), sets verbose level to "-q".
+- SPHINXBUILD:    Documentation build tool (default: "sphinx-build").
+- SPHINXOPTS:     Extra options pased to SPHINXBUILD
+                  (default: "-j auto" and "-q" if KBUILD_VERBOSE=0).
+                  The "-v" flag can be used to increase verbosity.
+                  If V=0, the first "-v" will drop "-q".
+- PYTHON3:        Python command to run SPHINXBUILD
+- PDFLATEX:       LaTeX PDF engine. (default: "xelatex")
+- LATEXOPTS:      Optional set of command line arguments to the LaTeX engine
+- srctree:        Location of the Kernel root directory (default: ".").
+
+"""
+
 def main():
     """
     Main function. The only mandatory argument is the target. If not
     specified, the other arguments will use default values if not
     specified at os.environ.
     """
-    parser = argparse.ArgumentParser(description="Kernel documentation builder")
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
+                                     description=__doc__,
+                                     epilog=EPILOG)
 
     parser.add_argument("target", choices=list(TARGETS.keys()),
                         help="Documentation target to build")
     parser.add_argument("--sphinxdirs", nargs="+",
                         help="Specific directories to build")
     parser.add_argument("--builddir", default="output",
-                        help="Sphinx configuration file")
+                        help="Sphinx configuration file (default: %(default)s)")
 
     parser.add_argument("--theme", help="Sphinx theme to use")
 
@@ -843,7 +865,7 @@ def main():
                         help="place build in verbose mode")
 
     parser.add_argument('-j', '--jobs', type=jobs_type,
-                        help="Sets number of jobs to use with sphinx-build")
+                        help="Sets number of jobs to use with sphinx-build(default: auto)")
 
     parser.add_argument('-i', '--interactive', action='store_true',
                         help="Change latex default to run in interactive mode")
-- 
cgit v1.2.3


From 4544e9c4ec9a5955a37fdd8204a3d98106f97ab7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 Feb 2026 09:40:22 -1000
Subject: selftests/sched_ext: Fix init_enable_count flakiness

The init_enable_count test is flaky. The test forks 1024 children before
attaching the scheduler to verify that existing tasks get ops.init_task()
called. The children were using sleep(1) before exiting.

7900aa699c34 ("sched_ext: Fix cgroup exit ordering by moving sched_ext_free()
to finish_task_switch()") changed when tasks are removed from scx_tasks -
previously when the task_struct was freed, now immediately in
finish_task_switch() when the task dies.

Before the commit, pre-forked children would linger on scx_tasks until freed
regardless of when they exited, so the scheduler would always see them during
iteration. The sleep(1) was unnecessary. After the commit, children are
removed as soon as they die. The sleep(1) masks the problem in most cases but
the test becomes flaky depending on timing.

Fix by synchronizing properly using a pipe. All children block on read() and
the parent signals them to exit by closing the write end after attaching the
scheduler. The children are auto-reaped so there's no need to wait on them.

Reported-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Cc: David Vernet <void@manifault.com>
Cc: Andrea Righi <arighi@nvidia.com>
Cc: Changwoo Min <changwoo@igalia.com>
Cc: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 .../selftests/sched_ext/init_enable_count.c        | 34 +++++++++++++++-------
 1 file changed, 23 insertions(+), 11 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c
index eddf9e0e26e7..82c71653977b 100644
--- a/tools/testing/selftests/sched_ext/init_enable_count.c
+++ b/tools/testing/selftests/sched_ext/init_enable_count.c
@@ -4,6 +4,7 @@
  * Copyright (c) 2023 David Vernet <dvernet@meta.com>
  * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
  */
+#include <signal.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <sched.h>
@@ -23,6 +24,9 @@ static enum scx_test_status run_test(bool global)
 	int ret, i, status;
 	struct sched_param param = {};
 	pid_t pids[num_pre_forks];
+	int pipe_fds[2];
+
+	SCX_FAIL_IF(pipe(pipe_fds) < 0, "Failed to create pipe");
 
 	skel = init_enable_count__open();
 	SCX_FAIL_IF(!skel, "Failed to open");
@@ -38,26 +42,34 @@ static enum scx_test_status run_test(bool global)
 	 * ensure (at least in practical terms) that there are more tasks that
 	 * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that
 	 * take the fork() path either below or in other processes.
+	 *
+	 * All children will block on read() on the pipe until the parent closes
+	 * the write end after attaching the scheduler, which signals all of
+	 * them to exit simultaneously. Auto-reap so we don't have to wait on
+	 * them.
 	 */
+	signal(SIGCHLD, SIG_IGN);
 	for (i = 0; i < num_pre_forks; i++) {
-		pids[i] = fork();
-		SCX_FAIL_IF(pids[i] < 0, "Failed to fork child");
-		if (pids[i] == 0) {
-			sleep(1);
+		pid_t pid = fork();
+
+		SCX_FAIL_IF(pid < 0, "Failed to fork child");
+		if (pid == 0) {
+			char buf;
+
+			close(pipe_fds[1]);
+			read(pipe_fds[0], &buf, 1);
+			close(pipe_fds[0]);
 			exit(0);
 		}
 	}
+	close(pipe_fds[0]);
 
 	link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops);
 	SCX_FAIL_IF(!link, "Failed to attach struct_ops");
 
-	for (i = 0; i < num_pre_forks; i++) {
-		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
-			    "Failed to wait for pre-forked child\n");
-
-		SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i,
-			    status);
-	}
+	/* Signal all pre-forked children to exit. */
+	close(pipe_fds[1]);
+	signal(SIGCHLD, SIG_DFL);
 
 	bpf_link__destroy(link);
 	SCX_GE(skel->bss->init_task_cnt, num_pre_forks);
-- 
cgit v1.2.3


From 9e3d4dae98325928f842192359521ca0a2e5408e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 31 Jan 2026 14:54:53 -0800
Subject: selftests: drv-net: rss: validate min RSS table size

Add a test which checks that the RSS table is at least 4x the max
queue count supported by the device. The original RSS spec from
Microsoft stated that the RSS indirection table should be 2 to 8
times the CPU count, presumably assuming queue per CPU. If the
CPU count is not a power of two, however, a power-of-2 table
2x larger than queue count results in a 33% traffic imbalance.
Validate that the indirection table is at least 4x the queue
count. This lowers the imbalance to 16% which empirically
appears to be more acceptable to memcache-like workloads.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20260131225454.1225151-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/Makefile   |  1 +
 tools/testing/selftests/drivers/net/hw/rss_drv.py | 88 +++++++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100755 tools/testing/selftests/drivers/net/hw/rss_drv.py

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
index 9c163ba6feee..a64140333a46 100644
--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -35,6 +35,7 @@ TEST_PROGS = \
 	pp_alloc_fail.py \
 	rss_api.py \
 	rss_ctx.py \
+	rss_drv.py \
 	rss_flow_label.py \
 	rss_input_xfrm.py \
 	toeplitz.py \
diff --git a/tools/testing/selftests/drivers/net/hw/rss_drv.py b/tools/testing/selftests/drivers/net/hw/rss_drv.py
new file mode 100755
index 000000000000..2d1a33189076
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/hw/rss_drv.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+"""
+Driver-related behavior tests for RSS.
+"""
+
+from lib.py import ksft_run, ksft_exit, ksft_ge
+from lib.py import ksft_variants, KsftNamedVariant, KsftSkipEx
+from lib.py import defer, ethtool
+from lib.py import EthtoolFamily, NlError
+from lib.py import NetDrvEnv
+
+
+def _is_power_of_two(n):
+    return n > 0 and (n & (n - 1)) == 0
+
+
+def _get_rss(cfg, context=0):
+    return ethtool(f"-x {cfg.ifname} context {context}", json=True)[0]
+
+
+def _test_rss_indir_size(cfg, qcnt, context=0):
+    """Test that indirection table size is at least 4x queue count."""
+    ethtool(f"-L {cfg.ifname} combined {qcnt}")
+
+    rss = _get_rss(cfg, context=context)
+    indir = rss['rss-indirection-table']
+    ksft_ge(len(indir), 4 * qcnt, "Table smaller than 4x")
+    return len(indir)
+
+
+def _maybe_create_context(cfg, create_context):
+    """ Either create a context and return its ID or return 0 for main ctx """
+    if not create_context:
+        return 0
+    try:
+        ctx = cfg.ethnl.rss_create_act({'header': {'dev-index': cfg.ifindex}})
+        ctx_id = ctx['context']
+        defer(cfg.ethnl.rss_delete_act,
+              {'header': {'dev-index': cfg.ifindex}, 'context': ctx_id})
+    except NlError:
+        raise KsftSkipEx("Device does not support additional RSS contexts")
+
+    return ctx_id
+
+
+@ksft_variants([
+    KsftNamedVariant("main", False),
+    KsftNamedVariant("ctx", True),
+])
+def indir_size_4x(cfg, create_context):
+    """
+    Test that the indirection table has at least 4 entries per queue.
+    Empirically network-heavy workloads like memcache suffer with the 33%
+    imbalance of a 2x indirection table size.
+    4x table translates to a 16% imbalance.
+    """
+    channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}})
+    ch_max = channels.get('combined-max', 0)
+    qcnt = channels['combined-count']
+
+    if ch_max < 3:
+        raise KsftSkipEx(f"Not enough queues for the test: max={ch_max}")
+
+    defer(ethtool, f"-L {cfg.ifname} combined {qcnt}")
+    ethtool(f"-L {cfg.ifname} combined 3")
+
+    ctx_id = _maybe_create_context(cfg, create_context)
+
+    indir_sz = _test_rss_indir_size(cfg, 3, context=ctx_id)
+
+    # Test with max queue count (max - 1 if max is a power of two)
+    test_max = ch_max - 1 if _is_power_of_two(ch_max) else ch_max
+    if test_max > 3 and indir_sz < test_max * 4:
+        _test_rss_indir_size(cfg, test_max, context=ctx_id)
+
+
+def main() -> None:
+    """ Ksft boiler plate main """
+    with NetDrvEnv(__file__) as cfg:
+        cfg.ethnl = EthtoolFamily()
+        ksft_run([indir_size_4x], args=(cfg, ))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
-- 
cgit v1.2.3


From 71a58ec6672fbb7ae9f1b4a8ee1b5c352af93c0d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sat, 31 Jan 2026 12:30:29 -0800
Subject: tools: ynl: cli: make the output compact

Make the default (non-JSON) output more compact. Looking at RSS
context dumps is pretty much impossible without this, because
default print shows the indirection table with line per entry:

  'indir': [0,
            1,
            2,
	    ...

And indirection tables have 100-200 entries each.

The compact output is far more readable:

    'indir': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
              16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,

Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20260131203029.1173492-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/pyynl/cli.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/pyynl/cli.py b/tools/net/ynl/pyynl/cli.py
index fdac1ab10a40..94a5ba348b69 100755
--- a/tools/net/ynl/pyynl/cli.py
+++ b/tools/net/ynl/pyynl/cli.py
@@ -44,6 +44,10 @@ def color(text, modifiers):
         return f"{modifiers}{text}{Colors.RESET}"
     return text
 
+def term_width():
+    """ Get terminal width in columns (80 if stdout is not a terminal) """
+    return shutil.get_terminal_size().columns
+
 def schema_dir():
     """
     Return the effective schema directory, preferring in-tree before
@@ -103,8 +107,7 @@ def print_attr_list(ynl, attr_names, attr_set, indent=2):
 
             if attr.yaml.get('doc'):
                 doc_prefix = prefix + ' ' * 4
-                term_width = shutil.get_terminal_size().columns
-                doc_text = textwrap.fill(attr.yaml['doc'], width=term_width,
+                doc_text = textwrap.fill(attr.yaml['doc'], width=term_width(),
                                          initial_indent=doc_prefix,
                                          subsequent_indent=doc_prefix)
                 attr_info += f"\n{doc_text}"
@@ -264,7 +267,7 @@ def main():
         if args.output_json:
             print(json.dumps(msg, cls=YnlEncoder))
         else:
-            pprint.PrettyPrinter().pprint(msg)
+            pprint.pprint(msg, width=term_width(), compact=True)
 
     if args.list_families:
         for filename in sorted(os.listdir(spec_dir())):
-- 
cgit v1.2.3


From 6a059c6bfb557a8c72634d761d78463a6e224547 Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Fri, 30 Jan 2026 20:24:28 +0100
Subject: selftests: mptcp: add splice io mode

This patch adds a new 'splice' io mode for mptcp_connect to test
the newly added read_sock() and splice_read() functions of MPTCP.

do_splice() efficiently transfers data directly between two file
descriptors (infd and outfd) without copying to userspace, using
Linux's splice() system call.

Usage:
	./mptcp_connect.sh -m splice

Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Co-developed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260130-net-next-mptcp-splice-v2-5-31332ba70d7f@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.c | 79 ++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index 10f6f99cfd4e..a74b13e42ecd 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -52,6 +52,7 @@ enum cfg_mode {
 	CFG_MODE_POLL,
 	CFG_MODE_MMAP,
 	CFG_MODE_SENDFILE,
+	CFG_MODE_SPLICE,
 };
 
 enum cfg_peek {
@@ -124,7 +125,7 @@ static void die_usage(void)
 	fprintf(stderr, "\t-j     -- add additional sleep at connection start and tear down "
 		"-- for MPJ tests\n");
 	fprintf(stderr, "\t-l     -- listens mode, accepts incoming connection\n");
-	fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n");
+	fprintf(stderr, "\t-m [poll|mmap|sendfile|splice] -- use poll(default)/mmap+write/sendfile/splice\n");
 	fprintf(stderr, "\t-M mark -- set socket packet mark\n");
 	fprintf(stderr, "\t-o option -- test sockopt <option>\n");
 	fprintf(stderr, "\t-p num -- use port num\n");
@@ -935,6 +936,71 @@ static int copyfd_io_sendfile(int infd, int peerfd, int outfd,
 	return err;
 }
 
+static int do_splice(const int infd, const int outfd, const size_t len,
+		     struct wstate *winfo)
+{
+	ssize_t in_bytes, out_bytes;
+	int pipefd[2];
+	int err;
+
+	err = pipe(pipefd);
+	if (err) {
+		perror("pipe");
+		return 2;
+	}
+
+again:
+	in_bytes = splice(infd, NULL, pipefd[1], NULL, len - winfo->total_len,
+			  SPLICE_F_MOVE | SPLICE_F_MORE);
+	if (in_bytes < 0) {
+		perror("splice in");
+		err = 3;
+	} else if (in_bytes > 0) {
+		out_bytes = splice(pipefd[0], NULL, outfd, NULL, in_bytes,
+				   SPLICE_F_MOVE | SPLICE_F_MORE);
+		if (out_bytes < 0) {
+			perror("splice out");
+			err = 4;
+		} else if (in_bytes != out_bytes) {
+			fprintf(stderr, "Unexpected transfer: %zu vs %zu\n",
+				in_bytes, out_bytes);
+			err = 5;
+		} else {
+			goto again;
+		}
+	}
+
+	close(pipefd[0]);
+	close(pipefd[1]);
+
+	return err;
+}
+
+static int copyfd_io_splice(int infd, int peerfd, int outfd, unsigned int size,
+			    bool *in_closed_after_out, struct wstate *winfo)
+{
+	int err;
+
+	if (listen_mode) {
+		err = do_splice(peerfd, outfd, size, winfo);
+		if (err)
+			return err;
+
+		err = do_splice(infd, peerfd, size, winfo);
+	} else {
+		err = do_splice(infd, peerfd, size, winfo);
+		if (err)
+			return err;
+
+		shut_wr(peerfd);
+
+		err = do_splice(peerfd, outfd, size, winfo);
+		*in_closed_after_out = true;
+	}
+
+	return err;
+}
+
 static int copyfd_io(int infd, int peerfd, int outfd, bool close_peerfd, struct wstate *winfo)
 {
 	bool in_closed_after_out = false;
@@ -967,6 +1033,14 @@ static int copyfd_io(int infd, int peerfd, int outfd, bool close_peerfd, struct
 					 &in_closed_after_out, winfo);
 		break;
 
+	case CFG_MODE_SPLICE:
+		file_size = get_infd_size(infd);
+		if (file_size < 0)
+			return file_size;
+		ret = copyfd_io_splice(infd, peerfd, outfd, file_size,
+				       &in_closed_after_out, winfo);
+		break;
+
 	default:
 		fprintf(stderr, "Invalid mode %d\n", cfg_mode);
 
@@ -1380,12 +1454,15 @@ int parse_mode(const char *mode)
 		return CFG_MODE_MMAP;
 	if (!strcasecmp(mode, "sendfile"))
 		return CFG_MODE_SENDFILE;
+	if (!strcasecmp(mode, "splice"))
+		return CFG_MODE_SPLICE;
 
 	fprintf(stderr, "Unknown test mode: %s\n", mode);
 	fprintf(stderr, "Supported modes are:\n");
 	fprintf(stderr, "\t\t\"poll\" - interleaved read/write using poll()\n");
 	fprintf(stderr, "\t\t\"mmap\" - send entire input file (mmap+write), then read response (-l will read input first)\n");
 	fprintf(stderr, "\t\t\"sendfile\" - send entire input file (sendfile), then read response (-l will read input first)\n");
+	fprintf(stderr, "\t\t\"splice\" - send entire input file (splice), then read response (-l will read input first)\n");
 
 	die_usage();
 
-- 
cgit v1.2.3


From 2f2dc84645fb25960a0f52aff4d754fce43edea4 Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Fri, 30 Jan 2026 20:24:29 +0100
Subject: selftests: mptcp: connect: cover splice mode

The "splice" alternate mode for mptcp_connect.sh/.c is available now,
this patch adds mptcp_connect_splice.sh to test it in the MPTCP CI by
default.

Note that this mode is also supported by stable kernel versions, but
optimised in this patch series.

Suggested-by: Matthieu Baerts <matttbe@kernel.org>
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260130-net-next-mptcp-splice-v2-6-31332ba70d7f@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/Makefile                | 1 +
 tools/testing/selftests/net/mptcp/mptcp_connect_splice.sh | 5 +++++
 2 files changed, 6 insertions(+)
 create mode 100755 tools/testing/selftests/net/mptcp/mptcp_connect_splice.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile
index 4dd6278cd3dd..22ba0da2adb8 100644
--- a/tools/testing/selftests/net/mptcp/Makefile
+++ b/tools/testing/selftests/net/mptcp/Makefile
@@ -11,6 +11,7 @@ TEST_PROGS := \
 	mptcp_connect_checksum.sh \
 	mptcp_connect_mmap.sh \
 	mptcp_connect_sendfile.sh \
+	mptcp_connect_splice.sh \
 	mptcp_join.sh \
 	mptcp_sockopt.sh \
 	pm_netlink.sh \
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect_splice.sh b/tools/testing/selftests/net/mptcp/mptcp_connect_splice.sh
new file mode 100755
index 000000000000..241254a966c9
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect_splice.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+MPTCP_LIB_KSFT_TEST="$(basename "${0}" .sh)" \
+	"$(dirname "${0}")/mptcp_connect.sh" -m splice "${@}"
-- 
cgit v1.2.3


From be621a76341caa911ff98175114ff072618d7d4a Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Mon, 26 Jan 2026 10:59:04 +0100
Subject: selftests/sched_ext: Add test for sched_ext dl_server

Add a selftest to validate the correct behavior of the deadline server
for the ext_sched_class.

Co-developed-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Tested-by: Christian Loehle <christian.loehle@arm.com>
Link: https://patch.msgid.link/20260126100050.3854740-7-arighi@nvidia.com
---
 tools/testing/selftests/sched_ext/Makefile       |   1 +
 tools/testing/selftests/sched_ext/rt_stall.bpf.c |  23 +++
 tools/testing/selftests/sched_ext/rt_stall.c     | 240 +++++++++++++++++++++++
 3 files changed, 264 insertions(+)
 create mode 100644 tools/testing/selftests/sched_ext/rt_stall.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/rt_stall.c

(limited to 'tools')

diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 5fe45f9c5f8f..c9255d1499b6 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -183,6 +183,7 @@ auto-test-targets :=			\
 	select_cpu_dispatch_bad_dsq	\
 	select_cpu_dispatch_dbl_dsp	\
 	select_cpu_vtime		\
+	rt_stall			\
 	test_example			\
 
 testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
diff --git a/tools/testing/selftests/sched_ext/rt_stall.bpf.c b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
new file mode 100644
index 000000000000..80086779dd1e
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that verified if RT tasks can stall SCHED_EXT tasks.
+ *
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+void BPF_STRUCT_OPS(rt_stall_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops rt_stall_ops = {
+	.exit			= (void *)rt_stall_exit,
+	.name			= "rt_stall",
+};
diff --git a/tools/testing/selftests/sched_ext/rt_stall.c b/tools/testing/selftests/sched_ext/rt_stall.c
new file mode 100644
index 000000000000..015200f80f6e
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/rt_stall.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sched.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <linux/sched.h>
+#include <signal.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <unistd.h>
+#include "rt_stall.bpf.skel.h"
+#include "scx_test.h"
+#include "../kselftest.h"
+
+#define CORE_ID		0	/* CPU to pin tasks to */
+#define RUN_TIME        5	/* How long to run the test in seconds */
+
+/* Simple busy-wait function for test tasks */
+static void process_func(void)
+{
+	while (1) {
+		/* Busy wait */
+		for (volatile unsigned long i = 0; i < 10000000UL; i++)
+			;
+	}
+}
+
+/* Set CPU affinity to a specific core */
+static void set_affinity(int cpu)
+{
+	cpu_set_t mask;
+
+	CPU_ZERO(&mask);
+	CPU_SET(cpu, &mask);
+	if (sched_setaffinity(0, sizeof(mask), &mask) != 0) {
+		perror("sched_setaffinity");
+		exit(EXIT_FAILURE);
+	}
+}
+
+/* Set task scheduling policy and priority */
+static void set_sched(int policy, int priority)
+{
+	struct sched_param param;
+
+	param.sched_priority = priority;
+	if (sched_setscheduler(0, policy, &param) != 0) {
+		perror("sched_setscheduler");
+		exit(EXIT_FAILURE);
+	}
+}
+
+/* Get process runtime from /proc/<pid>/stat */
+static float get_process_runtime(int pid)
+{
+	char path[256];
+	FILE *file;
+	long utime, stime;
+	int fields;
+
+	snprintf(path, sizeof(path), "/proc/%d/stat", pid);
+	file = fopen(path, "r");
+	if (file == NULL) {
+		perror("Failed to open stat file");
+		return -1;
+	}
+
+	/* Skip the first 13 fields and read the 14th and 15th */
+	fields = fscanf(file,
+			"%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %lu %lu",
+			&utime, &stime);
+	fclose(file);
+
+	if (fields != 2) {
+		fprintf(stderr, "Failed to read stat file\n");
+		return -1;
+	}
+
+	/* Calculate the total time spent in the process */
+	long total_time = utime + stime;
+	long ticks_per_second = sysconf(_SC_CLK_TCK);
+	float runtime_seconds = total_time * 1.0 / ticks_per_second;
+
+	return runtime_seconds;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct rt_stall *skel;
+
+	skel = rt_stall__open();
+	SCX_FAIL_IF(!skel, "Failed to open");
+	SCX_ENUM_INIT(skel);
+	SCX_FAIL_IF(rt_stall__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static bool sched_stress_test(bool is_ext)
+{
+	/*
+	 * We're expecting the EXT task to get around 5% of CPU time when
+	 * competing with the RT task (small 1% fluctuations are expected).
+	 *
+	 * However, the EXT task should get at least 4% of the CPU to prove
+	 * that the EXT deadline server is working correctly. A percentage
+	 * less than 4% indicates a bug where RT tasks can potentially
+	 * stall SCHED_EXT tasks, causing the test to fail.
+	 */
+	const float expected_min_ratio = 0.04; /* 4% */
+	const char *class_str = is_ext ? "EXT" : "FAIR";
+
+	float ext_runtime, rt_runtime, actual_ratio;
+	int ext_pid, rt_pid;
+
+	ksft_print_header();
+	ksft_set_plan(1);
+
+	/* Create and set up a EXT task */
+	ext_pid = fork();
+	if (ext_pid == 0) {
+		set_affinity(CORE_ID);
+		process_func();
+		exit(0);
+	} else if (ext_pid < 0) {
+		perror("fork task");
+		ksft_exit_fail();
+	}
+
+	/* Create an RT task */
+	rt_pid = fork();
+	if (rt_pid == 0) {
+		set_affinity(CORE_ID);
+		set_sched(SCHED_FIFO, 50);
+		process_func();
+		exit(0);
+	} else if (rt_pid < 0) {
+		perror("fork for RT task");
+		ksft_exit_fail();
+	}
+
+	/* Let the processes run for the specified time */
+	sleep(RUN_TIME);
+
+	/* Get runtime for the EXT task */
+	ext_runtime = get_process_runtime(ext_pid);
+	if (ext_runtime == -1)
+		ksft_exit_fail_msg("Error getting runtime for %s task (PID %d)\n",
+				   class_str, ext_pid);
+	ksft_print_msg("Runtime of %s task (PID %d) is %f seconds\n",
+		       class_str, ext_pid, ext_runtime);
+
+	/* Get runtime for the RT task */
+	rt_runtime = get_process_runtime(rt_pid);
+	if (rt_runtime == -1)
+		ksft_exit_fail_msg("Error getting runtime for RT task (PID %d)\n", rt_pid);
+	ksft_print_msg("Runtime of RT task (PID %d) is %f seconds\n", rt_pid, rt_runtime);
+
+	/* Kill the processes */
+	kill(ext_pid, SIGKILL);
+	kill(rt_pid, SIGKILL);
+	waitpid(ext_pid, NULL, 0);
+	waitpid(rt_pid, NULL, 0);
+
+	/* Verify that the scx task got enough runtime */
+	actual_ratio = ext_runtime / (ext_runtime + rt_runtime);
+	ksft_print_msg("%s task got %.2f%% of total runtime\n",
+		       class_str, actual_ratio * 100);
+
+	if (actual_ratio >= expected_min_ratio) {
+		ksft_test_result_pass("PASS: %s task got more than %.2f%% of runtime\n",
+				      class_str, expected_min_ratio * 100);
+		return true;
+	}
+	ksft_test_result_fail("FAIL: %s task got less than %.2f%% of runtime\n",
+			      class_str, expected_min_ratio * 100);
+	return false;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct rt_stall *skel = ctx;
+	struct bpf_link *link = NULL;
+	bool res;
+	int i;
+
+	/*
+	 * Test if the dl_server is working both with and without the
+	 * sched_ext scheduler attached.
+	 *
+	 * This ensures all the scenarios are covered:
+	 *   - fair_server stop -> ext_server start
+	 *   - ext_server stop -> fair_server stop
+	 */
+	for (i = 0; i < 4; i++) {
+		bool is_ext = i % 2;
+
+		if (is_ext) {
+			memset(&skel->data->uei, 0, sizeof(skel->data->uei));
+			link = bpf_map__attach_struct_ops(skel->maps.rt_stall_ops);
+			SCX_FAIL_IF(!link, "Failed to attach scheduler");
+		}
+		res = sched_stress_test(is_ext);
+		if (is_ext) {
+			SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
+			bpf_link__destroy(link);
+		}
+
+		if (!res)
+			ksft_exit_fail();
+	}
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct rt_stall *skel = ctx;
+
+	rt_stall__destroy(skel);
+}
+
+struct scx_test rt_stall = {
+	.name = "rt_stall",
+	.description = "Verify that RT tasks cannot stall SCHED_EXT tasks",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&rt_stall)
-- 
cgit v1.2.3


From dd6a37e8faa723c680cb8615efa5b042691b927f Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelagnelf@nvidia.com>
Date: Mon, 26 Jan 2026 10:59:05 +0100
Subject: selftests/sched_ext: Add test for DL server total_bw consistency

Add a new kselftest to verify that the total_bw value in
/sys/kernel/debug/sched/debug remains consistent across all CPUs
under different sched_ext BPF program states:

1. Before a BPF scheduler is loaded
2. While a BPF scheduler is loaded and active
3. After a BPF scheduler is unloaded

The test runs CPU stress threads to ensure DL server bandwidth
values stabilize before checking consistency. This helps catch
potential issues with DL server bandwidth accounting during
sched_ext transitions.

Co-developed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Christian Loehle <christian.loehle@arm.com>
Link: https://patch.msgid.link/20260126100050.3854740-8-arighi@nvidia.com
---
 tools/testing/selftests/sched_ext/Makefile   |   1 +
 tools/testing/selftests/sched_ext/total_bw.c | 281 +++++++++++++++++++++++++++
 2 files changed, 282 insertions(+)
 create mode 100644 tools/testing/selftests/sched_ext/total_bw.c

(limited to 'tools')

diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index c9255d1499b6..2c601a7eaff5 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -185,6 +185,7 @@ auto-test-targets :=			\
 	select_cpu_vtime		\
 	rt_stall			\
 	test_example			\
+	total_bw			\
 
 testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
 
diff --git a/tools/testing/selftests/sched_ext/total_bw.c b/tools/testing/selftests/sched_ext/total_bw.c
new file mode 100644
index 000000000000..5b0a619bab86
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/total_bw.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test to verify that total_bw value remains consistent across all CPUs
+ * in different BPF program states.
+ *
+ * Copyright (C) 2025 NVIDIA Corporation.
+ */
+#include <bpf/bpf.h>
+#include <errno.h>
+#include <pthread.h>
+#include <scx/common.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "minimal.bpf.skel.h"
+#include "scx_test.h"
+
+#define MAX_CPUS 512
+#define STRESS_DURATION_SEC 5
+
+struct total_bw_ctx {
+	struct minimal *skel;
+	long baseline_bw[MAX_CPUS];
+	int nr_cpus;
+};
+
+static void *cpu_stress_thread(void *arg)
+{
+	volatile int i;
+	time_t end_time = time(NULL) + STRESS_DURATION_SEC;
+
+	while (time(NULL) < end_time)
+		for (i = 0; i < 1000000; i++)
+			;
+
+	return NULL;
+}
+
+/*
+ * The first enqueue on a CPU causes the DL server to start, for that
+ * reason run stressor threads in the hopes it schedules on all CPUs.
+ */
+static int run_cpu_stress(int nr_cpus)
+{
+	pthread_t *threads;
+	int i, ret = 0;
+
+	threads = calloc(nr_cpus, sizeof(pthread_t));
+	if (!threads)
+		return -ENOMEM;
+
+	/* Create threads to run on each CPU */
+	for (i = 0; i < nr_cpus; i++) {
+		if (pthread_create(&threads[i], NULL, cpu_stress_thread, NULL)) {
+			ret = -errno;
+			fprintf(stderr, "Failed to create thread %d: %s\n", i, strerror(-ret));
+			break;
+		}
+	}
+
+	/* Wait for all threads to complete */
+	for (i = 0; i < nr_cpus; i++) {
+		if (threads[i])
+			pthread_join(threads[i], NULL);
+	}
+
+	free(threads);
+	return ret;
+}
+
+static int read_total_bw_values(long *bw_values, int max_cpus)
+{
+	FILE *fp;
+	char line[256];
+	int cpu_count = 0;
+
+	fp = fopen("/sys/kernel/debug/sched/debug", "r");
+	if (!fp) {
+		SCX_ERR("Failed to open debug file");
+		return -1;
+	}
+
+	while (fgets(line, sizeof(line), fp)) {
+		char *bw_str = strstr(line, "total_bw");
+
+		if (bw_str) {
+			bw_str = strchr(bw_str, ':');
+			if (bw_str) {
+				/* Only store up to max_cpus values */
+				if (cpu_count < max_cpus)
+					bw_values[cpu_count] = atol(bw_str + 1);
+				cpu_count++;
+			}
+		}
+	}
+
+	fclose(fp);
+	return cpu_count;
+}
+
+static bool verify_total_bw_consistency(long *bw_values, int count)
+{
+	int i;
+	long first_value;
+
+	if (count <= 0)
+		return false;
+
+	first_value = bw_values[0];
+
+	for (i = 1; i < count; i++) {
+		if (bw_values[i] != first_value) {
+			SCX_ERR("Inconsistent total_bw: CPU0=%ld, CPU%d=%ld",
+				first_value, i, bw_values[i]);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static int fetch_verify_total_bw(long *bw_values, int nr_cpus)
+{
+	int attempts = 0;
+	int max_attempts = 10;
+	int count;
+
+	/*
+	 * The first enqueue on a CPU causes the DL server to start, for that
+	 * reason run stressor threads in the hopes it schedules on all CPUs.
+	 */
+	if (run_cpu_stress(nr_cpus) < 0) {
+		SCX_ERR("Failed to run CPU stress");
+		return -1;
+	}
+
+	/* Try multiple times to get stable values */
+	while (attempts < max_attempts) {
+		count = read_total_bw_values(bw_values, nr_cpus);
+		fprintf(stderr, "Read %d total_bw values (testing %d CPUs)\n", count, nr_cpus);
+		/* If system has more CPUs than we're testing, that's OK */
+		if (count < nr_cpus) {
+			SCX_ERR("Expected at least %d CPUs, got %d", nr_cpus, count);
+			attempts++;
+			sleep(1);
+			continue;
+		}
+
+		/* Only verify the CPUs we're testing */
+		if (verify_total_bw_consistency(bw_values, nr_cpus)) {
+			fprintf(stderr, "Values are consistent: %ld\n", bw_values[0]);
+			return 0;
+		}
+
+		attempts++;
+		sleep(1);
+	}
+
+	return -1;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct total_bw_ctx *test_ctx;
+
+	if (access("/sys/kernel/debug/sched/debug", R_OK) != 0) {
+		fprintf(stderr, "Skipping test: debugfs sched/debug not accessible\n");
+		return SCX_TEST_SKIP;
+	}
+
+	test_ctx = calloc(1, sizeof(*test_ctx));
+	if (!test_ctx)
+		return SCX_TEST_FAIL;
+
+	test_ctx->nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	if (test_ctx->nr_cpus <= 0) {
+		free(test_ctx);
+		return SCX_TEST_FAIL;
+	}
+
+	/* If system has more CPUs than MAX_CPUS, just test the first MAX_CPUS */
+	if (test_ctx->nr_cpus > MAX_CPUS)
+		test_ctx->nr_cpus = MAX_CPUS;
+
+	/* Test scenario 1: BPF program not loaded */
+	/* Read and verify baseline total_bw before loading BPF program */
+	fprintf(stderr, "BPF prog initially not loaded, reading total_bw values\n");
+	if (fetch_verify_total_bw(test_ctx->baseline_bw, test_ctx->nr_cpus) < 0) {
+		SCX_ERR("Failed to get stable baseline values");
+		free(test_ctx);
+		return SCX_TEST_FAIL;
+	}
+
+	/* Load the BPF skeleton */
+	test_ctx->skel = minimal__open();
+	if (!test_ctx->skel) {
+		free(test_ctx);
+		return SCX_TEST_FAIL;
+	}
+
+	SCX_ENUM_INIT(test_ctx->skel);
+	if (minimal__load(test_ctx->skel)) {
+		minimal__destroy(test_ctx->skel);
+		free(test_ctx);
+		return SCX_TEST_FAIL;
+	}
+
+	*ctx = test_ctx;
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct total_bw_ctx *test_ctx = ctx;
+	struct bpf_link *link;
+	long loaded_bw[MAX_CPUS];
+	long unloaded_bw[MAX_CPUS];
+	int i;
+
+	/* Test scenario 2: BPF program loaded */
+	link = bpf_map__attach_struct_ops(test_ctx->skel->maps.minimal_ops);
+	if (!link) {
+		SCX_ERR("Failed to attach scheduler");
+		return SCX_TEST_FAIL;
+	}
+
+	fprintf(stderr, "BPF program loaded, reading total_bw values\n");
+	if (fetch_verify_total_bw(loaded_bw, test_ctx->nr_cpus) < 0) {
+		SCX_ERR("Failed to get stable values with BPF loaded");
+		bpf_link__destroy(link);
+		return SCX_TEST_FAIL;
+	}
+	bpf_link__destroy(link);
+
+	/* Test scenario 3: BPF program unloaded */
+	fprintf(stderr, "BPF program unloaded, reading total_bw values\n");
+	if (fetch_verify_total_bw(unloaded_bw, test_ctx->nr_cpus) < 0) {
+		SCX_ERR("Failed to get stable values after BPF unload");
+		return SCX_TEST_FAIL;
+	}
+
+	/* Verify all three scenarios have the same total_bw values */
+	for (i = 0; i < test_ctx->nr_cpus; i++) {
+		if (test_ctx->baseline_bw[i] != loaded_bw[i]) {
+			SCX_ERR("CPU%d: baseline_bw=%ld != loaded_bw=%ld",
+				i, test_ctx->baseline_bw[i], loaded_bw[i]);
+			return SCX_TEST_FAIL;
+		}
+
+		if (test_ctx->baseline_bw[i] != unloaded_bw[i]) {
+			SCX_ERR("CPU%d: baseline_bw=%ld != unloaded_bw=%ld",
+				i, test_ctx->baseline_bw[i], unloaded_bw[i]);
+			return SCX_TEST_FAIL;
+		}
+	}
+
+	fprintf(stderr, "All total_bw values are consistent across all scenarios\n");
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct total_bw_ctx *test_ctx = ctx;
+
+	if (test_ctx) {
+		if (test_ctx->skel)
+			minimal__destroy(test_ctx->skel);
+		free(test_ctx);
+	}
+}
+
+struct scx_test total_bw = {
+	.name = "total_bw",
+	.description = "Verify total_bw consistency across BPF program states",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&total_bw)
-- 
cgit v1.2.3


From 6f74bc8b6e8d0e8218c1342682dadb156603d13e Mon Sep 17 00:00:00 2001
From: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Date: Sat, 31 Jan 2026 23:25:03 +0100
Subject: selftests/net: gro: add self-test for TCP CWR flag

Currently, GRO does not flush packets when the CWR bit is set.
A corresponding self-test is being added, in which the CWR flag
is set for two consecutive packets, but the first packet with the
CWR flag set will not be flushed immediately.

+===================+==========+===============+===========+
|     Packet id     | CWR flag |    Payload    | Flushing? |
+===================+==========+===============+===========+
|         0         |     0    |  PAYLOAD_LEN  |     0     |
|        ...        |     0    |  PAYLOAD_LEN  |     1     |
+-------------------+----------+---------------+-----------+
| NUM_PACKETS/2 - 1 |     1    |  payload_len  |     0     |
|   NUM_PACKETS/2   |     1    |  payload_len  |     1     |
+-------------------+----------+---------------+-----------+
|        ...        |     0    |  PAYLOAD_LEN  |     0     |
|   NUM_PACKETS     |     0    |  PAYLOAD_LEN  |     1     |
+===================+==========+===============+===========+

Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20260131222515.8485-4-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/drivers/net/gro.c  | 81 +++++++++++++++++++++---------
 tools/testing/selftests/drivers/net/gro.py |  3 +-
 2 files changed, 60 insertions(+), 24 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/gro.c b/tools/testing/selftests/drivers/net/gro.c
index e76c618704cf..3c0745b68bfa 100644
--- a/tools/testing/selftests/drivers/net/gro.c
+++ b/tools/testing/selftests/drivers/net/gro.c
@@ -17,8 +17,8 @@
  *  Pure ACK does not coalesce.
  *
  * flags_*:
- *  No packets with PSH, SYN, URG, RST set will be coalesced.
- *   - flags_psh, flags_syn, flags_rst, flags_urg
+ *  No packets with PSH, SYN, URG, RST, CWR set will be coalesced.
+ *   - flags_psh, flags_syn, flags_rst, flags_urg, flags_cwr
  *
  * tcp_*:
  *  Packets with incorrect checksum, non-consecutive seqno and
@@ -360,32 +360,58 @@ static void create_packet(void *buf, int seq_offset, int ack_offset,
 	fill_datalinklayer(buf);
 }
 
-/* send one extra flag, not first and not last pkt */
-static void send_flags(int fd, struct sockaddr_ll *daddr, int psh, int syn,
-		       int rst, int urg)
+#ifndef TH_CWR
+#define TH_CWR 0x80
+#endif
+static void set_flags(struct tcphdr *tcph, int payload_len, int psh, int syn,
+		      int rst, int urg, int cwr)
 {
-	static char flag_buf[MAX_HDR_LEN + PAYLOAD_LEN];
-	static char buf[MAX_HDR_LEN + PAYLOAD_LEN];
-	int payload_len, pkt_size, flag, i;
-	struct tcphdr *tcph;
-
-	payload_len = PAYLOAD_LEN * psh;
-	pkt_size = total_hdr_len + payload_len;
-	flag = NUM_PACKETS / 2;
-
-	create_packet(flag_buf, flag * payload_len, 0, payload_len, 0);
-
-	tcph = (struct tcphdr *)(flag_buf + tcp_offset);
 	tcph->psh = psh;
 	tcph->syn = syn;
 	tcph->rst = rst;
 	tcph->urg = urg;
+	if (cwr)
+		tcph->th_flags |= TH_CWR;
+	else
+		tcph->th_flags &= ~TH_CWR;
 	tcph->check = 0;
 	tcph->check = tcp_checksum(tcph, payload_len);
+}
+
+/* send extra flags of the (NUM_PACKETS / 2) and (NUM_PACKETS / 2 - 1)
+ * pkts, not first and not last pkt
+ */
+static void send_flags(int fd, struct sockaddr_ll *daddr, int psh, int syn,
+		       int rst, int urg, int cwr)
+{
+	static char flag_buf[2][MAX_HDR_LEN + PAYLOAD_LEN];
+	static char buf[MAX_HDR_LEN + PAYLOAD_LEN];
+	int payload_len, pkt_size, i;
+	struct tcphdr *tcph;
+	int flag[2];
+
+	payload_len = PAYLOAD_LEN * (psh || cwr);
+	pkt_size = total_hdr_len + payload_len;
+	flag[0] = NUM_PACKETS / 2;
+	flag[1] = NUM_PACKETS / 2 - 1;
+
+	/* Create and configure packets with flags
+	 */
+	for (i = 0; i < 2; i++) {
+		if (flag[i] > 0) {
+			create_packet(flag_buf[i], flag[i] * payload_len, 0,
+				      payload_len, 0);
+			tcph = (struct tcphdr *)(flag_buf[i] + tcp_offset);
+			set_flags(tcph, payload_len, psh, syn, rst, urg, cwr);
+		}
+	}
 
 	for (i = 0; i < NUM_PACKETS + 1; i++) {
-		if (i == flag) {
-			write_packet(fd, flag_buf, pkt_size, daddr);
+		if (i == flag[0]) {
+			write_packet(fd, flag_buf[0], pkt_size, daddr);
+			continue;
+		} else if (i == flag[1] && cwr) {
+			write_packet(fd, flag_buf[1], pkt_size, daddr);
 			continue;
 		}
 		create_packet(buf, i * PAYLOAD_LEN, 0, PAYLOAD_LEN, 0);
@@ -1068,16 +1094,19 @@ static void gro_sender(void)
 
 	/* flags sub-tests */
 	} else if (strcmp(testname, "flags_psh") == 0) {
-		send_flags(txfd, &daddr, 1, 0, 0, 0);
+		send_flags(txfd, &daddr, 1, 0, 0, 0, 0);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
 	} else if (strcmp(testname, "flags_syn") == 0) {
-		send_flags(txfd, &daddr, 0, 1, 0, 0);
+		send_flags(txfd, &daddr, 0, 1, 0, 0, 0);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
 	} else if (strcmp(testname, "flags_rst") == 0) {
-		send_flags(txfd, &daddr, 0, 0, 1, 0);
+		send_flags(txfd, &daddr, 0, 0, 1, 0, 0);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
 	} else if (strcmp(testname, "flags_urg") == 0) {
-		send_flags(txfd, &daddr, 0, 0, 0, 1);
+		send_flags(txfd, &daddr, 0, 0, 0, 1, 0);
+		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+	} else if (strcmp(testname, "flags_cwr") == 0) {
+		send_flags(txfd, &daddr, 0, 0, 0, 0, 1);
 		write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
 
 	/* tcp sub-tests */
@@ -1239,6 +1268,12 @@ static void gro_receiver(void)
 		correct_payload[2] = PAYLOAD_LEN * 2;
 		printf("urg flag ends coalescing: ");
 		check_recv_pkts(rxfd, correct_payload, 3);
+	} else if (strcmp(testname, "flags_cwr") == 0) {
+		correct_payload[0] = PAYLOAD_LEN;
+		correct_payload[1] = PAYLOAD_LEN * 2;
+		correct_payload[2] = PAYLOAD_LEN * 2;
+		printf("cwr flag ends coalescing: ");
+		check_recv_pkts(rxfd, correct_payload, 3);
 
 	/* tcp sub-tests */
 	} else if (strcmp(testname, "tcp_csum") == 0) {
diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py
index 1bb8af571456..cbc1b19dbc91 100755
--- a/tools/testing/selftests/drivers/net/gro.py
+++ b/tools/testing/selftests/drivers/net/gro.py
@@ -17,6 +17,7 @@ Test cases:
   - flags_syn: Packets with SYN flag don't coalesce
   - flags_rst: Packets with RST flag don't coalesce
   - flags_urg: Packets with URG flag don't coalesce
+  - flags_cwr: Packets with CWR flag don't coalesce
   - tcp_csum: Packets with incorrect checksum don't coalesce
   - tcp_seq: Packets with non-consecutive seqno don't coalesce
   - tcp_ts: Packets with different timestamp options don't coalesce
@@ -191,7 +192,7 @@ def _gro_variants():
     common_tests = [
         "data_same", "data_lrg_sml", "data_sml_lrg",
         "ack",
-        "flags_psh", "flags_syn", "flags_rst", "flags_urg",
+        "flags_psh", "flags_syn", "flags_rst", "flags_urg", "flags_cwr",
         "tcp_csum", "tcp_seq", "tcp_ts", "tcp_opt",
         "ip_ecn", "ip_tos",
         "large_max", "large_rem",
-- 
cgit v1.2.3


From f85d9c45f1d48a146f37cfd3d244aac4157ea390 Mon Sep 17 00:00:00 2001
From: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Date: Sat, 31 Jan 2026 23:25:15 +0100
Subject: selftests/net: packetdrill: add TCP Accurate ECN cases
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Linux Accurate ECN test sets using ACE counters and AccECN options to
cover several scenarios: Connection teardown, different ACK conditions,
counter wrapping, SACK space grabbing, fallback schemes, negotiation
retransmission/reorder/loss, AccECN option drop/loss, different
handshake reflectors, data with marking, and different sysctl values.

The packetdrill used is commit cbe405666c9c8698ac1e72f5e8ffc551216dfa56
of repo: https://github.com/minuscat/packetdrill/tree/upstream_accecn.
And corresponding patches are sent to google/packetdrill email list.

Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Co-developed-by: Ilpo Järvinen <ij@kernel.org>
Signed-off-by: Ilpo Järvinen <ij@kernel.org>
Co-developed-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20260131222515.8485-16-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../packetdrill/tcp_accecn_2nd_data_as_first.pkt   | 24 ++++++++
 .../tcp_accecn_2nd_data_as_first_connect.pkt       | 30 ++++++++++
 .../tcp_accecn_3rd_ack_after_synack_rxmt.pkt       | 19 ++++++
 .../tcp_accecn_3rd_ack_ce_updates_received_ce.pkt  | 18 ++++++
 .../tcp_accecn_3rd_ack_lost_data_ce.pkt            | 22 +++++++
 .../net/packetdrill/tcp_accecn_3rd_dups.pkt        | 26 ++++++++
 .../packetdrill/tcp_accecn_acc_ecn_disabled.pkt    | 13 ++++
 .../tcp_accecn_accecn_then_notecn_syn.pkt          | 28 +++++++++
 .../packetdrill/tcp_accecn_accecn_to_rfc3168.pkt   | 18 ++++++
 .../tcp_accecn_client_accecn_options_drop.pkt      | 34 +++++++++++
 .../tcp_accecn_client_accecn_options_lost.pkt      | 38 ++++++++++++
 .../packetdrill/tcp_accecn_clientside_disabled.pkt | 12 ++++
 ...cp_accecn_close_local_close_then_remote_fin.pkt | 25 ++++++++
 .../tcp_accecn_delivered_2ndlargeack.pkt           | 25 ++++++++
 .../tcp_accecn_delivered_falseoverflow_detect.pkt  | 31 ++++++++++
 .../packetdrill/tcp_accecn_delivered_largeack.pkt  | 24 ++++++++
 .../packetdrill/tcp_accecn_delivered_largeack2.pkt | 25 ++++++++
 .../packetdrill/tcp_accecn_delivered_maxack.pkt    | 25 ++++++++
 .../packetdrill/tcp_accecn_delivered_updates.pkt   | 70 ++++++++++++++++++++++
 .../selftests/net/packetdrill/tcp_accecn_ecn3.pkt  | 12 ++++
 .../tcp_accecn_ecn_field_updates_opt.pkt           | 35 +++++++++++
 .../net/packetdrill/tcp_accecn_ipflags_drop.pkt    | 14 +++++
 .../net/packetdrill/tcp_accecn_listen_opt_drop.pkt | 16 +++++
 .../tcp_accecn_multiple_syn_ack_drop.pkt           | 28 +++++++++
 .../packetdrill/tcp_accecn_multiple_syn_drop.pkt   | 18 ++++++
 .../packetdrill/tcp_accecn_negotiation_bleach.pkt  | 23 +++++++
 .../packetdrill/tcp_accecn_negotiation_connect.pkt | 23 +++++++
 .../packetdrill/tcp_accecn_negotiation_listen.pkt  | 26 ++++++++
 .../tcp_accecn_negotiation_noopt_connect.pkt       | 23 +++++++
 .../tcp_accecn_negotiation_optenable.pkt           | 23 +++++++
 .../packetdrill/tcp_accecn_no_ecn_after_accecn.pkt | 20 +++++++
 .../selftests/net/packetdrill/tcp_accecn_noopt.pkt | 27 +++++++++
 .../net/packetdrill/tcp_accecn_noprogress.pkt      | 27 +++++++++
 .../tcp_accecn_notecn_then_accecn_syn.pkt          | 28 +++++++++
 .../packetdrill/tcp_accecn_rfc3168_to_fallback.pkt | 18 ++++++
 .../packetdrill/tcp_accecn_rfc3168_to_rfc3168.pkt  | 18 ++++++
 .../net/packetdrill/tcp_accecn_sack_space_grab.pkt | 28 +++++++++
 .../tcp_accecn_sack_space_grab_with_ts.pkt         | 39 ++++++++++++
 .../tcp_accecn_serverside_accecn_disabled1.pkt     | 20 +++++++
 .../tcp_accecn_serverside_accecn_disabled2.pkt     | 20 +++++++
 .../packetdrill/tcp_accecn_serverside_broken.pkt   | 19 ++++++
 .../tcp_accecn_serverside_ecn_disabled.pkt         | 19 ++++++
 .../net/packetdrill/tcp_accecn_serverside_only.pkt | 18 ++++++
 ...accecn_syn_ace_flags_acked_after_retransmit.pkt | 18 ++++++
 .../packetdrill/tcp_accecn_syn_ace_flags_drop.pkt  | 16 +++++
 ...cn_syn_ack_ace_flags_acked_after_retransmit.pkt | 27 +++++++++
 .../tcp_accecn_syn_ack_ace_flags_drop.pkt          | 26 ++++++++
 .../net/packetdrill/tcp_accecn_syn_ce.pkt          | 13 ++++
 .../net/packetdrill/tcp_accecn_syn_ect0.pkt        | 13 ++++
 .../net/packetdrill/tcp_accecn_syn_ect1.pkt        | 13 ++++
 .../net/packetdrill/tcp_accecn_synack_ce.pkt       | 27 +++++++++
 .../tcp_accecn_synack_ce_updates_delivered_ce.pkt  | 22 +++++++
 .../net/packetdrill/tcp_accecn_synack_ect0.pkt     | 24 ++++++++
 .../net/packetdrill/tcp_accecn_synack_ect1.pkt     | 24 ++++++++
 .../net/packetdrill/tcp_accecn_synack_rexmit.pkt   | 15 +++++
 .../net/packetdrill/tcp_accecn_synack_rxmt.pkt     | 25 ++++++++
 .../net/packetdrill/tcp_accecn_tsnoprogress.pkt    | 26 ++++++++
 .../net/packetdrill/tcp_accecn_tsprogress.pkt      | 25 ++++++++
 58 files changed, 1363 insertions(+)
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first_connect.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_after_synack_rxmt.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_ce_updates_received_ce.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_lost_data_ce.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_dups.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_acc_ecn_disabled.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_then_notecn_syn.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_to_rfc3168.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_drop.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_lost.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_clientside_disabled.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_close_local_close_then_remote_fin.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_2ndlargeack.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_falseoverflow_detect.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack2.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_maxack.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_updates.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_ecn3.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_ecn_field_updates_opt.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_ipflags_drop.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_listen_opt_drop.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_ack_drop.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_drop.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_bleach.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_connect.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_listen.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_noopt_connect.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_optenable.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_no_ecn_after_accecn.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_noopt.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_noprogress.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_notecn_then_accecn_syn.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_fallback.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_rfc3168.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab_with_ts.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled1.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled2.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_broken.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_ecn_disabled.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_only.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_acked_after_retransmit.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_drop.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_acked_after_retransmit.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_drop.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ce.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect0.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect1.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce_updates_delivered_ce.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect0.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect1.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rexmit.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rxmt.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_tsnoprogress.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_accecn_tsprogress.pkt

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first.pkt
new file mode 100644
index 000000000000..07e9936e70e6
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first.pkt
@@ -0,0 +1,24 @@
+// 3rd ACK + 1st data segment lost, data segments with ce
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
++0.05 < SEWA 0:0(0) win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
+// 3rd ACK lost
+// 1st data segment lost
++0.05 < [ce] EAP. 1001:2001(1000) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] WA. 1:1(0) ack 1 <ECN e1b 1 ceb 1000 e0b 1,nop,nop,nop,sack 1001:2001>
++.002 accept(3, ..., ...) = 4
+
++0.2 < [ce] EAP. 1:1001(1000) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop>
++.001 > [ect0] EWA. 1:1(0) ack 2001 <ECN e1b 1 ceb 2000 e0b 1,nop>
+
++0.05 < [ce] EAP. 2001:3001(1000) ack 1 win 264
++.001 > [ect0] . 1:1(0) ack 3001 <ECN e1b 1 ceb 3000 e0b 1,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first_connect.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first_connect.pkt
new file mode 100644
index 000000000000..76b8422b34dc
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_2nd_data_as_first_connect.pkt
@@ -0,0 +1,30 @@
+// 3rd ACK + 1st data segment lost, 2nd data segments with ce
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < [noecn] SW. 0:0(0) ack 1 win 32767 <mss 1016,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8>
+// 3rd ACK lost
++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+
++0.01 write(4, ..., 2000) = 2000
+// 1st data segment lost + 2nd gets CE
++.002 > [ect0] .5 1:1005(1004) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
++.000 > [ect0] P.5 1005:2001(996) ack 1 <ECN e1b 1 ceb 0 e0b 1, nop>
++0.05 < [ect0] .6 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 996 e1b 1,nop,nop,nop,sack 1005:2001>
+
++0.01 %{ assert tcpi_delivered_ce == 1, tcpi_delivered_ce }%
+
++0.002~+0.1 > [ect0] .5 1:1005(1004) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
++.05 < [ect0] .6 1:1(0) ack 2001 win 264 <ECN e0b 1005 ceb 996 e1b 1,nop>
+
++0.01 write(4, ..., 1000) = 1000
++0~+0.002 > [ect0] P.5 2001:3001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+
++0.1 < [ect0] .5 1:1001(1000) ack 3001 win 264 <ECN e0b 1 ceb 0 e1b 1,nop>
++0~+0.01 > [ect0] .5 3001:3001(0) ack 1001 <ECN e1b 1 ceb 0 e0b 1001,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_after_synack_rxmt.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_after_synack_rxmt.pkt
new file mode 100644
index 000000000000..84060e490589
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_after_synack_rxmt.pkt
@@ -0,0 +1,19 @@
+// Test 3rd ACK flags when SYN-ACK is rexmitted
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < [noecn] SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8>
++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+
++0.1 < [ect0] S. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
+// Our code currently sends a challenge ACK
+// when it receives a SYN in ESTABLISHED state
+// based on the latest SYN
++.002 > [ect0] A. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_ce_updates_received_ce.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_ce_updates_received_ce.pkt
new file mode 100644
index 000000000000..d3fe09d0606f
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_ce_updates_received_ce.pkt
@@ -0,0 +1,18 @@
+// Third ACK CE increases r.cep
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
++0.05 < SEWA 0:0(0) win 32767 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ce] W. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] WAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_lost_data_ce.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_lost_data_ce.pkt
new file mode 100644
index 000000000000..d28722db42b1
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_ack_lost_data_ce.pkt
@@ -0,0 +1,22 @@
+// 3rd ACK lost, CE for the first data segment
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
++0.05 < SEWA 0:0(0) win 32767 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
+// 3rd ACK lost
++0.05 < [ce] EAP. 1:1001(1000) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] WA. 1:1(0) ack 1001 <ECN e1b 1 ceb 1000 e0b 1,nop>
++.002 accept(3, ..., ...) = 4
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.05 < [ce] EAP. 1001:2001(1000) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop>
++.001 > [ect0] EWA. 1:1(0) ack 2001 <ECN e1b 1 ceb 2000 e0b 1 ,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_dups.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_dups.pkt
new file mode 100644
index 000000000000..a4d808116e34
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_3rd_dups.pkt
@@ -0,0 +1,26 @@
+// Test SYN/ACK rexmit triggered 3rd ACK duplicate + CE on first data seg
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
+
+// SYN/ACK rexmitted => two 3rd ACKs in-flight
++1.0~+1.1 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+// Delivered 1st 3rd ACK
++0.05 < [ect0] W. 1:1(0) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
+// Duplicate 3rd ACK delivered
++1.05 < [ect0] W. 1:1(0) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
+
++0.05 < [ce] EAP. 1:1001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] WA. 1:1(0) ack 1001 <ECN e1b 1 ceb 1000 e0b 1,nop>
+   +0 read(4, ..., 1000) = 1000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_acc_ecn_disabled.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_acc_ecn_disabled.pkt
new file mode 100644
index 000000000000..410a303c6d49
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_acc_ecn_disabled.pkt
@@ -0,0 +1,13 @@
+// Test that when accurate ECN is disabled,
+// client uses RFC3168 ECN for SYN
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=1
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > [noecn] SEW 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < [noecn] S. 0:0(0) ack 1 win 32767 <mss 1460,sackOK,nop,nop,nop,wscale 8>
++.002 > [noecn] . 1:1(0) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_then_notecn_syn.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_then_notecn_syn.pkt
new file mode 100644
index 000000000000..10728114b11b
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_then_notecn_syn.pkt
@@ -0,0 +1,28 @@
+// Test that SYN-ACK with ACE flags and without
+// ACE flags got dropped. Although we disable ECN,
+// we shouldn't consider this as blackholed as
+// these are dropped due to congestion
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=2
+`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < [ect0] SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > [noecn] SA. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
+
+// Retransmit SYN
++0.1 < [noecn] S 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > [noecn] SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
++0.1 < [noecn] W. 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
+// Write with AccECN option but with ip-noecn since we received one SYN with ACE=0
++0.01 write(4, ..., 100) = 100
++.002 > [noecn] P5. 1:101(100) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_to_rfc3168.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_to_rfc3168.pkt
new file mode 100644
index 000000000000..04d928f0d44d
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_accecn_to_rfc3168.pkt
@@ -0,0 +1,18 @@
+// Test AccECN -> RFC3168 fallback when sysctl asks for RFC3168 ECN
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=1
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++0.05 < . 1:1(0) ack 1 win 320
++.002 accept(3, ..., ...) = 4
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] P. 1:1001(1000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_drop.pkt
new file mode 100644
index 000000000000..788af6bea69c
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_drop.pkt
@@ -0,0 +1,34 @@
+// Client negotiates AccECN and starts sending
+// AccECN option in last ACK and data segments
+// Middlebox drops AccECN option and client
+// reverts to ACE flags only
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=2
+sysctl -q net.ipv4.tcp_ecn_option_beacon=1
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
++0.05 < [ect0] EAP. 1:1001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] EA. 1:1(0) ack 1001 <ECN e1b 1 ceb 0 e0b 1001,nop>
+   +0 read(4, ..., 1000) = 1000
+
++0.05 < [ect0] EAP. 1:1001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] EA. 1:1(0) ack 1001 <ECN e1b 1 ceb 0 e0b 2001,nop,nop,nop,sack 1:1001>
+
++0.05 < [ect0] EAP. 1:1001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] EA. 1:1(0) ack 1001 <nop,nop,sack 1:1001>
+
++0.05 < [ect0] EAP. 1001:2001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] EA. 1:1(0) ack 2001
+   +0 read(4, ..., 1000) = 1000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_lost.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_lost.pkt
new file mode 100644
index 000000000000..f5839c2e682d
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_client_accecn_options_lost.pkt
@@ -0,0 +1,38 @@
+// Client negotiates AccECN and starts sending
+// AccECN option in last ACK and data segments
+// Middlebox accepts AccECN option but some packets
+// are lost due to congestion. Client should
+// continue to send AccECN option
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=2
+`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.102 connect(4, ..., ...) = 0
+
++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.1  < [ect0] SW. 0:0(0) ack 1 win 32767 <mss 1024,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8>
++.002 > [ect0] A. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+
+// Send
++0.01 write(4, ..., 3000) = 3000
++.002 > [ect0] .5 1:1013(1012) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
++.002 > [ect0] P.5 1013:2025(1012) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
++.002 > [ect0] P.5 2025:3001(976) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+
+// First two segments were lost due to congestion as SACK was
+// received acknowledging 3rd segment
++0.1 < [ect0] .5 1:1(0) ack 1 win 264 <ECN e1b 1 ceb 0 e0b 977,nop,nop,nop,sack 2025:3001>
+
+// Since data with option was SACKed, we can
+// continue to use AccECN option for the rest of
+// the connection. This one is a rexmt
++.02~+0.5 > [ect0] .5 1:1013(1012) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
++0.1 < [ect0] .5 1:1(0) ack 3001 win 264 <ECN e1b 1 ceb 0 e0b 3000,nop>
+
+// Send new data, it should contain AccECN option
++0.01 write(4, ..., 2000) = 2000
++.002 > [ect0] .5 3001:4013(1012) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
++.002 > [ect0] P.5 4013:5001(988) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_clientside_disabled.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_clientside_disabled.pkt
new file mode 100644
index 000000000000..c00b36d6a833
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_clientside_disabled.pkt
@@ -0,0 +1,12 @@
+// AccECN sysctl server-side only, no ECN/AccECN
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=5
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < S. 0:0(0) ack 1 win 32767 <mss 1460,sackOK,nop,nop,nop,wscale 8>
++.002 > . 1:1(0) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_close_local_close_then_remote_fin.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_close_local_close_then_remote_fin.pkt
new file mode 100644
index 000000000000..f9c27f39f354
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_close_local_close_then_remote_fin.pkt
@@ -0,0 +1,25 @@
+// Test basic connection teardown where local process closes first:
+// the local process calls close() first, so we send a FIN, and receive an ACK.
+// Then we receive a FIN and ACK it.
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=0
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+ +.01...0.011 connect(3, ..., ...) = 0
+   +0 > [noecn] SEWA 0:0(0) <...>
+   +0 < [ect1] SW. 0:0(0) ack 1 win 32768 <mss 1000,nop,wscale 6,nop,nop,sackOK>
+   +0 > [ect0] EW. 1:1(0) ack 1
+
+   +0 write(3, ..., 1000) = 1000
+   +0 > [ect0] P5. 1:1001(1000) ack 1
+   +0 < [ect0] .5 1:1(0) ack 1001 win 257
+
+   +0 close(3) = 0
+   +0 > [ect0] F5. 1001:1001(0) ack 1
+   +0 < [ect0] .5 1:1(0) ack 1002 win 257
+
+   +0 < [ect0] F5. 1:1(0) ack 1002 win 257
+   +0 > [ect0] . 1002:1002(0) ack 2
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_2ndlargeack.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_2ndlargeack.pkt
new file mode 100644
index 000000000000..6d771234124a
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_2ndlargeack.pkt
@@ -0,0 +1,25 @@
+// Test a large ACK (> ACE field max)
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=0
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 264
++.002 accept(3, ..., ...) = 4
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 14600) = 14600
++.002 > [ect0] P.5 1:14601(14600) ack 1
++0.05 < [ect0] .5 1:1(0) ack 1461 win 264
++0.05 < [ect0] .5 1:1(0) ack 14601 win 264
+
++0.01 %{ assert tcpi_delivered_ce == 8, tcpi_delivered_ce }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_falseoverflow_detect.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_falseoverflow_detect.pkt
new file mode 100644
index 000000000000..76384f52b021
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_falseoverflow_detect.pkt
@@ -0,0 +1,31 @@
+// Test false overflow detection with option used to rule out overflow
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
+// Stop sending option to allow easier testing
++0 `sysctl -q net.ipv4.tcp_ecn_option=0`
+
++0.002 write(4, ..., 14600) = 14600
++.002 > [ect0] P.5 1:14601(14600) ack 1
+
++0.05 < [ect0] .5 1:1(0) ack 1460 win 264 <ECN e0b 1461 ceb 0 e1b 1,nop>
++0.05 < [ect0] .5 1:1(0) ack 14601 win 264 <ECN e0b 14601 ceb 0 e1b 1,nop>
+
++0.01 %{
+assert tcpi_delivered_ce == 0, tcpi_delivered_ce
+assert tcpi_delivered_e0_bytes == 14600, tcpi_delivered_e0_bytes
+}%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack.pkt
new file mode 100644
index 000000000000..8bce5dce35a2
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack.pkt
@@ -0,0 +1,24 @@
+// Test a large ACK (> ACE field max)
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=0
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 264
++.002 accept(3, ..., ...) = 4
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 14600) = 14600
++.002 > [ect0] P.5 1:14601(14600) ack 1
++0.05 < [ect0] .5 1:1(0) ack 14601 win 264
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack2.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack2.pkt
new file mode 100644
index 000000000000..5f2b147214f4
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_largeack2.pkt
@@ -0,0 +1,25 @@
+// Test a large ACK (> ACE field max)
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=0
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 264
++.002 accept(3, ..., ...) = 4
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 14600) = 14600
++.002 > [ect0] P.5 1:14601(14600) ack 1
+  // Fake CE
++0.05 < [ect0] .6 1:1(0) ack 14601 win 264
+
++0.01 %{ assert tcpi_delivered_ce == 1, tcpi_delivered_ce }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_maxack.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_maxack.pkt
new file mode 100644
index 000000000000..fd07bdc14f37
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_maxack.pkt
@@ -0,0 +1,25 @@
+// Test a large ACK (at ACE field max delta)
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=0
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 264
++.002 accept(3, ..., ...) = 4
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 14600) = 14600
++.002 > [ect0] P.5 1:14601(14600) ack 1
+  // Fake CE
++0.05 < [ect0] .4 1:1(0) ack 14601 win 264
+
++0.01 %{ assert tcpi_delivered_ce == 7, tcpi_delivered_ce }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_updates.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_updates.pkt
new file mode 100644
index 000000000000..cb1e70ff2d26
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_delivered_updates.pkt
@@ -0,0 +1,70 @@
+// Test basic AccECN CEP/CEB/E0B/E1B functionality & CEP wrapping
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
++0.01 %{
+assert tcpi_delivered_ce == 0, tcpi_delivered_ce
+assert tcpi_delivered_ce_bytes == 0, tcpi_delivered_ce_bytes
+}%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+  // Fake CE
++0.05 < [ect0] WA. 1:1(0) ack 1001 win 264 <ECN e0b 1 ceb 1000 e1b 1,nop>
+
++0.01 %{
+assert tcpi_delivered_ce == 1, tcpi_delivered_ce
+assert tcpi_delivered_ce_bytes == 1000, tcpi_delivered_ce_bytes
+}%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1001:2001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+  // Fake ect0
++0.05 < [ect0] WA. 1:1(0) ack 2001 win 264 <ECN e0b 1001 ceb 1000 e1b 1,nop>
+
++0.01 %{
+assert tcpi_delivered_ce == 1, tcpi_delivered_ce
+assert tcpi_delivered_e0_bytes == 1000, tcpi_delivered_e0_bytes
+}%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 2001:3001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+  // Fake ce
++0.05 < [ect0] EWA. 1:1(0) ack 3001 win 264 <ECN e0b 1001 ceb 2000 e1b 1,nop>
+
++0.01 %{
+assert tcpi_delivered_ce == 2, tcpi_delivered_ce
+assert tcpi_delivered_ce_bytes == 2000, tcpi_delivered_ce_bytes
+}%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 3001:4001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+  // Fake ect1
++0.05 < [ect0] EWA. 1:1(0) ack 4001 win 264 <ECN e0b 1001 ceb 2000 e1b 1001,nop>
+
++0.01 %{
+assert tcpi_delivered_ce == 2, tcpi_delivered_ce
+assert tcpi_delivered_e1_bytes == 1000, tcpi_delivered_e1_bytes
+}%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 4001:5001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+  // Fake ce
++0.05 < [ect0] . 1:1(0) ack 5001 win 264 <ECN e0b 1001 ceb 3000 e1b 1001,nop>
+
++0.01 %{
+assert tcpi_delivered_ce == 3, tcpi_delivered_ce
+assert tcpi_delivered_ce_bytes == 3000, tcpi_delivered_ce_bytes
+}%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_ecn3.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_ecn3.pkt
new file mode 100644
index 000000000000..6627c7bb2d26
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_ecn3.pkt
@@ -0,0 +1,12 @@
+// Test that tcp_ecn=4 uses RFC3168 ECN for SYN
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=4
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.05 connect(4, ..., ...) = 0
+
++.002 > SEW 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < S. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > . 1:1(0) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_ecn_field_updates_opt.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_ecn_field_updates_opt.pkt
new file mode 100644
index 000000000000..51879477bb50
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_ecn_field_updates_opt.pkt
@@ -0,0 +1,35 @@
+// Test basic AccECN CEP/CEB/E0B/E1B functionality & CEP wrapping
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
++0.05 < [ce] EAP. 1:1001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] WA. 1:1(0) ack 1001 <ECN e1b 1 ceb 1000 e0b 1,nop>
+   +0 read(4, ..., 1000) = 1000
+
++0.05 < [ect0] EAP. 1001:2001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] WA. 1:1(0) ack 2001 <ECN e1b 1 ceb 1000 e0b 1001,nop>
+   +0 read(4, ..., 1000) = 1000
+
++0.05 < [ce] EAP. 2001:3001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] EWA. 1:1(0) ack 3001 <ECN e1b 1 ceb 2000 e0b 1001,nop>
+   +0 read(4, ..., 1000) = 1000
+
++0.05 < [ect1] EAP. 3001:4001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] EWA. 1:1(0) ack 4001 <ECN e1b 1001 ceb 2000 e0b 1001,nop>
+   +0 read(4, ..., 1000) = 1000
+
++0.05 < [ce] EAP. 4001:5001(1000) ack 1 win 257 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] . 1:1(0) ack 5001 <ECN e1b 1001 ceb 3000 e0b 1001,nop>
+   +0 read(4, ..., 1000) = 1000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_ipflags_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_ipflags_drop.pkt
new file mode 100644
index 000000000000..0c72fa4a1251
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_ipflags_drop.pkt
@@ -0,0 +1,14 @@
+// Test IP flags drop
+--tolerance_usecs=50000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 1.1 connect(4, ..., ...) = 0
+
++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++.02 ~ +1.1 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < S. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > [noecn] . 1:1(0) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_listen_opt_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_listen_opt_drop.pkt
new file mode 100644
index 000000000000..171f9433e55f
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_listen_opt_drop.pkt
@@ -0,0 +1,16 @@
+// SYN/ACK option drop test
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
++.02 ~+2 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.02 ~+5 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.02 ~+8 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_ack_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_ack_drop.pkt
new file mode 100644
index 000000000000..0f65cf56cd2b
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_ack_drop.pkt
@@ -0,0 +1,28 @@
+// Test that SYN-ACK with ACE flags and without
+// ACE flags got dropped. Although we disable ECN,
+// we shouldn't consider this as blackholed as
+// these are dropped due to congestion
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=2
+`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < [noecn] SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > [noecn] SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
+
+// Retransmit SYN-ACK without option
++1~+1.1 > [noecn] SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
+// SYN-ACK maybe getting blackholed, disable ECN
++2~+2.2 > [noecn] S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++4~+4.4 > [noecn] S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
+// Received an ACK after sending 3rd retransmission, not a blackhole
++0.1 < [noecn] . 1:1(0) ack 1 win 320
++.002 accept(3, ..., ...) = 4
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_drop.pkt
new file mode 100644
index 000000000000..343181633980
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_multiple_syn_drop.pkt
@@ -0,0 +1,18 @@
+// Test that SYN with ACE flags and without
+// ACE flags got dropped. Although we disable
+// ECN, we shouldn't consider this as blackholed
+// as these are dropped due to congestion
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 3.1 connect(4, ..., ...) = 0
+
++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++.02~+1.1 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++.02~+1.1 > [noecn] S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++.02~+1.1 > [noecn] S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.1 < [noecn] S. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++0~+0.01 > [noecn] . 1:1(0) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_bleach.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_bleach.pkt
new file mode 100644
index 000000000000..37dabc4603c8
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_bleach.pkt
@@ -0,0 +1,23 @@
+// Test AccECN flags bleach
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] . 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [noecn] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
++0.05 < [ect0] EAP. 1:1(0) ack 1001 win 320
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_connect.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_connect.pkt
new file mode 100644
index 000000000000..5b14892fda51
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_connect.pkt
@@ -0,0 +1,23 @@
+// Test basic AccECN negotiation
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8>
++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
++.05 < [ect0] EAP. 1:1(0) ack 1001 win 256 <ECN e0b 1001 ceb 0 e1b 0,nop>
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1001:2001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_listen.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_listen.pkt
new file mode 100644
index 000000000000..25f7cb2feb25
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_listen.pkt
@@ -0,0 +1,26 @@
+// Test basic AccECN negotiation
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
++0.05 < [ect0] EAP. 1:1(0) ack 1001 win 320
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1001:2001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_noopt_connect.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_noopt_connect.pkt
new file mode 100644
index 000000000000..50e08c492a69
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_noopt_connect.pkt
@@ -0,0 +1,23 @@
+// Test basic AccECN negotiation without option
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < SW. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > [ect0] W. 1:1(0) ack 1
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1:1001(1000) ack 1
++.05 < [ect0] EAP. 1:1(0) ack 1001 win 256
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1001:2001(1000) ack 1
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_optenable.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_optenable.pkt
new file mode 100644
index 000000000000..2904f1ba9975
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_negotiation_optenable.pkt
@@ -0,0 +1,23 @@
+// Test basic AccECN negotiation, late option enable
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < SW. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > [ect0] W. 1:1(0) ack 1
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1:1001(1000) ack 1
++.05 < [ect0] EAP. 1:1(0) ack 1001 win 256 <ECN e0b 1001 ceb 0 e1b 1,nop>
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1001:2001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_no_ecn_after_accecn.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_no_ecn_after_accecn.pkt
new file mode 100644
index 000000000000..64e0fc1c1f14
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_no_ecn_after_accecn.pkt
@@ -0,0 +1,20 @@
+// Test client behavior on receiving a non ECN SYN-ACK
+// after receiving an AccECN SYN-ACK and moving to
+// ESTABLISHED state
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=2
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+// Receive an AccECN SYN-ACK and move to ESTABLISHED
++0.05 < [noecn] SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8>
++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+
+// Receive a non ECN SYN-ACK and send a challenge ACK with ACE feedback
++0.1 < [noecn] S. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_noopt.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_noopt.pkt
new file mode 100644
index 000000000000..f407c629a3f7
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_noopt.pkt
@@ -0,0 +1,27 @@
+// Test basic AccECN negotiation with option off using sysctl
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=0
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1:1001(1000) ack 1
++0.05 < [ect0] EAP. 1:1(0) ack 1001 win 320
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1001:2001(1000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_noprogress.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_noprogress.pkt
new file mode 100644
index 000000000000..32454e7187f9
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_noprogress.pkt
@@ -0,0 +1,27 @@
+// Test no progress filtering
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+  // Fake CE and claim no progress
++0.05 < [ect0] WA. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 1000 e1b 1,nop>
+
++0.01 %{
+assert tcpi_delivered_ce == 0, tcpi_delivered_ce
+assert tcpi_delivered_ce_bytes == 0, tcpi_delivered_ce_bytes
+}%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_notecn_then_accecn_syn.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_notecn_then_accecn_syn.pkt
new file mode 100644
index 000000000000..6597d5f2d778
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_notecn_then_accecn_syn.pkt
@@ -0,0 +1,28 @@
+// Test that SYN-ACK with ACE flags and without
+// ACE flags got dropped. Although we disable ECN,
+// we shouldn't consider this as blackholed as
+// these are dropped due to congestion
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=2
+`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < [noecn] S 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > [noecn] S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
+// Retransmit SYN
++0.1 < [ect0] SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > [noecn] S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
++0.1 < [noecn] . 1:1(0) ack 1 win 320
++.002 accept(3, ..., ...) = 4
+
+// Write with AccECN option but with ip-noecn since we received one SYN with ACE=0
++0.01 write(4, ..., 100) = 100
++.002 > [noecn] P. 1:101(100) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_fallback.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_fallback.pkt
new file mode 100644
index 000000000000..0f97dfcfa82d
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_fallback.pkt
@@ -0,0 +1,18 @@
+// Test RFC3168 fallback when sysctl asks for AccECN
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEW 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++0.05 < . 1:1(0) ack 1 win 320
++.002 accept(3, ..., ...) = 4
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] P. 1:1001(1000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_rfc3168.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_rfc3168.pkt
new file mode 100644
index 000000000000..9baffdd66fe5
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_rfc3168_to_rfc3168.pkt
@@ -0,0 +1,18 @@
+// Test RFC3168 ECN when sysctl asks for RFC3168 ECN
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=1
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEW 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++0.05 < . 1:1(0) ack 1 win 320
++.002 accept(3, ..., ...) = 4
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] P. 1:1001(1000) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab.pkt
new file mode 100644
index 000000000000..3fc56f9c6a6f
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab.pkt
@@ -0,0 +1,28 @@
+// Test SACK space grab to fit AccECN option
+--tcp_ts_tick_usecs=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
++.01 < [ect1] EAP. 1001:2001(1000) ack 1 win 264
++0.002 > [ect0] EA. 1:1(0) ack 1 <ECN e1b 1001 ceb 0 e0b 1,nop,nop,nop,sack 1001:2001>
++.01 < [ect0] EAP. 3001:4001(1000) ack 1 win 264
++0.002 > [ect0] EA. 1:1(0) ack 1 <ECN e1b 1001 ceb 0 e0b 1001,nop,nop,nop,sack 3001:4001 1001:2001>
++.01 < [ce] EAP. 5001:6001(1000) ack 1 win 264
++0.002 > [ect0] WA. 1:1(0) ack 1 <ECN e1b 1001 ceb 1000 e0b 1001,nop,nop,nop,sack 5001:6001 3001:4001 1001:2001>
+// DSACK works?
++.01 < [ect0] EAP. 5001:6001(1000) ack 1 win 264
++0.002 > [ect0] WA. 1:1(0) ack 1 <ECN e1b 1001 ceb 1000 e0b 2001,nop,nop,nop,sack 5001:6001 5001:6001 3001:4001>
++.01 < [ect1] EAP. 6001:7001(1000) ack 1 win 264
++0.002 > [ect0] WA. 1:1(0) ack 1 <ECN e1b 2001 ceb 1000 e0b 2001,nop,nop,nop,sack 5001:7001 3001:4001 1001:2001>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab_with_ts.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab_with_ts.pkt
new file mode 100644
index 000000000000..1c075b5d81ae
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_sack_space_grab_with_ts.pkt
@@ -0,0 +1,39 @@
+// Test SACK space grab to fit AccECN option
+--tcp_ts_tick_usecs=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,sackOK,TS val 1 ecr 0,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,sackOK,TS val 100 ecr 1,ECN e1b 1 ceb 0 e0b 1,nop,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <nop,nop,TS val 2 ecr 100,ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
+// One SACK block should allow all 3 AccECN fields:
++.01 < [ect1] EAP. 1001:2001(1000) ack 1 win 264 <nop,nop,TS val 3 ecr 100>
++0.002 > [ect0] EA. 1:1(0) ack 1 <nop,nop,TS val 160 ecr 2,ECN e1b 1001 ceb 0 e0b 1,nop,nop,nop,sack 1001:2001>
+
+// Two SACK blocks should fit w/ AccECN if we only need to use 2 AccECN fields: check ect1 arriving.
++.01 < [ect1] EAP. 3001:4001(1000) ack 1 win 264 <nop,nop,TS val 4 ecr 100>
++0.002 > [ect0] EA. 1:1(0) ack 1 <nop,nop,TS val 172 ecr 2,ECN e1b 2001 ceb 0,nop,nop,sack 3001:4001 1001:2001>
+
+// Two SACK blocks should fit w/ AccECN if we only need to use 2 AccECN fields: check CE arriving.
++.01 < [ce] EAP. 5001:6001(1000) ack 1 win 264 <nop,nop,TS val 5 ecr 100>
++0.002 > [ect0] WA. 1:1(0) ack 1 <nop,nop,TS val 184 ecr 2,ECN e1b 2001 ceb 1000,nop,nop,sack 5001:6001 3001:4001>
+
+// Check that DSACK works, using 2 SACK blocks in total, if we only need to use 2 AccECN fields: check ect1 arriving.
++.01 < [ect1] EAP. 5001:6001(1000) ack 1 win 264 <nop,nop,TS val 5 ecr 100>
++0.002 > [ect0] WA. 1:1(0) ack 1 <nop,nop,TS val 196 ecr 2,ECN e1b 3001 ceb 1000,nop,nop,sack 5001:6001 5001:6001>
+
+// Check the case where the AccECN option doesn't fit, because sending ect0
+// with order 1 would rquire 3 AccECN fields,
+// and TS (12 bytes) + 2 SACK blocks (20 bytes) + 3 AccECN fields (2 + 3*3 bytes) > 40 bytes.
+// That's OK; Linux TCP AccECN is optimized for the ECT1 case, not ECT0.
++.01 < [ect0] EAP. 6001:7001(1000) ack 1 win 264 <nop,nop,TS val 5 ecr 100>
++0.002 > [ect0] WA. 1:1(0) ack 1 <nop,nop,TS val 204 ecr 2,nop,nop,sack 5001:7001 3001:4001 1001:2001>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled1.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled1.pkt
new file mode 100644
index 000000000000..6b88ab78bfce
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled1.pkt
@@ -0,0 +1,20 @@
+// Test against classic ECN server
+// Not-ECT on SYN and server sets 1|0|1 (AE is unused for classic ECN)
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < [noecn] SEA. 0:0(0) ack 1 win 32767 <mss 1460,sackOK,TS val 700 ecr 100,nop,wscale 8>
++.002 > [ect0] W. 1:1(0) ack 1 <nop, nop, TS val 200 ecr 700>
+
++0 write(4, ..., 100) = 100
++.002 > [ect0] P.5 1:101(100) ack 1 <nop,nop,TS val 300 ecr 700>
++0 close(4) = 0
+
++.002 > [ect0] F.5 101:101(0) ack 1 <nop,nop,TS val 400 ecr 700>
++0.1 < [noecn] R. 1:1(0) ack 102 win 4242
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled2.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled2.pkt
new file mode 100644
index 000000000000..d24ada008ece
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_accecn_disabled2.pkt
@@ -0,0 +1,20 @@
+// Test against classic ECN server
+// Not-ECT on SYN and server sets 0|0|1
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < [noecn] SE. 0:0(0) ack 1 win 32767 <mss 1460,sackOK,TS val 700 ecr 100,nop,wscale 8>
++.002 > [noecn] . 1:1(0) ack 1 <nop, nop, TS val 200 ecr 700>
+
++0 write(4, ..., 100) = 100
++.002 > [ect0] P. 1:101(100) ack 1 <nop,nop,TS val 300 ecr 700>
++0 close(4) = 0
+
++0 > [noecn] F. 101:101(0) ack 1 <...>
++0.1 < R. 1:1(0) ack 102 win 4242
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_broken.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_broken.pkt
new file mode 100644
index 000000000000..a20d7e890ee1
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_broken.pkt
@@ -0,0 +1,19 @@
+// Test against broken server (1|1|1)
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < [noecn] SEWA. 0:0(0) ack 1 win 32767 <mss 1460,sackOK,TS val 700 ecr 100,nop,wscale 8>
++.002 > [noecn] . 1:1(0) ack 1 <nop, nop, TS val 200 ecr 700>
+
++0 write(4, ..., 100) = 100
++.002 > [noecn] P. 1:101(100) ack 1 <nop,nop,TS val 300 ecr 700>
++0 close(4) = 0
+
++.002 > [noecn] F. 101:101(0) ack 1 <...>
++0.1 < [noecn] R. 1:1(0) ack 102 win 4242
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_ecn_disabled.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_ecn_disabled.pkt
new file mode 100644
index 000000000000..428255bedab7
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_ecn_disabled.pkt
@@ -0,0 +1,19 @@
+// Test against Non ECN server (0|0|0)
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < [noecn] S. 0:0(0) ack 1 win 32767 <mss 1460,sackOK,TS val 700 ecr 100,nop,wscale 8>
++.002 > [noecn] . 1:1(0) ack 1 <nop, nop, TS val 200 ecr 700>
+
++0 write(4, ..., 100) = 100
++.002 > [noecn] P. 1:101(100) ack 1 <nop,nop,TS val 300 ecr 700>
++0 close(4) = 0
+
++.002 > [noecn] F. 101:101(0) ack 1 <nop,nop,TS val 400 ecr 700>
++0.1 < [noecn] R. 1:1(0) ack 102 win 4242
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_only.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_only.pkt
new file mode 100644
index 000000000000..e9a5a0d3677c
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_serverside_only.pkt
@@ -0,0 +1,18 @@
+// Test AccECN with sysctl set to server-side only
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=5
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_acked_after_retransmit.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_acked_after_retransmit.pkt
new file mode 100644
index 000000000000..412fa903105c
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_acked_after_retransmit.pkt
@@ -0,0 +1,18 @@
+// Test that SYN with ACE flags was Acked
+// after 2nd retransmission. In this case,
+// since we got SYN-ACK that supports Accurate
+// ECN, we consider this as successful negotiation
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 2.1 connect(4, ..., ...) = 0
+
++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++1~+1.1 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++1~+1.1 > [noecn] S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
+
++0.1 < [noecn] SW. 0:0(0) ack 1 win 32767 <mss 1016,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8>
++0~+0.01 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_drop.pkt
new file mode 100644
index 000000000000..4622754a2270
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ace_flags_drop.pkt
@@ -0,0 +1,16 @@
+// Test that SYN with ACE flags got dropped
+// We retry one more time with ACE and then
+// fallback to disabled ECN
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 2.1 connect(4, ..., ...) = 0
+
++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++1~+1.1 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++1~+1.1 > [noecn] S 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.1 < [noecn] S. 0:0(0) ack 1 win 32767 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++0~+0.01 > [noecn] . 1:1(0) ack 1
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_acked_after_retransmit.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_acked_after_retransmit.pkt
new file mode 100644
index 000000000000..ee15f108cafe
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_acked_after_retransmit.pkt
@@ -0,0 +1,27 @@
+// Test that SYN-ACK with ACE flags was Acked
+// after 2nd retransmission. In this case,
+// since we got the last ACK that supports Accurate
+// ECN, we consider this as successful negotiation
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=2
+`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < [noecn] SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > [noecn] SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
+
+// Retransmit SYN-ACK without option
++1~+1.1 > [noecn] SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
+// SYN-ACK maybe getting blackholed, disable ECN
++2~+2.2 > [noecn] S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
+// Received an ACK with ACE flags, state should be set to negotiation succeeded
++0.1 < [noecn] W. 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_drop.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_drop.pkt
new file mode 100644
index 000000000000..ccfe353a8ee4
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ack_ace_flags_drop.pkt
@@ -0,0 +1,26 @@
+// Test that SYN-ACK with ACE flags got dropped
+// We retry one more time with ACE and then
+// fallback to disabled ECN
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=2
+`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < [noecn] SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > [noecn] SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
+
+// Retransmit SYN-ACK without option
++1~+1.1 > [noecn] SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
+// SYN-ACK maybe getting blackholed, disable ECN
++2~+2.2 > [noecn] S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+
+// Received an ACK with no ACE flags, state should be set to blackholed
++0.1 < [noecn] . 1:1(0) ack 1 win 320
++0 accept(3, ..., ...) = 4
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ce.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ce.pkt
new file mode 100644
index 000000000000..dc83f7a18180
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ce.pkt
@@ -0,0 +1,13 @@
+// Test AccECN ECN field reflector in SYNACK
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < [ce] SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SWA. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect0.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect0.pkt
new file mode 100644
index 000000000000..e63a8d018c37
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect0.pkt
@@ -0,0 +1,13 @@
+// Test AccECN ECN field reflector in SYNACK
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < [ect0] SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SA. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect1.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect1.pkt
new file mode 100644
index 000000000000..23c0e43b3dbe
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_syn_ect1.pkt
@@ -0,0 +1,13 @@
+// Test AccECN ECN field reflector in SYNACK
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < [ect1] SEWA 0:0(0) win 32792 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SEW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce.pkt
new file mode 100644
index 000000000000..c3497738f680
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce.pkt
@@ -0,0 +1,27 @@
+// Test SYNACK CE & received_ce update
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=2
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > [noecn] SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < [ce] SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8>
++.002 > [ect0] WA. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+
++0.01 write(4, ..., 100) = 100
++.002 > [ect0] P.6 1:101(100) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
++0.05 < [ect0] P.5 1:101(100) ack 101 win 256 <ECN e0b 101 ceb 0 e1b 1,nop>
++.002 > [ect0] .6  101:101(0) ack 101 <ECN e1b 1 ceb 0 e0b 101,nop>
+
++0.01 write(4, ..., 100) = 100
++.002 > [ect0] P.6 101:201(100) ack 101 <ECN e1b 1 ceb 0 e0b 101,nop>
+
++0.1  < [ect1] P.5 201:301(100) ack 201 win 256 <ECN e0b 101 ceb 0 e1b 1,nop>
++.002 > [ect0] .6 201:201(0) ack 101 <ECN e1b 101 ceb 0 e0b 101,nop,nop,nop,sack 201:301>
+
++0.01 < [ce] .6 401:501(100) ack 201 win 256 <ECN e0b 101 ceb 0 e1b 1,nop>
++.002 > [ect0] .7 201:201(0) ack 101 <ECN e1b 101 ceb 100 e0b 101,nop,nop,nop,sack 401:501 201:301>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce_updates_delivered_ce.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce_updates_delivered_ce.pkt
new file mode 100644
index 000000000000..5fd77f466572
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ce_updates_delivered_ce.pkt
@@ -0,0 +1,22 @@
+// Reflected SYNACK CE mark increases delivered_ce
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_fallback=0
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
++0.05 < SEWA 0:0(0) win 32767 <mss 1050,nop,nop,sackOK,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
+// Fake ce for prev, ECT validator must be disabled for this to work
++0.05 < [ect0] WA. 1:1(0) ack 1 win 264 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
++0.01 %{ assert tcpi_delivered_ce == 1, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1:1001(1000) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect0.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect0.pkt
new file mode 100644
index 000000000000..f6ad1ea5c0c4
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect0.pkt
@@ -0,0 +1,24 @@
+// Test SYN=0 reflector
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < [ect0] SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8>
++.002 > [ect0] A. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+
++0.01 write(4, ..., 100) = 100
++.002 > [ect0] P.5 1:101(100) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
++0.05 < [ect0] P.5 1:1(0) ack 101 win 256 <ECN e0b 101 ceb 0 e1b 1,nop>
+
++0.01 < [ect0] P.5 1:101(100) ack 101 win 256 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] .5 101:101(0) ack 101 <ECN e1b 1 ceb 0 e0b 101,nop>
++0 read(4, ..., 100) = 100
+
++0 close(4) = 0
++0 > F.5 101:101(0) ack 101 <...>
++0.1 < R. 101:101(0) ack 102 win 4242
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect1.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect1.pkt
new file mode 100644
index 000000000000..7ecfc5fb9dbb
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_ect1.pkt
@@ -0,0 +1,24 @@
+// Test SYN=0 reflector
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < [ect1] SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8>
++.002 > [ect0] EW. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+
++0.01 write(4, ..., 100) = 100
++.002 > [ect0] P.5 1:101(100) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
++0.05 < [ect1] P.5 1:1(0) ack 101 win 256 <ECN e0b 101 ceb 0 e1b 1,nop>
+
++0.01 < [ect1] P.5 1:101(100) ack 101 win 256 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 > [ect0] .5 101:101(0) ack 101 <ECN e1b 101 ceb 0 e0b 1,nop>
++0 read(4, ..., 100) = 100
+
++0 close(4) = 0
++0 > F5. 101:101(0) ack 101 <...>
++0.1 < R. 101:101(0) ack 102 win 4242
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rexmit.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rexmit.pkt
new file mode 100644
index 000000000000..9e0959782ef5
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rexmit.pkt
@@ -0,0 +1,15 @@
+// Test 3rd ACK flags when SYN-ACK is rexmitted
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4
++.002 ... 0.052 connect(4, ..., ...) = 0
+
++.002 > SEWA 0:0(0) <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8>
++0.05 < SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8>
++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
+
++0.05 < SW. 0:0(0) ack 1 win 32767 <mss 1460,ECN e0b 1 ceb 0 e1b 1,nop,nop,nop,sackOK,nop,wscale 8>
++.002 > [ect0] W. 1:1(0) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rxmt.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rxmt.pkt
new file mode 100644
index 000000000000..a5a41633af07
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_synack_rxmt.pkt
@@ -0,0 +1,25 @@
+// Test that we retransmit SYN-ACK with ACE and without
+// AccECN options after
+// SYN-ACK was lost and TCP moved to TCPS_SYN_RECEIVED
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+sysctl -q net.ipv4.tcp_ecn_option=2
+`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 1) = 0
+
++0 < [noecn] SEWA 0:0(0) win 32792 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++.002 > [noecn] SW. 0:0(0) ack 1 <mss 1460,ECN e1b 1 ceb 0 e0b 1,nop,nop,nop,sackOK,nop,wscale 8>
+
+// Retransmit SYN-ACK without option
++1~+1.1 > [noecn] SW. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
++0.1 < [noecn] W. 1:1(0) ack 1 win 320 <ECN e0b 1 ceb 0 e1b 1,nop>
++.002 accept(3, ..., ...) = 4
+
+// We try to write with AccECN option
++0.01 write(4, ..., 100) = 100
++.002 > [ect0] P5. 1:101(100) ack 1 <ECN e1b 1 ceb 0 e0b 1,nop>
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_tsnoprogress.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_tsnoprogress.pkt
new file mode 100644
index 000000000000..f3fe2f098966
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_tsnoprogress.pkt
@@ -0,0 +1,26 @@
+// Test TS progress filtering
+--tcp_ts_tick_usecs=1000
+--tolerance_usecs=7000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,sackOK,TS val 1 ecr 0,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,sackOK,TS val 10 ecr 1,ECN e1b 1 ceb 0 e0b 1,nop,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <nop,nop,TS val 2 ecr 10>
++.002 accept(3, ..., ...) = 4
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1:1001(1000) ack 1 <nop,nop,TS val 83 ecr 2>
+  // Fake CE and claim no progress
++0.05 < [ect0] WA. 1:1(0) ack 1 win 264 <nop,nop,TS val 2 ecr 83>
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_accecn_tsprogress.pkt b/tools/testing/selftests/net/packetdrill/tcp_accecn_tsprogress.pkt
new file mode 100644
index 000000000000..1446799d2481
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_accecn_tsprogress.pkt
@@ -0,0 +1,25 @@
+// Test TS progress filtering
+--tcp_ts_tick_usecs=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_ecn=3
+`
+
+    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < SEWA 0:0(0) win 32792 <mss 1050,sackOK,TS val 1 ecr 0,nop,wscale 8>
++.002 > SW. 0:0(0) ack 1 <mss 1460,sackOK,TS val 10 ecr 1,ECN e1b 1 ceb 0 e0b 1,nop,nop,wscale 8>
++0.05 < [ect0] W. 1:1(0) ack 1 win 264 <nop,nop,TS val 2 ecr 10>
++.002 accept(3, ..., ...) = 4
+
++0.01 %{ assert tcpi_delivered_ce == 0, tcpi_delivered_ce }%
+
++0.01 write(4, ..., 1000) = 1000
++.002 > [ect0] EAP. 1:1001(1000) ack 1 <nop,nop,TS val 83 ecr 2>
+  // Fake CE and claim no progress
++0.05 < [ect0] WA. 1:1(0) ack 1 win 264 <nop,nop,TS val 3 ecr 83>
+
++0.01 %{ assert tcpi_delivered_ce == 1, tcpi_delivered_ce }%
-- 
cgit v1.2.3


From 169343cc8ff2bd59758760d867bd26adae866a2b Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 30 Jan 2026 15:35:39 -0800
Subject: perf build: Remove NO_LIBCAP that controls nothing

Using libcap was removed in commit e25ebda78e230283 ("perf cap: Tidy up
and improve capability testing") and improve capability testing"),
however, some build documentation and a use of the NO_LIBCAP=1 were
lingering.

Remove these left over bits.

Fixes: e25ebda78e230283 ("perf cap: Tidy up and improve capability testing")
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.perf | 2 --
 tools/perf/tests/make    | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index b6edc8100c8e..2a7e5814b159 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -86,8 +86,6 @@ include ../scripts/utilities.mak
 #
 # Define NO_LIBBPF if you do not want BPF support
 #
-# Define NO_LIBCAP if you do not want process capabilities considered by perf
-#
 # Define NO_SDT if you do not want to define SDT event in perf tools,
 # note that it doesn't disable SDT scanning support.
 #
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 767ad9e147a8..0b16c9c81c7f 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -121,7 +121,7 @@ make_minimal        += NO_DEMANGLE=1 NO_LIBELF=1 NO_BACKTRACE=1
 make_minimal        += NO_LIBNUMA=1 NO_LIBBIONIC=1 NO_LIBDW=1
 make_minimal        += NO_LIBBPF=1
 make_minimal        += NO_SDT=1 NO_JVMTI=1 NO_LIBZSTD=1
-make_minimal        += NO_LIBCAP=1 NO_CAPSTONE=1
+make_minimal        += NO_CAPSTONE=1
 
 # binutils 2_42 and newer have bfd_thread_init()
 new_libbfd := $(shell echo '#include <bfd.h>' | $(CC) -E -x c - | grep bfd_thread_init)
-- 
cgit v1.2.3


From b5c9bcde61b8cabf3b4194902374e62b8c8a4d41 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 30 Jan 2026 15:34:37 -0800
Subject: perf capstone: Support for dlopen-ing libcapstone.so

If perf is built with LIBCAPSTONE_DLOPEN=1, support dlopen-ing
libcapstone.so and then calling the necessary functions by looking them
up using dlsym.

The types come from capstone.h which means the libcapstone feature check
needs to pass, and NO_CAPSTONE=1 hasn't been defined. This will cause
the definition of HAVE_LIBCAPSTONE_SUPPORT.

Earlier versions of this code tried to declare the necessary
capstone.h constants and structs, but they weren't stable and caused
breakages across libcapstone releases.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Charlie Jenkins <charlie@rivosinc.com>
Cc: Collin Funk <collin.funk1@gmail.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config |   8 ++-
 tools/perf/tests/make      |   2 +
 tools/perf/util/Build      |   2 +-
 tools/perf/util/capstone.c | 176 +++++++++++++++++++++++++++++++++++----------
 tools/perf/util/capstone.h |  33 +++++++++
 5 files changed, 179 insertions(+), 42 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 63ca9b2be663..e085d27f698a 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -1078,8 +1078,12 @@ ifndef NO_CAPSTONE
   $(call feature_check,libcapstone)
   ifeq ($(feature-libcapstone), 1)
     CFLAGS += -DHAVE_LIBCAPSTONE_SUPPORT $(LIBCAPSTONE_CFLAGS)
-    LDFLAGS += $(LICAPSTONE_LDFLAGS)
-    EXTLIBS += -lcapstone
+    ifdef LIBCAPSTONE_DLOPEN
+      CFLAGS += -DLIBCAPSTONE_DLOPEN
+    else
+      LDFLAGS += $(LIBCAPSTONE_LDFLAGS)
+      EXTLIBS += -lcapstone
+    endif
     $(call detected,CONFIG_LIBCAPSTONE)
   else
     msg := $(warning No libcapstone found, disables disasm engine support for 'perf script', please install libcapstone-dev/capstone-devel);
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 0b16c9c81c7f..eb41516c0562 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -85,6 +85,7 @@ make_no_libdw       := NO_LIBDW=1
 make_libunwind      := LIBUNWIND=1
 make_no_backtrace   := NO_BACKTRACE=1
 make_no_libcapstone := NO_CAPSTONE=1
+make_libcapstone_dlopen := LIBCAPSTONE_DLOPEN=1
 make_no_libnuma     := NO_LIBNUMA=1
 make_no_libbionic   := NO_LIBBIONIC=1
 make_no_libbpf	    := NO_LIBBPF=1
@@ -159,6 +160,7 @@ run += make_libunwind
 run += make_no_libdw_dwarf_unwind
 run += make_no_backtrace
 run += make_no_libcapstone
+run += make_libcapstone_dlopen
 run += make_no_libnuma
 run += make_no_libbionic
 run += make_no_libbpf
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index b9925c6902ca..c037b1e99d28 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -11,7 +11,7 @@ perf-util-y += block-info.o
 perf-util-y += block-range.o
 perf-util-y += build-id.o
 perf-util-y += cacheline.o
-perf-util-y += capstone.o
+perf-util-$(CONFIG_LIBCAPSTONE) += capstone.o
 perf-util-y += config.o
 perf-util-y += copyfile.o
 perf-util-y += ctype.o
diff --git a/tools/perf/util/capstone.c b/tools/perf/util/capstone.c
index 9216916f848f..25cf6e15ec27 100644
--- a/tools/perf/util/capstone.c
+++ b/tools/perf/util/capstone.c
@@ -11,20 +11,137 @@
 #include "print_insn.h"
 #include "symbol.h"
 #include "thread.h"
+#include <dlfcn.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <inttypes.h>
 #include <string.h>
 
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
 #include <capstone/capstone.h>
+
+#ifdef LIBCAPSTONE_DLOPEN
+static void *perf_cs_dll_handle(void)
+{
+	static bool dll_handle_init;
+	static void *dll_handle;
+
+	if (!dll_handle_init) {
+		dll_handle_init = true;
+		dll_handle = dlopen("libcapstone.so", RTLD_LAZY);
+		if (!dll_handle)
+			pr_debug("dlopen failed for libcapstone.so\n");
+	}
+	return dll_handle;
+}
+#endif
+
+static enum cs_err perf_cs_open(enum cs_arch arch, enum cs_mode mode, csh *handle)
+{
+#ifndef LIBCAPSTONE_DLOPEN
+	return cs_open(arch, mode, handle);
+#else
+	static bool fn_init;
+	static enum cs_err (*fn)(enum cs_arch arch, enum cs_mode mode, csh *handle);
+
+	if (!fn_init) {
+		fn = dlsym(perf_cs_dll_handle(), "cs_open");
+		if (!fn)
+			pr_debug("dlsym failed for cs_open\n");
+		fn_init = true;
+	}
+	if (!fn)
+		return CS_ERR_HANDLE;
+	return fn(arch, mode, handle);
 #endif
+}
+
+static enum cs_err perf_cs_option(csh handle, enum cs_opt_type type, size_t value)
+{
+#ifndef LIBCAPSTONE_DLOPEN
+	return cs_option(handle, type, value);
+#else
+	static bool fn_init;
+	static enum cs_err (*fn)(csh handle, enum cs_opt_type type, size_t value);
+
+	if (!fn_init) {
+		fn = dlsym(perf_cs_dll_handle(), "cs_option");
+		if (!fn)
+			pr_debug("dlsym failed for cs_option\n");
+		fn_init = true;
+	}
+	if (!fn)
+		return CS_ERR_HANDLE;
+	return fn(handle, type, value);
+#endif
+}
+
+static size_t perf_cs_disasm(csh handle, const uint8_t *code, size_t code_size,
+			uint64_t address, size_t count, struct cs_insn **insn)
+{
+#ifndef LIBCAPSTONE_DLOPEN
+	return cs_disasm(handle, code, code_size, address, count, insn);
+#else
+	static bool fn_init;
+	static enum cs_err (*fn)(csh handle, const uint8_t *code, size_t code_size,
+				 uint64_t address, size_t count, struct cs_insn **insn);
+
+	if (!fn_init) {
+		fn = dlsym(perf_cs_dll_handle(), "cs_disasm");
+		if (!fn)
+			pr_debug("dlsym failed for cs_disasm\n");
+		fn_init = true;
+	}
+	if (!fn)
+		return CS_ERR_HANDLE;
+	return fn(handle, code, code_size, address, count, insn);
+#endif
+}
+
+static void perf_cs_free(struct cs_insn *insn, size_t count)
+{
+#ifndef LIBCAPSTONE_DLOPEN
+	cs_free(insn, count);
+#else
+	static bool fn_init;
+	static void (*fn)(struct cs_insn *insn, size_t count);
+
+	if (!fn_init) {
+		fn = dlsym(perf_cs_dll_handle(), "cs_free");
+		if (!fn)
+			pr_debug("dlsym failed for cs_free\n");
+		fn_init = true;
+	}
+	if (!fn)
+		return;
+	fn(insn, count);
+#endif
+}
+
+static enum cs_err perf_cs_close(csh *handle)
+{
+#ifndef LIBCAPSTONE_DLOPEN
+	return cs_close(handle);
+#else
+	static bool fn_init;
+	static enum cs_err (*fn)(csh *handle);
+
+	if (!fn_init) {
+		fn = dlsym(perf_cs_dll_handle(), "cs_close");
+		if (!fn)
+			pr_debug("dlsym failed for cs_close\n");
+		fn_init = true;
+	}
+	if (!fn)
+		return CS_ERR_HANDLE;
+	return fn(handle);
+#endif
+}
 
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
 static int capstone_init(struct machine *machine, csh *cs_handle, bool is64,
 			 bool disassembler_style)
 {
-	cs_arch arch;
-	cs_mode mode;
+	enum cs_arch arch;
+	enum cs_mode mode;
 
 	if (machine__is(machine, "x86_64") && is64) {
 		arch = CS_ARCH_X86;
@@ -45,7 +162,7 @@ static int capstone_init(struct machine *machine, csh *cs_handle, bool is64,
 		return -1;
 	}
 
-	if (cs_open(arch, mode, cs_handle) != CS_ERR_OK) {
+	if (perf_cs_open(arch, mode, cs_handle) != CS_ERR_OK) {
 		pr_warning_once("cs_open failed\n");
 		return -1;
 	}
@@ -57,27 +174,25 @@ static int capstone_init(struct machine *machine, csh *cs_handle, bool is64,
 		 * is set via annotation args
 		 */
 		if (disassembler_style)
-			cs_option(*cs_handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT);
+			perf_cs_option(*cs_handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT);
 		/*
 		 * Resolving address operands to symbols is implemented
 		 * on x86 by investigating instruction details.
 		 */
-		cs_option(*cs_handle, CS_OPT_DETAIL, CS_OPT_ON);
+		perf_cs_option(*cs_handle, CS_OPT_DETAIL, CS_OPT_ON);
 	}
 
 	return 0;
 }
-#endif
 
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
-static size_t print_insn_x86(struct thread *thread, u8 cpumode, cs_insn *insn,
+static size_t print_insn_x86(struct thread *thread, u8 cpumode, struct cs_insn *insn,
 			     int print_opts, FILE *fp)
 {
 	struct addr_location al;
 	size_t printed = 0;
 
 	if (insn->detail && insn->detail->x86.op_count == 1) {
-		cs_x86_op *op = &insn->detail->x86.operands[0];
+		struct cs_x86_op *op = &insn->detail->x86.operands[0];
 
 		addr_location__init(&al);
 		if (op->type == X86_OP_IMM &&
@@ -95,7 +210,6 @@ static size_t print_insn_x86(struct thread *thread, u8 cpumode, cs_insn *insn,
 	printed += fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str);
 	return printed;
 }
-#endif
 
 
 ssize_t capstone__fprintf_insn_asm(struct machine *machine __maybe_unused,
@@ -106,9 +220,8 @@ ssize_t capstone__fprintf_insn_asm(struct machine *machine __maybe_unused,
 				   uint64_t ip __maybe_unused, int *lenp __maybe_unused,
 				   int print_opts __maybe_unused, FILE *fp __maybe_unused)
 {
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
 	size_t printed;
-	cs_insn *insn;
+	struct cs_insn *insn;
 	csh cs_handle;
 	size_t count;
 	int ret;
@@ -118,7 +231,7 @@ ssize_t capstone__fprintf_insn_asm(struct machine *machine __maybe_unused,
 	if (ret < 0)
 		return ret;
 
-	count = cs_disasm(cs_handle, code, code_size, ip, 1, &insn);
+	count = perf_cs_disasm(cs_handle, code, code_size, ip, 1, &insn);
 	if (count > 0) {
 		if (machine__normalized_is(machine, "x86"))
 			printed = print_insn_x86(thread, cpumode, &insn[0], print_opts, fp);
@@ -126,20 +239,16 @@ ssize_t capstone__fprintf_insn_asm(struct machine *machine __maybe_unused,
 			printed = fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str);
 		if (lenp)
 			*lenp = insn->size;
-		cs_free(insn, count);
+		perf_cs_free(insn, count);
 	} else {
 		printed = -1;
 	}
 
-	cs_close(&cs_handle);
+	perf_cs_close(&cs_handle);
 	return printed;
-#else
-	return -1;
-#endif
 }
 
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
-static void print_capstone_detail(cs_insn *insn, char *buf, size_t len,
+static void print_capstone_detail(struct cs_insn *insn, char *buf, size_t len,
 				  struct annotate_args *args, u64 addr)
 {
 	int i;
@@ -154,7 +263,7 @@ static void print_capstone_detail(cs_insn *insn, char *buf, size_t len,
 		return;
 
 	for (i = 0; i < insn->detail->x86.op_count; i++) {
-		cs_x86_op *op = &insn->detail->x86.operands[i];
+		struct cs_x86_op *op = &insn->detail->x86.operands[i];
 		u64 orig_addr;
 
 		if (op->type != X86_OP_MEM)
@@ -195,9 +304,7 @@ static void print_capstone_detail(cs_insn *insn, char *buf, size_t len,
 		break;
 	}
 }
-#endif
 
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
 struct find_file_offset_data {
 	u64 ip;
 	u64 offset;
@@ -214,13 +321,11 @@ static int find_file_offset(u64 start, u64 len, u64 pgoff, void *arg)
 	}
 	return 0;
 }
-#endif
 
 int symbol__disassemble_capstone(const char *filename __maybe_unused,
 				 struct symbol *sym __maybe_unused,
 				 struct annotate_args *args __maybe_unused)
 {
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
 	struct annotation *notes = symbol__annotation(sym);
 	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
@@ -235,7 +340,7 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused,
 	const u8 *buf;
 	u64 buf_len;
 	csh handle;
-	cs_insn *insn = NULL;
+	struct cs_insn *insn = NULL;
 	char disasm_buf[512];
 	struct disasm_line *dl;
 	bool disassembler_style = false;
@@ -274,7 +379,7 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused,
 
 	needs_cs_close = true;
 
-	free_count = count = cs_disasm(handle, buf, buf_len, start, buf_len, &insn);
+	free_count = count = perf_cs_disasm(handle, buf, buf_len, start, buf_len, &insn);
 	for (i = 0, offset = 0; i < count; i++) {
 		int printed;
 
@@ -313,9 +418,9 @@ int symbol__disassemble_capstone(const char *filename __maybe_unused,
 
 out:
 	if (needs_cs_close) {
-		cs_close(&handle);
+		perf_cs_close(&handle);
 		if (free_count > 0)
-			cs_free(insn, free_count);
+			perf_cs_free(insn, free_count);
 	}
 	free(code_buf);
 	return count < 0 ? count : 0;
@@ -335,16 +440,12 @@ err:
 	}
 	count = -1;
 	goto out;
-#else
-	return -1;
-#endif
 }
 
 int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused,
 					 struct symbol *sym __maybe_unused,
 					 struct annotate_args *args __maybe_unused)
 {
-#ifdef HAVE_LIBCAPSTONE_SUPPORT
 	struct annotation *notes = symbol__annotation(sym);
 	struct map *map = args->ms->map;
 	struct dso *dso = map__dso(map);
@@ -458,7 +559,7 @@ int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused,
 
 out:
 	if (needs_cs_close)
-		cs_close(&handle);
+		perf_cs_close(&handle);
 	free(buf);
 	return count < 0 ? count : 0;
 
@@ -467,7 +568,4 @@ err:
 		close(fd);
 	count = -1;
 	goto out;
-#else
-	return -1;
-#endif
 }
diff --git a/tools/perf/util/capstone.h b/tools/perf/util/capstone.h
index 0f030ea034b6..7c0baaa01a73 100644
--- a/tools/perf/util/capstone.h
+++ b/tools/perf/util/capstone.h
@@ -6,6 +6,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <linux/compiler.h>
 #include <linux/types.h>
 
 struct annotate_args;
@@ -13,6 +14,7 @@ struct machine;
 struct symbol;
 struct thread;
 
+#ifdef HAVE_LIBCAPSTONE_SUPPORT
 ssize_t capstone__fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpumode,
 				   bool is64bit, const uint8_t *code, size_t code_size,
 				   uint64_t ip, int *lenp, int print_opts, FILE *fp);
@@ -21,4 +23,35 @@ int symbol__disassemble_capstone(const char *filename, struct symbol *sym,
 int symbol__disassemble_capstone_powerpc(const char *filename, struct symbol *sym,
 					 struct annotate_args *args);
 
+#else /* !HAVE_LIBCAPSTONE_SUPPORT */
+static inline ssize_t capstone__fprintf_insn_asm(struct machine *machine __maybe_unused,
+						 struct thread *thread __maybe_unused,
+						 u8 cpumode __maybe_unused,
+						 bool is64bit __maybe_unused,
+						 const uint8_t *code __maybe_unused,
+						 size_t code_size __maybe_unused,
+						 uint64_t ip __maybe_unused,
+						 int *lenp __maybe_unused,
+						 int print_opts __maybe_unused,
+						 FILE *fp __maybe_unused)
+{
+	return -1;
+}
+
+static inline int symbol__disassemble_capstone(const char *filename __maybe_unused,
+					       struct symbol *sym __maybe_unused,
+					       struct annotate_args *args __maybe_unused)
+{
+	return -1;
+}
+
+static inline int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused,
+						       struct symbol *sym __maybe_unused,
+						       struct annotate_args *args __maybe_unused)
+{
+	return -1;
+}
+
+#endif /* HAVE_LIBCAPSTONE_SUPPORT */
+
 #endif /* __PERF_CAPSTONE_H */
-- 
cgit v1.2.3


From b0388bafa4949bd30af7b3be5ee415f2a25ac014 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Tue, 3 Feb 2026 08:51:00 -0800
Subject: bpf: Relax scalar id equivalence for state pruning

Scalar register IDs are used by the verifier to track relationships
between registers and enable bounds propagation across those
relationships. Once an ID becomes singular (i.e. only a single
register/stack slot carries it), it can no longer contribute to bounds
propagation and effectively becomes stale. The previous commit makes the
verifier clear such ids before caching the state.

When comparing the current and cached states for pruning, these stale
IDs can cause technically equivalent states to be considered different
and thus prevent pruning.

For example, in the selftest added in the next commit, two registers -
r6 and r7 are not linked to any other registers and get cached with
id=0, in the current state, they are both linked to each other with
id=A.  Before this commit, check_scalar_ids would give temporary ids to
r6 and r7 (say tid1 and tid2) and then check_ids() would map tid1->A,
and when it would see tid2->A, it would not consider these state
equivalent.

Relax scalar ID equivalence by treating rold->id == 0 as "independent":
if the old state did not rely on any ID relationships for a register,
then any ID/linking present in the current state only adds constraints
and is always safe to accept for pruning. Implement this by returning
true immediately in check_scalar_ids() when old_id == 0.

Maintain correctness for the opposite direction (old_id != 0 && cur_id
== 0) by still allocating a temporary ID for cur_id == 0. This avoids
incorrectly allowing multiple independent current registers (id==0) to
satisfy a single linked old ID during mapping.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20260203165102.2302462-5-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                              | 63 +++++++++++++++++-----
 .../selftests/bpf/progs/verifier_scalar_ids.c      |  8 +--
 2 files changed, 56 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 80dc01350c77..da03bbbc1620 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19387,13 +19387,29 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
 	return false;
 }
 
-/* Similar to check_ids(), but allocate a unique temporary ID
- * for 'old_id' or 'cur_id' of zero.
- * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid.
+/*
+ * Compare scalar register IDs for state equivalence.
+ *
+ * When old_id == 0, the old register is independent - not linked to any
+ * other register. Any linking in the current state only adds constraints,
+ * making it more restrictive. Since the old state didn't rely on any ID
+ * relationships for this register, it's always safe to accept cur regardless
+ * of its ID. Hence, return true immediately.
+ *
+ * When old_id != 0 but cur_id == 0, we need to ensure that different
+ * independent registers in cur don't incorrectly satisfy the ID matching
+ * requirements of linked registers in old.
+ *
+ * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0
+ * and r7.id=0 (both independent), without temp IDs both would map old_id=X
+ * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map
+ * X->temp2, but X is already mapped to temp1, so the check fails correctly.
  */
 static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap)
 {
-	old_id = old_id ? old_id : ++idmap->tmp_id_gen;
+	if (!old_id)
+		return true;
+
 	cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen;
 
 	return check_ids(old_id, cur_id, idmap);
@@ -19618,11 +19634,21 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 		}
 		if (!rold->precise && exact == NOT_EXACT)
 			return true;
-		if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
-			return false;
-		if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off))
-			return false;
-		/* Why check_ids() for scalar registers?
+		/*
+		 * Linked register tracking uses rold->id to detect relationships.
+		 * When rold->id == 0, the register is independent and any linking
+		 * in rcur only adds constraints. When rold->id != 0, we must verify
+		 * id mapping and (for BPF_ADD_CONST) offset consistency.
+		 *
+		 * +------------------+-----------+------------------+---------------+
+		 * |                  | rold->id  | rold + ADD_CONST | rold->id == 0 |
+		 * |------------------+-----------+------------------+---------------|
+		 * | rcur->id         | range,ids | false            | range         |
+		 * | rcur + ADD_CONST | false     | range,ids,off    | range         |
+		 * | rcur->id == 0    | range,ids | false            | range         |
+		 * +------------------+-----------+------------------+---------------+
+		 *
+		 * Why check_ids() for scalar registers?
 		 *
 		 * Consider the following BPF code:
 		 *   1: r6 = ... unbound scalar, ID=a ...
@@ -19646,9 +19672,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 		 * ---
 		 * Also verify that new value satisfies old value range knowledge.
 		 */
-		return range_within(rold, rcur) &&
-		       tnum_in(rold->var_off, rcur->var_off) &&
-		       check_scalar_ids(rold->id, rcur->id, idmap);
+
+		/* ADD_CONST mismatch: different linking semantics */
+		if ((rold->id & BPF_ADD_CONST) && !(rcur->id & BPF_ADD_CONST))
+			return false;
+
+		if (rold->id && !(rold->id & BPF_ADD_CONST) && (rcur->id & BPF_ADD_CONST))
+			return false;
+
+		/* Both have offset linkage: offsets must match */
+		if ((rold->id & BPF_ADD_CONST) && rold->off != rcur->off)
+			return false;
+
+		if (!check_scalar_ids(rold->id, rcur->id, idmap))
+			return false;
+
+		return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off);
 	case PTR_TO_MAP_KEY:
 	case PTR_TO_MAP_VALUE:
 	case PTR_TO_MEM:
diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
index c0ce690ddb68..c8f8820336b7 100644
--- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
+++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
@@ -723,9 +723,9 @@ __success __log_level(2)
 /* The exit instruction should be reachable from two states,
  * use two matches and "processed .. insns" to ensure this.
  */
-__msg("13: (95) exit")
-__msg("13: (95) exit")
-__msg("processed 18 insns")
+__msg("15: (95) exit")
+__msg("15: (95) exit")
+__msg("processed 20 insns")
 __flag(BPF_F_TEST_STATE_FREQ)
 __naked void two_old_ids_one_cur_id(void)
 {
@@ -734,9 +734,11 @@ __naked void two_old_ids_one_cur_id(void)
 	"call %[bpf_ktime_get_ns];"
 	"r0 &= 0xff;"
 	"r6 = r0;"
+	"r8 = r0;"
 	"call %[bpf_ktime_get_ns];"
 	"r0 &= 0xff;"
 	"r7 = r0;"
+	"r9 = r0;"
 	"r0 = 0;"
 	/* Maybe make r{6,7} IDs identical */
 	"if r6 > r7 goto l0_%=;"
-- 
cgit v1.2.3


From f6ef5584ccb5683542abb38461970e969b580fba Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Tue, 3 Feb 2026 08:51:01 -0800
Subject: selftests/bpf: Add a test for ids=0 to verifier_scalar_ids test

Test that two registers with their id=0 (unlinked) in the cached state
can be mapped to a single id (linked) in the current state.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20260203165102.2302462-6-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/progs/verifier_scalar_ids.c      | 45 ++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
index c8f8820336b7..3072fee9a448 100644
--- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
+++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c
@@ -715,6 +715,51 @@ __naked void ignore_unique_scalar_ids_old(void)
 	: __clobber_all);
 }
 
+/* Check that two registers with 0 scalar IDs in a verified state can be mapped
+ * to the same scalar ID in current state.
+ */
+SEC("socket")
+__success __log_level(2)
+/* The states should be equivalent on reaching insn 12.
+ */
+__msg("12: safe")
+__msg("processed 17 insns")
+__flag(BPF_F_TEST_STATE_FREQ)
+__naked void two_nil_old_ids_one_cur_id(void)
+{
+	asm volatile (
+	/* Give unique scalar IDs to r{6,7} */
+	"call %[bpf_ktime_get_ns];"
+	"r0 &= 0xff;"
+	"r6 = r0;"
+	"r6 *= 1;"
+	"call %[bpf_ktime_get_ns];"
+	"r0 &= 0xff;"
+	"r7 = r0;"
+	"r7 *= 1;"
+	"r0 = 0;"
+	/* Maybe make r{6,7} IDs identical */
+	"if r6 > r7 goto l0_%=;"
+	"goto l1_%=;"
+"l0_%=:"
+	"r6 = r7;"
+"l1_%=:"
+	/* Mark r{6,7} precise.
+	 * Get here in two states:
+	 * - first:  r6{.id=0}, r7{.id=0} (cached state)
+	 * - second: r6{.id=A}, r7{.id=A}
+	 * Verifier considers such states equivalent.
+	 * Thus "exit;" would be verified only once.
+	 */
+	"r2 = r10;"
+	"r2 += r6;"
+	"r2 += r7;"
+	"exit;"
+	:
+	: __imm(bpf_ktime_get_ns)
+	: __clobber_all);
+}
+
 /* Check that two different scalar IDs in a verified state can't be
  * mapped to the same scalar ID in current state.
  */
-- 
cgit v1.2.3


From 954fa97e215ea8fb1fe70d117d25875f3d3938ea Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Tue, 3 Feb 2026 13:04:22 -0500
Subject: selftests/bpf: Add selftests for bpf_stream_print_stack

Add selftests for the new bpf_stream_print_stack kfunc.

Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20260203180424.14057-3-emil@etsalapatis.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/stream.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c
index 4a5bd852f10c..f63b378de090 100644
--- a/tools/testing/selftests/bpf/progs/stream.c
+++ b/tools/testing/selftests/bpf/progs/stream.c
@@ -234,4 +234,25 @@ int stream_arena_callback_fault(void *ctx)
 	return 0;
 }
 
+SEC("syscall")
+__arch_x86_64
+__arch_arm64
+__success __retval(0)
+__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}")
+__stderr("Call trace:\n"
+"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n"
+"|[ \t]+[^\n]+\n)*}}")
+int stream_print_stack_kfunc(void *ctx)
+{
+	return bpf_stream_print_stack(BPF_STDERR);
+}
+
+SEC("syscall")
+__success __retval(-2)
+int stream_print_stack_invalid_id(void *ctx)
+{
+	/* Try to pass an invalid stream ID. */
+	return bpf_stream_print_stack((enum bpf_stream_id)0xbadcafe);
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 4d99137eea48b18387d8d17443e28d124177ab7b Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Tue, 3 Feb 2026 13:04:24 -0500
Subject: selftests/bpf: Add selftests for stream functions under lock

Add a selftest to ensure BPF stream functions can now be called
while holding a lock.

Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20260203180424.14057-5-emil@etsalapatis.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/stream.c | 32 ++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c
index f63b378de090..6f999ba951a3 100644
--- a/tools/testing/selftests/bpf/progs/stream.c
+++ b/tools/testing/selftests/bpf/progs/stream.c
@@ -42,6 +42,10 @@ int size;
 u64 fault_addr;
 void *arena_ptr;
 
+#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8)))
+
+private(STREAM) struct bpf_spin_lock block;
+
 SEC("syscall")
 __success __retval(0)
 int stream_exhaust(void *ctx)
@@ -255,4 +259,32 @@ int stream_print_stack_invalid_id(void *ctx)
 	return bpf_stream_print_stack((enum bpf_stream_id)0xbadcafe);
 }
 
+SEC("syscall")
+__arch_x86_64
+__arch_arm64
+__success __retval(0)
+__stdout(_STR)
+__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}")
+__stderr("Call trace:\n"
+"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n"
+"|[ \t]+[^\n]+\n)*}}")
+int stream_print_kfuncs_locked(void *ctx)
+{
+	int ret;
+
+	bpf_spin_lock(&block);
+
+	ret = bpf_stream_printk(BPF_STDOUT, _STR);
+	if (ret)
+		goto out;
+
+	ret = bpf_stream_print_stack(BPF_STDERR);
+
+out:
+	bpf_spin_unlock(&block);
+
+	return ret;
+}
+
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From d2ac7e4418dd2db7c512a658eb5b3d93650baacd Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 31 Jan 2026 12:02:19 -0800
Subject: perf test kvm: Add stat live testing

Ensure the `perf kvm stat live -p ..` has some basic functionality.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Anup Patel <anup@brainfault.org>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quan Zhou <zhouquan@iscas.ac.cn>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yunseong Kim <ysk@kzalloc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/kvm.sh | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/kvm.sh b/tools/perf/tests/shell/kvm.sh
index 2a399b83fe80..f88e859025c4 100755
--- a/tools/perf/tests/shell/kvm.sh
+++ b/tools/perf/tests/shell/kvm.sh
@@ -7,9 +7,10 @@ set -e
 err=0
 perfdata=$(mktemp /tmp/__perf_kvm_test.perf.data.XXXXX)
 qemu_pid_file=$(mktemp /tmp/__perf_kvm_test.qemu.pid.XXXXX)
+log_file=$(mktemp /tmp/__perf_kvm_test.live_log.XXXXX)
 
 cleanup() {
-	rm -f "${perfdata}"
+	rm -f "${perfdata}" "${log_file}"
 	if [ -f "${qemu_pid_file}" ]; then
 		if [ -s "${qemu_pid_file}" ]; then
 			qemu_pid=$(cat "${qemu_pid_file}")
@@ -96,6 +97,32 @@ test_kvm_buildid_list() {
 	echo "perf kvm buildid-list test [Success]"
 }
 
+test_kvm_stat_live() {
+	echo "Testing perf kvm stat live"
+
+        # Run perf kvm live for 5 seconds, monitoring that PID
+	# Use sleep to keep stdin open but silent, preventing EOF loop or interactive spam
+	if ! sleep 10 | timeout 5s perf kvm stat live -p "${qemu_pid}" > "${log_file}" 2>&1; then
+		retval=$?
+		if [ $retval -ne 124 ] && [ $retval -ne 0 ]; then
+			echo "perf kvm stat live [Failed: perf kvm stat live failed to start or run (ret=$retval)]"
+			head -n 50 "${log_file}"
+			err=1
+			return
+		fi
+	fi
+
+	# Check for some sample data (percentage)
+	if ! grep -E -q "[0-9]+\.[0-9]+%" "${log_file}"; then
+		echo "perf kvm stat live [Failed: no sample percentage found]"
+		head -n 50 "${log_file}"
+		err=1
+		return
+	fi
+
+	echo "perf kvm stat live test [Success]"
+}
+
 setup_qemu() {
 	# Find qemu
 	if [ "$(uname -m)" = "x86_64" ]; then
@@ -148,6 +175,7 @@ if [ $err -eq 0 ]; then
 	test_kvm_stat
 	test_kvm_record_report
 	test_kvm_buildid_list
+	test_kvm_stat_live
 fi
 
 cleanup
-- 
cgit v1.2.3


From 8c5b40678c63be6b85f1c2dc8c8b89d632faf988 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 2 Feb 2026 22:09:18 -0800
Subject: libperf build: Always place libperf includes first

When building tools/perf the CFLAGS can contain a directory for the
installed headers.

As the headers may be being installed while building libperf.a this can
cause headers to be partially installed and found in the include path
while building an object file for libperf.a.

The installed header may reference other installed headers that are
missing given the partial nature of the install and then the build fails
with a missing header file.

Avoid this by ensuring the libperf source headers are always first in
the CFLAGS.

Fixes: 3143504918105156 ("libperf: Make libperf.a part of the perf build")
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile
index 9692d0742ed0..32301a1d8f0c 100644
--- a/tools/lib/perf/Makefile
+++ b/tools/lib/perf/Makefile
@@ -50,9 +50,9 @@ INCLUDES = \
 -I$(srctree)/tools/include/uapi
 
 # Append required CFLAGS
+override CFLAGS := $(INCLUDES) $(CFLAGS)
 override CFLAGS += -g -Werror -Wall
 override CFLAGS += -fPIC
-override CFLAGS += $(INCLUDES)
 override CFLAGS += -fvisibility=hidden
 override CFLAGS += $(EXTRA_WARNINGS)
 override CFLAGS += $(EXTRA_CFLAGS)
-- 
cgit v1.2.3


From ceea279f93760767c0e654341829334a1c881a08 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 3 Feb 2026 10:26:36 -0800
Subject: perf kvm stat: Remove use of the arch directory

`perf kvm stat` supports record and report options.

By using the arch directory a report for a different machine type cannot
be supported.

Move the kvm-stat code out of the arch directory and into
util/kvm-stat-arch following the pattern of perf-regs and dwarf-regs.

Avoid duplicate symbols by renaming functions to have the architecture
name within them.

For global variables, wrap them in an architecture specific function.
Selecting the architecture to use with `perf kvm stat` is selected by
EM_HOST, ie no different than before the change.

Later the ELF machine can be determined from the session or a header
feature (ie EM_HOST at the time of the record).

The build and #define HAVE_KVM_STAT_SUPPORT is now redundant so remove
across Makefiles and in the build.

Opportunistically constify architectural structs and arrays.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Anup Patel <anup@brainfault.org>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quan Zhou <zhouquan@iscas.ac.cn>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yunseong Kim <ysk@kzalloc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.config                         |   4 -
 tools/perf/arch/arm64/Makefile                     |   1 -
 tools/perf/arch/arm64/util/Build                   |   1 -
 tools/perf/arch/arm64/util/arm64_exception_types.h |  97 --------
 tools/perf/arch/arm64/util/kvm-stat.c              |  84 -------
 tools/perf/arch/loongarch/Makefile                 |   1 -
 tools/perf/arch/loongarch/util/Build               |   1 -
 tools/perf/arch/loongarch/util/kvm-stat.c          | 139 -----------
 tools/perf/arch/powerpc/Makefile                   |   1 -
 tools/perf/arch/powerpc/util/Build                 |   1 -
 tools/perf/arch/powerpc/util/book3s_hcalls.h       | 124 ----------
 tools/perf/arch/powerpc/util/book3s_hv_exits.h     |  33 ---
 tools/perf/arch/powerpc/util/kvm-stat.c            | 219 ----------------
 tools/perf/arch/riscv/Makefile                     |   1 -
 tools/perf/arch/riscv/util/Build                   |   2 -
 tools/perf/arch/riscv/util/kvm-stat.c              |  78 ------
 tools/perf/arch/riscv/util/riscv_trap_types.h      |  57 -----
 tools/perf/arch/s390/Makefile                      |   1 -
 tools/perf/arch/s390/util/Build                    |   1 -
 tools/perf/arch/s390/util/kvm-stat.c               | 110 ---------
 tools/perf/arch/x86/Makefile                       |   1 -
 tools/perf/arch/x86/util/Build                     |   1 -
 tools/perf/arch/x86/util/kvm-stat.c                | 264 --------------------
 tools/perf/builtin-kvm.c                           |  37 +--
 tools/perf/util/Build                              |   3 +-
 tools/perf/util/kvm-stat-arch/Build                |   6 +
 .../util/kvm-stat-arch/arm64_exception_types.h     |  97 ++++++++
 tools/perf/util/kvm-stat-arch/book3s_hcalls.h      | 124 ++++++++++
 tools/perf/util/kvm-stat-arch/book3s_hv_exits.h    |  33 +++
 tools/perf/util/kvm-stat-arch/kvm-stat-arm64.c     |  95 +++++++
 tools/perf/util/kvm-stat-arch/kvm-stat-loongarch.c | 151 ++++++++++++
 tools/perf/util/kvm-stat-arch/kvm-stat-powerpc.c   | 228 +++++++++++++++++
 tools/perf/util/kvm-stat-arch/kvm-stat-riscv.c     |  90 +++++++
 tools/perf/util/kvm-stat-arch/kvm-stat-s390.c      | 120 +++++++++
 tools/perf/util/kvm-stat-arch/kvm-stat-x86.c       | 274 +++++++++++++++++++++
 tools/perf/util/kvm-stat-arch/riscv_trap_types.h   |  57 +++++
 tools/perf/util/kvm-stat.c                         | 209 +++++++++++++++-
 tools/perf/util/kvm-stat.h                         |  76 ++++--
 38 files changed, 1548 insertions(+), 1274 deletions(-)
 delete mode 100644 tools/perf/arch/arm64/util/arm64_exception_types.h
 delete mode 100644 tools/perf/arch/arm64/util/kvm-stat.c
 delete mode 100644 tools/perf/arch/loongarch/util/kvm-stat.c
 delete mode 100644 tools/perf/arch/powerpc/util/book3s_hcalls.h
 delete mode 100644 tools/perf/arch/powerpc/util/book3s_hv_exits.h
 delete mode 100644 tools/perf/arch/powerpc/util/kvm-stat.c
 delete mode 100644 tools/perf/arch/riscv/util/kvm-stat.c
 delete mode 100644 tools/perf/arch/riscv/util/riscv_trap_types.h
 delete mode 100644 tools/perf/arch/s390/util/kvm-stat.c
 delete mode 100644 tools/perf/arch/x86/util/kvm-stat.c
 create mode 100644 tools/perf/util/kvm-stat-arch/Build
 create mode 100644 tools/perf/util/kvm-stat-arch/arm64_exception_types.h
 create mode 100644 tools/perf/util/kvm-stat-arch/book3s_hcalls.h
 create mode 100644 tools/perf/util/kvm-stat-arch/book3s_hv_exits.h
 create mode 100644 tools/perf/util/kvm-stat-arch/kvm-stat-arm64.c
 create mode 100644 tools/perf/util/kvm-stat-arch/kvm-stat-loongarch.c
 create mode 100644 tools/perf/util/kvm-stat-arch/kvm-stat-powerpc.c
 create mode 100644 tools/perf/util/kvm-stat-arch/kvm-stat-riscv.c
 create mode 100644 tools/perf/util/kvm-stat-arch/kvm-stat-s390.c
 create mode 100644 tools/perf/util/kvm-stat-arch/kvm-stat-x86.c
 create mode 100644 tools/perf/util/kvm-stat-arch/riscv_trap_types.h

(limited to 'tools')

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index e085d27f698a..b683aab3ab97 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -1033,10 +1033,6 @@ ifndef NO_LIBNUMA
   endif
 endif
 
-ifdef HAVE_KVM_STAT_SUPPORT
-    CFLAGS += -DHAVE_KVM_STAT_SUPPORT
-endif
-
 ifeq (${IS_64_BIT}, 1)
   ifndef NO_PERF_READ_VDSO32
     $(call feature_check,compile-32)
diff --git a/tools/perf/arch/arm64/Makefile b/tools/perf/arch/arm64/Makefile
index 087e099fb453..44cc3f023318 100644
--- a/tools/perf/arch/arm64/Makefile
+++ b/tools/perf/arch/arm64/Makefile
@@ -1,3 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0
 PERF_HAVE_JITDUMP := 1
-HAVE_KVM_STAT_SUPPORT := 1
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
index 0177af19cc00..d25edd9e1883 100644
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
@@ -1,4 +1,3 @@
-perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
 perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
 perf-util-y += ../../arm/util/auxtrace.o
 perf-util-y += ../../arm/util/cs-etm.o
diff --git a/tools/perf/arch/arm64/util/arm64_exception_types.h b/tools/perf/arch/arm64/util/arm64_exception_types.h
deleted file mode 100644
index bf827f19ace0..000000000000
--- a/tools/perf/arch/arm64/util/arm64_exception_types.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef ARCH_PERF_ARM64_EXCEPTION_TYPES_H
-#define ARCH_PERF_ARM64_EXCEPTION_TYPES_H
-
-/* Per asm/virt.h */
-#define HVC_STUB_ERR		  0xbadca11
-
-/* Per asm/kvm_asm.h */
-#define ARM_EXCEPTION_IRQ		0
-#define ARM_EXCEPTION_EL1_SERROR	1
-#define ARM_EXCEPTION_TRAP		2
-#define ARM_EXCEPTION_IL		3
-/* The hyp-stub will return this for any kvm_call_hyp() call */
-#define ARM_EXCEPTION_HYP_GONE		HVC_STUB_ERR
-
-#define kvm_arm_exception_type					\
-	{ARM_EXCEPTION_IRQ,		"IRQ"		},	\
-	{ARM_EXCEPTION_EL1_SERROR,	"SERROR"	},	\
-	{ARM_EXCEPTION_TRAP,		"TRAP"		},	\
-	{ARM_EXCEPTION_IL,		"ILLEGAL"	},	\
-	{ARM_EXCEPTION_HYP_GONE,	"HYP_GONE"	}
-
-/* Per asm/esr.h */
-#define ESR_ELx_EC_UNKNOWN	(0x00)
-#define ESR_ELx_EC_WFx		(0x01)
-/* Unallocated EC: 0x02 */
-#define ESR_ELx_EC_CP15_32	(0x03)
-#define ESR_ELx_EC_CP15_64	(0x04)
-#define ESR_ELx_EC_CP14_MR	(0x05)
-#define ESR_ELx_EC_CP14_LS	(0x06)
-#define ESR_ELx_EC_FP_ASIMD	(0x07)
-#define ESR_ELx_EC_CP10_ID	(0x08)	/* EL2 only */
-#define ESR_ELx_EC_PAC		(0x09)	/* EL2 and above */
-#define ESR_ELx_EC_OTHER	(0x0A)
-/* Unallocated EC: 0x0B */
-#define ESR_ELx_EC_CP14_64	(0x0C)
-#define ESR_ELx_EC_BTI		(0x0D)
-#define ESR_ELx_EC_ILL		(0x0E)
-/* Unallocated EC: 0x0F - 0x10 */
-#define ESR_ELx_EC_SVC32	(0x11)
-#define ESR_ELx_EC_HVC32	(0x12)	/* EL2 only */
-#define ESR_ELx_EC_SMC32	(0x13)	/* EL2 and above */
-/* Unallocated EC: 0x14 */
-#define ESR_ELx_EC_SVC64	(0x15)
-#define ESR_ELx_EC_HVC64	(0x16)	/* EL2 and above */
-#define ESR_ELx_EC_SMC64	(0x17)	/* EL2 and above */
-#define ESR_ELx_EC_SYS64	(0x18)
-#define ESR_ELx_EC_SVE		(0x19)
-#define ESR_ELx_EC_ERET		(0x1a)	/* EL2 only */
-/* Unallocated EC: 0x1B */
-#define ESR_ELx_EC_FPAC		(0x1C)	/* EL1 and above */
-#define ESR_ELx_EC_SME		(0x1D)
-/* Unallocated EC: 0x1E */
-#define ESR_ELx_EC_IMP_DEF	(0x1f)	/* EL3 only */
-#define ESR_ELx_EC_IABT_LOW	(0x20)
-#define ESR_ELx_EC_IABT_CUR	(0x21)
-#define ESR_ELx_EC_PC_ALIGN	(0x22)
-/* Unallocated EC: 0x23 */
-#define ESR_ELx_EC_DABT_LOW	(0x24)
-#define ESR_ELx_EC_DABT_CUR	(0x25)
-#define ESR_ELx_EC_SP_ALIGN	(0x26)
-#define ESR_ELx_EC_MOPS		(0x27)
-#define ESR_ELx_EC_FP_EXC32	(0x28)
-/* Unallocated EC: 0x29 - 0x2B */
-#define ESR_ELx_EC_FP_EXC64	(0x2C)
-#define ESR_ELx_EC_GCS		(0x2D)
-/* Unallocated EC: 0x2E */
-#define ESR_ELx_EC_SERROR	(0x2F)
-#define ESR_ELx_EC_BREAKPT_LOW	(0x30)
-#define ESR_ELx_EC_BREAKPT_CUR	(0x31)
-#define ESR_ELx_EC_SOFTSTP_LOW	(0x32)
-#define ESR_ELx_EC_SOFTSTP_CUR	(0x33)
-#define ESR_ELx_EC_WATCHPT_LOW	(0x34)
-#define ESR_ELx_EC_WATCHPT_CUR	(0x35)
-/* Unallocated EC: 0x36 - 0x37 */
-#define ESR_ELx_EC_BKPT32	(0x38)
-/* Unallocated EC: 0x39 */
-#define ESR_ELx_EC_VECTOR32	(0x3A)	/* EL2 only */
-/* Unallocated EC: 0x3B */
-#define ESR_ELx_EC_BRK64	(0x3C)
-/* Unallocated EC: 0x3D - 0x3F */
-#define ESR_ELx_EC_MAX		(0x3F)
-
-#define ECN(x) { ESR_ELx_EC_##x, #x }
-
-#define kvm_arm_exception_class \
-	ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \
-	ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(PAC), ECN(CP14_64), \
-	ECN(SVC64), ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(SVE), \
-	ECN(IMP_DEF), ECN(IABT_LOW), ECN(IABT_CUR), \
-	ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \
-	ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \
-	ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \
-	ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \
-	ECN(BKPT32), ECN(VECTOR32), ECN(BRK64)
-
-#endif /* ARCH_PERF_ARM64_EXCEPTION_TYPES_H */
diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/arch/arm64/util/kvm-stat.c
deleted file mode 100644
index 6611aa21cba9..000000000000
--- a/tools/perf/arch/arm64/util/kvm-stat.c
+++ /dev/null
@@ -1,84 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <errno.h>
-#include <memory.h>
-#include "../../../util/evsel.h"
-#include "../../../util/kvm-stat.h"
-#include "arm64_exception_types.h"
-#include "debug.h"
-
-define_exit_reasons_table(arm64_exit_reasons, kvm_arm_exception_type);
-define_exit_reasons_table(arm64_trap_exit_reasons, kvm_arm_exception_class);
-
-const char *kvm_trap_exit_reason = "esr_ec";
-const char *vcpu_id_str = "id";
-const char *kvm_exit_reason = "ret";
-const char *kvm_entry_trace = "kvm:kvm_entry";
-const char *kvm_exit_trace = "kvm:kvm_exit";
-
-const char *kvm_events_tp[] = {
-	"kvm:kvm_entry",
-	"kvm:kvm_exit",
-	NULL,
-};
-
-static void event_get_key(struct evsel *evsel,
-			  struct perf_sample *sample,
-			  struct event_key *key)
-{
-	key->info = 0;
-	key->key = evsel__intval(evsel, sample, kvm_exit_reason);
-	key->exit_reasons = arm64_exit_reasons;
-
-	/*
-	 * TRAP exceptions carry exception class info in esr_ec field
-	 * and, hence, we need to use a different exit_reasons table to
-	 * properly decode event's est_ec.
-	 */
-	if (key->key == ARM_EXCEPTION_TRAP) {
-		key->key = evsel__intval(evsel, sample, kvm_trap_exit_reason);
-		key->exit_reasons = arm64_trap_exit_reasons;
-	}
-}
-
-static bool event_begin(struct evsel *evsel,
-			struct perf_sample *sample __maybe_unused,
-			struct event_key *key __maybe_unused)
-{
-	return evsel__name_is(evsel, kvm_entry_trace);
-}
-
-static bool event_end(struct evsel *evsel,
-		      struct perf_sample *sample,
-		      struct event_key *key)
-{
-	if (evsel__name_is(evsel, kvm_exit_trace)) {
-		event_get_key(evsel, sample, key);
-		return true;
-	}
-	return false;
-}
-
-static struct kvm_events_ops exit_events = {
-	.is_begin_event = event_begin,
-	.is_end_event	= event_end,
-	.decode_key	= exit_event_decode_key,
-	.name		= "VM-EXIT"
-};
-
-struct kvm_reg_events_ops kvm_reg_events_ops[] = {
-	{
-		.name	= "vmexit",
-		.ops	= &exit_events,
-	},
-	{ NULL, NULL },
-};
-
-const char * const kvm_skip_events[] = {
-	NULL,
-};
-
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
-{
-	kvm->exit_reasons_isa = "arm64";
-	return 0;
-}
diff --git a/tools/perf/arch/loongarch/Makefile b/tools/perf/arch/loongarch/Makefile
index 087e099fb453..44cc3f023318 100644
--- a/tools/perf/arch/loongarch/Makefile
+++ b/tools/perf/arch/loongarch/Makefile
@@ -1,3 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0
 PERF_HAVE_JITDUMP := 1
-HAVE_KVM_STAT_SUPPORT := 1
diff --git a/tools/perf/arch/loongarch/util/Build b/tools/perf/arch/loongarch/util/Build
index 0aa31986ecb5..1cb06a5f8935 100644
--- a/tools/perf/arch/loongarch/util/Build
+++ b/tools/perf/arch/loongarch/util/Build
@@ -3,4 +3,3 @@ perf-util-y += perf_regs.o
 
 perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
 perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
-perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
diff --git a/tools/perf/arch/loongarch/util/kvm-stat.c b/tools/perf/arch/loongarch/util/kvm-stat.c
deleted file mode 100644
index a7859a3a9a51..000000000000
--- a/tools/perf/arch/loongarch/util/kvm-stat.c
+++ /dev/null
@@ -1,139 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <errno.h>
-#include <memory.h>
-#include "util/kvm-stat.h"
-#include "util/parse-events.h"
-#include "util/debug.h"
-#include "util/evsel.h"
-#include "util/evlist.h"
-#include "util/pmus.h"
-
-#define LOONGARCH_EXCEPTION_INT		0
-#define LOONGARCH_EXCEPTION_PIL		1
-#define LOONGARCH_EXCEPTION_PIS		2
-#define LOONGARCH_EXCEPTION_PIF		3
-#define LOONGARCH_EXCEPTION_PME		4
-#define LOONGARCH_EXCEPTION_FPD		15
-#define LOONGARCH_EXCEPTION_SXD		16
-#define LOONGARCH_EXCEPTION_ASXD	17
-#define LOONGARCH_EXCEPTION_GSPR	22
-#define  LOONGARCH_EXCEPTION_CPUCFG	100
-#define  LOONGARCH_EXCEPTION_CSR	101
-#define  LOONGARCH_EXCEPTION_IOCSR	102
-#define  LOONGARCH_EXCEPTION_IDLE	103
-#define  LOONGARCH_EXCEPTION_OTHERS	104
-#define LOONGARCH_EXCEPTION_HVC		23
-
-#define loongarch_exception_type				\
-	{LOONGARCH_EXCEPTION_INT,  "Interrupt" },		\
-	{LOONGARCH_EXCEPTION_PIL,  "Mem Read" },		\
-	{LOONGARCH_EXCEPTION_PIS,  "Mem Store" },		\
-	{LOONGARCH_EXCEPTION_PIF,  "Inst Fetch" },		\
-	{LOONGARCH_EXCEPTION_PME,  "Mem Modify" },		\
-	{LOONGARCH_EXCEPTION_FPD,  "FPU" },			\
-	{LOONGARCH_EXCEPTION_SXD,  "LSX" },			\
-	{LOONGARCH_EXCEPTION_ASXD, "LASX" },			\
-	{LOONGARCH_EXCEPTION_GSPR, "Privilege Error" },		\
-	{LOONGARCH_EXCEPTION_HVC,  "Hypercall" },		\
-	{LOONGARCH_EXCEPTION_CPUCFG, "CPUCFG" },		\
-	{LOONGARCH_EXCEPTION_CSR,    "CSR" },			\
-	{LOONGARCH_EXCEPTION_IOCSR,  "IOCSR" },			\
-	{LOONGARCH_EXCEPTION_IDLE,   "Idle" },			\
-	{LOONGARCH_EXCEPTION_OTHERS, "Others" }
-
-define_exit_reasons_table(loongarch_exit_reasons, loongarch_exception_type);
-
-const char *vcpu_id_str = "vcpu_id";
-const char *kvm_exit_reason = "reason";
-const char *kvm_entry_trace = "kvm:kvm_enter";
-const char *kvm_reenter_trace = "kvm:kvm_reenter";
-const char *kvm_exit_trace = "kvm:kvm_exit";
-const char *kvm_events_tp[] = {
-	"kvm:kvm_enter",
-	"kvm:kvm_reenter",
-	"kvm:kvm_exit",
-	"kvm:kvm_exit_gspr",
-	NULL,
-};
-
-static bool event_begin(struct evsel *evsel,
-			struct perf_sample *sample, struct event_key *key)
-{
-	return exit_event_begin(evsel, sample, key);
-}
-
-static bool event_end(struct evsel *evsel,
-		      struct perf_sample *sample __maybe_unused,
-		      struct event_key *key __maybe_unused)
-{
-	/*
-	 * LoongArch kvm is different with other architectures
-	 *
-	 * There is kvm:kvm_reenter or kvm:kvm_enter event adjacent with
-	 * kvm:kvm_exit event.
-	 *   kvm:kvm_enter   means returning to vmm and then to guest
-	 *   kvm:kvm_reenter means returning to guest immediately
-	 */
-	return evsel__name_is(evsel, kvm_entry_trace) || evsel__name_is(evsel, kvm_reenter_trace);
-}
-
-static void event_gspr_get_key(struct evsel *evsel,
-			       struct perf_sample *sample, struct event_key *key)
-{
-	unsigned int insn;
-
-	key->key = LOONGARCH_EXCEPTION_OTHERS;
-	insn = evsel__intval(evsel, sample, "inst_word");
-
-	switch (insn >> 24) {
-	case 0:
-		/* CPUCFG inst trap */
-		if ((insn >> 10) == 0x1b)
-			key->key = LOONGARCH_EXCEPTION_CPUCFG;
-		break;
-	case 4:
-		/* CSR inst trap */
-		key->key = LOONGARCH_EXCEPTION_CSR;
-		break;
-	case 6:
-		/* IOCSR inst trap */
-		if ((insn >> 15) == 0xc90)
-			key->key = LOONGARCH_EXCEPTION_IOCSR;
-		else if ((insn >> 15) == 0xc91)
-			/* Idle inst trap */
-			key->key = LOONGARCH_EXCEPTION_IDLE;
-		break;
-	default:
-		key->key = LOONGARCH_EXCEPTION_OTHERS;
-		break;
-	}
-}
-
-static struct child_event_ops child_events[] = {
-	{ .name = "kvm:kvm_exit_gspr", .get_key = event_gspr_get_key },
-	{ NULL, NULL },
-};
-
-static struct kvm_events_ops exit_events = {
-	.is_begin_event = event_begin,
-	.is_end_event = event_end,
-	.child_ops = child_events,
-	.decode_key = exit_event_decode_key,
-	.name = "VM-EXIT"
-};
-
-struct kvm_reg_events_ops kvm_reg_events_ops[] = {
-	{ .name	= "vmexit", .ops = &exit_events, },
-	{ NULL, NULL },
-};
-
-const char * const kvm_skip_events[] = {
-	NULL,
-};
-
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
-{
-	kvm->exit_reasons_isa = "loongarch64";
-	kvm->exit_reasons = loongarch_exit_reasons;
-	return 0;
-}
diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
index a295a80ea078..44cc3f023318 100644
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -1,3 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0
-HAVE_KVM_STAT_SUPPORT := 1
 PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index 5fd28ec713a4..e091b6785674 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -1,5 +1,4 @@
 perf-util-y += header.o
-perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
 perf-util-y += perf_regs.o
 perf-util-y += mem-events.o
 perf-util-y += pmu.o
diff --git a/tools/perf/arch/powerpc/util/book3s_hcalls.h b/tools/perf/arch/powerpc/util/book3s_hcalls.h
deleted file mode 100644
index 488f4339b83c..000000000000
--- a/tools/perf/arch/powerpc/util/book3s_hcalls.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH_PERF_BOOK3S_HV_HCALLS_H
-#define ARCH_PERF_BOOK3S_HV_HCALLS_H
-
-/*
- * PowerPC HCALL codes : hcall code to name mapping
- */
-#define kvm_trace_symbol_hcall \
-	{0x4, "H_REMOVE"},					\
-	{0x8, "H_ENTER"},					\
-	{0xc, "H_READ"},					\
-	{0x10, "H_CLEAR_MOD"},					\
-	{0x14, "H_CLEAR_REF"},					\
-	{0x18, "H_PROTECT"},					\
-	{0x1c, "H_GET_TCE"},					\
-	{0x20, "H_PUT_TCE"},					\
-	{0x24, "H_SET_SPRG0"},					\
-	{0x28, "H_SET_DABR"},					\
-	{0x2c, "H_PAGE_INIT"},					\
-	{0x30, "H_SET_ASR"},					\
-	{0x34, "H_ASR_ON"},					\
-	{0x38, "H_ASR_OFF"},					\
-	{0x3c, "H_LOGICAL_CI_LOAD"},				\
-	{0x40, "H_LOGICAL_CI_STORE"},				\
-	{0x44, "H_LOGICAL_CACHE_LOAD"},				\
-	{0x48, "H_LOGICAL_CACHE_STORE"},			\
-	{0x4c, "H_LOGICAL_ICBI"},				\
-	{0x50, "H_LOGICAL_DCBF"},				\
-	{0x54, "H_GET_TERM_CHAR"},				\
-	{0x58, "H_PUT_TERM_CHAR"},				\
-	{0x5c, "H_REAL_TO_LOGICAL"},				\
-	{0x60, "H_HYPERVISOR_DATA"},				\
-	{0x64, "H_EOI"},					\
-	{0x68, "H_CPPR"},					\
-	{0x6c, "H_IPI"},					\
-	{0x70, "H_IPOLL"},					\
-	{0x74, "H_XIRR"},					\
-	{0x78, "H_MIGRATE_DMA"},				\
-	{0x7c, "H_PERFMON"},					\
-	{0xdc, "H_REGISTER_VPA"},				\
-	{0xe0, "H_CEDE"},					\
-	{0xe4, "H_CONFER"},					\
-	{0xe8, "H_PROD"},					\
-	{0xec, "H_GET_PPP"},					\
-	{0xf0, "H_SET_PPP"},					\
-	{0xf4, "H_PURR"},					\
-	{0xf8, "H_PIC"},					\
-	{0xfc, "H_REG_CRQ"},					\
-	{0x100, "H_FREE_CRQ"},					\
-	{0x104, "H_VIO_SIGNAL"},				\
-	{0x108, "H_SEND_CRQ"},					\
-	{0x110, "H_COPY_RDMA"},					\
-	{0x114, "H_REGISTER_LOGICAL_LAN"},			\
-	{0x118, "H_FREE_LOGICAL_LAN"},				\
-	{0x11c, "H_ADD_LOGICAL_LAN_BUFFER"},			\
-	{0x120, "H_SEND_LOGICAL_LAN"},				\
-	{0x124, "H_BULK_REMOVE"},				\
-	{0x130, "H_MULTICAST_CTRL"},				\
-	{0x134, "H_SET_XDABR"},					\
-	{0x138, "H_STUFF_TCE"},					\
-	{0x13c, "H_PUT_TCE_INDIRECT"},				\
-	{0x14c, "H_CHANGE_LOGICAL_LAN_MAC"},			\
-	{0x150, "H_VTERM_PARTNER_INFO"},			\
-	{0x154, "H_REGISTER_VTERM"},				\
-	{0x158, "H_FREE_VTERM"},				\
-	{0x15c, "H_RESET_EVENTS"},				\
-	{0x160, "H_ALLOC_RESOURCE"},				\
-	{0x164, "H_FREE_RESOURCE"},				\
-	{0x168, "H_MODIFY_QP"},					\
-	{0x16c, "H_QUERY_QP"},					\
-	{0x170, "H_REREGISTER_PMR"},				\
-	{0x174, "H_REGISTER_SMR"},				\
-	{0x178, "H_QUERY_MR"},					\
-	{0x17c, "H_QUERY_MW"},					\
-	{0x180, "H_QUERY_HCA"},					\
-	{0x184, "H_QUERY_PORT"},				\
-	{0x188, "H_MODIFY_PORT"},				\
-	{0x18c, "H_DEFINE_AQP1"},				\
-	{0x190, "H_GET_TRACE_BUFFER"},				\
-	{0x194, "H_DEFINE_AQP0"},				\
-	{0x198, "H_RESIZE_MR"},					\
-	{0x19c, "H_ATTACH_MCQP"},				\
-	{0x1a0, "H_DETACH_MCQP"},				\
-	{0x1a4, "H_CREATE_RPT"},				\
-	{0x1a8, "H_REMOVE_RPT"},				\
-	{0x1ac, "H_REGISTER_RPAGES"},				\
-	{0x1b0, "H_DISABLE_AND_GET"},				\
-	{0x1b4, "H_ERROR_DATA"},				\
-	{0x1b8, "H_GET_HCA_INFO"},				\
-	{0x1bc, "H_GET_PERF_COUNT"},				\
-	{0x1c0, "H_MANAGE_TRACE"},				\
-	{0x1d4, "H_FREE_LOGICAL_LAN_BUFFER"},			\
-	{0x1d8, "H_POLL_PENDING"},				\
-	{0x1e4, "H_QUERY_INT_STATE"},				\
-	{0x244, "H_ILLAN_ATTRIBUTES"},				\
-	{0x250, "H_MODIFY_HEA_QP"},				\
-	{0x254, "H_QUERY_HEA_QP"},				\
-	{0x258, "H_QUERY_HEA"},					\
-	{0x25c, "H_QUERY_HEA_PORT"},				\
-	{0x260, "H_MODIFY_HEA_PORT"},				\
-	{0x264, "H_REG_BCMC"},					\
-	{0x268, "H_DEREG_BCMC"},				\
-	{0x26c, "H_REGISTER_HEA_RPAGES"},			\
-	{0x270, "H_DISABLE_AND_GET_HEA"},			\
-	{0x274, "H_GET_HEA_INFO"},				\
-	{0x278, "H_ALLOC_HEA_RESOURCE"},			\
-	{0x284, "H_ADD_CONN"},					\
-	{0x288, "H_DEL_CONN"},					\
-	{0x298, "H_JOIN"},					\
-	{0x2a4, "H_VASI_STATE"},				\
-	{0x2b0, "H_ENABLE_CRQ"},				\
-	{0x2b8, "H_GET_EM_PARMS"},				\
-	{0x2d0, "H_SET_MPP"},					\
-	{0x2d4, "H_GET_MPP"},					\
-	{0x2ec, "H_HOME_NODE_ASSOCIATIVITY"},			\
-	{0x2f4, "H_BEST_ENERGY"},				\
-	{0x2fc, "H_XIRR_X"},					\
-	{0x300, "H_RANDOM"},					\
-	{0x304, "H_COP"},					\
-	{0x314, "H_GET_MPP_X"},					\
-	{0x31c, "H_SET_MODE"},					\
-	{0xf000, "H_RTAS"}					\
-
-#endif
diff --git a/tools/perf/arch/powerpc/util/book3s_hv_exits.h b/tools/perf/arch/powerpc/util/book3s_hv_exits.h
deleted file mode 100644
index 2011376c7ab5..000000000000
--- a/tools/perf/arch/powerpc/util/book3s_hv_exits.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH_PERF_BOOK3S_HV_EXITS_H
-#define ARCH_PERF_BOOK3S_HV_EXITS_H
-
-/*
- * PowerPC Interrupt vectors : exit code to name mapping
- */
-
-#define kvm_trace_symbol_exit \
-	{0x0,	"RETURN_TO_HOST"}, \
-	{0x100, "SYSTEM_RESET"}, \
-	{0x200, "MACHINE_CHECK"}, \
-	{0x300, "DATA_STORAGE"}, \
-	{0x380, "DATA_SEGMENT"}, \
-	{0x400, "INST_STORAGE"}, \
-	{0x480, "INST_SEGMENT"}, \
-	{0x500, "EXTERNAL"}, \
-	{0x502, "EXTERNAL_HV"}, \
-	{0x600, "ALIGNMENT"}, \
-	{0x700, "PROGRAM"}, \
-	{0x800, "FP_UNAVAIL"}, \
-	{0x900, "DECREMENTER"}, \
-	{0x980, "HV_DECREMENTER"}, \
-	{0xc00, "SYSCALL"}, \
-	{0xd00, "TRACE"}, \
-	{0xe00, "H_DATA_STORAGE"}, \
-	{0xe20, "H_INST_STORAGE"}, \
-	{0xe40, "H_EMUL_ASSIST"}, \
-	{0xf00, "PERFMON"}, \
-	{0xf20, "ALTIVEC"}, \
-	{0xf40, "VSX"}
-
-#endif
diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c
deleted file mode 100644
index c8357b571ccf..000000000000
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ /dev/null
@@ -1,219 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <errno.h>
-#include "util/kvm-stat.h"
-#include "util/parse-events.h"
-#include "util/debug.h"
-#include "util/evsel.h"
-#include "util/evlist.h"
-#include "util/pmus.h"
-
-#include "book3s_hv_exits.h"
-#include "book3s_hcalls.h"
-#include <subcmd/parse-options.h>
-
-#define NR_TPS 4
-
-const char *vcpu_id_str = "vcpu_id";
-const char *kvm_entry_trace = "kvm_hv:kvm_guest_enter";
-const char *kvm_exit_trace = "kvm_hv:kvm_guest_exit";
-
-define_exit_reasons_table(hv_exit_reasons, kvm_trace_symbol_exit);
-define_exit_reasons_table(hcall_reasons, kvm_trace_symbol_hcall);
-
-/* Tracepoints specific to ppc_book3s_hv */
-const char *ppc_book3s_hv_kvm_tp[] = {
-	"kvm_hv:kvm_guest_enter",
-	"kvm_hv:kvm_guest_exit",
-	"kvm_hv:kvm_hcall_enter",
-	"kvm_hv:kvm_hcall_exit",
-	NULL,
-};
-
-/* 1 extra placeholder for NULL */
-const char *kvm_events_tp[NR_TPS + 1];
-const char *kvm_exit_reason;
-
-static void hcall_event_get_key(struct evsel *evsel,
-				struct perf_sample *sample,
-				struct event_key *key)
-{
-	key->info = 0;
-	key->key = evsel__intval(evsel, sample, "req");
-}
-
-static const char *get_hcall_exit_reason(u64 exit_code)
-{
-	struct exit_reasons_table *tbl = hcall_reasons;
-
-	while (tbl->reason != NULL) {
-		if (tbl->exit_code == exit_code)
-			return tbl->reason;
-		tbl++;
-	}
-
-	pr_debug("Unknown hcall code: %lld\n",
-	       (unsigned long long)exit_code);
-	return "UNKNOWN";
-}
-
-static bool hcall_event_end(struct evsel *evsel,
-			    struct perf_sample *sample __maybe_unused,
-			    struct event_key *key __maybe_unused)
-{
-	return (evsel__name_is(evsel, kvm_events_tp[3]));
-}
-
-static bool hcall_event_begin(struct evsel *evsel,
-			      struct perf_sample *sample, struct event_key *key)
-{
-	if (evsel__name_is(evsel, kvm_events_tp[2])) {
-		hcall_event_get_key(evsel, sample, key);
-		return true;
-	}
-
-	return false;
-}
-static void hcall_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
-				   struct event_key *key,
-				   char *decode)
-{
-	const char *hcall_reason = get_hcall_exit_reason(key->key);
-
-	scnprintf(decode, KVM_EVENT_NAME_LEN, "%s", hcall_reason);
-}
-
-static struct kvm_events_ops hcall_events = {
-	.is_begin_event = hcall_event_begin,
-	.is_end_event = hcall_event_end,
-	.decode_key = hcall_event_decode_key,
-	.name = "HCALL-EVENT",
-};
-
-static struct kvm_events_ops exit_events = {
-	.is_begin_event = exit_event_begin,
-	.is_end_event = exit_event_end,
-	.decode_key = exit_event_decode_key,
-	.name = "VM-EXIT"
-};
-
-struct kvm_reg_events_ops kvm_reg_events_ops[] = {
-	{ .name = "vmexit", .ops = &exit_events },
-	{ .name = "hcall", .ops = &hcall_events },
-	{ NULL, NULL },
-};
-
-const char * const kvm_skip_events[] = {
-	NULL,
-};
-
-
-static int is_tracepoint_available(const char *str, struct evlist *evlist)
-{
-	struct parse_events_error err;
-	int ret;
-
-	parse_events_error__init(&err);
-	ret = parse_events(evlist, str, &err);
-	if (ret)
-		parse_events_error__print(&err, "tracepoint");
-	parse_events_error__exit(&err);
-	return ret;
-}
-
-static int ppc__setup_book3s_hv(struct perf_kvm_stat *kvm,
-				struct evlist *evlist)
-{
-	const char **events_ptr;
-	int i, nr_tp = 0, err = -1;
-
-	/* Check for book3s_hv tracepoints */
-	for (events_ptr = ppc_book3s_hv_kvm_tp; *events_ptr; events_ptr++) {
-		err = is_tracepoint_available(*events_ptr, evlist);
-		if (err)
-			return -1;
-		nr_tp++;
-	}
-
-	for (i = 0; i < nr_tp; i++)
-		kvm_events_tp[i] = ppc_book3s_hv_kvm_tp[i];
-
-	kvm_events_tp[i] = NULL;
-	kvm_exit_reason = "trap";
-	kvm->exit_reasons = hv_exit_reasons;
-	kvm->exit_reasons_isa = "HV";
-
-	return 0;
-}
-
-/* Wrapper to setup kvm tracepoints */
-static int ppc__setup_kvm_tp(struct perf_kvm_stat *kvm)
-{
-	struct evlist *evlist = evlist__new();
-
-	if (evlist == NULL)
-		return -ENOMEM;
-
-	/* Right now, only supported on book3s_hv */
-	return ppc__setup_book3s_hv(kvm, evlist);
-}
-
-int setup_kvm_events_tp(struct perf_kvm_stat *kvm)
-{
-	return ppc__setup_kvm_tp(kvm);
-}
-
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
-{
-	int ret;
-
-	ret = ppc__setup_kvm_tp(kvm);
-	if (ret) {
-		kvm->exit_reasons = NULL;
-		kvm->exit_reasons_isa = NULL;
-	}
-
-	return ret;
-}
-
-/*
- * In case of powerpc architecture, pmu registers are programmable
- * by guest kernel. So monitoring guest via host may not provide
- * valid samples with default 'cycles' event. It is better to use
- * 'trace_imc/trace_cycles' event for guest profiling, since it
- * can track the guest instruction pointer in the trace-record.
- *
- * Function to parse the arguments and return appropriate values.
- */
-int kvm_add_default_arch_event(int *argc, const char **argv)
-{
-	const char **tmp;
-	bool event = false;
-	int i, j = *argc;
-
-	const struct option event_options[] = {
-		OPT_BOOLEAN('e', "event", &event, NULL),
-		OPT_END()
-	};
-
-	tmp = calloc(j + 1, sizeof(char *));
-	if (!tmp)
-		return -EINVAL;
-
-	for (i = 0; i < j; i++)
-		tmp[i] = argv[i];
-
-	parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN);
-	if (!event) {
-		if (perf_pmus__have_event("trace_imc", "trace_cycles")) {
-			argv[j++] = strdup("-e");
-			argv[j++] = strdup("trace_imc/trace_cycles/");
-			*argc += 2;
-		} else {
-			free(tmp);
-			return -EINVAL;
-		}
-	}
-
-	free(tmp);
-	return 0;
-}
diff --git a/tools/perf/arch/riscv/Makefile b/tools/perf/arch/riscv/Makefile
index 087e099fb453..44cc3f023318 100644
--- a/tools/perf/arch/riscv/Makefile
+++ b/tools/perf/arch/riscv/Makefile
@@ -1,3 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0
 PERF_HAVE_JITDUMP := 1
-HAVE_KVM_STAT_SUPPORT := 1
diff --git a/tools/perf/arch/riscv/util/Build b/tools/perf/arch/riscv/util/Build
index 628b9ebd418b..c01231bcf9c3 100644
--- a/tools/perf/arch/riscv/util/Build
+++ b/tools/perf/arch/riscv/util/Build
@@ -1,4 +1,2 @@
 perf-util-y += perf_regs.o
 perf-util-y += header.o
-
-perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
diff --git a/tools/perf/arch/riscv/util/kvm-stat.c b/tools/perf/arch/riscv/util/kvm-stat.c
deleted file mode 100644
index 3ea7acb5e159..000000000000
--- a/tools/perf/arch/riscv/util/kvm-stat.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Arch specific functions for perf kvm stat.
- *
- * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd.
- *
- */
-#include <errno.h>
-#include <memory.h>
-#include "../../../util/evsel.h"
-#include "../../../util/kvm-stat.h"
-#include "riscv_trap_types.h"
-#include "debug.h"
-
-define_exit_reasons_table(riscv_exit_reasons, kvm_riscv_trap_class);
-
-const char *vcpu_id_str = "id";
-const char *kvm_exit_reason = "scause";
-const char *kvm_entry_trace = "kvm:kvm_entry";
-const char *kvm_exit_trace = "kvm:kvm_exit";
-
-const char *kvm_events_tp[] = {
-	"kvm:kvm_entry",
-	"kvm:kvm_exit",
-	NULL,
-};
-
-static void event_get_key(struct evsel *evsel,
-			  struct perf_sample *sample,
-			  struct event_key *key)
-{
-	key->info = 0;
-	key->key = evsel__intval(evsel, sample, kvm_exit_reason) & ~CAUSE_IRQ_FLAG;
-	key->exit_reasons = riscv_exit_reasons;
-}
-
-static bool event_begin(struct evsel *evsel,
-			struct perf_sample *sample __maybe_unused,
-			struct event_key *key __maybe_unused)
-{
-	return evsel__name_is(evsel, kvm_entry_trace);
-}
-
-static bool event_end(struct evsel *evsel,
-		      struct perf_sample *sample,
-		      struct event_key *key)
-{
-	if (evsel__name_is(evsel, kvm_exit_trace)) {
-		event_get_key(evsel, sample, key);
-		return true;
-	}
-	return false;
-}
-
-static struct kvm_events_ops exit_events = {
-	.is_begin_event = event_begin,
-	.is_end_event	= event_end,
-	.decode_key	= exit_event_decode_key,
-	.name		= "VM-EXIT"
-};
-
-struct kvm_reg_events_ops kvm_reg_events_ops[] = {
-	{
-		.name	= "vmexit",
-		.ops	= &exit_events,
-	},
-	{ NULL, NULL },
-};
-
-const char * const kvm_skip_events[] = {
-	NULL,
-};
-
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
-{
-	kvm->exit_reasons_isa = "riscv64";
-	return 0;
-}
diff --git a/tools/perf/arch/riscv/util/riscv_trap_types.h b/tools/perf/arch/riscv/util/riscv_trap_types.h
deleted file mode 100644
index 6cc71eb01fca..000000000000
--- a/tools/perf/arch/riscv/util/riscv_trap_types.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#ifndef ARCH_PERF_RISCV_TRAP_TYPES_H
-#define ARCH_PERF_RISCV_TRAP_TYPES_H
-
-/* Exception cause high bit - is an interrupt if set */
-#define CAUSE_IRQ_FLAG		(_AC(1, UL) << (__riscv_xlen - 1))
-
-/* Interrupt causes (minus the high bit) */
-#define IRQ_S_SOFT 1
-#define IRQ_VS_SOFT 2
-#define IRQ_M_SOFT 3
-#define IRQ_S_TIMER 5
-#define IRQ_VS_TIMER 6
-#define IRQ_M_TIMER 7
-#define IRQ_S_EXT 9
-#define IRQ_VS_EXT 10
-#define IRQ_M_EXT 11
-#define IRQ_S_GEXT 12
-#define IRQ_PMU_OVF 13
-
-/* Exception causes */
-#define EXC_INST_MISALIGNED 0
-#define EXC_INST_ACCESS 1
-#define EXC_INST_ILLEGAL 2
-#define EXC_BREAKPOINT 3
-#define EXC_LOAD_MISALIGNED 4
-#define EXC_LOAD_ACCESS 5
-#define EXC_STORE_MISALIGNED 6
-#define EXC_STORE_ACCESS 7
-#define EXC_SYSCALL 8
-#define EXC_HYPERVISOR_SYSCALL 9
-#define EXC_SUPERVISOR_SYSCALL 10
-#define EXC_INST_PAGE_FAULT 12
-#define EXC_LOAD_PAGE_FAULT 13
-#define EXC_STORE_PAGE_FAULT 15
-#define EXC_INST_GUEST_PAGE_FAULT 20
-#define EXC_LOAD_GUEST_PAGE_FAULT 21
-#define EXC_VIRTUAL_INST_FAULT 22
-#define EXC_STORE_GUEST_PAGE_FAULT 23
-
-#define TRAP(x) { x, #x }
-
-#define kvm_riscv_trap_class \
-	TRAP(IRQ_S_SOFT), TRAP(IRQ_VS_SOFT), TRAP(IRQ_M_SOFT), \
-	TRAP(IRQ_S_TIMER), TRAP(IRQ_VS_TIMER), TRAP(IRQ_M_TIMER), \
-	TRAP(IRQ_S_EXT), TRAP(IRQ_VS_EXT), TRAP(IRQ_M_EXT), \
-	TRAP(IRQ_S_GEXT), TRAP(IRQ_PMU_OVF), \
-	TRAP(EXC_INST_MISALIGNED), TRAP(EXC_INST_ACCESS), TRAP(EXC_INST_ILLEGAL), \
-	TRAP(EXC_BREAKPOINT), TRAP(EXC_LOAD_MISALIGNED), TRAP(EXC_LOAD_ACCESS), \
-	TRAP(EXC_STORE_MISALIGNED), TRAP(EXC_STORE_ACCESS), TRAP(EXC_SYSCALL), \
-	TRAP(EXC_HYPERVISOR_SYSCALL), TRAP(EXC_SUPERVISOR_SYSCALL), \
-	TRAP(EXC_INST_PAGE_FAULT), TRAP(EXC_LOAD_PAGE_FAULT), \
-	TRAP(EXC_STORE_PAGE_FAULT), TRAP(EXC_INST_GUEST_PAGE_FAULT), \
-	TRAP(EXC_LOAD_GUEST_PAGE_FAULT), TRAP(EXC_VIRTUAL_INST_FAULT), \
-	TRAP(EXC_STORE_GUEST_PAGE_FAULT)
-
-#endif /* ARCH_PERF_RISCV_TRAP_TYPES_H */
diff --git a/tools/perf/arch/s390/Makefile b/tools/perf/arch/s390/Makefile
index 0033698a65ce..8b59ce8efb89 100644
--- a/tools/perf/arch/s390/Makefile
+++ b/tools/perf/arch/s390/Makefile
@@ -1,3 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0-only
-HAVE_KVM_STAT_SUPPORT := 1
 PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build
index 5391d26fedd4..87229f2c4397 100644
--- a/tools/perf/arch/s390/util/Build
+++ b/tools/perf/arch/s390/util/Build
@@ -1,5 +1,4 @@
 perf-util-y += header.o
-perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
 perf-util-y += perf_regs.o
 
 perf-util-y += machine.o
diff --git a/tools/perf/arch/s390/util/kvm-stat.c b/tools/perf/arch/s390/util/kvm-stat.c
deleted file mode 100644
index 0aed92df51ba..000000000000
--- a/tools/perf/arch/s390/util/kvm-stat.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Arch specific functions for perf kvm stat.
- *
- * Copyright 2014 IBM Corp.
- * Author(s): Alexander Yarygin <yarygin@linux.vnet.ibm.com>
- */
-
-#include <errno.h>
-#include <string.h>
-#include "../../util/kvm-stat.h"
-#include "../../util/evsel.h"
-#include <asm/sie.h>
-
-define_exit_reasons_table(sie_exit_reasons, sie_intercept_code);
-define_exit_reasons_table(sie_icpt_insn_codes, icpt_insn_codes);
-define_exit_reasons_table(sie_sigp_order_codes, sigp_order_codes);
-define_exit_reasons_table(sie_diagnose_codes, diagnose_codes);
-define_exit_reasons_table(sie_icpt_prog_codes, icpt_prog_codes);
-
-const char *vcpu_id_str = "id";
-const char *kvm_exit_reason = "icptcode";
-const char *kvm_entry_trace = "kvm:kvm_s390_sie_enter";
-const char *kvm_exit_trace = "kvm:kvm_s390_sie_exit";
-
-static void event_icpt_insn_get_key(struct evsel *evsel,
-				    struct perf_sample *sample,
-				    struct event_key *key)
-{
-	unsigned long insn;
-
-	insn = evsel__intval(evsel, sample, "instruction");
-	key->key = icpt_insn_decoder(insn);
-	key->exit_reasons = sie_icpt_insn_codes;
-}
-
-static void event_sigp_get_key(struct evsel *evsel,
-			       struct perf_sample *sample,
-			       struct event_key *key)
-{
-	key->key = evsel__intval(evsel, sample, "order_code");
-	key->exit_reasons = sie_sigp_order_codes;
-}
-
-static void event_diag_get_key(struct evsel *evsel,
-			       struct perf_sample *sample,
-			       struct event_key *key)
-{
-	key->key = evsel__intval(evsel, sample, "code");
-	key->exit_reasons = sie_diagnose_codes;
-}
-
-static void event_icpt_prog_get_key(struct evsel *evsel,
-				    struct perf_sample *sample,
-				    struct event_key *key)
-{
-	key->key = evsel__intval(evsel, sample, "code");
-	key->exit_reasons = sie_icpt_prog_codes;
-}
-
-static struct child_event_ops child_events[] = {
-	{ .name = "kvm:kvm_s390_intercept_instruction",
-	  .get_key = event_icpt_insn_get_key },
-	{ .name = "kvm:kvm_s390_handle_sigp",
-	  .get_key = event_sigp_get_key },
-	{ .name = "kvm:kvm_s390_handle_diag",
-	  .get_key = event_diag_get_key },
-	{ .name = "kvm:kvm_s390_intercept_prog",
-	  .get_key = event_icpt_prog_get_key },
-	{ NULL, NULL },
-};
-
-static struct kvm_events_ops exit_events = {
-	.is_begin_event = exit_event_begin,
-	.is_end_event = exit_event_end,
-	.child_ops = child_events,
-	.decode_key = exit_event_decode_key,
-	.name = "VM-EXIT"
-};
-
-const char *kvm_events_tp[] = {
-	"kvm:kvm_s390_sie_enter",
-	"kvm:kvm_s390_sie_exit",
-	"kvm:kvm_s390_intercept_instruction",
-	"kvm:kvm_s390_handle_sigp",
-	"kvm:kvm_s390_handle_diag",
-	"kvm:kvm_s390_intercept_prog",
-	NULL,
-};
-
-struct kvm_reg_events_ops kvm_reg_events_ops[] = {
-	{ .name = "vmexit", .ops = &exit_events },
-	{ NULL, NULL },
-};
-
-const char * const kvm_skip_events[] = {
-	"Wait state",
-	NULL,
-};
-
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid)
-{
-	if (strstr(cpuid, "IBM")) {
-		kvm->exit_reasons = sie_exit_reasons;
-		kvm->exit_reasons_isa = "SIE";
-	} else
-		return -ENOTSUP;
-
-	return 0;
-}
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile
index a295a80ea078..44cc3f023318 100644
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile
@@ -1,3 +1,2 @@
 # SPDX-License-Identifier: GPL-2.0
-HAVE_KVM_STAT_SUPPORT := 1
 PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
index 76127eefde8b..0c4cf1dd07bf 100644
--- a/tools/perf/arch/x86/util/Build
+++ b/tools/perf/arch/x86/util/Build
@@ -1,7 +1,6 @@
 perf-util-y += header.o
 perf-util-y += tsc.o
 perf-util-y += pmu.o
-perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
 perf-util-y += perf_regs.o
 perf-util-y += topdown.o
 perf-util-y += machine.o
diff --git a/tools/perf/arch/x86/util/kvm-stat.c b/tools/perf/arch/x86/util/kvm-stat.c
deleted file mode 100644
index bff36f9345ea..000000000000
--- a/tools/perf/arch/x86/util/kvm-stat.c
+++ /dev/null
@@ -1,264 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <errno.h>
-#include <string.h>
-#include "../../../util/kvm-stat.h"
-#include "../../../util/evsel.h"
-#include "../../../util/env.h"
-#include <asm/svm.h>
-#include <asm/vmx.h>
-#include <asm/kvm.h>
-#include <subcmd/parse-options.h>
-
-define_exit_reasons_table(vmx_exit_reasons, VMX_EXIT_REASONS);
-define_exit_reasons_table(svm_exit_reasons, SVM_EXIT_REASONS);
-
-static struct kvm_events_ops exit_events = {
-	.is_begin_event = exit_event_begin,
-	.is_end_event = exit_event_end,
-	.decode_key = exit_event_decode_key,
-	.name = "VM-EXIT"
-};
-
-const char *vcpu_id_str = "vcpu_id";
-const char *kvm_exit_reason = "exit_reason";
-const char *kvm_entry_trace = "kvm:kvm_entry";
-const char *kvm_exit_trace = "kvm:kvm_exit";
-
-/*
- * For the mmio events, we treat:
- * the time of MMIO write: kvm_mmio(KVM_TRACE_MMIO_WRITE...) -> kvm_entry
- * the time of MMIO read: kvm_exit -> kvm_mmio(KVM_TRACE_MMIO_READ...).
- */
-static void mmio_event_get_key(struct evsel *evsel, struct perf_sample *sample,
-			       struct event_key *key)
-{
-	key->key  = evsel__intval(evsel, sample, "gpa");
-	key->info = evsel__intval(evsel, sample, "type");
-}
-
-#define KVM_TRACE_MMIO_READ_UNSATISFIED 0
-#define KVM_TRACE_MMIO_READ 1
-#define KVM_TRACE_MMIO_WRITE 2
-
-static bool mmio_event_begin(struct evsel *evsel,
-			     struct perf_sample *sample, struct event_key *key)
-{
-	/* MMIO read begin event in kernel. */
-	if (kvm_exit_event(evsel))
-		return true;
-
-	/* MMIO write begin event in kernel. */
-	if (evsel__name_is(evsel, "kvm:kvm_mmio") &&
-	    evsel__intval(evsel, sample, "type") == KVM_TRACE_MMIO_WRITE) {
-		mmio_event_get_key(evsel, sample, key);
-		return true;
-	}
-
-	return false;
-}
-
-static bool mmio_event_end(struct evsel *evsel, struct perf_sample *sample,
-			   struct event_key *key)
-{
-	/* MMIO write end event in kernel. */
-	if (kvm_entry_event(evsel))
-		return true;
-
-	/* MMIO read end event in kernel.*/
-	if (evsel__name_is(evsel, "kvm:kvm_mmio") &&
-	    evsel__intval(evsel, sample, "type") == KVM_TRACE_MMIO_READ) {
-		mmio_event_get_key(evsel, sample, key);
-		return true;
-	}
-
-	return false;
-}
-
-static void mmio_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
-				  struct event_key *key,
-				  char *decode)
-{
-	scnprintf(decode, KVM_EVENT_NAME_LEN, "%#lx:%s",
-		  (unsigned long)key->key,
-		  key->info == KVM_TRACE_MMIO_WRITE ? "W" : "R");
-}
-
-static struct kvm_events_ops mmio_events = {
-	.is_begin_event = mmio_event_begin,
-	.is_end_event = mmio_event_end,
-	.decode_key = mmio_event_decode_key,
-	.name = "MMIO Access"
-};
-
- /* The time of emulation pio access is from kvm_pio to kvm_entry. */
-static void ioport_event_get_key(struct evsel *evsel,
-				 struct perf_sample *sample,
-				 struct event_key *key)
-{
-	key->key  = evsel__intval(evsel, sample, "port");
-	key->info = evsel__intval(evsel, sample, "rw");
-}
-
-static bool ioport_event_begin(struct evsel *evsel,
-			       struct perf_sample *sample,
-			       struct event_key *key)
-{
-	if (evsel__name_is(evsel, "kvm:kvm_pio")) {
-		ioport_event_get_key(evsel, sample, key);
-		return true;
-	}
-
-	return false;
-}
-
-static bool ioport_event_end(struct evsel *evsel,
-			     struct perf_sample *sample __maybe_unused,
-			     struct event_key *key __maybe_unused)
-{
-	return kvm_entry_event(evsel);
-}
-
-static void ioport_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
-				    struct event_key *key,
-				    char *decode)
-{
-	scnprintf(decode, KVM_EVENT_NAME_LEN, "%#llx:%s",
-		  (unsigned long long)key->key,
-		  key->info ? "POUT" : "PIN");
-}
-
-static struct kvm_events_ops ioport_events = {
-	.is_begin_event = ioport_event_begin,
-	.is_end_event = ioport_event_end,
-	.decode_key = ioport_event_decode_key,
-	.name = "IO Port Access"
-};
-
- /* The time of emulation msr is from kvm_msr to kvm_entry. */
-static void msr_event_get_key(struct evsel *evsel,
-				 struct perf_sample *sample,
-				 struct event_key *key)
-{
-	key->key  = evsel__intval(evsel, sample, "ecx");
-	key->info = evsel__intval(evsel, sample, "write");
-}
-
-static bool msr_event_begin(struct evsel *evsel,
-			       struct perf_sample *sample,
-			       struct event_key *key)
-{
-	if (evsel__name_is(evsel, "kvm:kvm_msr")) {
-		msr_event_get_key(evsel, sample, key);
-		return true;
-	}
-
-	return false;
-}
-
-static bool msr_event_end(struct evsel *evsel,
-			     struct perf_sample *sample __maybe_unused,
-			     struct event_key *key __maybe_unused)
-{
-	return kvm_entry_event(evsel);
-}
-
-static void msr_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
-				    struct event_key *key,
-				    char *decode)
-{
-	scnprintf(decode, KVM_EVENT_NAME_LEN, "%#llx:%s",
-		  (unsigned long long)key->key,
-		  key->info ? "W" : "R");
-}
-
-static struct kvm_events_ops msr_events = {
-	.is_begin_event = msr_event_begin,
-	.is_end_event = msr_event_end,
-	.decode_key = msr_event_decode_key,
-	.name = "MSR Access"
-};
-
-const char *kvm_events_tp[] = {
-	"kvm:kvm_entry",
-	"kvm:kvm_exit",
-	"kvm:kvm_mmio",
-	"kvm:kvm_pio",
-	"kvm:kvm_msr",
-	NULL,
-};
-
-struct kvm_reg_events_ops kvm_reg_events_ops[] = {
-	{ .name = "vmexit", .ops = &exit_events },
-	{ .name = "mmio", .ops = &mmio_events },
-	{ .name = "ioport", .ops = &ioport_events },
-	{ .name = "msr", .ops = &msr_events },
-	{ NULL, NULL },
-};
-
-const char * const kvm_skip_events[] = {
-	"HLT",
-	NULL,
-};
-
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid)
-{
-	if (strstr(cpuid, "Intel")) {
-		kvm->exit_reasons = vmx_exit_reasons;
-		kvm->exit_reasons_isa = "VMX";
-	} else if (strstr(cpuid, "AMD") || strstr(cpuid, "Hygon")) {
-		kvm->exit_reasons = svm_exit_reasons;
-		kvm->exit_reasons_isa = "SVM";
-	} else
-		return -ENOTSUP;
-
-	return 0;
-}
-
-/*
- * After KVM supports PEBS for guest on Intel platforms
- * (https://lore.kernel.org/all/20220411101946.20262-1-likexu@tencent.com/),
- * host loses the capability to sample guest with PEBS since all PEBS related
- * MSRs are switched to guest value after vm-entry, like IA32_DS_AREA MSR is
- * switched to guest GVA at vm-entry. This would lead to "perf kvm record"
- * fails to sample guest on Intel platforms since "cycles:P" event is used to
- * sample guest by default.
- *
- * So, to avoid this issue explicitly use "cycles" instead of "cycles:P" event
- * by default to sample guest on Intel platforms.
- */
-int kvm_add_default_arch_event(int *argc, const char **argv)
-{
-	const char **tmp;
-	bool event = false;
-	int ret = 0, i, j = *argc;
-
-	const struct option event_options[] = {
-		OPT_BOOLEAN('e', "event", &event, NULL),
-		OPT_BOOLEAN(0, "pfm-events", &event, NULL),
-		OPT_END()
-	};
-
-	if (!x86__is_intel_cpu())
-		return 0;
-
-	tmp = calloc(j + 1, sizeof(char *));
-	if (!tmp)
-		return -ENOMEM;
-
-	for (i = 0; i < j; i++)
-		tmp[i] = argv[i];
-
-	parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN);
-	if (!event) {
-		argv[j++] = STRDUP_FAIL_EXIT("-e");
-		argv[j++] = STRDUP_FAIL_EXIT("cycles");
-		*argc += 2;
-	}
-
-	free(tmp);
-	return 0;
-
-EXIT:
-	free(tmp);
-	return ret;
-}
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index c61369d54dd9..bd9bda32157f 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -52,7 +52,7 @@
 #include <math.h>
 #include <perf/mmap.h>
 
-#if defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
+#if defined(HAVE_LIBTRACEEVENT)
 #define GET_EVENT_KEY(func, field)					\
 static u64 get_event_ ##func(struct kvm_event *event, int vcpu)		\
 {									\
@@ -597,7 +597,7 @@ static void kvm_display(struct perf_kvm_stat *kvm)
 
 #endif /* HAVE_SLANG_SUPPORT */
 
-#endif // defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
+#endif // defined(HAVE_LIBTRACEEVENT)
 
 static const char *get_filename_for_perf_kvm(void)
 {
@@ -613,13 +613,13 @@ static const char *get_filename_for_perf_kvm(void)
 	return filename;
 }
 
-#if defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
+#if defined(HAVE_LIBTRACEEVENT)
 
 static bool register_kvm_events_ops(struct perf_kvm_stat *kvm)
 {
-	struct kvm_reg_events_ops *events_ops = kvm_reg_events_ops;
+	const struct kvm_reg_events_ops *events_ops;
 
-	for (events_ops = kvm_reg_events_ops; events_ops->name; events_ops++) {
+	for (events_ops = kvm_reg_events_ops(); events_ops->name; events_ops++) {
 		if (!strcmp(events_ops->name, kvm->report_event)) {
 			kvm->events_ops = events_ops->ops;
 			return true;
@@ -809,7 +809,7 @@ static bool is_child_event(struct perf_kvm_stat *kvm,
 			   struct perf_sample *sample,
 			   struct event_key *key)
 {
-	struct child_event_ops *child_ops;
+	const struct child_event_ops *child_ops;
 
 	child_ops = kvm->events_ops->child_ops;
 
@@ -845,7 +845,7 @@ static bool skip_event(const char *event)
 {
 	const char * const *skip_events;
 
-	for (skip_events = kvm_skip_events; *skip_events; skip_events++)
+	for (skip_events = kvm_skip_events(); *skip_events; skip_events++)
 		if (!strcmp(event, *skip_events))
 			return true;
 
@@ -928,7 +928,7 @@ struct vcpu_event_record *per_vcpu_record(struct thread *thread,
 			return NULL;
 		}
 
-		vcpu_record->vcpu_id = evsel__intval(evsel, sample, vcpu_id_str);
+		vcpu_record->vcpu_id = evsel__intval(evsel, sample, vcpu_id_str());
 		thread__set_priv(thread, vcpu_record);
 	}
 
@@ -1636,11 +1636,6 @@ exit:
 	return ret;
 }
 
-int __weak setup_kvm_events_tp(struct perf_kvm_stat *kvm __maybe_unused)
-{
-	return 0;
-}
-
 static int
 kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
 {
@@ -1666,7 +1661,7 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
 		return ret;
 	}
 
-	for (events_tp = kvm_events_tp; *events_tp; events_tp++)
+	for (events_tp = kvm_events_tp(); *events_tp; events_tp++)
 		events_tp_size++;
 
 	rec_argc = ARRAY_SIZE(record_args) + argc + 2 +
@@ -1681,7 +1676,7 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
 
 	for (j = 0; j < events_tp_size; j++) {
 		rec_argv[i++] = STRDUP_FAIL_EXIT("-e");
-		rec_argv[i++] = STRDUP_FAIL_EXIT(kvm_events_tp[j]);
+		rec_argv[i++] = STRDUP_FAIL_EXIT(kvm_events_tp()[j]);
 	}
 
 	rec_argv[i++] = STRDUP_FAIL_EXIT("-o");
@@ -1775,7 +1770,7 @@ static struct evlist *kvm_live_event_list(void)
 	if (evlist == NULL)
 		return NULL;
 
-	for (events_tp = kvm_events_tp; *events_tp; events_tp++) {
+	for (events_tp = kvm_events_tp(); *events_tp; events_tp++) {
 
 		tp = strdup(*events_tp);
 		if (tp == NULL)
@@ -1985,13 +1980,7 @@ static int kvm_cmd_stat(const char *file_name, int argc, const char **argv)
 perf_stat:
 	return cmd_stat(argc, argv);
 }
-#endif /* HAVE_KVM_STAT_SUPPORT */
-
-int __weak kvm_add_default_arch_event(int *argc __maybe_unused,
-					const char **argv __maybe_unused)
-{
-	return 0;
-}
+#endif /* HAVE_LIBTRACEEVENT */
 
 static int __cmd_record(const char *file_name, int argc, const char **argv)
 {
@@ -2179,7 +2168,7 @@ int cmd_kvm(int argc, const char **argv)
 		return __cmd_top(argc, argv);
 	else if (strlen(argv[0]) > 2 && strstarts("buildid-list", argv[0]))
 		return __cmd_buildid_list(file_name, argc, argv);
-#if defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
+#if defined(HAVE_LIBTRACEEVENT)
 	else if (strlen(argv[0]) > 2 && strstarts("stat", argv[0]))
 		return kvm_cmd_stat(file_name, argc, argv);
 #endif
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index c037b1e99d28..bcccad7487a9 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -128,7 +128,8 @@ perf-util-y += spark.o
 perf-util-y += topdown.o
 perf-util-y += iostat.o
 perf-util-y += stream.o
-perf-util-y += kvm-stat.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat.o
+perf-util-y += kvm-stat-arch/
 perf-util-y += lock-contention.o
 perf-util-y += auxtrace.o
 perf-util-y += intel-pt-decoder/
diff --git a/tools/perf/util/kvm-stat-arch/Build b/tools/perf/util/kvm-stat-arch/Build
new file mode 100644
index 000000000000..d84e55656e7a
--- /dev/null
+++ b/tools/perf/util/kvm-stat-arch/Build
@@ -0,0 +1,6 @@
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat-arm64.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat-loongarch.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat-powerpc.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat-riscv.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat-s390.o
+perf-util-$(CONFIG_LIBTRACEEVENT) += kvm-stat-x86.o
diff --git a/tools/perf/util/kvm-stat-arch/arm64_exception_types.h b/tools/perf/util/kvm-stat-arch/arm64_exception_types.h
new file mode 100644
index 000000000000..bf827f19ace0
--- /dev/null
+++ b/tools/perf/util/kvm-stat-arch/arm64_exception_types.h
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef ARCH_PERF_ARM64_EXCEPTION_TYPES_H
+#define ARCH_PERF_ARM64_EXCEPTION_TYPES_H
+
+/* Per asm/virt.h */
+#define HVC_STUB_ERR		  0xbadca11
+
+/* Per asm/kvm_asm.h */
+#define ARM_EXCEPTION_IRQ		0
+#define ARM_EXCEPTION_EL1_SERROR	1
+#define ARM_EXCEPTION_TRAP		2
+#define ARM_EXCEPTION_IL		3
+/* The hyp-stub will return this for any kvm_call_hyp() call */
+#define ARM_EXCEPTION_HYP_GONE		HVC_STUB_ERR
+
+#define kvm_arm_exception_type					\
+	{ARM_EXCEPTION_IRQ,		"IRQ"		},	\
+	{ARM_EXCEPTION_EL1_SERROR,	"SERROR"	},	\
+	{ARM_EXCEPTION_TRAP,		"TRAP"		},	\
+	{ARM_EXCEPTION_IL,		"ILLEGAL"	},	\
+	{ARM_EXCEPTION_HYP_GONE,	"HYP_GONE"	}
+
+/* Per asm/esr.h */
+#define ESR_ELx_EC_UNKNOWN	(0x00)
+#define ESR_ELx_EC_WFx		(0x01)
+/* Unallocated EC: 0x02 */
+#define ESR_ELx_EC_CP15_32	(0x03)
+#define ESR_ELx_EC_CP15_64	(0x04)
+#define ESR_ELx_EC_CP14_MR	(0x05)
+#define ESR_ELx_EC_CP14_LS	(0x06)
+#define ESR_ELx_EC_FP_ASIMD	(0x07)
+#define ESR_ELx_EC_CP10_ID	(0x08)	/* EL2 only */
+#define ESR_ELx_EC_PAC		(0x09)	/* EL2 and above */
+#define ESR_ELx_EC_OTHER	(0x0A)
+/* Unallocated EC: 0x0B */
+#define ESR_ELx_EC_CP14_64	(0x0C)
+#define ESR_ELx_EC_BTI		(0x0D)
+#define ESR_ELx_EC_ILL		(0x0E)
+/* Unallocated EC: 0x0F - 0x10 */
+#define ESR_ELx_EC_SVC32	(0x11)
+#define ESR_ELx_EC_HVC32	(0x12)	/* EL2 only */
+#define ESR_ELx_EC_SMC32	(0x13)	/* EL2 and above */
+/* Unallocated EC: 0x14 */
+#define ESR_ELx_EC_SVC64	(0x15)
+#define ESR_ELx_EC_HVC64	(0x16)	/* EL2 and above */
+#define ESR_ELx_EC_SMC64	(0x17)	/* EL2 and above */
+#define ESR_ELx_EC_SYS64	(0x18)
+#define ESR_ELx_EC_SVE		(0x19)
+#define ESR_ELx_EC_ERET		(0x1a)	/* EL2 only */
+/* Unallocated EC: 0x1B */
+#define ESR_ELx_EC_FPAC		(0x1C)	/* EL1 and above */
+#define ESR_ELx_EC_SME		(0x1D)
+/* Unallocated EC: 0x1E */
+#define ESR_ELx_EC_IMP_DEF	(0x1f)	/* EL3 only */
+#define ESR_ELx_EC_IABT_LOW	(0x20)
+#define ESR_ELx_EC_IABT_CUR	(0x21)
+#define ESR_ELx_EC_PC_ALIGN	(0x22)
+/* Unallocated EC: 0x23 */
+#define ESR_ELx_EC_DABT_LOW	(0x24)
+#define ESR_ELx_EC_DABT_CUR	(0x25)
+#define ESR_ELx_EC_SP_ALIGN	(0x26)
+#define ESR_ELx_EC_MOPS		(0x27)
+#define ESR_ELx_EC_FP_EXC32	(0x28)
+/* Unallocated EC: 0x29 - 0x2B */
+#define ESR_ELx_EC_FP_EXC64	(0x2C)
+#define ESR_ELx_EC_GCS		(0x2D)
+/* Unallocated EC: 0x2E */
+#define ESR_ELx_EC_SERROR	(0x2F)
+#define ESR_ELx_EC_BREAKPT_LOW	(0x30)
+#define ESR_ELx_EC_BREAKPT_CUR	(0x31)
+#define ESR_ELx_EC_SOFTSTP_LOW	(0x32)
+#define ESR_ELx_EC_SOFTSTP_CUR	(0x33)
+#define ESR_ELx_EC_WATCHPT_LOW	(0x34)
+#define ESR_ELx_EC_WATCHPT_CUR	(0x35)
+/* Unallocated EC: 0x36 - 0x37 */
+#define ESR_ELx_EC_BKPT32	(0x38)
+/* Unallocated EC: 0x39 */
+#define ESR_ELx_EC_VECTOR32	(0x3A)	/* EL2 only */
+/* Unallocated EC: 0x3B */
+#define ESR_ELx_EC_BRK64	(0x3C)
+/* Unallocated EC: 0x3D - 0x3F */
+#define ESR_ELx_EC_MAX		(0x3F)
+
+#define ECN(x) { ESR_ELx_EC_##x, #x }
+
+#define kvm_arm_exception_class \
+	ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \
+	ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(PAC), ECN(CP14_64), \
+	ECN(SVC64), ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(SVE), \
+	ECN(IMP_DEF), ECN(IABT_LOW), ECN(IABT_CUR), \
+	ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \
+	ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \
+	ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \
+	ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \
+	ECN(BKPT32), ECN(VECTOR32), ECN(BRK64)
+
+#endif /* ARCH_PERF_ARM64_EXCEPTION_TYPES_H */
diff --git a/tools/perf/util/kvm-stat-arch/book3s_hcalls.h b/tools/perf/util/kvm-stat-arch/book3s_hcalls.h
new file mode 100644
index 000000000000..488f4339b83c
--- /dev/null
+++ b/tools/perf/util/kvm-stat-arch/book3s_hcalls.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_PERF_BOOK3S_HV_HCALLS_H
+#define ARCH_PERF_BOOK3S_HV_HCALLS_H
+
+/*
+ * PowerPC HCALL codes : hcall code to name mapping
+ */
+#define kvm_trace_symbol_hcall \
+	{0x4, "H_REMOVE"},					\
+	{0x8, "H_ENTER"},					\
+	{0xc, "H_READ"},					\
+	{0x10, "H_CLEAR_MOD"},					\
+	{0x14, "H_CLEAR_REF"},					\
+	{0x18, "H_PROTECT"},					\
+	{0x1c, "H_GET_TCE"},					\
+	{0x20, "H_PUT_TCE"},					\
+	{0x24, "H_SET_SPRG0"},					\
+	{0x28, "H_SET_DABR"},					\
+	{0x2c, "H_PAGE_INIT"},					\
+	{0x30, "H_SET_ASR"},					\
+	{0x34, "H_ASR_ON"},					\
+	{0x38, "H_ASR_OFF"},					\
+	{0x3c, "H_LOGICAL_CI_LOAD"},				\
+	{0x40, "H_LOGICAL_CI_STORE"},				\
+	{0x44, "H_LOGICAL_CACHE_LOAD"},				\
+	{0x48, "H_LOGICAL_CACHE_STORE"},			\
+	{0x4c, "H_LOGICAL_ICBI"},				\
+	{0x50, "H_LOGICAL_DCBF"},				\
+	{0x54, "H_GET_TERM_CHAR"},				\
+	{0x58, "H_PUT_TERM_CHAR"},				\
+	{0x5c, "H_REAL_TO_LOGICAL"},				\
+	{0x60, "H_HYPERVISOR_DATA"},				\
+	{0x64, "H_EOI"},					\
+	{0x68, "H_CPPR"},					\
+	{0x6c, "H_IPI"},					\
+	{0x70, "H_IPOLL"},					\
+	{0x74, "H_XIRR"},					\
+	{0x78, "H_MIGRATE_DMA"},				\
+	{0x7c, "H_PERFMON"},					\
+	{0xdc, "H_REGISTER_VPA"},				\
+	{0xe0, "H_CEDE"},					\
+	{0xe4, "H_CONFER"},					\
+	{0xe8, "H_PROD"},					\
+	{0xec, "H_GET_PPP"},					\
+	{0xf0, "H_SET_PPP"},					\
+	{0xf4, "H_PURR"},					\
+	{0xf8, "H_PIC"},					\
+	{0xfc, "H_REG_CRQ"},					\
+	{0x100, "H_FREE_CRQ"},					\
+	{0x104, "H_VIO_SIGNAL"},				\
+	{0x108, "H_SEND_CRQ"},					\
+	{0x110, "H_COPY_RDMA"},					\
+	{0x114, "H_REGISTER_LOGICAL_LAN"},			\
+	{0x118, "H_FREE_LOGICAL_LAN"},				\
+	{0x11c, "H_ADD_LOGICAL_LAN_BUFFER"},			\
+	{0x120, "H_SEND_LOGICAL_LAN"},				\
+	{0x124, "H_BULK_REMOVE"},				\
+	{0x130, "H_MULTICAST_CTRL"},				\
+	{0x134, "H_SET_XDABR"},					\
+	{0x138, "H_STUFF_TCE"},					\
+	{0x13c, "H_PUT_TCE_INDIRECT"},				\
+	{0x14c, "H_CHANGE_LOGICAL_LAN_MAC"},			\
+	{0x150, "H_VTERM_PARTNER_INFO"},			\
+	{0x154, "H_REGISTER_VTERM"},				\
+	{0x158, "H_FREE_VTERM"},				\
+	{0x15c, "H_RESET_EVENTS"},				\
+	{0x160, "H_ALLOC_RESOURCE"},				\
+	{0x164, "H_FREE_RESOURCE"},				\
+	{0x168, "H_MODIFY_QP"},					\
+	{0x16c, "H_QUERY_QP"},					\
+	{0x170, "H_REREGISTER_PMR"},				\
+	{0x174, "H_REGISTER_SMR"},				\
+	{0x178, "H_QUERY_MR"},					\
+	{0x17c, "H_QUERY_MW"},					\
+	{0x180, "H_QUERY_HCA"},					\
+	{0x184, "H_QUERY_PORT"},				\
+	{0x188, "H_MODIFY_PORT"},				\
+	{0x18c, "H_DEFINE_AQP1"},				\
+	{0x190, "H_GET_TRACE_BUFFER"},				\
+	{0x194, "H_DEFINE_AQP0"},				\
+	{0x198, "H_RESIZE_MR"},					\
+	{0x19c, "H_ATTACH_MCQP"},				\
+	{0x1a0, "H_DETACH_MCQP"},				\
+	{0x1a4, "H_CREATE_RPT"},				\
+	{0x1a8, "H_REMOVE_RPT"},				\
+	{0x1ac, "H_REGISTER_RPAGES"},				\
+	{0x1b0, "H_DISABLE_AND_GET"},				\
+	{0x1b4, "H_ERROR_DATA"},				\
+	{0x1b8, "H_GET_HCA_INFO"},				\
+	{0x1bc, "H_GET_PERF_COUNT"},				\
+	{0x1c0, "H_MANAGE_TRACE"},				\
+	{0x1d4, "H_FREE_LOGICAL_LAN_BUFFER"},			\
+	{0x1d8, "H_POLL_PENDING"},				\
+	{0x1e4, "H_QUERY_INT_STATE"},				\
+	{0x244, "H_ILLAN_ATTRIBUTES"},				\
+	{0x250, "H_MODIFY_HEA_QP"},				\
+	{0x254, "H_QUERY_HEA_QP"},				\
+	{0x258, "H_QUERY_HEA"},					\
+	{0x25c, "H_QUERY_HEA_PORT"},				\
+	{0x260, "H_MODIFY_HEA_PORT"},				\
+	{0x264, "H_REG_BCMC"},					\
+	{0x268, "H_DEREG_BCMC"},				\
+	{0x26c, "H_REGISTER_HEA_RPAGES"},			\
+	{0x270, "H_DISABLE_AND_GET_HEA"},			\
+	{0x274, "H_GET_HEA_INFO"},				\
+	{0x278, "H_ALLOC_HEA_RESOURCE"},			\
+	{0x284, "H_ADD_CONN"},					\
+	{0x288, "H_DEL_CONN"},					\
+	{0x298, "H_JOIN"},					\
+	{0x2a4, "H_VASI_STATE"},				\
+	{0x2b0, "H_ENABLE_CRQ"},				\
+	{0x2b8, "H_GET_EM_PARMS"},				\
+	{0x2d0, "H_SET_MPP"},					\
+	{0x2d4, "H_GET_MPP"},					\
+	{0x2ec, "H_HOME_NODE_ASSOCIATIVITY"},			\
+	{0x2f4, "H_BEST_ENERGY"},				\
+	{0x2fc, "H_XIRR_X"},					\
+	{0x300, "H_RANDOM"},					\
+	{0x304, "H_COP"},					\
+	{0x314, "H_GET_MPP_X"},					\
+	{0x31c, "H_SET_MODE"},					\
+	{0xf000, "H_RTAS"}					\
+
+#endif
diff --git a/tools/perf/util/kvm-stat-arch/book3s_hv_exits.h b/tools/perf/util/kvm-stat-arch/book3s_hv_exits.h
new file mode 100644
index 000000000000..2011376c7ab5
--- /dev/null
+++ b/tools/perf/util/kvm-stat-arch/book3s_hv_exits.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_PERF_BOOK3S_HV_EXITS_H
+#define ARCH_PERF_BOOK3S_HV_EXITS_H
+
+/*
+ * PowerPC Interrupt vectors : exit code to name mapping
+ */
+
+#define kvm_trace_symbol_exit \
+	{0x0,	"RETURN_TO_HOST"}, \
+	{0x100, "SYSTEM_RESET"}, \
+	{0x200, "MACHINE_CHECK"}, \
+	{0x300, "DATA_STORAGE"}, \
+	{0x380, "DATA_SEGMENT"}, \
+	{0x400, "INST_STORAGE"}, \
+	{0x480, "INST_SEGMENT"}, \
+	{0x500, "EXTERNAL"}, \
+	{0x502, "EXTERNAL_HV"}, \
+	{0x600, "ALIGNMENT"}, \
+	{0x700, "PROGRAM"}, \
+	{0x800, "FP_UNAVAIL"}, \
+	{0x900, "DECREMENTER"}, \
+	{0x980, "HV_DECREMENTER"}, \
+	{0xc00, "SYSCALL"}, \
+	{0xd00, "TRACE"}, \
+	{0xe00, "H_DATA_STORAGE"}, \
+	{0xe20, "H_INST_STORAGE"}, \
+	{0xe40, "H_EMUL_ASSIST"}, \
+	{0xf00, "PERFMON"}, \
+	{0xf20, "ALTIVEC"}, \
+	{0xf40, "VSX"}
+
+#endif
diff --git a/tools/perf/util/kvm-stat-arch/kvm-stat-arm64.c b/tools/perf/util/kvm-stat-arch/kvm-stat-arm64.c
new file mode 100644
index 000000000000..8003ff415b1a
--- /dev/null
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-arm64.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <memory.h>
+#include "../debug.h"
+#include "../evsel.h"
+#include "../kvm-stat.h"
+#include "arm64_exception_types.h"
+
+define_exit_reasons_table(arm64_exit_reasons, kvm_arm_exception_type);
+define_exit_reasons_table(arm64_trap_exit_reasons, kvm_arm_exception_class);
+
+static const char *kvm_trap_exit_reason = "esr_ec";
+
+static const char * const __kvm_events_tp[] = {
+	"kvm:kvm_entry",
+	"kvm:kvm_exit",
+	NULL,
+};
+
+static void event_get_key(struct evsel *evsel,
+			  struct perf_sample *sample,
+			  struct event_key *key)
+{
+	key->info = 0;
+	key->key = evsel__intval(evsel, sample, kvm_exit_reason());
+	key->exit_reasons = arm64_exit_reasons;
+
+	/*
+	 * TRAP exceptions carry exception class info in esr_ec field
+	 * and, hence, we need to use a different exit_reasons table to
+	 * properly decode event's est_ec.
+	 */
+	if (key->key == ARM_EXCEPTION_TRAP) {
+		key->key = evsel__intval(evsel, sample, kvm_trap_exit_reason);
+		key->exit_reasons = arm64_trap_exit_reasons;
+	}
+}
+
+static bool event_begin(struct evsel *evsel,
+			struct perf_sample *sample __maybe_unused,
+			struct event_key *key __maybe_unused)
+{
+	return evsel__name_is(evsel, kvm_entry_trace());
+}
+
+static bool event_end(struct evsel *evsel,
+		      struct perf_sample *sample,
+		      struct event_key *key)
+{
+	if (evsel__name_is(evsel, kvm_exit_trace())) {
+		event_get_key(evsel, sample, key);
+		return true;
+	}
+	return false;
+}
+
+static const struct kvm_events_ops exit_events = {
+	.is_begin_event = event_begin,
+	.is_end_event	= event_end,
+	.decode_key	= exit_event_decode_key,
+	.name		= "VM-EXIT"
+};
+
+static const struct kvm_reg_events_ops __kvm_reg_events_ops[] = {
+	{
+		.name	= "vmexit",
+		.ops	= &exit_events,
+	},
+	{ NULL, NULL },
+};
+
+static const char * const __kvm_skip_events[] = {
+	NULL,
+};
+
+int __cpu_isa_init_arm64(struct perf_kvm_stat *kvm)
+{
+	kvm->exit_reasons_isa = "arm64";
+	return 0;
+}
+
+const char * const *__kvm_events_tp_arm64(void)
+{
+	return __kvm_events_tp;
+}
+
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_arm64(void)
+{
+	return __kvm_reg_events_ops;
+}
+
+const char * const *__kvm_skip_events_arm64(void)
+{
+	return __kvm_skip_events;
+}
diff --git a/tools/perf/util/kvm-stat-arch/kvm-stat-loongarch.c b/tools/perf/util/kvm-stat-arch/kvm-stat-loongarch.c
new file mode 100644
index 000000000000..a15ce072ac34
--- /dev/null
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-loongarch.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <memory.h>
+#include "../kvm-stat.h"
+#include "../parse-events.h"
+#include "../debug.h"
+#include "../evsel.h"
+#include "../evlist.h"
+#include "../pmus.h"
+
+#define LOONGARCH_EXCEPTION_INT		0
+#define LOONGARCH_EXCEPTION_PIL		1
+#define LOONGARCH_EXCEPTION_PIS		2
+#define LOONGARCH_EXCEPTION_PIF		3
+#define LOONGARCH_EXCEPTION_PME		4
+#define LOONGARCH_EXCEPTION_FPD		15
+#define LOONGARCH_EXCEPTION_SXD		16
+#define LOONGARCH_EXCEPTION_ASXD	17
+#define LOONGARCH_EXCEPTION_GSPR	22
+#define  LOONGARCH_EXCEPTION_CPUCFG	100
+#define  LOONGARCH_EXCEPTION_CSR	101
+#define  LOONGARCH_EXCEPTION_IOCSR	102
+#define  LOONGARCH_EXCEPTION_IDLE	103
+#define  LOONGARCH_EXCEPTION_OTHERS	104
+#define LOONGARCH_EXCEPTION_HVC		23
+
+#define loongarch_exception_type				\
+	{LOONGARCH_EXCEPTION_INT,  "Interrupt" },		\
+	{LOONGARCH_EXCEPTION_PIL,  "Mem Read" },		\
+	{LOONGARCH_EXCEPTION_PIS,  "Mem Store" },		\
+	{LOONGARCH_EXCEPTION_PIF,  "Inst Fetch" },		\
+	{LOONGARCH_EXCEPTION_PME,  "Mem Modify" },		\
+	{LOONGARCH_EXCEPTION_FPD,  "FPU" },			\
+	{LOONGARCH_EXCEPTION_SXD,  "LSX" },			\
+	{LOONGARCH_EXCEPTION_ASXD, "LASX" },			\
+	{LOONGARCH_EXCEPTION_GSPR, "Privilege Error" },		\
+	{LOONGARCH_EXCEPTION_HVC,  "Hypercall" },		\
+	{LOONGARCH_EXCEPTION_CPUCFG, "CPUCFG" },		\
+	{LOONGARCH_EXCEPTION_CSR,    "CSR" },			\
+	{LOONGARCH_EXCEPTION_IOCSR,  "IOCSR" },			\
+	{LOONGARCH_EXCEPTION_IDLE,   "Idle" },			\
+	{LOONGARCH_EXCEPTION_OTHERS, "Others" }
+
+define_exit_reasons_table(loongarch_exit_reasons, loongarch_exception_type);
+
+static const char *kvm_reenter_trace = "kvm:kvm_reenter";
+static const char * const __kvm_events_tp[] = {
+	"kvm:kvm_enter",
+	"kvm:kvm_reenter",
+	"kvm:kvm_exit",
+	"kvm:kvm_exit_gspr",
+	NULL,
+};
+
+static bool event_begin(struct evsel *evsel,
+			struct perf_sample *sample, struct event_key *key)
+{
+	return exit_event_begin(evsel, sample, key);
+}
+
+static bool event_end(struct evsel *evsel,
+		      struct perf_sample *sample __maybe_unused,
+		      struct event_key *key __maybe_unused)
+{
+	/*
+	 * LoongArch kvm is different with other architectures
+	 *
+	 * There is kvm:kvm_reenter or kvm:kvm_enter event adjacent with
+	 * kvm:kvm_exit event.
+	 *   kvm:kvm_enter   means returning to vmm and then to guest
+	 *   kvm:kvm_reenter means returning to guest immediately
+	 */
+	return evsel__name_is(evsel, kvm_entry_trace()) ||
+	       evsel__name_is(evsel, kvm_reenter_trace);
+}
+
+static void event_gspr_get_key(struct evsel *evsel,
+			       struct perf_sample *sample, struct event_key *key)
+{
+	unsigned int insn;
+
+	key->key = LOONGARCH_EXCEPTION_OTHERS;
+	insn = evsel__intval(evsel, sample, "inst_word");
+
+	switch (insn >> 24) {
+	case 0:
+		/* CPUCFG inst trap */
+		if ((insn >> 10) == 0x1b)
+			key->key = LOONGARCH_EXCEPTION_CPUCFG;
+		break;
+	case 4:
+		/* CSR inst trap */
+		key->key = LOONGARCH_EXCEPTION_CSR;
+		break;
+	case 6:
+		/* IOCSR inst trap */
+		if ((insn >> 15) == 0xc90)
+			key->key = LOONGARCH_EXCEPTION_IOCSR;
+		else if ((insn >> 15) == 0xc91)
+			/* Idle inst trap */
+			key->key = LOONGARCH_EXCEPTION_IDLE;
+		break;
+	default:
+		key->key = LOONGARCH_EXCEPTION_OTHERS;
+		break;
+	}
+}
+
+static const struct child_event_ops child_events[] = {
+	{ .name = "kvm:kvm_exit_gspr", .get_key = event_gspr_get_key },
+	{ NULL, NULL },
+};
+
+static const struct kvm_events_ops exit_events = {
+	.is_begin_event = event_begin,
+	.is_end_event = event_end,
+	.child_ops = child_events,
+	.decode_key = exit_event_decode_key,
+	.name = "VM-EXIT"
+};
+
+static const struct kvm_reg_events_ops __kvm_reg_events_ops[] = {
+	{ .name	= "vmexit", .ops = &exit_events, },
+	{ NULL, NULL },
+};
+
+static const char * const __kvm_skip_events[] = {
+	NULL,
+};
+
+int __cpu_isa_init_loongarch(struct perf_kvm_stat *kvm)
+{
+	kvm->exit_reasons_isa = "loongarch64";
+	kvm->exit_reasons = loongarch_exit_reasons;
+	return 0;
+}
+
+const char * const *__kvm_events_tp_loongarch(void)
+{
+	return __kvm_events_tp;
+}
+
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_loongarch(void)
+{
+	return __kvm_reg_events_ops;
+}
+
+const char * const *__kvm_skip_events_loongarch(void)
+{
+	return __kvm_skip_events;
+}
diff --git a/tools/perf/util/kvm-stat-arch/kvm-stat-powerpc.c b/tools/perf/util/kvm-stat-arch/kvm-stat-powerpc.c
new file mode 100644
index 000000000000..42182d70beb6
--- /dev/null
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-powerpc.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include "../kvm-stat.h"
+#include "../parse-events.h"
+#include "../debug.h"
+#include "../evsel.h"
+#include "../evlist.h"
+#include "../pmus.h"
+
+#include "book3s_hv_exits.h"
+#include "book3s_hcalls.h"
+#include <subcmd/parse-options.h>
+
+#define NR_TPS 4
+
+define_exit_reasons_table(hv_exit_reasons, kvm_trace_symbol_exit);
+define_exit_reasons_table(hcall_reasons, kvm_trace_symbol_hcall);
+
+/* Tracepoints specific to ppc_book3s_hv */
+static const char * const ppc_book3s_hv_kvm_tp[] = {
+	"kvm_hv:kvm_guest_enter",
+	"kvm_hv:kvm_guest_exit",
+	"kvm_hv:kvm_hcall_enter",
+	"kvm_hv:kvm_hcall_exit",
+	NULL,
+};
+
+/* 1 extra placeholder for NULL */
+static const char *__kvm_events_tp[NR_TPS + 1];
+
+static void hcall_event_get_key(struct evsel *evsel,
+				struct perf_sample *sample,
+				struct event_key *key)
+{
+	key->info = 0;
+	key->key = evsel__intval(evsel, sample, "req");
+}
+
+static const char *get_hcall_exit_reason(u64 exit_code)
+{
+	struct exit_reasons_table *tbl = hcall_reasons;
+
+	while (tbl->reason != NULL) {
+		if (tbl->exit_code == exit_code)
+			return tbl->reason;
+		tbl++;
+	}
+
+	pr_debug("Unknown hcall code: %lld\n",
+	       (unsigned long long)exit_code);
+	return "UNKNOWN";
+}
+
+static bool hcall_event_end(struct evsel *evsel,
+			    struct perf_sample *sample __maybe_unused,
+			    struct event_key *key __maybe_unused)
+{
+	return evsel__name_is(evsel, __kvm_events_tp[3]);
+}
+
+static bool hcall_event_begin(struct evsel *evsel,
+			      struct perf_sample *sample, struct event_key *key)
+{
+	if (evsel__name_is(evsel, __kvm_events_tp[2])) {
+		hcall_event_get_key(evsel, sample, key);
+		return true;
+	}
+
+	return false;
+}
+static void hcall_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
+				   struct event_key *key,
+				   char *decode)
+{
+	const char *hcall_reason = get_hcall_exit_reason(key->key);
+
+	scnprintf(decode, KVM_EVENT_NAME_LEN, "%s", hcall_reason);
+}
+
+static const struct kvm_events_ops hcall_events = {
+	.is_begin_event = hcall_event_begin,
+	.is_end_event = hcall_event_end,
+	.decode_key = hcall_event_decode_key,
+	.name = "HCALL-EVENT",
+};
+
+static const struct kvm_events_ops exit_events = {
+	.is_begin_event = exit_event_begin,
+	.is_end_event = exit_event_end,
+	.decode_key = exit_event_decode_key,
+	.name = "VM-EXIT"
+};
+
+static const struct kvm_reg_events_ops __kvm_reg_events_ops[] = {
+	{ .name = "vmexit", .ops = &exit_events },
+	{ .name = "hcall", .ops = &hcall_events },
+	{ NULL, NULL },
+};
+
+static const char * const __kvm_skip_events[] = {
+	NULL,
+};
+
+
+static int is_tracepoint_available(const char *str, struct evlist *evlist)
+{
+	struct parse_events_error err;
+	int ret;
+
+	parse_events_error__init(&err);
+	ret = parse_events(evlist, str, &err);
+	if (ret)
+		parse_events_error__print(&err, "tracepoint");
+	parse_events_error__exit(&err);
+	return ret;
+}
+
+static int ppc__setup_book3s_hv(struct perf_kvm_stat *kvm,
+				struct evlist *evlist)
+{
+	const char * const *events_ptr;
+	int i, nr_tp = 0, err = -1;
+
+	/* Check for book3s_hv tracepoints */
+	for (events_ptr = ppc_book3s_hv_kvm_tp; *events_ptr; events_ptr++) {
+		err = is_tracepoint_available(*events_ptr, evlist);
+		if (err)
+			return -1;
+		nr_tp++;
+	}
+
+	for (i = 0; i < nr_tp; i++)
+		__kvm_events_tp[i] = ppc_book3s_hv_kvm_tp[i];
+
+	__kvm_events_tp[i] = NULL;
+	kvm->exit_reasons = hv_exit_reasons;
+	kvm->exit_reasons_isa = "HV";
+
+	return 0;
+}
+
+/* Wrapper to setup kvm tracepoints */
+static int ppc__setup_kvm_tp(struct perf_kvm_stat *kvm)
+{
+	struct evlist *evlist = evlist__new();
+
+	if (evlist == NULL)
+		return -ENOMEM;
+
+	/* Right now, only supported on book3s_hv */
+	return ppc__setup_book3s_hv(kvm, evlist);
+}
+
+int __setup_kvm_events_tp_powerpc(struct perf_kvm_stat *kvm)
+{
+	return ppc__setup_kvm_tp(kvm);
+}
+
+int __cpu_isa_init_powerpc(struct perf_kvm_stat *kvm)
+{
+	int ret;
+
+	ret = ppc__setup_kvm_tp(kvm);
+	if (ret) {
+		kvm->exit_reasons = NULL;
+		kvm->exit_reasons_isa = NULL;
+	}
+
+	return ret;
+}
+
+/*
+ * In case of powerpc architecture, pmu registers are programmable
+ * by guest kernel. So monitoring guest via host may not provide
+ * valid samples with default 'cycles' event. It is better to use
+ * 'trace_imc/trace_cycles' event for guest profiling, since it
+ * can track the guest instruction pointer in the trace-record.
+ *
+ * Function to parse the arguments and return appropriate values.
+ */
+int __kvm_add_default_arch_event_powerpc(int *argc, const char **argv)
+{
+	const char **tmp;
+	bool event = false;
+	int i, j = *argc;
+
+	const struct option event_options[] = {
+		OPT_BOOLEAN('e', "event", &event, NULL),
+		OPT_END()
+	};
+
+	tmp = calloc(j + 1, sizeof(char *));
+	if (!tmp)
+		return -EINVAL;
+
+	for (i = 0; i < j; i++)
+		tmp[i] = argv[i];
+
+	parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN);
+	if (!event) {
+		if (perf_pmus__have_event("trace_imc", "trace_cycles")) {
+			argv[j++] = strdup("-e");
+			argv[j++] = strdup("trace_imc/trace_cycles/");
+			*argc += 2;
+		} else {
+			free(tmp);
+			return -EINVAL;
+		}
+	}
+
+	free(tmp);
+	return 0;
+}
+
+const char * const *__kvm_events_tp_powerpc(void)
+{
+	return __kvm_events_tp;
+}
+
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_powerpc(void)
+{
+	return __kvm_reg_events_ops;
+}
+
+const char * const *__kvm_skip_events_powerpc(void)
+{
+	return __kvm_skip_events;
+}
diff --git a/tools/perf/util/kvm-stat-arch/kvm-stat-riscv.c b/tools/perf/util/kvm-stat-arch/kvm-stat-riscv.c
new file mode 100644
index 000000000000..b2c5d3220795
--- /dev/null
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-riscv.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Arch specific functions for perf kvm stat.
+ *
+ * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd.
+ *
+ */
+#include <errno.h>
+#include <memory.h>
+#include "../evsel.h"
+#include "../kvm-stat.h"
+#include "riscv_trap_types.h"
+#include "debug.h"
+
+define_exit_reasons_table(riscv_exit_reasons, kvm_riscv_trap_class);
+
+static const char * const __kvm_events_tp[] = {
+	"kvm:kvm_entry",
+	"kvm:kvm_exit",
+	NULL,
+};
+
+static void event_get_key(struct evsel *evsel,
+			  struct perf_sample *sample,
+			  struct event_key *key)
+{
+	int xlen = 64; // TODO: 32-bit support.
+
+	key->info = 0;
+	key->key = evsel__intval(evsel, sample, kvm_exit_reason()) & ~CAUSE_IRQ_FLAG(xlen);
+	key->exit_reasons = riscv_exit_reasons;
+}
+
+static bool event_begin(struct evsel *evsel,
+			struct perf_sample *sample __maybe_unused,
+			struct event_key *key __maybe_unused)
+{
+	return evsel__name_is(evsel, kvm_entry_trace());
+}
+
+static bool event_end(struct evsel *evsel,
+		      struct perf_sample *sample,
+		      struct event_key *key)
+{
+	if (evsel__name_is(evsel, kvm_exit_trace())) {
+		event_get_key(evsel, sample, key);
+		return true;
+	}
+	return false;
+}
+
+static const struct kvm_events_ops exit_events = {
+	.is_begin_event = event_begin,
+	.is_end_event	= event_end,
+	.decode_key	= exit_event_decode_key,
+	.name		= "VM-EXIT"
+};
+
+static const struct kvm_reg_events_ops __kvm_reg_events_ops[] = {
+	{
+		.name	= "vmexit",
+		.ops	= &exit_events,
+	},
+	{ NULL, NULL },
+};
+
+static const char * const __kvm_skip_events[] = {
+	NULL,
+};
+
+int __cpu_isa_init_riscv(struct perf_kvm_stat *kvm)
+{
+	kvm->exit_reasons_isa = "riscv64";
+	return 0;
+}
+
+const char * const *__kvm_events_tp_riscv(void)
+{
+	return __kvm_events_tp;
+}
+
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_riscv(void)
+{
+	return __kvm_reg_events_ops;
+}
+
+const char * const *__kvm_skip_events_riscv(void)
+{
+	return __kvm_skip_events;
+}
diff --git a/tools/perf/util/kvm-stat-arch/kvm-stat-s390.c b/tools/perf/util/kvm-stat-arch/kvm-stat-s390.c
new file mode 100644
index 000000000000..7e29169f5bb0
--- /dev/null
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-s390.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Arch specific functions for perf kvm stat.
+ *
+ * Copyright 2014 IBM Corp.
+ * Author(s): Alexander Yarygin <yarygin@linux.vnet.ibm.com>
+ */
+
+#include <errno.h>
+#include <string.h>
+#include "../kvm-stat.h"
+#include "../evsel.h"
+#include "../../../arch/s390/include/uapi/asm/sie.h"
+
+define_exit_reasons_table(sie_exit_reasons, sie_intercept_code);
+define_exit_reasons_table(sie_icpt_insn_codes, icpt_insn_codes);
+define_exit_reasons_table(sie_sigp_order_codes, sigp_order_codes);
+define_exit_reasons_table(sie_diagnose_codes, diagnose_codes);
+define_exit_reasons_table(sie_icpt_prog_codes, icpt_prog_codes);
+
+static void event_icpt_insn_get_key(struct evsel *evsel,
+				    struct perf_sample *sample,
+				    struct event_key *key)
+{
+	u64 insn;
+
+	insn = evsel__intval(evsel, sample, "instruction");
+	key->key = icpt_insn_decoder(insn);
+	key->exit_reasons = sie_icpt_insn_codes;
+}
+
+static void event_sigp_get_key(struct evsel *evsel,
+			       struct perf_sample *sample,
+			       struct event_key *key)
+{
+	key->key = evsel__intval(evsel, sample, "order_code");
+	key->exit_reasons = sie_sigp_order_codes;
+}
+
+static void event_diag_get_key(struct evsel *evsel,
+			       struct perf_sample *sample,
+			       struct event_key *key)
+{
+	key->key = evsel__intval(evsel, sample, "code");
+	key->exit_reasons = sie_diagnose_codes;
+}
+
+static void event_icpt_prog_get_key(struct evsel *evsel,
+				    struct perf_sample *sample,
+				    struct event_key *key)
+{
+	key->key = evsel__intval(evsel, sample, "code");
+	key->exit_reasons = sie_icpt_prog_codes;
+}
+
+static const struct child_event_ops child_events[] = {
+	{ .name = "kvm:kvm_s390_intercept_instruction",
+	  .get_key = event_icpt_insn_get_key },
+	{ .name = "kvm:kvm_s390_handle_sigp",
+	  .get_key = event_sigp_get_key },
+	{ .name = "kvm:kvm_s390_handle_diag",
+	  .get_key = event_diag_get_key },
+	{ .name = "kvm:kvm_s390_intercept_prog",
+	  .get_key = event_icpt_prog_get_key },
+	{ NULL, NULL },
+};
+
+static const struct kvm_events_ops exit_events = {
+	.is_begin_event = exit_event_begin,
+	.is_end_event = exit_event_end,
+	.child_ops = child_events,
+	.decode_key = exit_event_decode_key,
+	.name = "VM-EXIT"
+};
+
+static const char * const __kvm_events_tp[] = {
+	"kvm:kvm_s390_sie_enter",
+	"kvm:kvm_s390_sie_exit",
+	"kvm:kvm_s390_intercept_instruction",
+	"kvm:kvm_s390_handle_sigp",
+	"kvm:kvm_s390_handle_diag",
+	"kvm:kvm_s390_intercept_prog",
+	NULL,
+};
+
+static const struct kvm_reg_events_ops __kvm_reg_events_ops[] = {
+	{ .name = "vmexit", .ops = &exit_events },
+	{ NULL, NULL },
+};
+
+static const char * const __kvm_skip_events[] = {
+	"Wait state",
+	NULL,
+};
+
+int __cpu_isa_init_s390(struct perf_kvm_stat *kvm, const char *cpuid)
+{
+	if (strstr(cpuid, "IBM")) {
+		kvm->exit_reasons = sie_exit_reasons;
+		kvm->exit_reasons_isa = "SIE";
+	} else
+		return -ENOTSUP;
+
+	return 0;
+}
+
+const char * const *__kvm_events_tp_s390(void)
+{
+	return __kvm_events_tp;
+}
+
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_s390(void)
+{
+	return __kvm_reg_events_ops;
+}
+
+const char * const *__kvm_skip_events_s390(void)
+{
+	return __kvm_skip_events;
+}
diff --git a/tools/perf/util/kvm-stat-arch/kvm-stat-x86.c b/tools/perf/util/kvm-stat-arch/kvm-stat-x86.c
new file mode 100644
index 000000000000..1cf541385a4b
--- /dev/null
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-x86.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <string.h>
+#include "../kvm-stat.h"
+#include "../evsel.h"
+#include "../env.h"
+#include <asm/svm.h>
+#include <asm/vmx.h>
+#include <asm/kvm.h>
+#include <subcmd/parse-options.h>
+
+define_exit_reasons_table(vmx_exit_reasons, VMX_EXIT_REASONS);
+define_exit_reasons_table(svm_exit_reasons, SVM_EXIT_REASONS);
+
+static const struct kvm_events_ops exit_events = {
+	.is_begin_event = exit_event_begin,
+	.is_end_event = exit_event_end,
+	.decode_key = exit_event_decode_key,
+	.name = "VM-EXIT"
+};
+
+/*
+ * For the mmio events, we treat:
+ * the time of MMIO write: kvm_mmio(KVM_TRACE_MMIO_WRITE...) -> kvm_entry
+ * the time of MMIO read: kvm_exit -> kvm_mmio(KVM_TRACE_MMIO_READ...).
+ */
+static void mmio_event_get_key(struct evsel *evsel, struct perf_sample *sample,
+			       struct event_key *key)
+{
+	key->key  = evsel__intval(evsel, sample, "gpa");
+	key->info = evsel__intval(evsel, sample, "type");
+}
+
+#define KVM_TRACE_MMIO_READ_UNSATISFIED 0
+#define KVM_TRACE_MMIO_READ 1
+#define KVM_TRACE_MMIO_WRITE 2
+
+static bool mmio_event_begin(struct evsel *evsel,
+			     struct perf_sample *sample, struct event_key *key)
+{
+	/* MMIO read begin event in kernel. */
+	if (kvm_exit_event(evsel))
+		return true;
+
+	/* MMIO write begin event in kernel. */
+	if (evsel__name_is(evsel, "kvm:kvm_mmio") &&
+	    evsel__intval(evsel, sample, "type") == KVM_TRACE_MMIO_WRITE) {
+		mmio_event_get_key(evsel, sample, key);
+		return true;
+	}
+
+	return false;
+}
+
+static bool mmio_event_end(struct evsel *evsel, struct perf_sample *sample,
+			   struct event_key *key)
+{
+	/* MMIO write end event in kernel. */
+	if (kvm_entry_event(evsel))
+		return true;
+
+	/* MMIO read end event in kernel.*/
+	if (evsel__name_is(evsel, "kvm:kvm_mmio") &&
+	    evsel__intval(evsel, sample, "type") == KVM_TRACE_MMIO_READ) {
+		mmio_event_get_key(evsel, sample, key);
+		return true;
+	}
+
+	return false;
+}
+
+static void mmio_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
+				  struct event_key *key,
+				  char *decode)
+{
+	scnprintf(decode, KVM_EVENT_NAME_LEN, "%#lx:%s",
+		  (unsigned long)key->key,
+		  key->info == KVM_TRACE_MMIO_WRITE ? "W" : "R");
+}
+
+static const struct kvm_events_ops mmio_events = {
+	.is_begin_event = mmio_event_begin,
+	.is_end_event = mmio_event_end,
+	.decode_key = mmio_event_decode_key,
+	.name = "MMIO Access"
+};
+
+ /* The time of emulation pio access is from kvm_pio to kvm_entry. */
+static void ioport_event_get_key(struct evsel *evsel,
+				 struct perf_sample *sample,
+				 struct event_key *key)
+{
+	key->key  = evsel__intval(evsel, sample, "port");
+	key->info = evsel__intval(evsel, sample, "rw");
+}
+
+static bool ioport_event_begin(struct evsel *evsel,
+			       struct perf_sample *sample,
+			       struct event_key *key)
+{
+	if (evsel__name_is(evsel, "kvm:kvm_pio")) {
+		ioport_event_get_key(evsel, sample, key);
+		return true;
+	}
+
+	return false;
+}
+
+static bool ioport_event_end(struct evsel *evsel,
+			     struct perf_sample *sample __maybe_unused,
+			     struct event_key *key __maybe_unused)
+{
+	return kvm_entry_event(evsel);
+}
+
+static void ioport_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
+				    struct event_key *key,
+				    char *decode)
+{
+	scnprintf(decode, KVM_EVENT_NAME_LEN, "%#llx:%s",
+		  (unsigned long long)key->key,
+		  key->info ? "POUT" : "PIN");
+}
+
+static const struct kvm_events_ops ioport_events = {
+	.is_begin_event = ioport_event_begin,
+	.is_end_event = ioport_event_end,
+	.decode_key = ioport_event_decode_key,
+	.name = "IO Port Access"
+};
+
+ /* The time of emulation msr is from kvm_msr to kvm_entry. */
+static void msr_event_get_key(struct evsel *evsel,
+				 struct perf_sample *sample,
+				 struct event_key *key)
+{
+	key->key  = evsel__intval(evsel, sample, "ecx");
+	key->info = evsel__intval(evsel, sample, "write");
+}
+
+static bool msr_event_begin(struct evsel *evsel,
+			       struct perf_sample *sample,
+			       struct event_key *key)
+{
+	if (evsel__name_is(evsel, "kvm:kvm_msr")) {
+		msr_event_get_key(evsel, sample, key);
+		return true;
+	}
+
+	return false;
+}
+
+static bool msr_event_end(struct evsel *evsel,
+			     struct perf_sample *sample __maybe_unused,
+			     struct event_key *key __maybe_unused)
+{
+	return kvm_entry_event(evsel);
+}
+
+static void msr_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
+				    struct event_key *key,
+				    char *decode)
+{
+	scnprintf(decode, KVM_EVENT_NAME_LEN, "%#llx:%s",
+		  (unsigned long long)key->key,
+		  key->info ? "W" : "R");
+}
+
+static const struct kvm_events_ops msr_events = {
+	.is_begin_event = msr_event_begin,
+	.is_end_event = msr_event_end,
+	.decode_key = msr_event_decode_key,
+	.name = "MSR Access"
+};
+
+static const char * const __kvm_events_tp[] = {
+	"kvm:kvm_entry",
+	"kvm:kvm_exit",
+	"kvm:kvm_mmio",
+	"kvm:kvm_pio",
+	"kvm:kvm_msr",
+	NULL,
+};
+
+static const struct kvm_reg_events_ops __kvm_reg_events_ops[] = {
+	{ .name = "vmexit", .ops = &exit_events },
+	{ .name = "mmio", .ops = &mmio_events },
+	{ .name = "ioport", .ops = &ioport_events },
+	{ .name = "msr", .ops = &msr_events },
+	{ NULL, NULL },
+};
+
+static const char * const __kvm_skip_events[] = {
+	"HLT",
+	NULL,
+};
+
+int __cpu_isa_init_x86(struct perf_kvm_stat *kvm, const char *cpuid)
+{
+	if (strstr(cpuid, "Intel")) {
+		kvm->exit_reasons = vmx_exit_reasons;
+		kvm->exit_reasons_isa = "VMX";
+	} else if (strstr(cpuid, "AMD") || strstr(cpuid, "Hygon")) {
+		kvm->exit_reasons = svm_exit_reasons;
+		kvm->exit_reasons_isa = "SVM";
+	} else
+		return -ENOTSUP;
+
+	return 0;
+}
+
+/*
+ * After KVM supports PEBS for guest on Intel platforms
+ * (https://lore.kernel.org/all/20220411101946.20262-1-likexu@tencent.com/),
+ * host loses the capability to sample guest with PEBS since all PEBS related
+ * MSRs are switched to guest value after vm-entry, like IA32_DS_AREA MSR is
+ * switched to guest GVA at vm-entry. This would lead to "perf kvm record"
+ * fails to sample guest on Intel platforms since "cycles:P" event is used to
+ * sample guest by default.
+ *
+ * So, to avoid this issue explicitly use "cycles" instead of "cycles:P" event
+ * by default to sample guest on Intel platforms.
+ */
+int __kvm_add_default_arch_event_x86(int *argc, const char **argv)
+{
+	const char **tmp;
+	bool event = false;
+	int ret = 0, i, j = *argc;
+
+	const struct option event_options[] = {
+		OPT_BOOLEAN('e', "event", &event, NULL),
+		OPT_BOOLEAN(0, "pfm-events", &event, NULL),
+		OPT_END()
+	};
+
+	if (!x86__is_intel_cpu())
+		return 0;
+
+	tmp = calloc(j + 1, sizeof(char *));
+	if (!tmp)
+		return -ENOMEM;
+
+	for (i = 0; i < j; i++)
+		tmp[i] = argv[i];
+
+	parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN);
+	if (!event) {
+		argv[j++] = STRDUP_FAIL_EXIT("-e");
+		argv[j++] = STRDUP_FAIL_EXIT("cycles");
+		*argc += 2;
+	}
+
+	free(tmp);
+	return 0;
+
+EXIT:
+	free(tmp);
+	return ret;
+}
+
+const char * const *__kvm_events_tp_x86(void)
+{
+	return __kvm_events_tp;
+}
+
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_x86(void)
+{
+	return __kvm_reg_events_ops;
+}
+
+const char * const *__kvm_skip_events_x86(void)
+{
+	return __kvm_skip_events;
+}
diff --git a/tools/perf/util/kvm-stat-arch/riscv_trap_types.h b/tools/perf/util/kvm-stat-arch/riscv_trap_types.h
new file mode 100644
index 000000000000..aa5d24fab4ee
--- /dev/null
+++ b/tools/perf/util/kvm-stat-arch/riscv_trap_types.h
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef ARCH_PERF_RISCV_TRAP_TYPES_H
+#define ARCH_PERF_RISCV_TRAP_TYPES_H
+
+/* Exception cause high bit - is an interrupt if set */
+#define CAUSE_IRQ_FLAG(xlen)		(_AC(1, UL) << (xlen - 1))
+
+/* Interrupt causes (minus the high bit) */
+#define IRQ_S_SOFT 1
+#define IRQ_VS_SOFT 2
+#define IRQ_M_SOFT 3
+#define IRQ_S_TIMER 5
+#define IRQ_VS_TIMER 6
+#define IRQ_M_TIMER 7
+#define IRQ_S_EXT 9
+#define IRQ_VS_EXT 10
+#define IRQ_M_EXT 11
+#define IRQ_S_GEXT 12
+#define IRQ_PMU_OVF 13
+
+/* Exception causes */
+#define EXC_INST_MISALIGNED 0
+#define EXC_INST_ACCESS 1
+#define EXC_INST_ILLEGAL 2
+#define EXC_BREAKPOINT 3
+#define EXC_LOAD_MISALIGNED 4
+#define EXC_LOAD_ACCESS 5
+#define EXC_STORE_MISALIGNED 6
+#define EXC_STORE_ACCESS 7
+#define EXC_SYSCALL 8
+#define EXC_HYPERVISOR_SYSCALL 9
+#define EXC_SUPERVISOR_SYSCALL 10
+#define EXC_INST_PAGE_FAULT 12
+#define EXC_LOAD_PAGE_FAULT 13
+#define EXC_STORE_PAGE_FAULT 15
+#define EXC_INST_GUEST_PAGE_FAULT 20
+#define EXC_LOAD_GUEST_PAGE_FAULT 21
+#define EXC_VIRTUAL_INST_FAULT 22
+#define EXC_STORE_GUEST_PAGE_FAULT 23
+
+#define TRAP(x) { x, #x }
+
+#define kvm_riscv_trap_class \
+	TRAP(IRQ_S_SOFT), TRAP(IRQ_VS_SOFT), TRAP(IRQ_M_SOFT), \
+	TRAP(IRQ_S_TIMER), TRAP(IRQ_VS_TIMER), TRAP(IRQ_M_TIMER), \
+	TRAP(IRQ_S_EXT), TRAP(IRQ_VS_EXT), TRAP(IRQ_M_EXT), \
+	TRAP(IRQ_S_GEXT), TRAP(IRQ_PMU_OVF), \
+	TRAP(EXC_INST_MISALIGNED), TRAP(EXC_INST_ACCESS), TRAP(EXC_INST_ILLEGAL), \
+	TRAP(EXC_BREAKPOINT), TRAP(EXC_LOAD_MISALIGNED), TRAP(EXC_LOAD_ACCESS), \
+	TRAP(EXC_STORE_MISALIGNED), TRAP(EXC_STORE_ACCESS), TRAP(EXC_SYSCALL), \
+	TRAP(EXC_HYPERVISOR_SYSCALL), TRAP(EXC_SUPERVISOR_SYSCALL), \
+	TRAP(EXC_INST_PAGE_FAULT), TRAP(EXC_LOAD_PAGE_FAULT), \
+	TRAP(EXC_STORE_PAGE_FAULT), TRAP(EXC_INST_GUEST_PAGE_FAULT), \
+	TRAP(EXC_LOAD_GUEST_PAGE_FAULT), TRAP(EXC_VIRTUAL_INST_FAULT), \
+	TRAP(EXC_STORE_GUEST_PAGE_FAULT)
+
+#endif /* ARCH_PERF_RISCV_TRAP_TYPES_H */
diff --git a/tools/perf/util/kvm-stat.c b/tools/perf/util/kvm-stat.c
index 38ace736db5c..b1affd97917b 100644
--- a/tools/perf/util/kvm-stat.c
+++ b/tools/perf/util/kvm-stat.c
@@ -2,12 +2,11 @@
 #include "debug.h"
 #include "evsel.h"
 #include "kvm-stat.h"
-
-#if defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
+#include <dwarf-regs.h>
 
 bool kvm_exit_event(struct evsel *evsel)
 {
-	return evsel__name_is(evsel, kvm_exit_trace);
+	return evsel__name_is(evsel, kvm_exit_trace());
 }
 
 void exit_event_get_key(struct evsel *evsel,
@@ -15,7 +14,7 @@ void exit_event_get_key(struct evsel *evsel,
 			struct event_key *key)
 {
 	key->info = 0;
-	key->key  = evsel__intval(evsel, sample, kvm_exit_reason);
+	key->key  = evsel__intval(evsel, sample, kvm_exit_reason());
 }
 
 
@@ -32,7 +31,7 @@ bool exit_event_begin(struct evsel *evsel,
 
 bool kvm_entry_event(struct evsel *evsel)
 {
-	return evsel__name_is(evsel, kvm_entry_trace);
+	return evsel__name_is(evsel, kvm_entry_trace());
 }
 
 bool exit_event_end(struct evsel *evsel,
@@ -67,4 +66,202 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm,
 	scnprintf(decode, KVM_EVENT_NAME_LEN, "%s", exit_reason);
 }
 
-#endif
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm)
+{
+	switch (EM_HOST) {
+	case EM_PPC:
+	case EM_PPC64:
+		return __setup_kvm_events_tp_powerpc(kvm);
+	default:
+		return 0;
+	}
+}
+
+int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid)
+{
+	switch (EM_HOST) {
+	case EM_AARCH64:
+		return __cpu_isa_init_arm64(kvm);
+	case EM_LOONGARCH:
+		return __cpu_isa_init_loongarch(kvm);
+	case EM_PPC:
+	case EM_PPC64:
+		return __cpu_isa_init_powerpc(kvm);
+	case EM_RISCV:
+		return __cpu_isa_init_riscv(kvm);
+	case EM_S390:
+		return __cpu_isa_init_s390(kvm, cpuid);
+	case EM_X86_64:
+	case EM_386:
+		return __cpu_isa_init_x86(kvm, cpuid);
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		return -1;
+	}
+}
+
+const char *vcpu_id_str(void)
+{
+	switch (EM_HOST) {
+	case EM_AARCH64:
+	case EM_RISCV:
+	case EM_S390:
+		return "id";
+	case EM_LOONGARCH:
+	case EM_PPC:
+	case EM_PPC64:
+	case EM_X86_64:
+	case EM_386:
+		return "vcpu_id";
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		return NULL;
+	}
+}
+
+const char *kvm_exit_reason(void)
+{
+	switch (EM_HOST) {
+	case EM_AARCH64:
+		return "ret";
+	case EM_LOONGARCH:
+		return "reason";
+	case EM_PPC:
+	case EM_PPC64:
+		return "trap";
+	case EM_RISCV:
+		return "scause";
+	case EM_S390:
+		return "icptcode";
+	case EM_X86_64:
+	case EM_386:
+		return "exit_reason";
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		return NULL;
+	}
+}
+
+const char *kvm_entry_trace(void)
+{
+	switch (EM_HOST) {
+	case EM_AARCH64:
+	case EM_RISCV:
+	case EM_X86_64:
+	case EM_386:
+		return "kvm:kvm_entry";
+	case EM_LOONGARCH:
+		return "kvm:kvm_enter";
+	case EM_PPC:
+	case EM_PPC64:
+		return "kvm_hv:kvm_guest_enter";
+	case EM_S390:
+		return "kvm:kvm_s390_sie_enter";
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		return NULL;
+	}
+}
+
+const char *kvm_exit_trace(void)
+{
+	switch (EM_HOST) {
+	case EM_AARCH64:
+	case EM_LOONGARCH:
+	case EM_RISCV:
+	case EM_X86_64:
+	case EM_386:
+		return "kvm:kvm_exit";
+	case EM_PPC:
+	case EM_PPC64:
+		return "kvm_hv:kvm_guest_exit";
+	case EM_S390:
+		return "kvm:kvm_s390_sie_exit";
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		return NULL;
+	}
+}
+
+const char * const *kvm_events_tp(void)
+{
+	switch (EM_HOST) {
+	case EM_AARCH64:
+		return __kvm_events_tp_arm64();
+	case EM_LOONGARCH:
+		return __kvm_events_tp_loongarch();
+	case EM_PPC:
+	case EM_PPC64:
+		return __kvm_events_tp_powerpc();
+	case EM_RISCV:
+		return __kvm_events_tp_riscv();
+	case EM_S390:
+		return __kvm_events_tp_s390();
+	case EM_X86_64:
+	case EM_386:
+		return __kvm_events_tp_x86();
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		return NULL;
+	}
+}
+
+const struct kvm_reg_events_ops *kvm_reg_events_ops(void)
+{
+	switch (EM_HOST) {
+	case EM_AARCH64:
+		return __kvm_reg_events_ops_arm64();
+	case EM_LOONGARCH:
+		return __kvm_reg_events_ops_loongarch();
+	case EM_PPC:
+	case EM_PPC64:
+		return __kvm_reg_events_ops_powerpc();
+	case EM_RISCV:
+		return __kvm_reg_events_ops_riscv();
+	case EM_S390:
+		return __kvm_reg_events_ops_s390();
+	case EM_X86_64:
+	case EM_386:
+		return __kvm_reg_events_ops_x86();
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		return NULL;
+	}
+}
+
+const char * const *kvm_skip_events(void)
+{
+	switch (EM_HOST) {
+	case EM_AARCH64:
+		return __kvm_skip_events_arm64();
+	case EM_LOONGARCH:
+		return __kvm_skip_events_loongarch();
+	case EM_PPC:
+	case EM_PPC64:
+		return __kvm_skip_events_powerpc();
+	case EM_RISCV:
+		return __kvm_skip_events_riscv();
+	case EM_S390:
+		return __kvm_skip_events_s390();
+	case EM_X86_64:
+	case EM_386:
+		return __kvm_skip_events_x86();
+	default:
+		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		return NULL;
+	}
+}
+
+int kvm_add_default_arch_event(int *argc, const char **argv)
+{
+	switch (EM_HOST) {
+	case EM_PPC:
+	case EM_PPC64:
+		return __kvm_add_default_arch_event_powerpc(argc, argv);
+	case EM_X86_64:
+	case EM_386:
+		return __kvm_add_default_arch_event_x86(argc, argv);
+	default:
+		return 0;
+	}
+}
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index a356b839c2ee..759079b4294c 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -2,8 +2,6 @@
 #ifndef __PERF_KVM_STAT_H
 #define __PERF_KVM_STAT_H
 
-#ifdef HAVE_KVM_STAT_SUPPORT
-
 #include "tool.h"
 #include "sort.h"
 #include "stat.h"
@@ -67,7 +65,7 @@ struct kvm_events_ops {
 			       struct event_key *key);
 	bool (*is_end_event)(struct evsel *evsel,
 			     struct perf_sample *sample, struct event_key *key);
-	struct child_event_ops *child_ops;
+	const struct child_event_ops *child_ops;
 	void (*decode_key)(struct perf_kvm_stat *kvm, struct event_key *key,
 			   char *decode);
 	const char *name;
@@ -95,7 +93,7 @@ struct perf_kvm_stat {
 	struct exit_reasons_table *exit_reasons;
 	const char *exit_reasons_isa;
 
-	struct kvm_events_ops *events_ops;
+	const struct kvm_events_ops *events_ops;
 
 	u64 total_time;
 	u64 total_count;
@@ -113,10 +111,10 @@ struct perf_kvm_stat {
 
 struct kvm_reg_events_ops {
 	const char *name;
-	struct kvm_events_ops *ops;
+	const struct kvm_events_ops *ops;
 };
 
-#if defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
+#ifdef HAVE_LIBTRACEEVENT
 
 void exit_event_get_key(struct evsel *evsel,
 			struct perf_sample *sample,
@@ -130,11 +128,9 @@ bool exit_event_end(struct evsel *evsel,
 void exit_event_decode_key(struct perf_kvm_stat *kvm,
 			   struct event_key *key,
 			   char *decode);
-#endif
 
 bool kvm_exit_event(struct evsel *evsel);
 bool kvm_entry_event(struct evsel *evsel);
-int setup_kvm_events_tp(struct perf_kvm_stat *kvm);
 
 #define define_exit_reasons_table(name, symbols)	\
 	static struct exit_reasons_table name[] = {	\
@@ -144,15 +140,59 @@ int setup_kvm_events_tp(struct perf_kvm_stat *kvm);
 /*
  * arch specific callbacks and data structures
  */
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm);
+int __setup_kvm_events_tp_powerpc(struct perf_kvm_stat *kvm);
+
 int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid);
+int __cpu_isa_init_arm64(struct perf_kvm_stat *kvm);
+int __cpu_isa_init_loongarch(struct perf_kvm_stat *kvm);
+int __cpu_isa_init_powerpc(struct perf_kvm_stat *kvm);
+int __cpu_isa_init_riscv(struct perf_kvm_stat *kvm);
+int __cpu_isa_init_s390(struct perf_kvm_stat *kvm, const char *cpuid);
+int __cpu_isa_init_x86(struct perf_kvm_stat *kvm, const char *cpuid);
+
+const char *vcpu_id_str(void);
+const char *kvm_exit_reason(void);
+const char *kvm_entry_trace(void);
+const char *kvm_exit_trace(void);
+
+const char * const *kvm_events_tp(void);
+const char * const *__kvm_events_tp_arm64(void);
+const char * const *__kvm_events_tp_loongarch(void);
+const char * const *__kvm_events_tp_powerpc(void);
+const char * const *__kvm_events_tp_riscv(void);
+const char * const *__kvm_events_tp_s390(void);
+const char * const *__kvm_events_tp_x86(void);
+
+const struct kvm_reg_events_ops *kvm_reg_events_ops(void);
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_arm64(void);
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_loongarch(void);
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_powerpc(void);
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_riscv(void);
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_s390(void);
+const struct kvm_reg_events_ops *__kvm_reg_events_ops_x86(void);
+
+const char * const *kvm_skip_events(void);
+const char * const *__kvm_skip_events_arm64(void);
+const char * const *__kvm_skip_events_loongarch(void);
+const char * const *__kvm_skip_events_powerpc(void);
+const char * const *__kvm_skip_events_riscv(void);
+const char * const *__kvm_skip_events_s390(void);
+const char * const *__kvm_skip_events_x86(void);
+
+int kvm_add_default_arch_event(int *argc, const char **argv);
+int __kvm_add_default_arch_event_powerpc(int *argc, const char **argv);
+int __kvm_add_default_arch_event_x86(int *argc, const char **argv);
+
+#else /* !HAVE_LIBTRACEEVENT */
+
+static inline int kvm_add_default_arch_event(int *argc __maybe_unused,
+					     const char **argv __maybe_unused)
+{
+	return 0;
+}
 
-extern const char *kvm_events_tp[];
-extern struct kvm_reg_events_ops kvm_reg_events_ops[];
-extern const char * const kvm_skip_events[];
-extern const char *vcpu_id_str;
-extern const char *kvm_exit_reason;
-extern const char *kvm_entry_trace;
-extern const char *kvm_exit_trace;
+#endif /* HAVE_LIBTRACEEVENT */
 
 static inline struct kvm_info *kvm_info__get(struct kvm_info *ki)
 {
@@ -186,11 +226,6 @@ static inline struct kvm_info *kvm_info__new(void)
 	return ki;
 }
 
-#else /* HAVE_KVM_STAT_SUPPORT */
-// We use this unconditionally in hists__findnew_entry() and hist_entry__delete()
-#define kvm_info__zput(ki) do { } while (0)
-#endif /* HAVE_KVM_STAT_SUPPORT */
-
 #define STRDUP_FAIL_EXIT(s)		\
 	({	char *_p;		\
 		_p = strdup(s);		\
@@ -201,5 +236,4 @@ static inline struct kvm_info *kvm_info__new(void)
 		_p;			\
 	})
 
-extern int kvm_add_default_arch_event(int *argc, const char **argv);
 #endif /* __PERF_KVM_STAT_H */
-- 
cgit v1.2.3


From 43af548436775557b79aee32b30e4f020d51fbe6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 3 Feb 2026 10:26:37 -0800
Subject: perf kvm: Wire up e_machine

Pass the e_machine to the kvm functions so that they aren't just wired
to EM_HOST.

In the case of a session move some setup until the session
is created.

As the session isn't fully running the default EM_HOST is returned as no
e_machine can be found in a running machine.

This is, however, some marginal progress to cross platform support.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Anup Patel <anup@brainfault.org>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quan Zhou <zhouquan@iscas.ac.cn>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yunseong Kim <ysk@kzalloc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-kvm.c                           | 45 +++++++++-------
 tools/perf/util/evsel.c                            |  2 +-
 tools/perf/util/evsel.h                            |  1 +
 tools/perf/util/kvm-stat-arch/kvm-stat-arm64.c     |  6 +--
 tools/perf/util/kvm-stat-arch/kvm-stat-loongarch.c |  3 +-
 tools/perf/util/kvm-stat-arch/kvm-stat-riscv.c     |  6 +--
 tools/perf/util/kvm-stat.c                         | 62 +++++++++++-----------
 tools/perf/util/kvm-stat.h                         | 23 ++++----
 8 files changed, 80 insertions(+), 68 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index bd9bda32157f..93ba07c58290 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -2,6 +2,7 @@
 #include "builtin.h"
 #include "perf.h"
 
+#include <dwarf-regs.h>
 #include "util/build-id.h"
 #include "util/evsel.h"
 #include "util/evlist.h"
@@ -615,11 +616,11 @@ static const char *get_filename_for_perf_kvm(void)
 
 #if defined(HAVE_LIBTRACEEVENT)
 
-static bool register_kvm_events_ops(struct perf_kvm_stat *kvm)
+static bool register_kvm_events_ops(struct perf_kvm_stat *kvm, uint16_t e_machine)
 {
 	const struct kvm_reg_events_ops *events_ops;
 
-	for (events_ops = kvm_reg_events_ops(); events_ops->name; events_ops++) {
+	for (events_ops = kvm_reg_events_ops(e_machine); events_ops->name; events_ops++) {
 		if (!strcmp(events_ops->name, kvm->report_event)) {
 			kvm->events_ops = events_ops->ops;
 			return true;
@@ -841,11 +842,11 @@ static bool handle_child_event(struct perf_kvm_stat *kvm,
 	return true;
 }
 
-static bool skip_event(const char *event)
+static bool skip_event(uint16_t e_machine, const char *event)
 {
 	const char * const *skip_events;
 
-	for (skip_events = kvm_skip_events(); *skip_events; skip_events++)
+	for (skip_events = kvm_skip_events(e_machine); *skip_events; skip_events++)
 		if (!strcmp(event, *skip_events))
 			return true;
 
@@ -901,9 +902,10 @@ static bool handle_end_event(struct perf_kvm_stat *kvm,
 
 	if (kvm->duration && time_diff > kvm->duration) {
 		char decode[KVM_EVENT_NAME_LEN];
+		uint16_t e_machine = perf_session__e_machine(kvm->session);
 
 		kvm->events_ops->decode_key(kvm, &event->key, decode);
-		if (!skip_event(decode)) {
+		if (!skip_event(e_machine, decode)) {
 			pr_info("%" PRIu64 " VM %d, vcpu %d: %s event took %" PRIu64 "usec\n",
 				 sample->time, sample->pid, vcpu_record->vcpu_id,
 				 decode, time_diff / NSEC_PER_USEC);
@@ -921,6 +923,8 @@ struct vcpu_event_record *per_vcpu_record(struct thread *thread,
 	/* Only kvm_entry records vcpu id. */
 	if (!thread__priv(thread) && kvm_entry_event(evsel)) {
 		struct vcpu_event_record *vcpu_record;
+		struct machine *machine = maps__machine(thread__maps(thread));
+		uint16_t e_machine = thread__e_machine(thread, machine, /*e_flags=*/NULL);
 
 		vcpu_record = zalloc(sizeof(*vcpu_record));
 		if (!vcpu_record) {
@@ -928,7 +932,7 @@ struct vcpu_event_record *per_vcpu_record(struct thread *thread,
 			return NULL;
 		}
 
-		vcpu_record->vcpu_id = evsel__intval(evsel, sample, vcpu_id_str());
+		vcpu_record->vcpu_id = evsel__intval(evsel, sample, vcpu_id_str(e_machine));
 		thread__set_priv(thread, vcpu_record);
 	}
 
@@ -1163,6 +1167,7 @@ static int cpu_isa_config(struct perf_kvm_stat *kvm)
 {
 	char buf[128], *cpuid;
 	int err;
+	uint16_t e_machine;
 
 	if (kvm->live) {
 		struct perf_cpu cpu = {-1};
@@ -1182,7 +1187,8 @@ static int cpu_isa_config(struct perf_kvm_stat *kvm)
 		return -EINVAL;
 	}
 
-	err = cpu_isa_init(kvm, cpuid);
+	e_machine = perf_session__e_machine(kvm->session);
+	err = cpu_isa_init(kvm, e_machine, cpuid);
 	if (err == -ENOTSUP)
 		pr_err("CPU %s is not supported.\n", cpuid);
 
@@ -1413,7 +1419,7 @@ static int kvm_events_live_report(struct perf_kvm_stat *kvm)
 
 	if (!verify_vcpu(kvm->trace_vcpu) ||
 	    !is_valid_key(kvm) ||
-	    !register_kvm_events_ops(kvm)) {
+		!register_kvm_events_ops(kvm, EM_HOST)) {
 		goto out;
 	}
 
@@ -1568,6 +1574,11 @@ static int read_events(struct perf_kvm_stat *kvm)
 		goto out_delete;
 	}
 
+	if (!register_kvm_events_ops(kvm, perf_session__e_machine(kvm->session))) {
+		ret = -EINVAL;
+		goto out_delete;
+	}
+
 	/*
 	 * Do not use 'isa' recorded in kvm_exit tracepoint since it is not
 	 * traced in the old kernel.
@@ -1610,9 +1621,6 @@ static int kvm_events_report_vcpu(struct perf_kvm_stat *kvm)
 	if (!is_valid_key(kvm))
 		goto exit;
 
-	if (!register_kvm_events_ops(kvm))
-		goto exit;
-
 	if (kvm->use_stdio) {
 		use_browser = 0;
 		setup_pager();
@@ -1653,15 +1661,16 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
 	};
 	const char * const *events_tp;
 	int ret;
+	uint16_t e_machine = EM_HOST;
 
 	events_tp_size = 0;
-	ret = setup_kvm_events_tp(kvm);
+	ret = setup_kvm_events_tp(kvm, e_machine);
 	if (ret < 0) {
 		pr_err("Unable to setup the kvm tracepoints\n");
 		return ret;
 	}
 
-	for (events_tp = kvm_events_tp(); *events_tp; events_tp++)
+	for (events_tp = kvm_events_tp(e_machine); *events_tp; events_tp++)
 		events_tp_size++;
 
 	rec_argc = ARRAY_SIZE(record_args) + argc + 2 +
@@ -1676,7 +1685,7 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
 
 	for (j = 0; j < events_tp_size; j++) {
 		rec_argv[i++] = STRDUP_FAIL_EXIT("-e");
-		rec_argv[i++] = STRDUP_FAIL_EXIT(kvm_events_tp()[j]);
+		rec_argv[i++] = STRDUP_FAIL_EXIT(kvm_events_tp(e_machine)[j]);
 	}
 
 	rec_argv[i++] = STRDUP_FAIL_EXIT("-o");
@@ -1770,7 +1779,7 @@ static struct evlist *kvm_live_event_list(void)
 	if (evlist == NULL)
 		return NULL;
 
-	for (events_tp = kvm_events_tp(); *events_tp; events_tp++) {
+	for (events_tp = kvm_events_tp(EM_HOST); *events_tp; events_tp++) {
 
 		tp = strdup(*events_tp);
 		if (tp == NULL)
@@ -1895,7 +1904,7 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
 	/*
 	 * generate the event list
 	 */
-	err = setup_kvm_events_tp(kvm);
+	err = setup_kvm_events_tp(kvm, EM_HOST);
 	if (err < 0) {
 		pr_err("Unable to setup the kvm tracepoints\n");
 		return err;
@@ -2005,7 +2014,7 @@ static int __cmd_record(const char *file_name, int argc, const char **argv)
 
 	BUG_ON(i + 2 != rec_argc);
 
-	ret = kvm_add_default_arch_event(&i, rec_argv);
+	ret = kvm_add_default_arch_event(EM_HOST, &i, rec_argv);
 	if (ret)
 		goto EXIT;
 
@@ -2092,7 +2101,7 @@ static int __cmd_top(int argc, const char **argv)
 
 	BUG_ON(i != argc);
 
-	ret = kvm_add_default_arch_event(&i, rec_argv);
+	ret = kvm_add_default_arch_event(EM_HOST, &i, rec_argv);
 	if (ret)
 		goto EXIT;
 
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 5ac1a05601b1..848d0faf6698 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1008,7 +1008,7 @@ int evsel__group_desc(struct evsel *evsel, char *buf, size_t size)
 	return ret;
 }
 
-static uint16_t evsel__e_machine(struct evsel *evsel)
+uint16_t evsel__e_machine(struct evsel *evsel)
 {
 	struct perf_session *session = evsel__session(evsel);
 
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 95c4bd0f0f2e..eefb5d569971 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -546,6 +546,7 @@ static inline bool evsel__is_dummy_event(struct evsel *evsel)
 
 struct perf_session *evsel__session(struct evsel *evsel);
 struct perf_env *evsel__env(struct evsel *evsel);
+uint16_t evsel__e_machine(struct evsel *evsel);
 
 int evsel__store_ids(struct evsel *evsel, struct evlist *evlist);
 
diff --git a/tools/perf/util/kvm-stat-arch/kvm-stat-arm64.c b/tools/perf/util/kvm-stat-arch/kvm-stat-arm64.c
index 8003ff415b1a..c640dcd8af7c 100644
--- a/tools/perf/util/kvm-stat-arch/kvm-stat-arm64.c
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-arm64.c
@@ -22,7 +22,7 @@ static void event_get_key(struct evsel *evsel,
 			  struct event_key *key)
 {
 	key->info = 0;
-	key->key = evsel__intval(evsel, sample, kvm_exit_reason());
+	key->key = evsel__intval(evsel, sample, kvm_exit_reason(EM_AARCH64));
 	key->exit_reasons = arm64_exit_reasons;
 
 	/*
@@ -40,14 +40,14 @@ static bool event_begin(struct evsel *evsel,
 			struct perf_sample *sample __maybe_unused,
 			struct event_key *key __maybe_unused)
 {
-	return evsel__name_is(evsel, kvm_entry_trace());
+	return evsel__name_is(evsel, kvm_entry_trace(EM_AARCH64));
 }
 
 static bool event_end(struct evsel *evsel,
 		      struct perf_sample *sample,
 		      struct event_key *key)
 {
-	if (evsel__name_is(evsel, kvm_exit_trace())) {
+	if (evsel__name_is(evsel, kvm_exit_trace(EM_AARCH64))) {
 		event_get_key(evsel, sample, key);
 		return true;
 	}
diff --git a/tools/perf/util/kvm-stat-arch/kvm-stat-loongarch.c b/tools/perf/util/kvm-stat-arch/kvm-stat-loongarch.c
index a15ce072ac34..b802e516b138 100644
--- a/tools/perf/util/kvm-stat-arch/kvm-stat-loongarch.c
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-loongarch.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <errno.h>
 #include <memory.h>
+#include <dwarf-regs.h>
 #include "../kvm-stat.h"
 #include "../parse-events.h"
 #include "../debug.h"
@@ -70,7 +71,7 @@ static bool event_end(struct evsel *evsel,
 	 *   kvm:kvm_enter   means returning to vmm and then to guest
 	 *   kvm:kvm_reenter means returning to guest immediately
 	 */
-	return evsel__name_is(evsel, kvm_entry_trace()) ||
+	return evsel__name_is(evsel, kvm_entry_trace(EM_LOONGARCH)) ||
 	       evsel__name_is(evsel, kvm_reenter_trace);
 }
 
diff --git a/tools/perf/util/kvm-stat-arch/kvm-stat-riscv.c b/tools/perf/util/kvm-stat-arch/kvm-stat-riscv.c
index b2c5d3220795..8d4d5d6ce720 100644
--- a/tools/perf/util/kvm-stat-arch/kvm-stat-riscv.c
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-riscv.c
@@ -27,7 +27,7 @@ static void event_get_key(struct evsel *evsel,
 	int xlen = 64; // TODO: 32-bit support.
 
 	key->info = 0;
-	key->key = evsel__intval(evsel, sample, kvm_exit_reason()) & ~CAUSE_IRQ_FLAG(xlen);
+	key->key = evsel__intval(evsel, sample, kvm_exit_reason(EM_RISCV)) & ~CAUSE_IRQ_FLAG(xlen);
 	key->exit_reasons = riscv_exit_reasons;
 }
 
@@ -35,14 +35,14 @@ static bool event_begin(struct evsel *evsel,
 			struct perf_sample *sample __maybe_unused,
 			struct event_key *key __maybe_unused)
 {
-	return evsel__name_is(evsel, kvm_entry_trace());
+	return evsel__name_is(evsel, kvm_entry_trace(EM_RISCV));
 }
 
 static bool event_end(struct evsel *evsel,
 		      struct perf_sample *sample,
 		      struct event_key *key)
 {
-	if (evsel__name_is(evsel, kvm_exit_trace())) {
+	if (evsel__name_is(evsel, kvm_exit_trace(EM_RISCV))) {
 		event_get_key(evsel, sample, key);
 		return true;
 	}
diff --git a/tools/perf/util/kvm-stat.c b/tools/perf/util/kvm-stat.c
index b1affd97917b..858b5dbd39f6 100644
--- a/tools/perf/util/kvm-stat.c
+++ b/tools/perf/util/kvm-stat.c
@@ -6,7 +6,7 @@
 
 bool kvm_exit_event(struct evsel *evsel)
 {
-	return evsel__name_is(evsel, kvm_exit_trace());
+	return evsel__name_is(evsel, kvm_exit_trace(evsel__e_machine(evsel)));
 }
 
 void exit_event_get_key(struct evsel *evsel,
@@ -14,7 +14,7 @@ void exit_event_get_key(struct evsel *evsel,
 			struct event_key *key)
 {
 	key->info = 0;
-	key->key  = evsel__intval(evsel, sample, kvm_exit_reason());
+	key->key  = evsel__intval(evsel, sample, kvm_exit_reason(evsel__e_machine(evsel)));
 }
 
 
@@ -31,7 +31,7 @@ bool exit_event_begin(struct evsel *evsel,
 
 bool kvm_entry_event(struct evsel *evsel)
 {
-	return evsel__name_is(evsel, kvm_entry_trace());
+	return evsel__name_is(evsel, kvm_entry_trace(evsel__e_machine(evsel)));
 }
 
 bool exit_event_end(struct evsel *evsel,
@@ -66,9 +66,9 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm,
 	scnprintf(decode, KVM_EVENT_NAME_LEN, "%s", exit_reason);
 }
 
-int setup_kvm_events_tp(struct perf_kvm_stat *kvm)
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm, uint16_t e_machine)
 {
-	switch (EM_HOST) {
+	switch (e_machine) {
 	case EM_PPC:
 	case EM_PPC64:
 		return __setup_kvm_events_tp_powerpc(kvm);
@@ -77,9 +77,9 @@ int setup_kvm_events_tp(struct perf_kvm_stat *kvm)
 	}
 }
 
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid)
+int cpu_isa_init(struct perf_kvm_stat *kvm, uint16_t e_machine, const char *cpuid)
 {
-	switch (EM_HOST) {
+	switch (e_machine) {
 	case EM_AARCH64:
 		return __cpu_isa_init_arm64(kvm);
 	case EM_LOONGARCH:
@@ -95,14 +95,14 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid)
 	case EM_386:
 		return __cpu_isa_init_x86(kvm, cpuid);
 	default:
-		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
 		return -1;
 	}
 }
 
-const char *vcpu_id_str(void)
+const char *vcpu_id_str(uint16_t e_machine)
 {
-	switch (EM_HOST) {
+	switch (e_machine) {
 	case EM_AARCH64:
 	case EM_RISCV:
 	case EM_S390:
@@ -114,14 +114,14 @@ const char *vcpu_id_str(void)
 	case EM_386:
 		return "vcpu_id";
 	default:
-		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
 		return NULL;
 	}
 }
 
-const char *kvm_exit_reason(void)
+const char *kvm_exit_reason(uint16_t e_machine)
 {
-	switch (EM_HOST) {
+	switch (e_machine) {
 	case EM_AARCH64:
 		return "ret";
 	case EM_LOONGARCH:
@@ -137,14 +137,14 @@ const char *kvm_exit_reason(void)
 	case EM_386:
 		return "exit_reason";
 	default:
-		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
 		return NULL;
 	}
 }
 
-const char *kvm_entry_trace(void)
+const char *kvm_entry_trace(uint16_t e_machine)
 {
-	switch (EM_HOST) {
+	switch (e_machine) {
 	case EM_AARCH64:
 	case EM_RISCV:
 	case EM_X86_64:
@@ -158,14 +158,14 @@ const char *kvm_entry_trace(void)
 	case EM_S390:
 		return "kvm:kvm_s390_sie_enter";
 	default:
-		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
 		return NULL;
 	}
 }
 
-const char *kvm_exit_trace(void)
+const char *kvm_exit_trace(uint16_t e_machine)
 {
-	switch (EM_HOST) {
+	switch (e_machine) {
 	case EM_AARCH64:
 	case EM_LOONGARCH:
 	case EM_RISCV:
@@ -178,14 +178,14 @@ const char *kvm_exit_trace(void)
 	case EM_S390:
 		return "kvm:kvm_s390_sie_exit";
 	default:
-		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
 		return NULL;
 	}
 }
 
-const char * const *kvm_events_tp(void)
+const char * const *kvm_events_tp(uint16_t e_machine)
 {
-	switch (EM_HOST) {
+	switch (e_machine) {
 	case EM_AARCH64:
 		return __kvm_events_tp_arm64();
 	case EM_LOONGARCH:
@@ -201,14 +201,14 @@ const char * const *kvm_events_tp(void)
 	case EM_386:
 		return __kvm_events_tp_x86();
 	default:
-		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
 		return NULL;
 	}
 }
 
-const struct kvm_reg_events_ops *kvm_reg_events_ops(void)
+const struct kvm_reg_events_ops *kvm_reg_events_ops(uint16_t e_machine)
 {
-	switch (EM_HOST) {
+	switch (e_machine) {
 	case EM_AARCH64:
 		return __kvm_reg_events_ops_arm64();
 	case EM_LOONGARCH:
@@ -224,14 +224,14 @@ const struct kvm_reg_events_ops *kvm_reg_events_ops(void)
 	case EM_386:
 		return __kvm_reg_events_ops_x86();
 	default:
-		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
 		return NULL;
 	}
 }
 
-const char * const *kvm_skip_events(void)
+const char * const *kvm_skip_events(uint16_t e_machine)
 {
-	switch (EM_HOST) {
+	switch (e_machine) {
 	case EM_AARCH64:
 		return __kvm_skip_events_arm64();
 	case EM_LOONGARCH:
@@ -247,14 +247,14 @@ const char * const *kvm_skip_events(void)
 	case EM_386:
 		return __kvm_skip_events_x86();
 	default:
-		pr_err("Unsupported kvm-stat host %d\n", EM_HOST);
+		pr_err("Unsupported kvm-stat host %d\n", e_machine);
 		return NULL;
 	}
 }
 
-int kvm_add_default_arch_event(int *argc, const char **argv)
+int kvm_add_default_arch_event(uint16_t e_machine, int *argc, const char **argv)
 {
-	switch (EM_HOST) {
+	switch (e_machine) {
 	case EM_PPC:
 	case EM_PPC64:
 		return __kvm_add_default_arch_event_powerpc(argc, argv);
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index 759079b4294c..4a998aaece5d 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -140,10 +140,10 @@ bool kvm_entry_event(struct evsel *evsel);
 /*
  * arch specific callbacks and data structures
  */
-int setup_kvm_events_tp(struct perf_kvm_stat *kvm);
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm, uint16_t e_machine);
 int __setup_kvm_events_tp_powerpc(struct perf_kvm_stat *kvm);
 
-int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid);
+int cpu_isa_init(struct perf_kvm_stat *kvm, uint16_t e_machine, const char *cpuid);
 int __cpu_isa_init_arm64(struct perf_kvm_stat *kvm);
 int __cpu_isa_init_loongarch(struct perf_kvm_stat *kvm);
 int __cpu_isa_init_powerpc(struct perf_kvm_stat *kvm);
@@ -151,12 +151,12 @@ int __cpu_isa_init_riscv(struct perf_kvm_stat *kvm);
 int __cpu_isa_init_s390(struct perf_kvm_stat *kvm, const char *cpuid);
 int __cpu_isa_init_x86(struct perf_kvm_stat *kvm, const char *cpuid);
 
-const char *vcpu_id_str(void);
-const char *kvm_exit_reason(void);
-const char *kvm_entry_trace(void);
-const char *kvm_exit_trace(void);
+const char *vcpu_id_str(uint16_t e_machine);
+const char *kvm_exit_reason(uint16_t e_machine);
+const char *kvm_entry_trace(uint16_t e_machine);
+const char *kvm_exit_trace(uint16_t e_machine);
 
-const char * const *kvm_events_tp(void);
+const char * const *kvm_events_tp(uint16_t e_machine);
 const char * const *__kvm_events_tp_arm64(void);
 const char * const *__kvm_events_tp_loongarch(void);
 const char * const *__kvm_events_tp_powerpc(void);
@@ -164,7 +164,7 @@ const char * const *__kvm_events_tp_riscv(void);
 const char * const *__kvm_events_tp_s390(void);
 const char * const *__kvm_events_tp_x86(void);
 
-const struct kvm_reg_events_ops *kvm_reg_events_ops(void);
+const struct kvm_reg_events_ops *kvm_reg_events_ops(uint16_t e_machine);
 const struct kvm_reg_events_ops *__kvm_reg_events_ops_arm64(void);
 const struct kvm_reg_events_ops *__kvm_reg_events_ops_loongarch(void);
 const struct kvm_reg_events_ops *__kvm_reg_events_ops_powerpc(void);
@@ -172,7 +172,7 @@ const struct kvm_reg_events_ops *__kvm_reg_events_ops_riscv(void);
 const struct kvm_reg_events_ops *__kvm_reg_events_ops_s390(void);
 const struct kvm_reg_events_ops *__kvm_reg_events_ops_x86(void);
 
-const char * const *kvm_skip_events(void);
+const char * const *kvm_skip_events(uint16_t e_machine);
 const char * const *__kvm_skip_events_arm64(void);
 const char * const *__kvm_skip_events_loongarch(void);
 const char * const *__kvm_skip_events_powerpc(void);
@@ -180,13 +180,14 @@ const char * const *__kvm_skip_events_riscv(void);
 const char * const *__kvm_skip_events_s390(void);
 const char * const *__kvm_skip_events_x86(void);
 
-int kvm_add_default_arch_event(int *argc, const char **argv);
+int kvm_add_default_arch_event(uint16_t e_machine, int *argc, const char **argv);
 int __kvm_add_default_arch_event_powerpc(int *argc, const char **argv);
 int __kvm_add_default_arch_event_x86(int *argc, const char **argv);
 
 #else /* !HAVE_LIBTRACEEVENT */
 
-static inline int kvm_add_default_arch_event(int *argc __maybe_unused,
+static inline int kvm_add_default_arch_event(uint16_t e_machine __maybe_unused,
+					     int *argc __maybe_unused,
 					     const char **argv __maybe_unused)
 {
 	return 0;
-- 
cgit v1.2.3


From 07ad6f31b6745caab701ebd5d914217cd10f5b7a Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 3 Feb 2026 10:26:38 -0800
Subject: perf session: Add e_flags to the e_machine helper

Allow e_flags as well as e_machine to be computed using the e_machine
helper.

This isn't currently used, the argument is always NULL, but it will be
used for a new header feature.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Anup Patel <anup@brainfault.org>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quan Zhou <zhouquan@iscas.ac.cn>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yunseong Kim <ysk@kzalloc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-kvm.c    |  9 +++++----
 tools/perf/builtin-report.c |  4 ++--
 tools/perf/builtin-script.c |  6 ++++--
 tools/perf/util/evsel.c     |  6 +++---
 tools/perf/util/evsel.h     |  2 +-
 tools/perf/util/kvm-stat.c  | 12 +++++++++---
 tools/perf/util/session.c   | 40 ++++++++++++++++++++++++++++++----------
 tools/perf/util/session.h   |  2 +-
 8 files changed, 55 insertions(+), 26 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 93ba07c58290..0c5e6b3aac74 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -902,7 +902,7 @@ static bool handle_end_event(struct perf_kvm_stat *kvm,
 
 	if (kvm->duration && time_diff > kvm->duration) {
 		char decode[KVM_EVENT_NAME_LEN];
-		uint16_t e_machine = perf_session__e_machine(kvm->session);
+		uint16_t e_machine = perf_session__e_machine(kvm->session, /*e_flags=*/NULL);
 
 		kvm->events_ops->decode_key(kvm, &event->key, decode);
 		if (!skip_event(e_machine, decode)) {
@@ -1187,7 +1187,7 @@ static int cpu_isa_config(struct perf_kvm_stat *kvm)
 		return -EINVAL;
 	}
 
-	e_machine = perf_session__e_machine(kvm->session);
+	e_machine = perf_session__e_machine(kvm->session, /*e_flags=*/NULL);
 	err = cpu_isa_init(kvm, e_machine, cpuid);
 	if (err == -ENOTSUP)
 		pr_err("CPU %s is not supported.\n", cpuid);
@@ -1549,7 +1549,7 @@ out:
 static int read_events(struct perf_kvm_stat *kvm)
 {
 	int ret;
-
+	uint16_t e_machine;
 	struct perf_data file = {
 		.path  = kvm->file_name,
 		.mode  = PERF_DATA_MODE_READ,
@@ -1574,7 +1574,8 @@ static int read_events(struct perf_kvm_stat *kvm)
 		goto out_delete;
 	}
 
-	if (!register_kvm_events_ops(kvm, perf_session__e_machine(kvm->session))) {
+	e_machine = perf_session__e_machine(kvm->session, /*e_flags=*/NULL);
+	if (!register_kvm_events_ops(kvm, e_machine)) {
 		ret = -EINVAL;
 		goto out_delete;
 	}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 810ffd66b11c..3b81f4b3dc49 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -448,7 +448,7 @@ static int report__setup_sample_type(struct report *rep)
 		}
 	}
 
-	callchain_param_setup(sample_type, perf_session__e_machine(session));
+	callchain_param_setup(sample_type, perf_session__e_machine(session, /*e_flags=*/NULL));
 
 	if (rep->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) {
 		ui__warning("Can't find LBR callchain. Switch off --stitch-lbr.\n"
@@ -1296,7 +1296,7 @@ static int process_attr(const struct perf_tool *tool __maybe_unused,
 	 */
 	sample_type = evlist__combined_sample_type(*pevlist);
 	session = (*pevlist)->session;
-	callchain_param_setup(sample_type, perf_session__e_machine(session));
+	callchain_param_setup(sample_type, perf_session__e_machine(session, /*e_flags=*/NULL));
 	return 0;
 }
 
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index c7d5a325b5cb..14c6f6c3c4f2 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2818,6 +2818,7 @@ static int process_attr(const struct perf_tool *tool, union perf_event *event,
 	struct perf_script *scr = container_of(tool, struct perf_script, tool);
 	struct evlist *evlist;
 	struct evsel *evsel, *pos;
+	uint16_t e_machine;
 	u64 sample_type;
 	int err;
 
@@ -2859,7 +2860,8 @@ static int process_attr(const struct perf_tool *tool, union perf_event *event,
 	 * on events sample_type.
 	 */
 	sample_type = evlist__combined_sample_type(evlist);
-	callchain_param_setup(sample_type, perf_session__e_machine(evsel__session(evsel)));
+	e_machine = perf_session__e_machine(evsel__session(evsel), /*e_flags=*/NULL);
+	callchain_param_setup(sample_type, e_machine);
 
 	/* Enable fields for callchain entries */
 	if (symbol_conf.use_callchain &&
@@ -3834,7 +3836,7 @@ static void script__setup_sample_type(struct perf_script *script)
 	struct perf_session *session = script->session;
 	u64 sample_type = evlist__combined_sample_type(session->evlist);
 
-	callchain_param_setup(sample_type, perf_session__e_machine(session));
+	callchain_param_setup(sample_type, perf_session__e_machine(session, /*e_flags=*/NULL));
 
 	if (script->stitch_lbr && (callchain_param.record_mode != CALLCHAIN_LBR)) {
 		pr_warning("Can't find LBR callchain. Switch off --stitch-lbr.\n"
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 848d0faf6698..aff44ffd3ff1 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1008,11 +1008,11 @@ int evsel__group_desc(struct evsel *evsel, char *buf, size_t size)
 	return ret;
 }
 
-uint16_t evsel__e_machine(struct evsel *evsel)
+uint16_t evsel__e_machine(struct evsel *evsel, uint32_t *e_flags)
 {
 	struct perf_session *session = evsel__session(evsel);
 
-	return session ? perf_session__e_machine(session) : EM_HOST;
+	return perf_session__e_machine(session, e_flags);
 }
 
 static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *opts,
@@ -1050,7 +1050,7 @@ static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *o
 
 	if (param->record_mode == CALLCHAIN_DWARF) {
 		if (!function) {
-			uint16_t e_machine = evsel__e_machine(evsel);
+			uint16_t e_machine = evsel__e_machine(evsel, /*e_flags=*/NULL);
 
 			evsel__set_sample_bit(evsel, REGS_USER);
 			evsel__set_sample_bit(evsel, STACK_USER);
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index eefb5d569971..a3d754c029a0 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -546,7 +546,7 @@ static inline bool evsel__is_dummy_event(struct evsel *evsel)
 
 struct perf_session *evsel__session(struct evsel *evsel);
 struct perf_env *evsel__env(struct evsel *evsel);
-uint16_t evsel__e_machine(struct evsel *evsel);
+uint16_t evsel__e_machine(struct evsel *evsel, uint32_t *e_flags);
 
 int evsel__store_ids(struct evsel *evsel, struct evlist *evlist);
 
diff --git a/tools/perf/util/kvm-stat.c b/tools/perf/util/kvm-stat.c
index 858b5dbd39f6..27f16810498c 100644
--- a/tools/perf/util/kvm-stat.c
+++ b/tools/perf/util/kvm-stat.c
@@ -6,15 +6,19 @@
 
 bool kvm_exit_event(struct evsel *evsel)
 {
-	return evsel__name_is(evsel, kvm_exit_trace(evsel__e_machine(evsel)));
+	uint16_t e_machine = evsel__e_machine(evsel, /*e_flags=*/NULL);
+
+	return evsel__name_is(evsel, kvm_exit_trace(e_machine));
 }
 
 void exit_event_get_key(struct evsel *evsel,
 			struct perf_sample *sample,
 			struct event_key *key)
 {
+	uint16_t e_machine = evsel__e_machine(evsel, /*e_flags=*/NULL);
+
 	key->info = 0;
-	key->key  = evsel__intval(evsel, sample, kvm_exit_reason(evsel__e_machine(evsel)));
+	key->key  = evsel__intval(evsel, sample, kvm_exit_reason(e_machine));
 }
 
 
@@ -31,7 +35,9 @@ bool exit_event_begin(struct evsel *evsel,
 
 bool kvm_entry_event(struct evsel *evsel)
 {
-	return evsel__name_is(evsel, kvm_entry_trace(evsel__e_machine(evsel)));
+	uint16_t e_machine = evsel__e_machine(evsel, /*e_flags=*/NULL);
+
+	return evsel__name_is(evsel, kvm_entry_trace(e_machine));
 }
 
 bool exit_event_end(struct evsel *evsel,
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index d0053618f540..72e8bb67d740 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2964,27 +2964,47 @@ struct perf_env *perf_session__env(struct perf_session *session)
 	return &session->header.env;
 }
 
-static int perf_session__e_machine_cb(struct thread *thread,
-				      void *arg __maybe_unused)
+struct perf_session__e_machine_cb_args {
+	uint32_t e_flags;
+	uint16_t e_machine;
+	bool need_e_flags;
+};
+
+static int perf_session__e_machine_cb(struct thread *thread, void *_args)
 {
-	uint16_t *result = arg;
+	struct perf_session__e_machine_cb_args *args = _args;
 	struct machine *machine = maps__machine(thread__maps(thread));
 
-	*result = thread__e_machine(thread, machine, /*e_flags=*/NULL);
-	return *result != EM_NONE ? 1 : 0;
+	args->e_machine = thread__e_machine(thread, machine,
+					    args->need_e_flags ? &args->e_flags : NULL);
+	return args->e_machine != EM_NONE ? 1 : 0;
 }
 
 /*
  * Note, a machine may have mixed 32-bit and 64-bit processes and so mixed
  * e_machines. Use thread__e_machine when this matters.
  */
-uint16_t perf_session__e_machine(struct perf_session *session)
+uint16_t perf_session__e_machine(struct perf_session *session, uint32_t *e_flags)
 {
-	uint16_t e_machine = EM_NONE;
+	struct perf_session__e_machine_cb_args args = {
+		.e_machine = EM_NONE,
+		.need_e_flags = e_flags != NULL,
+	};
+
+	if (!session) {
+		/* Default to assuming a host machine. */
+		if (e_flags)
+			*e_flags = EF_HOST;
+
+		return EM_HOST;
+	}
 
 	machines__for_each_thread(&session->machines,
-					 perf_session__e_machine_cb,
-					 &e_machine);
+				  perf_session__e_machine_cb,
+				  &args);
+
+	if (e_flags)
+		*e_flags = args.e_flags;
 
-	return e_machine == EM_NONE ? EM_HOST : e_machine;
+	return args.e_machine == EM_NONE ? EM_HOST : args.e_machine;
 }
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index eddc4c630b33..f05f0d4a6c23 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -211,6 +211,6 @@ int perf_event__process_finished_round(const struct perf_tool *tool,
 				       struct ordered_events *oe);
 
 struct perf_env *perf_session__env(struct perf_session *session);
-uint16_t perf_session__e_machine(struct perf_session *session);
+uint16_t perf_session__e_machine(struct perf_session *session, uint32_t *e_flags);
 
 #endif /* __PERF_SESSION_H */
-- 
cgit v1.2.3


From c4f4392264b45d53ec6e4d21b6f7d947953ddf45 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 3 Feb 2026 10:26:39 -0800
Subject: perf header: Add e_machine/e_flags to the header

Add 64-bits of feature data to record the ELF machine and flags.

This allows readers to initialize based on the data.

For example, `perf kvm stat` wants to initialize based on the kind of
data to be read, but at initialization time there are no threads to base
this data upon and using the host means cross platform support won't
work.

The values in the perf_env also act as a cache for these within the
session.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Anup Patel <anup@brainfault.org>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quan Zhou <zhouquan@iscas.ac.cn>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yunseong Kim <ysk@kzalloc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/env.h     |  3 +++
 tools/perf/util/header.c  | 33 +++++++++++++++++++++++++++++++++
 tools/perf/util/header.h  |  1 +
 tools/perf/util/session.c | 33 +++++++++++++++++++++++++++------
 4 files changed, 64 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index 76ba1a36e9ff..a4501cbca375 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -74,6 +74,9 @@ struct perf_env {
 	char			*os_release;
 	char			*version;
 	char			*arch;
+	/* e_machine expanded from 16 to 32-bits for alignment. */
+	u32			e_machine;
+	u32			e_flags;
 	int			nr_cpus_online;
 	int			nr_cpus_avail;
 	char			*cpu_desc;
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 31c3bab1b10a..9142a8ba4019 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -379,6 +379,21 @@ static int write_arch(struct feat_fd *ff,
 	return do_write_string(ff, uts.machine);
 }
 
+static int write_e_machine(struct feat_fd *ff,
+			   struct evlist *evlist __maybe_unused)
+{
+	/* e_machine expanded from 16 to 32-bits for alignment. */
+	uint32_t e_flags;
+	uint32_t e_machine = perf_session__e_machine(evlist->session, &e_flags);
+	int ret;
+
+	ret = do_write(ff, &e_machine, sizeof(e_machine));
+	if (ret)
+		return ret;
+
+	return do_write(ff, &e_flags, sizeof(e_flags));
+}
+
 static int write_version(struct feat_fd *ff,
 			 struct evlist *evlist __maybe_unused)
 {
@@ -1785,6 +1800,12 @@ static void print_arch(struct feat_fd *ff, FILE *fp)
 	fprintf(fp, "# arch : %s\n", ff->ph->env.arch);
 }
 
+static void print_e_machine(struct feat_fd *ff, FILE *fp)
+{
+	fprintf(fp, "# e_machine : %u\n", ff->ph->env.e_machine);
+	fprintf(fp, "#   e_flags : %u\n", ff->ph->env.e_flags);
+}
+
 static void print_cpudesc(struct feat_fd *ff, FILE *fp)
 {
 	fprintf(fp, "# cpudesc : %s\n", ff->ph->env.cpu_desc);
@@ -2612,6 +2633,17 @@ FEAT_PROCESS_STR_FUN(arch, arch);
 FEAT_PROCESS_STR_FUN(cpudesc, cpu_desc);
 FEAT_PROCESS_STR_FUN(cpuid, cpuid);
 
+static int process_e_machine(struct feat_fd *ff, void *data __maybe_unused)
+{
+	int ret;
+
+	ret = do_read_u32(ff, &ff->ph->env.e_machine);
+	if (ret)
+		return ret;
+
+	return do_read_u32(ff, &ff->ph->env.e_flags);
+}
+
 #ifdef HAVE_LIBTRACEEVENT
 static int process_tracing_data(struct feat_fd *ff, void *data)
 {
@@ -3730,6 +3762,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPN(HYBRID_TOPOLOGY,	hybrid_topology,	true),
 	FEAT_OPR(PMU_CAPS,	pmu_caps,	false),
 	FEAT_OPR(CPU_DOMAIN_INFO,	cpu_domain_info,	true),
+	FEAT_OPR(E_MACHINE,	e_machine,	false),
 };
 
 struct header_print_data {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 36cc74e2d14d..cc40ac796f52 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -54,6 +54,7 @@ enum {
 	HEADER_HYBRID_TOPOLOGY,
 	HEADER_PMU_CAPS,
 	HEADER_CPU_DOMAIN_INFO,
+	HEADER_E_MACHINE,
 	HEADER_LAST_FEATURE,
 	HEADER_FEAT_BITS	= 256,
 };
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 72e8bb67d740..53f51c3f9603 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2967,7 +2967,6 @@ struct perf_env *perf_session__env(struct perf_session *session)
 struct perf_session__e_machine_cb_args {
 	uint32_t e_flags;
 	uint16_t e_machine;
-	bool need_e_flags;
 };
 
 static int perf_session__e_machine_cb(struct thread *thread, void *_args)
@@ -2975,8 +2974,7 @@ static int perf_session__e_machine_cb(struct thread *thread, void *_args)
 	struct perf_session__e_machine_cb_args *args = _args;
 	struct machine *machine = maps__machine(thread__maps(thread));
 
-	args->e_machine = thread__e_machine(thread, machine,
-					    args->need_e_flags ? &args->e_flags : NULL);
+	args->e_machine = thread__e_machine(thread, machine, &args->e_flags);
 	return args->e_machine != EM_NONE ? 1 : 0;
 }
 
@@ -2988,8 +2986,8 @@ uint16_t perf_session__e_machine(struct perf_session *session, uint32_t *e_flags
 {
 	struct perf_session__e_machine_cb_args args = {
 		.e_machine = EM_NONE,
-		.need_e_flags = e_flags != NULL,
 	};
+	struct perf_env *env;
 
 	if (!session) {
 		/* Default to assuming a host machine. */
@@ -2999,12 +2997,35 @@ uint16_t perf_session__e_machine(struct perf_session *session, uint32_t *e_flags
 		return EM_HOST;
 	}
 
+	env = perf_session__env(session);
+	if (env && env->e_machine != EM_NONE) {
+		if (e_flags)
+			*e_flags = env->e_flags;
+
+		return env->e_machine;
+	}
+
 	machines__for_each_thread(&session->machines,
 				  perf_session__e_machine_cb,
 				  &args);
 
+	if (args.e_machine != EM_NONE) {
+		if (env) {
+			env->e_machine = args.e_machine;
+			env->e_flags = args.e_flags;
+		}
+		if (e_flags)
+			*e_flags = args.e_flags;
+
+		return args.e_machine;
+	}
+
+	/*
+	 * Couldn't determine from the perf_env or current set of
+	 * threads. Default to the host.
+	 */
 	if (e_flags)
-		*e_flags = args.e_flags;
+		*e_flags = EF_HOST;
 
-	return args.e_machine == EM_NONE ? EM_HOST : args.e_machine;
+	return EM_HOST;
 }
-- 
cgit v1.2.3


From 84cb36da81413c2dff805150b9f4db1524460269 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 3 Feb 2026 10:26:40 -0800
Subject: perf thread: Don't require machine to compute the e_machine

The machine can be calculated from a thread via its maps.

Don't require the machine argument to simplify callers and also to delay
computing the machine until a little later.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Aditya Bodkhe <aditya.b1@linux.ibm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Anubhav Shelat <ashelat@redhat.com>
Cc: Anup Patel <anup@brainfault.org>
Cc: Athira Rajeev <atrajeev@linux.ibm.com>
Cc: Blake Jones <blakejones@google.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Leo Yan <leo.yan@linux.dev>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quan Zhou <zhouquan@iscas.ac.cn>
Cc: Shimin Guo <shimin.guo@skydio.com>
Cc: Swapnil Sapkal <swapnil.sapkal@amd.com>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yunseong Kim <ysk@kzalloc.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/scripting-engines/trace-event-python.c | 8 +++-----
 tools/perf/util/session.c                              | 3 +--
 tools/perf/util/thread.c                               | 5 +++++
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 62c9c73daef5..2b0df7bd9a46 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -837,7 +837,6 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample,
 					 PyObject *callchain)
 {
 	PyObject *dict, *dict_sample, *brstack, *brstacksym;
-	struct machine *machine;
 	uint16_t e_machine = EM_HOST;
 	uint32_t e_flags = EF_HOST;
 
@@ -926,10 +925,9 @@ static PyObject *get_perf_sample_dict(struct perf_sample *sample,
 			PyLong_FromUnsignedLongLong(sample->cyc_cnt));
 	}
 
-	if (al->thread) {
-		machine = maps__machine(thread__maps(al->thread));
-		e_machine = thread__e_machine(al->thread, machine, &e_flags);
-	}
+	if (al->thread)
+		e_machine = thread__e_machine(al->thread, /*machine=*/NULL, &e_flags);
+
 	if (set_regs_in_dict(dict, sample, evsel, e_machine, e_flags))
 		Py_FatalError("Failed to setting regs in dict");
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 53f51c3f9603..4b465abfa36c 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2972,9 +2972,8 @@ struct perf_session__e_machine_cb_args {
 static int perf_session__e_machine_cb(struct thread *thread, void *_args)
 {
 	struct perf_session__e_machine_cb_args *args = _args;
-	struct machine *machine = maps__machine(thread__maps(thread));
 
-	args->e_machine = thread__e_machine(thread, machine, &args->e_flags);
+	args->e_machine = thread__e_machine(thread, /*machine=*/NULL, &args->e_flags);
 	return args->e_machine != EM_NONE ? 1 : 0;
 }
 
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 618f29afb160..22be77225bb0 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -499,6 +499,11 @@ uint16_t thread__e_machine(struct thread *thread, struct machine *machine, uint3
 		return e_machine;
 	}
 
+	if (machine == NULL) {
+		struct maps *maps = thread__maps(thread);
+
+		machine = maps__machine(maps);
+	}
 	tid = thread__tid(thread);
 	pid = thread__pid(thread);
 	if (pid != tid) {
-- 
cgit v1.2.3


From 10653c0dd86812981a9cf7112f0711f9f1f153a8 Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Sat, 31 Jan 2026 18:53:58 -0800
Subject: selftests/bpf: Refactor timer selftests

Refactor timer selftests, extracting stress test into a separate test.
This makes it easier to debug test failures and allows to extend.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260201025403.66625-5-alexei.starovoitov@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/timer.c | 55 +++++++++++++++++---------
 1 file changed, 36 insertions(+), 19 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c
index 34f9ccce2602..4d853d1bd2a7 100644
--- a/tools/testing/selftests/bpf/prog_tests/timer.c
+++ b/tools/testing/selftests/bpf/prog_tests/timer.c
@@ -22,13 +22,35 @@ static void *spin_lock_thread(void *arg)
 	pthread_exit(arg);
 }
 
-static int timer(struct timer *timer_skel)
+
+static int timer_stress(struct timer *timer_skel)
 {
-	int i, err, prog_fd;
+	int i, err = 1, prog_fd;
 	LIBBPF_OPTS(bpf_test_run_opts, topts);
 	pthread_t thread_id[NUM_THR];
 	void *ret;
 
+	prog_fd = bpf_program__fd(timer_skel->progs.race);
+	for (i = 0; i < NUM_THR; i++) {
+		err = pthread_create(&thread_id[i], NULL,
+				     &spin_lock_thread, &prog_fd);
+		if (!ASSERT_OK(err, "pthread_create"))
+			break;
+	}
+
+	while (i) {
+		err = pthread_join(thread_id[--i], &ret);
+		if (ASSERT_OK(err, "pthread_join"))
+			ASSERT_EQ(ret, (void *)&prog_fd, "pthread_join");
+	}
+	return err;
+}
+
+static int timer(struct timer *timer_skel)
+{
+	int err, prog_fd;
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+
 	err = timer__attach(timer_skel);
 	if (!ASSERT_OK(err, "timer_attach"))
 		return err;
@@ -63,25 +85,10 @@ static int timer(struct timer *timer_skel)
 	/* check that code paths completed */
 	ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok");
 
-	prog_fd = bpf_program__fd(timer_skel->progs.race);
-	for (i = 0; i < NUM_THR; i++) {
-		err = pthread_create(&thread_id[i], NULL,
-				     &spin_lock_thread, &prog_fd);
-		if (!ASSERT_OK(err, "pthread_create"))
-			break;
-	}
-
-	while (i) {
-		err = pthread_join(thread_id[--i], &ret);
-		if (ASSERT_OK(err, "pthread_join"))
-			ASSERT_EQ(ret, (void *)&prog_fd, "pthread_join");
-	}
-
 	return 0;
 }
 
-/* TODO: use pid filtering */
-void serial_test_timer(void)
+static void test_timer(int (*timer_test_fn)(struct timer *timer_skel))
 {
 	struct timer *timer_skel = NULL;
 	int err;
@@ -94,13 +101,23 @@ void serial_test_timer(void)
 	if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load"))
 		return;
 
-	err = timer(timer_skel);
+	err = timer_test_fn(timer_skel);
 	ASSERT_OK(err, "timer");
 	timer__destroy(timer_skel);
+}
+
+void serial_test_timer(void)
+{
+	test_timer(timer);
 
 	RUN_TESTS(timer_failure);
 }
 
+void serial_test_timer_stress(void)
+{
+	test_timer(timer_stress);
+}
+
 void test_timer_interrupt(void)
 {
 	struct timer_interrupt *skel = NULL;
-- 
cgit v1.2.3


From d02fdd7195ca24005e36a6f7efed41f9c0ca7f72 Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Sat, 31 Jan 2026 18:53:59 -0800
Subject: selftests/bpf: Add stress test for timer async cancel

Extend BPF timer selftest to run stress test for async cancel.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260201025403.66625-6-alexei.starovoitov@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/timer.c | 18 +++++++++++++++++-
 tools/testing/selftests/bpf/progs/timer.c      | 14 +++++++++++---
 2 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c
index 4d853d1bd2a7..a157a2a699e6 100644
--- a/tools/testing/selftests/bpf/prog_tests/timer.c
+++ b/tools/testing/selftests/bpf/prog_tests/timer.c
@@ -23,13 +23,14 @@ static void *spin_lock_thread(void *arg)
 }
 
 
-static int timer_stress(struct timer *timer_skel)
+static int timer_stress_runner(struct timer *timer_skel, bool async_cancel)
 {
 	int i, err = 1, prog_fd;
 	LIBBPF_OPTS(bpf_test_run_opts, topts);
 	pthread_t thread_id[NUM_THR];
 	void *ret;
 
+	timer_skel->bss->async_cancel = async_cancel;
 	prog_fd = bpf_program__fd(timer_skel->progs.race);
 	for (i = 0; i < NUM_THR; i++) {
 		err = pthread_create(&thread_id[i], NULL,
@@ -46,6 +47,16 @@ static int timer_stress(struct timer *timer_skel)
 	return err;
 }
 
+static int timer_stress(struct timer *timer_skel)
+{
+	return timer_stress_runner(timer_skel, false);
+}
+
+static int timer_stress_async_cancel(struct timer *timer_skel)
+{
+	return timer_stress_runner(timer_skel, true);
+}
+
 static int timer(struct timer *timer_skel)
 {
 	int err, prog_fd;
@@ -118,6 +129,11 @@ void serial_test_timer_stress(void)
 	test_timer(timer_stress);
 }
 
+void serial_test_timer_stress_async_cancel(void)
+{
+	test_timer(timer_stress_async_cancel);
+}
+
 void test_timer_interrupt(void)
 {
 	struct timer_interrupt *skel = NULL;
diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c
index 4c677c001258..a81413514e4b 100644
--- a/tools/testing/selftests/bpf/progs/timer.c
+++ b/tools/testing/selftests/bpf/progs/timer.c
@@ -1,13 +1,17 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2021 Facebook */
-#include <linux/bpf.h>
-#include <time.h>
+
+#include <vmlinux.h>
 #include <stdbool.h>
 #include <errno.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
+#define CLOCK_MONOTONIC 1
+#define CLOCK_BOOTTIME 7
+
 char _license[] SEC("license") = "GPL";
+
 struct hmap_elem {
 	int counter;
 	struct bpf_timer timer;
@@ -63,6 +67,7 @@ __u64 callback_check = 52;
 __u64 callback2_check = 52;
 __u64 pinned_callback_check;
 __s32 pinned_cpu;
+bool async_cancel = 0;
 
 #define ARRAY 1
 #define HTAB 2
@@ -419,7 +424,10 @@ int race(void *ctx)
 
 	bpf_timer_set_callback(timer, race_timer_callback);
 	bpf_timer_start(timer, 0, 0);
-	bpf_timer_cancel(timer);
+	if (async_cancel)
+		bpf_timer_cancel_async(timer);
+	else
+		bpf_timer_cancel(timer);
 
 	return 0;
 }
-- 
cgit v1.2.3


From fe9d205cec8ccdd13171a056257101374306e802 Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Sat, 31 Jan 2026 18:54:00 -0800
Subject: selftests/bpf: Verify bpf_timer_cancel_async works

Add test that verifies that bpf_timer_cancel_async works: can cancel
callback successfully.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260201025403.66625-7-alexei.starovoitov@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/timer.c | 25 +++++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/timer.c      | 23 +++++++++++++++++++++++
 2 files changed, 48 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c
index a157a2a699e6..2b932d4dfd43 100644
--- a/tools/testing/selftests/bpf/prog_tests/timer.c
+++ b/tools/testing/selftests/bpf/prog_tests/timer.c
@@ -99,6 +99,26 @@ static int timer(struct timer *timer_skel)
 	return 0;
 }
 
+static int timer_cancel_async(struct timer *timer_skel)
+{
+	int err, prog_fd;
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+
+	prog_fd = bpf_program__fd(timer_skel->progs.test_async_cancel_succeed);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+	ASSERT_EQ(topts.retval, 0, "test_run");
+
+	usleep(500);
+	/* check that there were no errors in timer execution */
+	ASSERT_EQ(timer_skel->bss->err, 0, "err");
+
+	/* check that code paths completed */
+	ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok");
+
+	return 0;
+}
+
 static void test_timer(int (*timer_test_fn)(struct timer *timer_skel))
 {
 	struct timer *timer_skel = NULL;
@@ -134,6 +154,11 @@ void serial_test_timer_stress_async_cancel(void)
 	test_timer(timer_stress_async_cancel);
 }
 
+void serial_test_timer_async_cancel(void)
+{
+	test_timer(timer_cancel_async);
+}
+
 void test_timer_interrupt(void)
 {
 	struct timer_interrupt *skel = NULL;
diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c
index a81413514e4b..4b4ca781e7cd 100644
--- a/tools/testing/selftests/bpf/progs/timer.c
+++ b/tools/testing/selftests/bpf/progs/timer.c
@@ -169,6 +169,29 @@ int BPF_PROG2(test1, int, a)
 	return 0;
 }
 
+static int timer_error(void *map, int *key, struct bpf_timer *timer)
+{
+	err = 42;
+	return 0;
+}
+
+SEC("syscall")
+int test_async_cancel_succeed(void *ctx)
+{
+	struct bpf_timer *arr_timer;
+	int array_key = ARRAY;
+
+	arr_timer = bpf_map_lookup_elem(&array, &array_key);
+	if (!arr_timer)
+		return 0;
+	bpf_timer_init(arr_timer, &array, CLOCK_MONOTONIC);
+	bpf_timer_set_callback(arr_timer, timer_error);
+	bpf_timer_start(arr_timer, 100000 /* 100us */, 0);
+	bpf_timer_cancel_async(arr_timer);
+	ok = 7;
+	return 0;
+}
+
 /* callback for prealloc and non-prealloca hashtab timers */
 static int timer_cb2(void *map, int *key, struct hmap_elem *val)
 {
-- 
cgit v1.2.3


From 083c5a4babad4af89dd07e2cc5f7004343d4c1f0 Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Sat, 31 Jan 2026 18:54:01 -0800
Subject: selftests/bpf: Add timer stress test in NMI context

Add stress tests for BPF timers that run in NMI context using perf_event
programs attached to PERF_COUNT_HW_CPU_CYCLES.

The tests cover three scenarios:
- nmi_race: Tests concurrent timer start and async cancel operations
- nmi_update: Tests updating a map element (effectively deleting and
  inserting new for array map) from within a timer callback
- nmi_cancel: Tests timer self-cancellation attempt.

A common test_common() helper is used to share timer setup logic across
all test modes.

The tests spawn multiple threads in a child process to generate
perf events, which trigger the BPF programs in NMI context. Hit counters
verify that the NMI code paths were actually exercised.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260201025403.66625-8-alexei.starovoitov@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/timer.c | 158 +++++++++++++++++++++++++
 tools/testing/selftests/bpf/progs/timer.c      |  85 +++++++++++--
 2 files changed, 231 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c
index 2b932d4dfd43..09ff21e1ad2f 100644
--- a/tools/testing/selftests/bpf/prog_tests/timer.c
+++ b/tools/testing/selftests/bpf/prog_tests/timer.c
@@ -1,12 +1,27 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2021 Facebook */
+#include <sched.h>
 #include <test_progs.h>
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
 #include "timer.skel.h"
 #include "timer_failure.skel.h"
 #include "timer_interrupt.skel.h"
 
 #define NUM_THR 8
 
+static int perf_event_open(__u32 type, __u64 config, int pid, int cpu)
+{
+	struct perf_event_attr attr = {
+		.type = type,
+		.config = config,
+		.size = sizeof(struct perf_event_attr),
+		.sample_period = 10000,
+	};
+
+	return syscall(__NR_perf_event_open, &attr, pid, cpu, -1, 0);
+}
+
 static void *spin_lock_thread(void *arg)
 {
 	int i, err, prog_fd = *(int *)arg;
@@ -57,6 +72,134 @@ static int timer_stress_async_cancel(struct timer *timer_skel)
 	return timer_stress_runner(timer_skel, true);
 }
 
+static void *nmi_cpu_worker(void *arg)
+{
+	volatile __u64 num = 1;
+	int i;
+
+	for (i = 0; i < 500000000; ++i)
+		num *= (i % 7) + 1;
+	(void)num;
+
+	return NULL;
+}
+
+static int run_nmi_test(struct timer *timer_skel, struct bpf_program *prog)
+{
+	struct bpf_link *link = NULL;
+	int pe_fd = -1, pipefd[2] = {-1, -1}, pid = 0, status;
+	char buf = 0;
+	int ret = -1;
+
+	if (!ASSERT_OK(pipe(pipefd), "pipe"))
+		goto cleanup;
+
+	pid = fork();
+	if (pid == 0) {
+		/* Child: spawn multiple threads to consume multiple CPUs */
+		pthread_t threads[NUM_THR];
+		int i;
+
+		close(pipefd[1]);
+		read(pipefd[0], &buf, 1);
+		close(pipefd[0]);
+
+		for (i = 0; i < NUM_THR; i++)
+			pthread_create(&threads[i], NULL, nmi_cpu_worker, NULL);
+		for (i = 0; i < NUM_THR; i++)
+			pthread_join(threads[i], NULL);
+		exit(0);
+	}
+
+	if (!ASSERT_GE(pid, 0, "fork"))
+		goto cleanup;
+
+	/* Open perf event for child process across all CPUs */
+	pe_fd = perf_event_open(PERF_TYPE_HARDWARE,
+				PERF_COUNT_HW_CPU_CYCLES,
+				pid,  /* measure child process */
+				-1);  /* on any CPU */
+	if (pe_fd < 0) {
+		if (errno == ENOENT || errno == EOPNOTSUPP) {
+			printf("SKIP:no PERF_COUNT_HW_CPU_CYCLES\n");
+			test__skip();
+			ret = EOPNOTSUPP;
+			goto cleanup;
+		}
+		ASSERT_GE(pe_fd, 0, "perf_event_open");
+		goto cleanup;
+	}
+
+	link = bpf_program__attach_perf_event(prog, pe_fd);
+	if (!ASSERT_OK_PTR(link, "attach_perf_event"))
+		goto cleanup;
+	pe_fd = -1;  /* Ownership transferred to link */
+
+	/* Signal child to start CPU work */
+	close(pipefd[0]);
+	pipefd[0] = -1;
+	write(pipefd[1], &buf, 1);
+	close(pipefd[1]);
+	pipefd[1] = -1;
+
+	waitpid(pid, &status, 0);
+	pid = 0;
+
+	/* Verify NMI context was hit */
+	ASSERT_GT(timer_skel->bss->test_hits, 0, "test_hits");
+	ret = 0;
+
+cleanup:
+	bpf_link__destroy(link);
+	if (pe_fd >= 0)
+		close(pe_fd);
+	if (pid > 0) {
+		write(pipefd[1], &buf, 1);
+		waitpid(pid, &status, 0);
+	}
+	if (pipefd[0] >= 0)
+		close(pipefd[0]);
+	if (pipefd[1] >= 0)
+		close(pipefd[1]);
+	return ret;
+}
+
+static int timer_stress_nmi_race(struct timer *timer_skel)
+{
+	int err;
+
+	err = run_nmi_test(timer_skel, timer_skel->progs.nmi_race);
+	if (err == EOPNOTSUPP)
+		return 0;
+	return err;
+}
+
+static int timer_stress_nmi_update(struct timer *timer_skel)
+{
+	int err;
+
+	err = run_nmi_test(timer_skel, timer_skel->progs.nmi_update);
+	if (err == EOPNOTSUPP)
+		return 0;
+	if (err)
+		return err;
+	ASSERT_GT(timer_skel->bss->update_hits, 0, "update_hits");
+	return 0;
+}
+
+static int timer_stress_nmi_cancel(struct timer *timer_skel)
+{
+	int err;
+
+	err = run_nmi_test(timer_skel, timer_skel->progs.nmi_cancel);
+	if (err == EOPNOTSUPP)
+		return 0;
+	if (err)
+		return err;
+	ASSERT_GT(timer_skel->bss->cancel_hits, 0, "cancel_hits");
+	return 0;
+}
+
 static int timer(struct timer *timer_skel)
 {
 	int err, prog_fd;
@@ -159,6 +302,21 @@ void serial_test_timer_async_cancel(void)
 	test_timer(timer_cancel_async);
 }
 
+void serial_test_timer_stress_nmi_race(void)
+{
+	test_timer(timer_stress_nmi_race);
+}
+
+void serial_test_timer_stress_nmi_update(void)
+{
+	test_timer(timer_stress_nmi_update);
+}
+
+void serial_test_timer_stress_nmi_cancel(void)
+{
+	test_timer(timer_stress_nmi_cancel);
+}
+
 void test_timer_interrupt(void)
 {
 	struct timer_interrupt *skel = NULL;
diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c
index 4b4ca781e7cd..d6d5fefcd9b1 100644
--- a/tools/testing/selftests/bpf/progs/timer.c
+++ b/tools/testing/selftests/bpf/progs/timer.c
@@ -63,6 +63,9 @@ __u64 bss_data;
 __u64 abs_data;
 __u64 err;
 __u64 ok;
+__u64 test_hits;
+__u64 update_hits;
+__u64 cancel_hits;
 __u64 callback_check = 52;
 __u64 callback2_check = 52;
 __u64 pinned_callback_check;
@@ -427,30 +430,88 @@ static int race_timer_callback(void *race_array, int *race_key, struct bpf_timer
 	return 0;
 }
 
-SEC("syscall")
-int race(void *ctx)
+/* Callback that updates its own map element */
+static int update_self_callback(void *map, int *key, struct bpf_timer *timer)
+{
+	struct elem init = {};
+
+	bpf_map_update_elem(map, key, &init, BPF_ANY);
+	__sync_fetch_and_add(&update_hits, 1);
+	return 0;
+}
+
+/* Callback that cancels itself using async cancel */
+static int cancel_self_callback(void *map, int *key, struct bpf_timer *timer)
+{
+	bpf_timer_cancel_async(timer);
+	__sync_fetch_and_add(&cancel_hits, 1);
+	return 0;
+}
+
+enum test_mode {
+	TEST_RACE_SYNC,
+	TEST_RACE_ASYNC,
+	TEST_UPDATE,
+	TEST_CANCEL,
+};
+
+static __always_inline int test_common(enum test_mode mode)
 {
 	struct bpf_timer *timer;
-	int err, race_key = 0;
 	struct elem init;
+	int ret, key = 0;
 
 	__builtin_memset(&init, 0, sizeof(struct elem));
-	bpf_map_update_elem(&race_array, &race_key, &init, BPF_ANY);
 
-	timer = bpf_map_lookup_elem(&race_array, &race_key);
+	bpf_map_update_elem(&race_array, &key, &init, BPF_ANY);
+	timer = bpf_map_lookup_elem(&race_array, &key);
 	if (!timer)
-		return 1;
+		return 0;
+
+	ret = bpf_timer_init(timer, &race_array, CLOCK_MONOTONIC);
+	if (ret && ret != -EBUSY)
+		return 0;
 
-	err = bpf_timer_init(timer, &race_array, CLOCK_MONOTONIC);
-	if (err && err != -EBUSY)
-		return 1;
+	if (mode == TEST_RACE_SYNC || mode == TEST_RACE_ASYNC)
+		bpf_timer_set_callback(timer, race_timer_callback);
+	else if (mode == TEST_UPDATE)
+		bpf_timer_set_callback(timer, update_self_callback);
+	else
+		bpf_timer_set_callback(timer, cancel_self_callback);
 
-	bpf_timer_set_callback(timer, race_timer_callback);
 	bpf_timer_start(timer, 0, 0);
-	if (async_cancel)
+
+	if (mode == TEST_RACE_ASYNC)
 		bpf_timer_cancel_async(timer);
-	else
+	else if (mode == TEST_RACE_SYNC)
 		bpf_timer_cancel(timer);
 
 	return 0;
 }
+
+SEC("syscall")
+int race(void *ctx)
+{
+	return test_common(async_cancel ? TEST_RACE_ASYNC : TEST_RACE_SYNC);
+}
+
+SEC("perf_event")
+int nmi_race(void *ctx)
+{
+	__sync_fetch_and_add(&test_hits, 1);
+	return test_common(TEST_RACE_ASYNC);
+}
+
+SEC("perf_event")
+int nmi_update(void *ctx)
+{
+	__sync_fetch_and_add(&test_hits, 1);
+	return test_common(TEST_UPDATE);
+}
+
+SEC("perf_event")
+int nmi_cancel(void *ctx)
+{
+	__sync_fetch_and_add(&test_hits, 1);
+	return test_common(TEST_CANCEL);
+}
-- 
cgit v1.2.3


From 3f7a8415209eeca4b1fda2a57f9d7ec49413e2eb Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Sat, 31 Jan 2026 18:54:02 -0800
Subject: selftests/bpf: Removed obsolete tests

Now bpf_timer can be used in tracepoints, so these tests are no longer
relevant.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260201025403.66625-9-alexei.starovoitov@gmail.com
---
 .../bpf/progs/verifier_helper_restricted.c         | 111 ---------------------
 1 file changed, 111 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c
index 059aa716e3d0..889c9b78b912 100644
--- a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c
+++ b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c
@@ -17,17 +17,6 @@ struct {
 	__type(value, struct val);
 } map_spin_lock SEC(".maps");
 
-struct timer {
-	struct bpf_timer t;
-};
-
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__uint(max_entries, 1);
-	__type(key, int);
-	__type(value, struct timer);
-} map_timer SEC(".maps");
-
 SEC("kprobe")
 __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_KPROBE")
 __failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns")
@@ -84,106 +73,6 @@ __naked void bpf_prog_type_raw_tracepoint_1(void)
 	: __clobber_all);
 }
 
-SEC("kprobe")
-__description("bpf_timer_init isn restricted in BPF_PROG_TYPE_KPROBE")
-__failure __msg("tracing progs cannot use bpf_timer yet")
-__naked void in_bpf_prog_type_kprobe_2(void)
-{
-	asm volatile ("					\
-	r2 = r10;					\
-	r2 += -8;					\
-	r1 = 0;						\
-	*(u64*)(r2 + 0) = r1;				\
-	r1 = %[map_timer] ll;				\
-	call %[bpf_map_lookup_elem];			\
-	if r0 == 0 goto l0_%=;				\
-	r1 = r0;					\
-	r2 = %[map_timer] ll;				\
-	r3 = 1;						\
-l0_%=:	call %[bpf_timer_init];				\
-	exit;						\
-"	:
-	: __imm(bpf_map_lookup_elem),
-	  __imm(bpf_timer_init),
-	  __imm_addr(map_timer)
-	: __clobber_all);
-}
-
-SEC("perf_event")
-__description("bpf_timer_init is forbidden in BPF_PROG_TYPE_PERF_EVENT")
-__failure __msg("tracing progs cannot use bpf_timer yet")
-__naked void bpf_prog_type_perf_event_2(void)
-{
-	asm volatile ("					\
-	r2 = r10;					\
-	r2 += -8;					\
-	r1 = 0;						\
-	*(u64*)(r2 + 0) = r1;				\
-	r1 = %[map_timer] ll;				\
-	call %[bpf_map_lookup_elem];			\
-	if r0 == 0 goto l0_%=;				\
-	r1 = r0;					\
-	r2 = %[map_timer] ll;				\
-	r3 = 1;						\
-l0_%=:	call %[bpf_timer_init];				\
-	exit;						\
-"	:
-	: __imm(bpf_map_lookup_elem),
-	  __imm(bpf_timer_init),
-	  __imm_addr(map_timer)
-	: __clobber_all);
-}
-
-SEC("tracepoint")
-__description("bpf_timer_init is forbidden in BPF_PROG_TYPE_TRACEPOINT")
-__failure __msg("tracing progs cannot use bpf_timer yet")
-__naked void in_bpf_prog_type_tracepoint_2(void)
-{
-	asm volatile ("					\
-	r2 = r10;					\
-	r2 += -8;					\
-	r1 = 0;						\
-	*(u64*)(r2 + 0) = r1;				\
-	r1 = %[map_timer] ll;				\
-	call %[bpf_map_lookup_elem];			\
-	if r0 == 0 goto l0_%=;				\
-	r1 = r0;					\
-	r2 = %[map_timer] ll;				\
-	r3 = 1;						\
-l0_%=:	call %[bpf_timer_init];				\
-	exit;						\
-"	:
-	: __imm(bpf_map_lookup_elem),
-	  __imm(bpf_timer_init),
-	  __imm_addr(map_timer)
-	: __clobber_all);
-}
-
-SEC("raw_tracepoint")
-__description("bpf_timer_init is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT")
-__failure __msg("tracing progs cannot use bpf_timer yet")
-__naked void bpf_prog_type_raw_tracepoint_2(void)
-{
-	asm volatile ("					\
-	r2 = r10;					\
-	r2 += -8;					\
-	r1 = 0;						\
-	*(u64*)(r2 + 0) = r1;				\
-	r1 = %[map_timer] ll;				\
-	call %[bpf_map_lookup_elem];			\
-	if r0 == 0 goto l0_%=;				\
-	r1 = r0;					\
-	r2 = %[map_timer] ll;				\
-	r3 = 1;						\
-l0_%=:	call %[bpf_timer_init];				\
-	exit;						\
-"	:
-	: __imm(bpf_map_lookup_elem),
-	  __imm(bpf_timer_init),
-	  __imm_addr(map_timer)
-	: __clobber_all);
-}
-
 SEC("kprobe")
 __description("bpf_spin_lock is forbidden in BPF_PROG_TYPE_KPROBE")
 __failure __msg("tracing progs cannot use bpf_spin_lock yet")
-- 
cgit v1.2.3


From b135beb07758a854160e5421b6f4d5bde72e0da6 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sat, 31 Jan 2026 18:54:03 -0800
Subject: selftests/bpf: Add a test to stress bpf_timer_start and map_delete
 race

Add a test to stress bpf_timer_start and map_delete race

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260201025403.66625-10-alexei.starovoitov@gmail.com
---
 .../bpf/prog_tests/timer_start_delete_race.c       | 137 +++++++++++++++++++++
 .../selftests/bpf/progs/timer_start_delete_race.c  |  66 ++++++++++
 2 files changed, 203 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/timer_start_delete_race.c
 create mode 100644 tools/testing/selftests/bpf/progs/timer_start_delete_race.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/timer_start_delete_race.c b/tools/testing/selftests/bpf/prog_tests/timer_start_delete_race.c
new file mode 100644
index 000000000000..29a46e96f660
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/timer_start_delete_race.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <pthread.h>
+#include <test_progs.h>
+#include "timer_start_delete_race.skel.h"
+
+/*
+ * Test for race between bpf_timer_start() and map element deletion.
+ *
+ * The race scenario:
+ * - CPU 1: bpf_timer_start() proceeds to bpf_async_process() and is about
+ *          to call hrtimer_start() but hasn't yet
+ * - CPU 2: map_delete_elem() calls __bpf_async_cancel_and_free(), since
+ *          timer is not scheduled yet hrtimer_try_to_cancel() is a nop,
+ *          then calls bpf_async_refcount_put() dropping refcnt to zero
+ *          and scheduling call_rcu_tasks_trace()
+ * - CPU 1: continues and calls hrtimer_start()
+ * - After RCU tasks trace grace period: memory is freed
+ * - Timer callback fires on freed memory: UAF!
+ *
+ * This test stresses this race by having two threads:
+ * - Thread 1: repeatedly starts timers
+ * - Thread 2: repeatedly deletes map elements
+ *
+ * KASAN should detect use-after-free.
+ */
+
+#define ITERATIONS 1000
+
+struct ctx {
+	struct timer_start_delete_race *skel;
+	volatile bool start;
+	volatile bool stop;
+	int errors;
+};
+
+static void *start_timer_thread(void *arg)
+{
+	struct ctx *ctx = arg;
+	cpu_set_t cpuset;
+	int fd, i;
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+	pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+
+	while (!ctx->start && !ctx->stop)
+		usleep(1);
+	if (ctx->stop)
+		return NULL;
+
+	fd = bpf_program__fd(ctx->skel->progs.start_timer);
+
+	for (i = 0; i < ITERATIONS && !ctx->stop; i++) {
+		LIBBPF_OPTS(bpf_test_run_opts, opts);
+		int err;
+
+		err = bpf_prog_test_run_opts(fd, &opts);
+		if (err || opts.retval) {
+			ctx->errors++;
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+static void *delete_elem_thread(void *arg)
+{
+	struct ctx *ctx = arg;
+	cpu_set_t cpuset;
+	int fd, i;
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(1, &cpuset);
+	pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+
+	while (!ctx->start && !ctx->stop)
+		usleep(1);
+	if (ctx->stop)
+		return NULL;
+
+	fd = bpf_program__fd(ctx->skel->progs.delete_elem);
+
+	for (i = 0; i < ITERATIONS && !ctx->stop; i++) {
+		LIBBPF_OPTS(bpf_test_run_opts, opts);
+		int err;
+
+		err = bpf_prog_test_run_opts(fd, &opts);
+		if (err || opts.retval) {
+			ctx->errors++;
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+void test_timer_start_delete_race(void)
+{
+	struct timer_start_delete_race *skel;
+	pthread_t threads[2];
+	struct ctx ctx = {};
+	int err;
+
+	skel = timer_start_delete_race__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+		return;
+
+	ctx.skel = skel;
+
+	err = pthread_create(&threads[0], NULL, start_timer_thread, &ctx);
+	if (!ASSERT_OK(err, "create start_timer_thread")) {
+		ctx.stop = true;
+		goto cleanup;
+	}
+
+	err = pthread_create(&threads[1], NULL, delete_elem_thread, &ctx);
+	if (!ASSERT_OK(err, "create delete_elem_thread")) {
+		ctx.stop = true;
+		pthread_join(threads[0], NULL);
+		goto cleanup;
+	}
+
+	ctx.start = true;
+
+	pthread_join(threads[0], NULL);
+	pthread_join(threads[1], NULL);
+
+	ASSERT_EQ(ctx.errors, 0, "thread_errors");
+
+	/* Either KASAN will catch UAF or kernel will crash or nothing happens */
+cleanup:
+	timer_start_delete_race__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/timer_start_delete_race.c b/tools/testing/selftests/bpf/progs/timer_start_delete_race.c
new file mode 100644
index 000000000000..2ff6357f33f9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/timer_start_delete_race.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <linux/bpf.h>
+#include <time.h>
+#include <bpf/bpf_helpers.h>
+
+#define ITER_CNT 2000
+
+struct map_value {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, int);
+	__type(value, struct map_value);
+	__uint(max_entries, 1);
+} timer_map SEC(".maps");
+
+long cb_cnt;
+
+/*
+ * Timer callback that accesses the map value.
+ * If the race bug exists and this runs on freed memory,
+ * KASAN should detect it.
+ */
+static int timer_cb(void *map, int *key, struct map_value *value)
+{
+	__sync_fetch_and_add(&cb_cnt, 1);
+	return 0;
+}
+
+SEC("syscall")
+int start_timer(void *ctx)
+{
+	struct map_value *value;
+	int i;
+
+	for (i = 0; i < ITER_CNT; i++) {
+		int key = 0;
+
+		value = bpf_map_lookup_elem(&timer_map, &key);
+		/* hey claude, no if (!value) here. The verifier got smarter */
+
+		bpf_timer_init(&value->timer, &timer_map, CLOCK_MONOTONIC);
+		bpf_timer_set_callback(&value->timer, timer_cb);
+		bpf_timer_start(&value->timer, 100000000, 0);
+	}
+	return 0;
+}
+
+SEC("syscall")
+int delete_elem(void *ctx)
+{
+	int i;
+
+	for (i = 0; i < ITER_CNT; i++) {
+		int key = 0;
+
+		bpf_map_delete_elem(&timer_map, &key);
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From bee60ce21b751275b3a7766f614373ef02dde512 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 2 Feb 2026 12:43:15 +0100
Subject: selftest: net: add a test-case for encap segmentation after GRO

We had a few patches in this area and no explicit coverage so far.
The test case covers the scenario addressed by the previous fix;
reusing the existing udpgro_fwd.sh script to leverage part of the
of the virtual network setup, even if such script is possibly not
a perfect fit.

Note that the mentioned script already contains several shellcheck
violation; this patch does not fix the existing code, just avoids
adding more issues in the new one.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Link: https://patch.msgid.link/768ca132af81e83856e34d3105b86c37e566a7ad.1770032084.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/udpgro_fwd.sh | 64 +++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh
index a39fdc4aa2ff..9b722c1e4b0f 100755
--- a/tools/testing/selftests/net/udpgro_fwd.sh
+++ b/tools/testing/selftests/net/udpgro_fwd.sh
@@ -162,6 +162,39 @@ run_test() {
 	echo " ok"
 }
 
+run_test_csum() {
+	local -r msg="$1"
+	local -r dst="$2"
+	local csum_error_filter=UdpInCsumErrors
+	local csum_errors
+
+	printf "%-40s" "$msg"
+
+	is_ipv6 "$dst" && csum_error_filter=Udp6InCsumErrors
+
+	ip netns exec "$NS_DST" iperf3 -s -1 >/dev/null &
+	wait_local_port_listen "$NS_DST" 5201 tcp
+	local spid="$!"
+	ip netns exec "$NS_SRC" iperf3 -c "$dst" -t 2 >/dev/null
+	local retc="$?"
+	wait "$spid"
+	local rets="$?"
+	if [ "$rets" -ne 0 ] || [ "$retc" -ne 0 ]; then
+		echo " fail client exit code $retc, server $rets"
+		ret=1
+		return
+	fi
+
+	csum_errors=$(ip netns exec "$NS_DST" nstat -as "$csum_error_filter" |
+		      grep "$csum_error_filter" | awk '{print $2}')
+	if [ -n "$csum_errors" ] && [ "$csum_errors" -gt 0 ]; then
+		echo " fail - csum error on receive $csum_errors, expected 0"
+		ret=1
+		return
+	fi
+	echo " ok"
+}
+
 run_bench() {
 	local -r msg=$1
 	local -r dst=$2
@@ -260,6 +293,37 @@ for family in 4 6; do
 	ip netns exec $NS_SRC $PING -q -c 1 $OL_NET$DST_NAT >/dev/null
 	run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 10 10 $OL_NET$DST
 	cleanup
+
+	# force segmentation and re-aggregation
+	create_vxlan_pair
+	ip netns exec "$NS_DST" ethtool -K veth"$DST" generic-receive-offload on
+	ip netns exec "$NS_SRC" ethtool -K veth"$SRC" tso off
+	ip -n "$NS_SRC" link set dev veth"$SRC" mtu 1430
+
+	# forward to a 2nd veth pair
+	ip -n "$NS_DST" link add br0 type bridge
+	ip -n "$NS_DST" link set dev veth"$DST" master br0
+
+	# segment the aggregated TSO packet, without csum offload
+	ip -n "$NS_DST" link add veth_segment type veth peer veth_rx
+	for FEATURE in tso tx-udp-segmentation tx-checksumming; do
+		ip netns exec "$NS_DST" ethtool -K veth_segment "$FEATURE" off
+	done
+	ip -n "$NS_DST" link set dev veth_segment master br0 up
+	ip -n "$NS_DST" link set dev br0 up
+	ip -n "$NS_DST" link set dev veth_rx up
+
+	# move the lower layer IP in the last added veth
+	for ADDR in "$BM_NET_V4$DST/24" "$BM_NET_V6$DST/64"; do
+		# the dad argument will let iproute emit a unharmful warning
+		# with ipv4 addresses
+		ip -n "$NS_DST" addr del dev veth"$DST" "$ADDR"
+		ip -n "$NS_DST" addr add dev veth_rx "$ADDR" \
+			nodad 2>/dev/null
+	done
+
+	run_test_csum "GSO after GRO" "$OL_NET$DST"
+	cleanup
 done
 
 exit $ret
-- 
cgit v1.2.3


From 52940a34a85bc8a17a095f6fae80c33a18c1f7ec Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda <imbrenda@linux.ibm.com>
Date: Wed, 4 Feb 2026 16:02:58 +0100
Subject: KVM: s390: selftests: Add selftest for the KVM_S390_KEYOP ioctl

This test allows to test the various storage key handling functions.

Acked-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
---
 tools/testing/selftests/kvm/Makefile.kvm |   1 +
 tools/testing/selftests/kvm/s390/keyop.c | 299 +++++++++++++++++++++++++++++++
 2 files changed, 300 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/s390/keyop.c

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index ba5c2b643efa..2e4774666723 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -199,6 +199,7 @@ TEST_GEN_PROGS_s390 += s390/cpumodel_subfuncs_test
 TEST_GEN_PROGS_s390 += s390/shared_zeropage_test
 TEST_GEN_PROGS_s390 += s390/ucontrol_test
 TEST_GEN_PROGS_s390 += s390/user_operexec
+TEST_GEN_PROGS_s390 += s390/keyop
 TEST_GEN_PROGS_s390 += rseq_test
 
 TEST_GEN_PROGS_riscv = $(TEST_GEN_PROGS_COMMON)
diff --git a/tools/testing/selftests/kvm/s390/keyop.c b/tools/testing/selftests/kvm/s390/keyop.c
new file mode 100644
index 000000000000..c7805e87d12c
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390/keyop.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test for s390x KVM_S390_KEYOP
+ *
+ * Copyright IBM Corp. 2026
+ *
+ * Authors:
+ *  Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <linux/bits.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kselftest.h"
+#include "processor.h"
+
+#define BUF_PAGES 128UL
+#define GUEST_PAGES 256UL
+
+#define BUF_START_GFN	(GUEST_PAGES - BUF_PAGES)
+#define BUF_START_ADDR	(BUF_START_GFN << PAGE_SHIFT)
+
+#define KEY_BITS_ACC	0xf0
+#define KEY_BIT_F	0x08
+#define KEY_BIT_R	0x04
+#define KEY_BIT_C	0x02
+
+#define KEY_BITS_RC	(KEY_BIT_R | KEY_BIT_C)
+#define KEY_BITS_ALL	(KEY_BITS_ACC | KEY_BIT_F | KEY_BITS_RC)
+
+static unsigned char tmp[BUF_PAGES];
+static unsigned char old[BUF_PAGES];
+static unsigned char expected[BUF_PAGES];
+
+static int _get_skeys(struct kvm_vcpu *vcpu, unsigned char skeys[])
+{
+	struct kvm_s390_skeys skeys_ioctl = {
+		.start_gfn = BUF_START_GFN,
+		.count = BUF_PAGES,
+		.skeydata_addr = (unsigned long)skeys,
+	};
+
+	return __vm_ioctl(vcpu->vm, KVM_S390_GET_SKEYS, &skeys_ioctl);
+}
+
+static void get_skeys(struct kvm_vcpu *vcpu, unsigned char skeys[])
+{
+	int r = _get_skeys(vcpu, skeys);
+
+	TEST_ASSERT(!r, "Failed to get storage keys, r=%d", r);
+}
+
+static void set_skeys(struct kvm_vcpu *vcpu, unsigned char skeys[])
+{
+	struct kvm_s390_skeys skeys_ioctl = {
+		.start_gfn = BUF_START_GFN,
+		.count = BUF_PAGES,
+		.skeydata_addr = (unsigned long)skeys,
+	};
+	int r;
+
+	r = __vm_ioctl(vcpu->vm, KVM_S390_SET_SKEYS, &skeys_ioctl);
+	TEST_ASSERT(!r, "Failed to set storage keys, r=%d", r);
+}
+
+static int do_keyop(struct kvm_vcpu *vcpu, int op, unsigned long page_idx, unsigned char skey)
+{
+	struct kvm_s390_keyop keyop = {
+		.guest_addr = BUF_START_ADDR + page_idx * PAGE_SIZE,
+		.key = skey,
+		.operation = op,
+	};
+	int r;
+
+	r = __vm_ioctl(vcpu->vm, KVM_S390_KEYOP, &keyop);
+	TEST_ASSERT(!r, "Failed to perform keyop, r=%d", r);
+	TEST_ASSERT((keyop.key & 1) == 0,
+		    "Last bit of key is 1, should be 0! page %lu, new key=%#x, old key=%#x",
+		    page_idx, skey, keyop.key);
+
+	return keyop.key;
+}
+
+static void fault_in_buffer(struct kvm_vcpu *vcpu, int where, int cur_loc)
+{
+	unsigned long i;
+	int r;
+
+	if (where != cur_loc)
+		return;
+
+	for (i = 0; i < BUF_PAGES; i++) {
+		r = ioctl(vcpu->fd, KVM_S390_VCPU_FAULT, BUF_START_ADDR + i * PAGE_SIZE);
+		TEST_ASSERT(!r, "Faulting in buffer page %lu, r=%d", i, r);
+	}
+}
+
+static inline void set_pattern(unsigned char skeys[])
+{
+	int i;
+
+	for (i = 0; i < BUF_PAGES; i++)
+		skeys[i] = i << 1;
+}
+
+static void dump_sk(const unsigned char skeys[], const char *descr)
+{
+	int i, j;
+
+	fprintf(stderr, "# %s:\n", descr);
+	for (i = 0; i < BUF_PAGES; i += 32) {
+		fprintf(stderr, "# %3d: ", i);
+		for (j = 0; j < 32; j++)
+			fprintf(stderr, "%02x ", skeys[i + j]);
+		fprintf(stderr, "\n");
+	}
+}
+
+static inline void compare(const unsigned char what[], const unsigned char expected[],
+			   const char *descr, int fault_in_loc)
+{
+	int i;
+
+	for (i = 0; i < BUF_PAGES; i++) {
+		if (expected[i] != what[i]) {
+			dump_sk(expected, "Expected");
+			dump_sk(what, "Got");
+		}
+		TEST_ASSERT(expected[i] == what[i],
+			    "%s! fault-in location %d, page %d, expected %#x, got %#x",
+			    descr, fault_in_loc, i, expected[i], what[i]);
+	}
+}
+
+static inline void clear_all(void)
+{
+	memset(tmp, 0, BUF_PAGES);
+	memset(old, 0, BUF_PAGES);
+	memset(expected, 0, BUF_PAGES);
+}
+
+static void test_init(struct kvm_vcpu *vcpu, int fault_in)
+{
+	/* Set all storage keys to zero */
+	fault_in_buffer(vcpu, fault_in, 1);
+	set_skeys(vcpu, expected);
+
+	fault_in_buffer(vcpu, fault_in, 2);
+	get_skeys(vcpu, tmp);
+	compare(tmp, expected, "Setting keys not zero", fault_in);
+
+	/* Set storage keys to a sequential pattern */
+	fault_in_buffer(vcpu, fault_in, 3);
+	set_pattern(expected);
+	set_skeys(vcpu, expected);
+
+	fault_in_buffer(vcpu, fault_in, 4);
+	get_skeys(vcpu, tmp);
+	compare(tmp, expected, "Setting storage keys failed", fault_in);
+}
+
+static void test_rrbe(struct kvm_vcpu *vcpu, int fault_in)
+{
+	unsigned char k;
+	int i;
+
+	/* Set storage keys to a sequential pattern */
+	fault_in_buffer(vcpu, fault_in, 1);
+	set_pattern(expected);
+	set_skeys(vcpu, expected);
+
+	/* Call the RRBE KEYOP ioctl on each page and verify the result */
+	fault_in_buffer(vcpu, fault_in, 2);
+	for (i = 0; i < BUF_PAGES; i++) {
+		k = do_keyop(vcpu, KVM_S390_KEYOP_RRBE, i, 0xff);
+		TEST_ASSERT((expected[i] & KEY_BITS_RC) == k,
+			    "Old R or C value mismatch! expected: %#x, got %#x",
+			    expected[i] & KEY_BITS_RC, k);
+		if (i == BUF_PAGES / 2)
+			fault_in_buffer(vcpu, fault_in, 3);
+	}
+
+	for (i = 0; i < BUF_PAGES; i++)
+		expected[i] &= ~KEY_BIT_R;
+
+	/* Verify that only the R bit has been cleared */
+	fault_in_buffer(vcpu, fault_in, 4);
+	get_skeys(vcpu, tmp);
+	compare(tmp, expected, "New value mismatch", fault_in);
+}
+
+static void test_iske(struct kvm_vcpu *vcpu, int fault_in)
+{
+	int i;
+
+	/* Set storage keys to a sequential pattern */
+	fault_in_buffer(vcpu, fault_in, 1);
+	set_pattern(expected);
+	set_skeys(vcpu, expected);
+
+	/* Call the ISKE KEYOP ioctl on each page and verify the result */
+	fault_in_buffer(vcpu, fault_in, 2);
+	for (i = 0; i < BUF_PAGES; i++) {
+		tmp[i] = do_keyop(vcpu, KVM_S390_KEYOP_ISKE, i, 0xff);
+		if (i == BUF_PAGES / 2)
+			fault_in_buffer(vcpu, fault_in, 3);
+	}
+	compare(tmp, expected, "Old value mismatch", fault_in);
+
+	/* Check storage keys have not changed */
+	fault_in_buffer(vcpu, fault_in, 4);
+	get_skeys(vcpu, tmp);
+	compare(tmp, expected, "Storage keys values changed", fault_in);
+}
+
+static void test_sske(struct kvm_vcpu *vcpu, int fault_in)
+{
+	int i;
+
+	/* Set storage keys to a sequential pattern */
+	fault_in_buffer(vcpu, fault_in, 1);
+	set_pattern(tmp);
+	set_skeys(vcpu, tmp);
+
+	/* Call the SSKE KEYOP ioctl on each page and verify the result */
+	fault_in_buffer(vcpu, fault_in, 2);
+	for (i = 0; i < BUF_PAGES; i++) {
+		expected[i] = ~tmp[i] & KEY_BITS_ALL;
+		/* Set the new storage keys to be the bit-inversion of the previous ones */
+		old[i] = do_keyop(vcpu, KVM_S390_KEYOP_SSKE, i, expected[i] | 1);
+		if (i == BUF_PAGES / 2)
+			fault_in_buffer(vcpu, fault_in, 3);
+	}
+	compare(old, tmp, "Old value mismatch", fault_in);
+
+	/* Verify that the storage keys have been set correctly */
+	fault_in_buffer(vcpu, fault_in, 4);
+	get_skeys(vcpu, tmp);
+	compare(tmp, expected, "New value mismatch", fault_in);
+}
+
+static struct testdef {
+	const char *name;
+	void (*test)(struct kvm_vcpu *vcpu, int fault_in_location);
+	int n_fault_in_locations;
+} testplan[] = {
+	{ "Initialization", test_init, 5 },
+	{ "RRBE", test_rrbe, 5 },
+	{ "ISKE", test_iske, 5 },
+	{ "SSKE", test_sske, 5 },
+};
+
+static void run_test(void (*the_test)(struct kvm_vcpu *, int), int fault_in_location)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	int r;
+
+	vm = vm_create_barebones();
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, GUEST_PAGES, 0);
+	vcpu = __vm_vcpu_add(vm, 0);
+
+	r = _get_skeys(vcpu, tmp);
+	TEST_ASSERT(r == KVM_S390_GET_SKEYS_NONE,
+		    "Storage keys are not disabled initially, r=%d", r);
+
+	clear_all();
+
+	the_test(vcpu, fault_in_location);
+
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	int i, f;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_KEYOP));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_UCONTROL));
+
+	ksft_print_header();
+	for (i = 0, f = 0; i < ARRAY_SIZE(testplan); i++)
+		f += testplan[i].n_fault_in_locations;
+	ksft_set_plan(f);
+
+	for (i = 0; i < ARRAY_SIZE(testplan); i++) {
+		for (f = 0; f < testplan[i].n_fault_in_locations; f++) {
+			run_test(testplan[i].test, f);
+			ksft_test_result_pass("%s (fault-in location %d)\n", testplan[i].name, f);
+		}
+	}
+
+	ksft_finished();	/* Print results and exit() accordingly */
+}
-- 
cgit v1.2.3


From af74daf91652f15b82560bb93850d2ec8bbfa976 Mon Sep 17 00:00:00 2001
From: Robert Richter <rrichter@amd.com>
Date: Tue, 27 Jan 2026 11:12:31 -0700
Subject: cxl: Enable AMD Zen5 address translation using ACPI PRMT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add AMD Zen5 support for address translation.

Zen5 systems may be configured to use 'Normalized addresses'. Then,
host physical addresses (HPA) are different from their system physical
addresses (SPA). The endpoint has its own physical address space and
an incoming HPA is already converted to the device's physical address
(DPA). Thus it has interleaving disabled and CXL endpoints are
programmed passthrough (DPA == HPA).

Host Physical Addresses (HPAs) need to be translated from the endpoint
to its CXL host bridge, esp. to identify the endpoint's root decoder
and region's address range. ACPI Platform Runtime Mechanism (PRM)
provides a handler to translate the DPA to its SPA. This is documented
in:

 AMD Family 1Ah Models 00h–0Fh and Models 10h–1Fh
 ACPI v6.5 Porting Guide, Publication # 58088
 https://www.amd.com/en/search/documentation/hub.html

With Normalized Addressing this PRM handler must be used to translate
an HPA of an endpoint to its SPA.

Do the following to implement AMD Zen5 address translation:

Introduce a new file core/atl.c to handle ACPI PRM specific address
translation code. Naming is loosely related to the kernel's AMD
Address Translation Library (CONFIG_AMD_ATL) but implementation does
not depend on it, nor it is vendor specific. Use Kbuild and Kconfig
options respectively to enable the code depending on architecture and
platform options.

AMD Zen5 systems support the ACPI PRM CXL Address Translation firmware
call (see ACPI v6.5 Porting Guide, Address Translation - CXL DPA to
System Physical Address). Firmware enables the PRM handler if the
platform has address translation implemented. Check firmware and
kernel support of ACPI PRM using the specific GUID. On success enable
address translation by setting up the earlier introduced root port
callback, see function cxl_prm_setup_translation(). Setup is done in
cxl_setup_prm_address_translation(), it is the only function that
needs to be exported. For low level PRM firmware calls, use the ACPI
framework.

Identify the region's interleaving ways by inspecting the address
ranges. Also determine the interleaving granularity using the address
translation callback. Note that the position of the chunk from one
interleaving block to the next may vary and thus cannot be considered
constant. Address offsets larger than the interleaving block size
cannot be used to calculate the granularity. Thus, probe the
granularity using address translation for various HPAs in the same
interleaving block.

[ dj: Add atl.o build to cxl_test ]

Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Tested-by: Gregory Price <gourry@gourry.net>
Signed-off-by: Robert Richter <rrichter@amd.com>
Link: https://patch.msgid.link/20260114164837.1076338-11-rrichter@amd.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/Kconfig       |   5 ++
 drivers/cxl/acpi.c        |   2 +
 drivers/cxl/core/Makefile |   1 +
 drivers/cxl/core/atl.c    | 190 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/cxl/cxl.h         |   7 ++
 tools/testing/cxl/Kbuild  |   1 +
 6 files changed, 206 insertions(+)
 create mode 100644 drivers/cxl/core/atl.c

(limited to 'tools')

diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index 48b7314afdb8..103950a9b73e 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -233,4 +233,9 @@ config CXL_MCE
 	def_bool y
 	depends on X86_MCE && MEMORY_FAILURE
 
+config CXL_ATL
+	def_bool y
+	depends on CXL_REGION
+	depends on ACPI_PRMT && AMD_NB
+
 endif
diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index a31d0f97f916..50c2987e0459 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -925,6 +925,8 @@ static int cxl_acpi_probe(struct platform_device *pdev)
 	cxl_root->ops.qos_class = cxl_acpi_qos_class;
 	root_port = &cxl_root->port;
 
+	cxl_setup_prm_address_translation(cxl_root);
+
 	rc = bus_for_each_dev(adev->dev.bus, NULL, root_port,
 			      add_host_bridge_dport);
 	if (rc < 0)
diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
index 5ad8fef210b5..11fe272a6e29 100644
--- a/drivers/cxl/core/Makefile
+++ b/drivers/cxl/core/Makefile
@@ -20,3 +20,4 @@ cxl_core-$(CONFIG_CXL_REGION) += region.o
 cxl_core-$(CONFIG_CXL_MCE) += mce.o
 cxl_core-$(CONFIG_CXL_FEATURES) += features.o
 cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o
+cxl_core-$(CONFIG_CXL_ATL) += atl.o
diff --git a/drivers/cxl/core/atl.c b/drivers/cxl/core/atl.c
new file mode 100644
index 000000000000..c36984686fb0
--- /dev/null
+++ b/drivers/cxl/core/atl.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ */
+
+#include <linux/prmt.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+
+#include <cxlmem.h>
+#include "core.h"
+
+/*
+ * PRM Address Translation - CXL DPA to System Physical Address
+ *
+ * Reference:
+ *
+ * AMD Family 1Ah Models 00h–0Fh and Models 10h–1Fh
+ * ACPI v6.5 Porting Guide, Publication # 58088
+ */
+
+static const guid_t prm_cxl_dpa_spa_guid =
+	GUID_INIT(0xee41b397, 0x25d4, 0x452c, 0xad, 0x54, 0x48, 0xc6, 0xe3,
+		  0x48, 0x0b, 0x94);
+
+struct prm_cxl_dpa_spa_data {
+	u64 dpa;
+	u8 reserved;
+	u8 devfn;
+	u8 bus;
+	u8 segment;
+	u64 *spa;
+} __packed;
+
+static u64 prm_cxl_dpa_spa(struct pci_dev *pci_dev, u64 dpa)
+{
+	struct prm_cxl_dpa_spa_data data;
+	u64 spa;
+	int rc;
+
+	data = (struct prm_cxl_dpa_spa_data) {
+		.dpa     = dpa,
+		.devfn   = pci_dev->devfn,
+		.bus     = pci_dev->bus->number,
+		.segment = pci_domain_nr(pci_dev->bus),
+		.spa     = &spa,
+	};
+
+	rc = acpi_call_prm_handler(prm_cxl_dpa_spa_guid, &data);
+	if (rc) {
+		pci_dbg(pci_dev, "failed to get SPA for %#llx: %d\n", dpa, rc);
+		return ULLONG_MAX;
+	}
+
+	pci_dbg(pci_dev, "PRM address translation: DPA -> SPA: %#llx -> %#llx\n", dpa, spa);
+
+	return spa;
+}
+
+static int cxl_prm_setup_root(struct cxl_root *cxl_root, void *data)
+{
+	struct cxl_region_context *ctx = data;
+	struct cxl_endpoint_decoder *cxled = ctx->cxled;
+	struct cxl_decoder *cxld = &cxled->cxld;
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct range hpa_range = ctx->hpa_range;
+	struct pci_dev *pci_dev;
+	u64 spa_len, len;
+	u64 addr, base_spa, base;
+	int ways, gran;
+
+	/*
+	 * When Normalized Addressing is enabled, the endpoint maintains a 1:1
+	 * mapping between HPA and DPA. If disabled, skip address translation
+	 * and perform only a range check.
+	 */
+	if (hpa_range.start != cxled->dpa_res->start)
+		return 0;
+
+	/*
+	 * Endpoints are programmed passthrough in Normalized Addressing mode.
+	 */
+	if (ctx->interleave_ways != 1) {
+		dev_dbg(&cxld->dev, "unexpected interleaving config: ways: %d granularity: %d\n",
+			ctx->interleave_ways, ctx->interleave_granularity);
+		return -ENXIO;
+	}
+
+	if (!cxlmd || !dev_is_pci(cxlmd->dev.parent)) {
+		dev_dbg(&cxld->dev, "No endpoint found: %s, range %#llx-%#llx\n",
+			dev_name(cxld->dev.parent), hpa_range.start,
+			hpa_range.end);
+		return -ENXIO;
+	}
+
+	pci_dev = to_pci_dev(cxlmd->dev.parent);
+
+	/* Translate HPA range to SPA. */
+	base = hpa_range.start;
+	hpa_range.start = prm_cxl_dpa_spa(pci_dev, hpa_range.start);
+	hpa_range.end = prm_cxl_dpa_spa(pci_dev, hpa_range.end);
+	base_spa = hpa_range.start;
+
+	if (hpa_range.start == ULLONG_MAX || hpa_range.end == ULLONG_MAX) {
+		dev_dbg(cxld->dev.parent,
+			"CXL address translation: Failed to translate HPA range: %#llx-%#llx:%#llx-%#llx(%s)\n",
+			hpa_range.start, hpa_range.end, ctx->hpa_range.start,
+			ctx->hpa_range.end, dev_name(&cxld->dev));
+		return -ENXIO;
+	}
+
+	/*
+	 * Since translated addresses include the interleaving offsets, align
+	 * the range to 256 MB.
+	 */
+	hpa_range.start = ALIGN_DOWN(hpa_range.start, SZ_256M);
+	hpa_range.end = ALIGN(hpa_range.end, SZ_256M) - 1;
+
+	len = range_len(&ctx->hpa_range);
+	spa_len = range_len(&hpa_range);
+	if (!len || !spa_len || spa_len % len) {
+		dev_dbg(cxld->dev.parent,
+			"CXL address translation: HPA range not contiguous: %#llx-%#llx:%#llx-%#llx(%s)\n",
+			hpa_range.start, hpa_range.end, ctx->hpa_range.start,
+			ctx->hpa_range.end, dev_name(&cxld->dev));
+		return -ENXIO;
+	}
+
+	ways = spa_len / len;
+	gran = SZ_256;
+
+	/*
+	 * Determine interleave granularity
+	 *
+	 * Note: The position of the chunk from one interleaving block to the
+	 * next may vary and thus cannot be considered constant. Address offsets
+	 * larger than the interleaving block size cannot be used to calculate
+	 * the granularity.
+	 */
+	if (ways > 1) {
+		while (gran <= SZ_16M) {
+			addr = prm_cxl_dpa_spa(pci_dev, base + gran);
+			if (addr != base_spa + gran)
+				break;
+			gran <<= 1;
+		}
+	}
+
+	if (gran > SZ_16M) {
+		dev_dbg(cxld->dev.parent,
+			"CXL address translation: Cannot determine granularity: %#llx-%#llx:%#llx-%#llx(%s)\n",
+			hpa_range.start, hpa_range.end, ctx->hpa_range.start,
+			ctx->hpa_range.end, dev_name(&cxld->dev));
+		return -ENXIO;
+	}
+
+	ctx->hpa_range = hpa_range;
+	ctx->interleave_ways = ways;
+	ctx->interleave_granularity = gran;
+
+	dev_dbg(&cxld->dev,
+		"address mapping found for %s (hpa -> spa): %#llx+%#llx -> %#llx+%#llx ways:%d granularity:%d\n",
+		dev_name(cxlmd->dev.parent), base, len, hpa_range.start,
+		spa_len, ways, gran);
+
+	return 0;
+}
+
+void cxl_setup_prm_address_translation(struct cxl_root *cxl_root)
+{
+	struct device *host = cxl_root->port.uport_dev;
+	u64 spa;
+	struct prm_cxl_dpa_spa_data data = { .spa = &spa };
+	int rc;
+
+	/*
+	 * Applies only to PCIe Host Bridges which are children of the CXL Root
+	 * Device (HID=“ACPI0017”). Check this and drop cxl_test instances.
+	 */
+	if (!acpi_match_device(host->driver->acpi_match_table, host))
+		return;
+
+	/* Check kernel (-EOPNOTSUPP) and firmware support (-ENODEV) */
+	rc = acpi_call_prm_handler(prm_cxl_dpa_spa_guid, &data);
+	if (rc == -EOPNOTSUPP || rc == -ENODEV)
+		return;
+
+	cxl_root->ops.translation_setup_root = cxl_prm_setup_root;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_setup_prm_address_translation, "CXL");
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 8ea334d81edf..20b0fd43fa7b 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -817,6 +817,13 @@ static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport,
 						struct device *host) { }
 #endif
 
+#ifdef CONFIG_CXL_ATL
+void cxl_setup_prm_address_translation(struct cxl_root *cxl_root);
+#else
+static inline
+void cxl_setup_prm_address_translation(struct cxl_root *cxl_root) {}
+#endif
+
 struct cxl_decoder *to_cxl_decoder(struct device *dev);
 struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev);
 struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev);
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 0e151d0572d1..612d8edbfc6f 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -63,6 +63,7 @@ cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
 cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o
 cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o
 cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o
+cxl_core-$(CONFIG_CXL_ATL) += $(CXL_CORE_SRC)/atl.o
 cxl_core-y += config_check.o
 cxl_core-y += cxl_core_test.o
 cxl_core-y += cxl_core_exports.o
-- 
cgit v1.2.3


From 5e6e1dc43a217624087ce45bafd20ac2cfb3c190 Mon Sep 17 00:00:00 2001
From: Donglin Peng <pengdonglin@xiaomi.com>
Date: Mon, 2 Feb 2026 20:01:14 +0800
Subject: resolve_btfids: Refactor the sort_btf_by_name function

Preserve original relative order of anonymous or same-named
types to improve the consistency.

No functional changes.

Signed-off-by: Donglin Peng <pengdonglin@xiaomi.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260202120114.3707141-1-dolinux.peng@gmail.com
---
 tools/bpf/resolve_btfids/main.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index db8d1554bdcc..ca7fcd03efb6 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -1226,22 +1226,26 @@ static int cmp_type_names(const void *a, const void *b, void *priv)
 	const struct btf_type *ta = btf__type_by_id(btf, *(__u32 *)a);
 	const struct btf_type *tb = btf__type_by_id(btf, *(__u32 *)b);
 	const char *na, *nb;
+	int r;
 
 	na = btf__str_by_offset(btf, ta->name_off);
 	nb = btf__str_by_offset(btf, tb->name_off);
-	return strcmp(na, nb);
+	r = strcmp(na, nb);
+	if (r != 0)
+		return r;
+
+	/* preserve original relative order of anonymous or same-named types */
+	return *(__u32 *)a < *(__u32 *)b ? -1 : 1;
 }
 
 static int sort_btf_by_name(struct btf *btf)
 {
 	__u32 *permute_ids = NULL, *id_map = NULL;
 	int nr_types, i, err = 0;
-	__u32 start_id = 0, start_offs = 1, id;
+	__u32 start_id = 0, id;
 
-	if (btf__base_btf(btf)) {
+	if (btf__base_btf(btf))
 		start_id = btf__type_cnt(btf__base_btf(btf));
-		start_offs = 0;
-	}
 	nr_types = btf__type_cnt(btf) - start_id;
 
 	permute_ids = calloc(nr_types, sizeof(*permute_ids));
@@ -1259,8 +1263,8 @@ static int sort_btf_by_name(struct btf *btf)
 	for (i = 0, id = start_id; i < nr_types; i++, id++)
 		permute_ids[i] = id;
 
-	qsort_r(permute_ids + start_offs, nr_types - start_offs,
-		sizeof(*permute_ids), cmp_type_names, btf);
+	qsort_r(permute_ids, nr_types, sizeof(*permute_ids), cmp_type_names,
+		btf);
 
 	for (i = 0; i < nr_types; i++) {
 		id = permute_ids[i] - start_id;
-- 
cgit v1.2.3


From 16cc8f249c702b7cbb4c2c2be7cd8f4fdd5d1d0c Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 4 Feb 2026 14:41:53 -0600
Subject: tools/power turbostat: AMD: msr offset 0x611 read failed:
 Input/output error

Turbostat exits during RAPL probe with:

turbostat: cpu0: msr offset 0x611 read failed: Input/output error

A binary with this bug can be used successfully with
the option "--no-msr"

Fix this regression by trusting the static AMD RAPL MSR offset.

Fixes: 19476a592bf2 ("tools/power turbostat: Validate RAPL MSRs for AWS Nitro Hypervisor")
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 5ad45c2ac5bd..c4c8b6315fd2 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2135,7 +2135,7 @@ off_t idx_to_offset(int idx)
 
 	switch (idx) {
 	case IDX_PKG_ENERGY:
-		if (valid_rapl_msrs & RAPL_AMD_F17H)
+		if (platform->plat_rapl_msrs & RAPL_AMD_F17H)
 			offset = MSR_PKG_ENERGY_STAT;
 		else
 			offset = MSR_PKG_ENERGY_STATUS;
-- 
cgit v1.2.3


From bd1b6608d7640aa8b1f1afa831331df7c13184ad Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 5 Dec 2025 11:26:40 -0500
Subject: tools/power turbostat: Dump CPUID(1) consistently with CPUID(6)

We dumped selected CPUID(1) features using a format that showed '-'
for a missing feature.  Not so friendly to parse a bunch of dashes
when features are missing...

For CPUID(1) adopt the format we used for CPUID(6): 'No-FEATURE'
means that 'FEATURE' is not present.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index c4c8b6315fd2..a209c5d87ff7 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -8826,15 +8826,15 @@ void process_cpuid()
 		fputc('\n', outf);
 
 		fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level);
-		fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n",
-			ecx_flags & (1 << 0) ? "SSE3" : "-",
-			ecx_flags & (1 << 3) ? "MONITOR" : "-",
-			ecx_flags & (1 << 6) ? "SMX" : "-",
-			ecx_flags & (1 << 7) ? "EIST" : "-",
-			ecx_flags & (1 << 8) ? "TM2" : "-",
-			edx_flags & (1 << 4) ? "TSC" : "-",
-			edx_flags & (1 << 5) ? "MSR" : "-",
-			edx_flags & (1 << 22) ? "ACPI-TM" : "-", edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-");
+		fprintf(outf, "CPUID(1): %sSSE3 %sMONITOR %sSMX %sEIST %sTM2 %sTSC %sMSR %sACPI-TM %sHT %sTM\n",
+			ecx_flags & (1 << 0) ? "" : "No-",
+			ecx_flags & (1 << 3) ? "" : "No-",
+			ecx_flags & (1 << 6) ? "" : "No-",
+			ecx_flags & (1 << 7) ? "" : "No-",
+			ecx_flags & (1 << 8) ? "" : "No-",
+			edx_flags & (1 << 4) ? "" : "No-",
+			edx_flags & (1 << 5) ? "" : "No-",
+			edx_flags & (1 << 22) ? "" : "No-", edx_flags & (1 << 28) ? "" : "No-", edx_flags & (1 << 29) ? "" : "No-");
 	}
 
 	probe_platform_features(family, model);
-- 
cgit v1.2.3


From 00b9e2a7fa671e960f8d708b06fb6b4b87ffec15 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 5 Dec 2025 11:38:27 -0500
Subject: tools/power turbostat: Dump CPUID.1.ECX[31] (Hypervisor)

Both Intel and AMD use CPUID.1.ECX[31] to distinguish
between hypervisor and real hardware.
Indicate "HV" or "No-HV" accordingly.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index a209c5d87ff7..0ef36f0330a9 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -498,6 +498,7 @@ unsigned int list_header_only;
 unsigned int dump_only;
 unsigned int force_load;
 unsigned int cpuid_has_aperf_mperf;
+unsigned int cpuid_has_hv;
 unsigned int has_aperf_access;
 unsigned int has_epb;
 unsigned int has_turbo;
@@ -8803,6 +8804,7 @@ void process_cpuid()
 		model += ((fms >> 16) & 0xf) << 4;
 	ecx_flags = ecx;
 	edx_flags = edx;
+	cpuid_has_hv = ecx_flags & (1 << 31);
 
 	if (!no_msr) {
 		if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
@@ -8826,12 +8828,13 @@ void process_cpuid()
 		fputc('\n', outf);
 
 		fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level);
-		fprintf(outf, "CPUID(1): %sSSE3 %sMONITOR %sSMX %sEIST %sTM2 %sTSC %sMSR %sACPI-TM %sHT %sTM\n",
+		fprintf(outf, "CPUID(1): %sSSE3 %sMONITOR %sSMX %sEIST %sTM2 %sHV %sTSC %sMSR %sACPI-TM %sHT %sTM\n",
 			ecx_flags & (1 << 0) ? "" : "No-",
 			ecx_flags & (1 << 3) ? "" : "No-",
 			ecx_flags & (1 << 6) ? "" : "No-",
 			ecx_flags & (1 << 7) ? "" : "No-",
 			ecx_flags & (1 << 8) ? "" : "No-",
+			cpuid_has_hv ? "" : "No-",
 			edx_flags & (1 << 4) ? "" : "No-",
 			edx_flags & (1 << 5) ? "" : "No-",
 			edx_flags & (1 << 22) ? "" : "No-", edx_flags & (1 << 28) ? "" : "No-", edx_flags & (1 << 29) ? "" : "No-");
@@ -10145,7 +10148,7 @@ int get_and_dump_counters(void)
 
 void print_version()
 {
-	fprintf(outf, "turbostat version 2025.12.02 - Len Brown <lenb@kernel.org>\n");
+	fprintf(outf, "turbostat version 2025.12.05 - Len Brown <lenb@kernel.org>\n");
 }
 
 #define COMMAND_LINE_SIZE 2048
-- 
cgit v1.2.3


From 61764831d5b148a9f3afe7ac35089e5908c92a81 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 5 Dec 2025 12:24:20 -0500
Subject: tools/power turbostat: Dump hypervisor name

Sometimes useful to know which hypervisor is running beneath us...

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 0ef36f0330a9..97cd9c5a0092 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -8772,6 +8772,27 @@ void probe_pstates(void)
 	for_all_cpus(print_epb, ODD_COUNTERS);
 	for_all_cpus(print_perf_limit, ODD_COUNTERS);
 }
+void dump_word_chars(unsigned int word)
+{
+	int i;
+
+	for (i = 0; i < 4; ++i)
+		fprintf(outf, "%c", (word >> (i * 8)) & 0xFF);
+}
+void dump_cpuid_hypervisor(void)
+{
+	unsigned int ebx = 0;
+	unsigned int ecx = 0;
+	unsigned int edx = 0;
+
+	__cpuid(0x40000000, max_extended_level, ebx, ecx, edx);
+
+	fprintf(outf, "Hypervisor: ");
+	dump_word_chars(ebx);
+	dump_word_chars(ecx);
+	dump_word_chars(edx);
+	fprintf(outf, "\n");
+}
 
 void process_cpuid()
 {
@@ -8839,6 +8860,8 @@ void process_cpuid()
 			edx_flags & (1 << 5) ? "" : "No-",
 			edx_flags & (1 << 22) ? "" : "No-", edx_flags & (1 << 28) ? "" : "No-", edx_flags & (1 << 29) ? "" : "No-");
 	}
+	if (!quiet && cpuid_has_hv)
+		dump_cpuid_hypervisor();
 
 	probe_platform_features(family, model);
 
-- 
cgit v1.2.3


From d0f7093ad5e4aa37405da2669bca1a62d22b7025 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Mon, 8 Dec 2025 10:01:04 -0500
Subject: tools/power turbostat: Harden against unexpected values

Divide-by-zero resulted if LLC references == 0

Pull the percentage division into pct() to centralize sanity checks there.

Fixes: 8808292799b0 ("tools/power turbostat: Print "nan" for out of range percentages")

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 94 +++++++++++++++++++----------------
 1 file changed, 51 insertions(+), 43 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 97cd9c5a0092..4dd4b0f3e6d4 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -3002,22 +3002,30 @@ void print_header(char *delim)
 }
 
 /*
- * pct()
+ * pct(numerator, denominator)
  *
- * If absolute value is < 1.1, return percentage
- * otherwise, return nan
+ * Return sanity checked percentage (100.0 * numerator/denominotor)
  *
- * return value is appropriate for printing percentages with %f
- * while flagging some obvious erroneous values.
+ * n < 0: nan
+ * d <= 0: nan
+ * n/d > 1.1: nan
  */
-double pct(double d)
+double pct(double numerator, double denominator)
 {
+	double retval;
 
-	double abs = fabs(d);
+	if (numerator < 0)
+		return nan("");
 
-	if (abs < 1.10)
-		return (100.0 * d);
-	return nan("");
+	if (denominator <= 0)
+		return nan("");
+
+	retval = 100.0 * numerator / denominator;
+
+	if (retval > 110.0)
+		return nan("");
+
+	return retval;
 }
 
 int dump_counters(PER_THREAD_PARAMS)
@@ -3047,7 +3055,7 @@ int dump_counters(PER_THREAD_PARAMS)
 
 		outp += sprintf(outp, "LLC refs: %lld", t->llc.references);
 		outp += sprintf(outp, "LLC miss: %lld", t->llc.misses);
-		outp += sprintf(outp, "LLC Hit%%: %.2f", pct((t->llc.references - t->llc.misses) / t->llc.references));
+		outp += sprintf(outp, "LLC Hit%%: %.2f", pct((t->llc.references - t->llc.misses), t->llc.references));
 
 		for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
 			outp += sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, t->counter[i], mp->sp->path);
@@ -3262,7 +3270,7 @@ int format_counters(PER_THREAD_PARAMS)
 		outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 / units * t->aperf / interval_float);
 
 	if (DO_BIC(BIC_Busy))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(t->mperf / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(t->mperf, tsc));
 
 	if (DO_BIC(BIC_Bzy_MHz)) {
 		if (has_base_hz)
@@ -3303,7 +3311,7 @@ int format_counters(PER_THREAD_PARAMS)
 			outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), t->llc.references / interval_float / 1000);
 
 		if (DO_BIC(BIC_LLC_HIT))
-			outp += sprintf(outp, fmt8, (printed++ ? delim : ""), pct((t->llc.references - t->llc.misses) / t->llc.references));
+			outp += sprintf(outp, fmt8, (printed++ ? delim : ""), pct((t->llc.references - t->llc.misses), t->llc.references));
 	}
 
 	/* Added Thread Counters */
@@ -3316,7 +3324,7 @@ int format_counters(PER_THREAD_PARAMS)
 			if (mp->type == COUNTER_USEC)
 				outp += print_float_value(&printed, delim, t->counter[i] / interval_float / 10000);
 			else
-				outp += print_float_value(&printed, delim, pct(t->counter[i] / tsc));
+				outp += print_float_value(&printed, delim, pct(t->counter[i], tsc));
 		}
 	}
 
@@ -3330,7 +3338,7 @@ int format_counters(PER_THREAD_PARAMS)
 			if (pp->type == COUNTER_USEC)
 				outp += print_float_value(&printed, delim, t->perf_counter[i] / interval_float / 10000);
 			else
-				outp += print_float_value(&printed, delim, pct(t->perf_counter[i] / tsc));
+				outp += print_float_value(&printed, delim, pct(t->perf_counter[i], tsc));
 		}
 	}
 
@@ -3344,34 +3352,34 @@ int format_counters(PER_THREAD_PARAMS)
 			break;
 
 		case PMT_TYPE_XTAL_TIME:
-			value_converted = pct(value_raw / crystal_hz / interval_float);
+			value_converted = pct(value_raw / crystal_hz, interval_float);
 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
 			break;
 
 		case PMT_TYPE_TCORE_CLOCK:
-			value_converted = pct(value_raw / tcore_clock_freq_hz / interval_float);
+			value_converted = pct(value_raw / tcore_clock_freq_hz, interval_float);
 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
 		}
 	}
 
 	/* C1 */
 	if (DO_BIC(BIC_CPU_c1))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(t->c1 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(t->c1, tsc));
 
 	/* print per-core data only for 1st thread in core */
 	if (!is_cpu_first_thread_in_core(t, c))
 		goto done;
 
 	if (DO_BIC(BIC_CPU_c3))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c3 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c3, tsc));
 	if (DO_BIC(BIC_CPU_c6))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c6 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c6, tsc));
 	if (DO_BIC(BIC_CPU_c7))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c7 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->c7, tsc));
 
 	/* Mod%c6 */
 	if (DO_BIC(BIC_Mod_c6))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->mc6_us / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(c->mc6_us, tsc));
 
 	if (DO_BIC(BIC_CoreTmp))
 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_temp_c);
@@ -3387,7 +3395,7 @@ int format_counters(PER_THREAD_PARAMS)
 		else if (mp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE)
 			outp += print_decimal_value(mp->width, &printed, delim, c->counter[i]);
 		else if (mp->format == FORMAT_PERCENT)
-			outp += print_float_value(&printed, delim, pct(c->counter[i] / tsc));
+			outp += print_float_value(&printed, delim, pct(c->counter[i], tsc));
 	}
 
 	/* Added perf Core counters */
@@ -3397,7 +3405,7 @@ int format_counters(PER_THREAD_PARAMS)
 		else if (pp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE)
 			outp += print_decimal_value(pp->width, &printed, delim, c->perf_counter[i]);
 		else if (pp->format == FORMAT_PERCENT)
-			outp += print_float_value(&printed, delim, pct(c->perf_counter[i] / tsc));
+			outp += print_float_value(&printed, delim, pct(c->perf_counter[i], tsc));
 	}
 
 	/* Added PMT Core counters */
@@ -3410,12 +3418,12 @@ int format_counters(PER_THREAD_PARAMS)
 			break;
 
 		case PMT_TYPE_XTAL_TIME:
-			value_converted = pct(value_raw / crystal_hz / interval_float);
+			value_converted = pct(value_raw / crystal_hz, interval_float);
 			outp += print_float_value(&printed, delim, value_converted);
 			break;
 
 		case PMT_TYPE_TCORE_CLOCK:
-			value_converted = pct(value_raw / tcore_clock_freq_hz / interval_float);
+			value_converted = pct(value_raw / tcore_clock_freq_hz, interval_float);
 			outp += print_float_value(&printed, delim, value_converted);
 		}
 	}
@@ -3471,39 +3479,39 @@ int format_counters(PER_THREAD_PARAMS)
 	if (DO_BIC(BIC_Totl_c0))
 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100 * p->pkg_wtd_core_c0 / tsc);	/* can exceed 100% */
 	if (DO_BIC(BIC_Any_c0))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_any_core_c0 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_any_core_c0, tsc));
 	if (DO_BIC(BIC_GFX_c0))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_any_gfxe_c0 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_any_gfxe_c0, tsc));
 	if (DO_BIC(BIC_CPUGFX))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_both_core_gfxe_c0 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pkg_both_core_gfxe_c0, tsc));
 
 	if (DO_BIC(BIC_Pkgpc2))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc2 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc2, tsc));
 	if (DO_BIC(BIC_Pkgpc3))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc3 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc3, tsc));
 	if (DO_BIC(BIC_Pkgpc6))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc6 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc6, tsc));
 	if (DO_BIC(BIC_Pkgpc7))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc7 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc7, tsc));
 	if (DO_BIC(BIC_Pkgpc8))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc8 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc8, tsc));
 	if (DO_BIC(BIC_Pkgpc9))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc9 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc9, tsc));
 	if (DO_BIC(BIC_Pkgpc10))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc10 / tsc));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->pc10, tsc));
 
 	if (DO_BIC(BIC_Diec6))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->die_c6 / crystal_hz / interval_float));
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->die_c6 / crystal_hz, interval_float));
 
 	if (DO_BIC(BIC_CPU_LPI)) {
 		if (p->cpu_lpi >= 0)
-			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->cpu_lpi / 1000000.0 / interval_float));
+			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->cpu_lpi / 1000000.0, interval_float));
 		else
 			outp += sprintf(outp, "%s(neg)", (printed++ ? delim : ""));
 	}
 	if (DO_BIC(BIC_SYS_LPI)) {
 		if (p->sys_lpi >= 0)
-			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->sys_lpi / 1000000.0 / interval_float));
+			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), pct(p->sys_lpi / 1000000.0, interval_float));
 		else
 			outp += sprintf(outp, "%s(neg)", (printed++ ? delim : ""));
 	}
@@ -3543,7 +3551,7 @@ int format_counters(PER_THREAD_PARAMS)
 		else if (mp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE)
 			outp += print_decimal_value(mp->width, &printed, delim, p->counter[i]);
 		else if (mp->format == FORMAT_PERCENT)
-			outp += print_float_value(&printed, delim, pct(p->counter[i] / tsc));
+			outp += print_float_value(&printed, delim, pct(p->counter[i], tsc));
 	}
 
 	/* Added perf Package Counters */
@@ -3555,7 +3563,7 @@ int format_counters(PER_THREAD_PARAMS)
 		else if (pp->format == FORMAT_DELTA || mp->format == FORMAT_AVERAGE)
 			outp += print_decimal_value(pp->width, &printed, delim, p->perf_counter[i]);
 		else if (pp->format == FORMAT_PERCENT)
-			outp += print_float_value(&printed, delim, pct(p->perf_counter[i] / tsc));
+			outp += print_float_value(&printed, delim, pct(p->perf_counter[i], tsc));
 	}
 
 	/* Added PMT Package Counters */
@@ -3568,12 +3576,12 @@ int format_counters(PER_THREAD_PARAMS)
 			break;
 
 		case PMT_TYPE_XTAL_TIME:
-			value_converted = pct(value_raw / crystal_hz / interval_float);
+			value_converted = pct(value_raw / crystal_hz, interval_float);
 			outp += print_float_value(&printed, delim, value_converted);
 			break;
 
 		case PMT_TYPE_TCORE_CLOCK:
-			value_converted = pct(value_raw / tcore_clock_freq_hz / interval_float);
+			value_converted = pct(value_raw / tcore_clock_freq_hz, interval_float);
 			outp += print_float_value(&printed, delim, value_converted);
 		}
 	}
-- 
cgit v1.2.3


From 785953cf6e63aa5a9fcdfa9577b1411e0281c4bc Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 10 Dec 2025 13:33:29 -0500
Subject: tools/power turbostat.8: Document the "--force" option

Starting in turbostat v2025.01.14, turbostat refused to run
on unsupported hardware, pointing to "RUN THE LATEST VERSION"
on turbostat(8).

At that time, turbostat supported and advertised the "--force"
parameter to run anyway (with unsupported results).

Also document "--force" on turbostat.8.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.8 | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 1551fcdbfd8a..cb3fd8576146 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -111,6 +111,8 @@ The column name "all" can be used to enable all disabled-by-default built-in cou
 .PP
 \fB--no-perf\fP Disable all the uses of the perf API.
 .PP
+\fB--force\fPForce turbostat to run on an unsupported platform (minimal defaults).
+.PP
 \fB--interval seconds\fP overrides the default 5.0 second measurement interval.
 .PP
 \fB--num_iterations num\fP number of the measurement iterations.
@@ -165,9 +167,9 @@ The system configuration dump (if --quiet is not used) is followed by statistics
 .PP
 \fBC1, C2, C3...\fP The number times Linux requested the C1, C2, C3 idle state during the measurement interval.  The system summary line shows the sum for all CPUs.  These are C-state names as exported in /sys/devices/system/cpu/cpu*/cpuidle/state*/name.  While their names are generic, their attributes are processor specific. They the system description section of output shows what MWAIT sub-states they are mapped to on each system.  These counters are in the "cpuidle" group, which is disabled, by default.
 .PP
-\fBC1+, C2+, C3+...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a deeper idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/below file.  These counters are in the "cpuidle" group, which is disabled, by default.
+\fBC1+, C2+, C3+...\fP The idle governor idle state misprediction statistics. Indicates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a deeper idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/below file.  These counters are in the "cpuidle" group, which is disabled, by default.
 .PP
-\fBC1-, C2-, C3-...\fP The idle governor idle state misprediction statistics. Inidcates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a shallower idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/above file.  These counters are in the "cpuidle" group, which is disabled, by default.
+\fBC1-, C2-, C3-...\fP The idle governor idle state misprediction statistics. Indicates the number times Linux requested the C1, C2, C3 idle state during the measurement interval, but should have requested a shallower idle state (if it exists and enabled). These statistics come from the /sys/devices/system/cpu/cpu*/cpuidle/state*/above file.  These counters are in the "cpuidle" group, which is disabled, by default.
 .PP
 \fBC1%, C2%, C3%\fP The residency percentage that Linux requested C1, C2, C3....  The system summary is the average of all CPUs in the system.  Note that these are software, reflecting what was requested.  The hardware counters reflect what was actually achieved.  These counters are in the "pct_idle" group, which is enabled by default.
 .PP
@@ -197,7 +199,7 @@ The system configuration dump (if --quiet is not used) is followed by statistics
 .PP
 \fBGFX%C0\fP Percentage of time that at least one GFX compute engine is busy.
 .PP
-\fBCPUGFX%\fP Percentage of time that at least one CPU is busy at the same time as at least one Graphics compute enginer is busy.
+\fBCPUGFX%\fP Percentage of time that at least one CPU is busy at the same time as at least one Graphics compute engine is busy.
 .PP
 \fBPkg%pc2, Pkg%pc3, Pkg%pc6, Pkg%pc7\fP percentage residency in hardware package idle states.  These numbers are from hardware residency counters.
 .PP
@@ -559,6 +561,8 @@ If the upstream version isn't new enough, the development tree can be found here
 If the development tree doesn't work, please contact the author via chat,
 or via email with the word "turbostat" on the Subject line.
 
+An old turbostat binary may run on unknown hardware by using "--force",
+but results are unsupported.
 .SH FILES
 .ta
 .nf
-- 
cgit v1.2.3


From 041e975937123ee22a7925e468ab73b8a8991767 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sat, 24 Jan 2026 22:39:02 -0600
Subject: tools/power turbostat: Rename "LLCkRPS" column to "LLCMRPS"

The purpose of the LLC References per Second LLC column
is to qualify the significance of the LLC%hit column.

If RPS is high, then the hit rate matters.
If RPS is low, then the hit rate is not significant.

Remove unnecessary and distracting precision in the RPS column
by dividing my a million rather than by a thousand.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.8 |  2 +-
 tools/power/x86/turbostat/turbostat.c | 30 +++++++++++++++---------------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index cb3fd8576146..44a416a728b3 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -161,7 +161,7 @@ The system configuration dump (if --quiet is not used) is followed by statistics
 .PP
 \fBSMI\fP The number of System Management Interrupts  serviced CPU during the measurement interval.  While this counter is actually per-CPU, SMI are triggered on all processors, so the number should be the same for all CPUs.
 .PP
-\fBLLCkRPS\fP Last Level Cache Thousands of References Per Second.  For CPUs with an L3 LLC, this is the number of references that CPU made to the L3 (and the number of misses that CPU made to it's L2).  For CPUs with an L2 LLC, this is the number of references to the L2 (and the number of misses to the CPU's L1).  The system summary row shows the sum for all CPUs.  In both cases, the value displayed is the actual value divided by 1000 in the interest of usually fitting into 8 columns.
+\fBLLCMRPS\fP Last Level Cache Millions of References Per Second.  For CPUs with an L3 LLC, this is the number of references that CPU made to the L3 (and the number of misses that CPU made to it's L2).  For CPUs with an L2 LLC, this is the number of references to the L2 (and the number of misses to the CPU's L1).  The system summary row shows the sum for all CPUs.  In both cases, the value displayed is the actual value divided by 1,000,000.  If this value is large, then the LLC%hit column is significant.  If this value is small, then the LLC%hit column is not significant.
 .PP
 \fBLLC%hit\fP Last Level Cache Hit Rate %.  Hit Rate Percent = 100.0 * (References - Misses)/References.  The system summary row shows the weighted average for all CPUs (100.0 * (Sum_References - Sum_Misses)/Sum_References).
 .PP
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 4dd4b0f3e6d4..2dfc110ae483 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -210,7 +210,7 @@ struct msr_counter bic[] = {
 	{ 0x0, "NMI", NULL, 0, 0, 0, NULL, 0 },
 	{ 0x0, "CPU%c1e", NULL, 0, 0, 0, NULL, 0 },
 	{ 0x0, "pct_idle", NULL, 0, 0, 0, NULL, 0 },
-	{ 0x0, "LLCkRPS", NULL, 0, 0, 0, NULL, 0 },
+	{ 0x0, "LLCMRPS", NULL, 0, 0, 0, NULL, 0 },
 	{ 0x0, "LLC%hit", NULL, 0, 0, 0, NULL, 0 },
 };
 
@@ -281,7 +281,7 @@ enum bic_names {
 	BIC_NMI,
 	BIC_CPU_c1e,
 	BIC_pct_idle,
-	BIC_LLC_RPS,
+	BIC_LLC_MRPS,
 	BIC_LLC_HIT,
 	MAX_BIC
 };
@@ -424,7 +424,7 @@ static void bic_groups_init(void)
 	SET_BIC(BIC_pct_idle, &bic_group_idle);
 
 	BIC_INIT(&bic_group_cache);
-	SET_BIC(BIC_LLC_RPS, &bic_group_cache);
+	SET_BIC(BIC_LLC_MRPS, &bic_group_cache);
 	SET_BIC(BIC_LLC_HIT, &bic_group_cache);
 
 	BIC_INIT(&bic_group_other);
@@ -2440,7 +2440,7 @@ static void bic_disable_msr_access(void)
 static void bic_disable_perf_access(void)
 {
 	CLR_BIC(BIC_IPC, &bic_enabled);
-	CLR_BIC(BIC_LLC_RPS, &bic_enabled);
+	CLR_BIC(BIC_LLC_MRPS, &bic_enabled);
 	CLR_BIC(BIC_LLC_HIT, &bic_enabled);
 }
 
@@ -2814,8 +2814,8 @@ void print_header(char *delim)
 	if (DO_BIC(BIC_SMI))
 		outp += sprintf(outp, "%sSMI", (printed++ ? delim : ""));
 
-	if (DO_BIC(BIC_LLC_RPS))
-		outp += sprintf(outp, "%sLLCkRPS", (printed++ ? delim : ""));
+	if (DO_BIC(BIC_LLC_MRPS))
+		outp += sprintf(outp, "%sLLCMRPS", (printed++ ? delim : ""));
 
 	if (DO_BIC(BIC_LLC_HIT))
 		outp += sprintf(outp, "%sLLC%%hit", (printed++ ? delim : ""));
@@ -3306,9 +3306,9 @@ int format_counters(PER_THREAD_PARAMS)
 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->smi_count);
 
 	/* LLC Stats */
-	if (DO_BIC(BIC_LLC_RPS) || DO_BIC(BIC_LLC_HIT)) {
-		if (DO_BIC(BIC_LLC_RPS))
-			outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), t->llc.references / interval_float / 1000);
+	if (DO_BIC(BIC_LLC_MRPS) || DO_BIC(BIC_LLC_HIT)) {
+		if (DO_BIC(BIC_LLC_MRPS))
+			outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), t->llc.references / interval_float / 1000000);
 
 		if (DO_BIC(BIC_LLC_HIT))
 			outp += sprintf(outp, fmt8, (printed++ ? delim : ""), pct((t->llc.references - t->llc.misses), t->llc.references));
@@ -3855,7 +3855,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
 	if (DO_BIC(BIC_SMI))
 		old->smi_count = new->smi_count - old->smi_count;
 
-	if (DO_BIC(BIC_LLC_RPS))
+	if (DO_BIC(BIC_LLC_MRPS))
 		old->llc.references = new->llc.references - old->llc.references;
 
 	if (DO_BIC(BIC_LLC_HIT))
@@ -5067,7 +5067,7 @@ int get_counters(PER_THREAD_PARAMS)
 
 	get_smi_aperf_mperf(cpu, t);
 
-	if (DO_BIC(BIC_LLC_RPS) || DO_BIC(BIC_LLC_HIT))
+	if (DO_BIC(BIC_LLC_MRPS) || DO_BIC(BIC_LLC_HIT))
 		get_perf_llc_stats(cpu, &t->llc);
 
 	if (DO_BIC(BIC_IPC))
@@ -8344,7 +8344,7 @@ void linux_perf_init(void)
 		if (fd_instr_count_percpu == NULL)
 			err(-1, "calloc fd_instr_count_percpu");
 	}
-	if (BIC_IS_ENABLED(BIC_LLC_RPS)) {
+	if (BIC_IS_ENABLED(BIC_LLC_MRPS)) {
 		fd_llc_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
 		if (fd_llc_percpu == NULL)
 			err(-1, "calloc fd_llc_percpu");
@@ -9066,7 +9066,7 @@ void perf_llc_init(void)
 
 	if (no_perf)
 		return;
-	if (!(BIC_IS_ENABLED(BIC_LLC_RPS) && BIC_IS_ENABLED(BIC_LLC_HIT)))
+	if (!(BIC_IS_ENABLED(BIC_LLC_MRPS) && BIC_IS_ENABLED(BIC_LLC_HIT)))
 		return;
 
 	for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) {
@@ -9089,7 +9089,7 @@ void perf_llc_init(void)
 			return;
 		}
 	}
-	BIC_PRESENT(BIC_LLC_RPS);
+	BIC_PRESENT(BIC_LLC_MRPS);
 	BIC_PRESENT(BIC_LLC_HIT);
 }
 
@@ -9518,7 +9518,7 @@ void check_perf_access(void)
 		if (!has_perf_instr_count_access())
 			no_perf = 1;
 
-	if (BIC_IS_ENABLED(BIC_LLC_RPS) || BIC_IS_ENABLED(BIC_LLC_HIT))
+	if (BIC_IS_ENABLED(BIC_LLC_MRPS) || BIC_IS_ENABLED(BIC_LLC_HIT))
 		if (!has_perf_llc_access())
 			no_perf = 1;
 
-- 
cgit v1.2.3


From b991074a5144d740ae3812a54e138aaaaf12dc7a Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 3 Feb 2026 19:30:49 -0600
Subject: tools/power turbostat: Allow more use of is_hybrid flag

The "is_hybrid" is set and used only in !quiet mode.

Make it valid in both quiet and !quiet mode to allow more uses.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 2dfc110ae483..75c865120656 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -8157,6 +8157,9 @@ void decode_feature_control_msr(void)
 	if (no_msr)
 		return;
 
+	if (quiet)
+		return;
+
 	if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr))
 		fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n",
 			base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : "");
@@ -8921,7 +8924,7 @@ void process_cpuid()
 	if (!quiet)
 		decode_misc_enable_msr();
 
-	if (max_level >= 0x7 && !quiet) {
+	if (max_level >= 0x7) {
 		int has_sgx;
 
 		ecx = 0;
@@ -8930,9 +8933,10 @@ void process_cpuid()
 
 		has_sgx = ebx & (1 << 2);
 
-		is_hybrid = edx & (1 << 15);
+		is_hybrid = !!(edx & (1 << 15));
 
-		fprintf(outf, "CPUID(7): %sSGX %sHybrid\n", has_sgx ? "" : "No-", is_hybrid ? "" : "No-");
+		if (!quiet)
+			fprintf(outf, "CPUID(7): %sSGX %sHybrid\n", has_sgx ? "" : "No-", is_hybrid ? "" : "No-");
 
 		if (has_sgx)
 			decode_feature_control_msr();
-- 
cgit v1.2.3


From a9c7a1a292794e15fefe20a664898d57f685ced7 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 3 Feb 2026 19:34:45 -0600
Subject: tools/power turbostat: Remove redundant newlines from err(3) strings

err(3) supplies a newline at the end of the string.
No need to end err(3) strings with '\n'.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 75c865120656..c622b55c330c 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -3150,7 +3150,7 @@ void get_perf_llc_stats(int cpu, struct llc_stats *llc)
 	actual_read_size = read(fd_llc_percpu[cpu], &r, expected_read_size);
 
 	if (actual_read_size == -1)
-		err(-1, "%s(cpu%d,) %d,,%ld\n", __func__, cpu, fd_llc_percpu[cpu], expected_read_size);
+		err(-1, "%s(cpu%d,) %d,,%ld", __func__, cpu, fd_llc_percpu[cpu], expected_read_size);
 
 	llc->references = r.llc.references;
 	llc->misses = r.llc.misses;
@@ -6134,7 +6134,7 @@ static int update_effective_str(bool startup)
 
 	pos = fgets(buf, 1024, fp);
 	if (!pos)
-		err(1, "%s: file read failed\n", PATH_EFFECTIVE_CPUS);
+		err(1, "%s: file read failed", PATH_EFFECTIVE_CPUS);
 
 	fclose(fp);
 
@@ -6151,7 +6151,7 @@ static void update_effective_set(bool startup)
 	update_effective_str(startup);
 
 	if (parse_cpu_str(cpu_effective_str, cpu_effective_set, cpu_effective_setsize))
-		err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str);
+		err(1, "%s: cpu str malformat %s", PATH_EFFECTIVE_CPUS, cpu_effective_str);
 }
 
 void linux_perf_init(void);
@@ -6818,7 +6818,7 @@ int check_for_cap_sys_rawio(void)
 
 free_and_exit:
 	if (cap_free(caps) == -1)
-		err(-6, "cap_free\n");
+		err(-6, "cap_free");
 
 	return ret;
 }
@@ -7021,7 +7021,7 @@ static void probe_intel_uncore_frequency_cluster(void)
 		sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i);
 
 		if (access(path_base, R_OK))
-			err(1, "%s: %s\n", __func__, path_base);
+			err(1, "%s: %s", __func__, path_base);
 
 		sprintf(path, "%s/package_id", path_base);
 		package_id = read_sysfs_int(path);
-- 
cgit v1.2.3


From 67ee5ad27d5101be4e9e8980c0734a0423bfd0a7 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 3 Feb 2026 21:51:45 -0800
Subject: selftests/bpf: Add a testcase for deadlock avoidance

Add a testcase that checks that deadlock avoidance is working
as expected.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260204055147.54960-3-alexei.starovoitov@gmail.com
---
 .../bpf/prog_tests/timer_start_deadlock.c          | 33 ++++++++++
 .../selftests/bpf/progs/timer_start_deadlock.c     | 75 ++++++++++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c
 create mode 100644 tools/testing/selftests/bpf/progs/timer_start_deadlock.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c b/tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c
new file mode 100644
index 000000000000..9f1f9aec8888
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include "timer_start_deadlock.skel.h"
+
+void test_timer_start_deadlock(void)
+{
+	struct timer_start_deadlock *skel;
+	int err, prog_fd;
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+
+	skel = timer_start_deadlock__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+		return;
+
+	err = timer_start_deadlock__attach(skel);
+	if (!ASSERT_OK(err, "skel_attach"))
+		goto cleanup;
+
+	prog_fd = bpf_program__fd(skel->progs.start_timer);
+
+	/*
+	 * Run the syscall program that attempts to deadlock.
+	 * If the kernel deadlocks, this call will never return.
+	 */
+	err = bpf_prog_test_run_opts(prog_fd, &opts);
+	ASSERT_OK(err, "prog_test_run");
+	ASSERT_EQ(opts.retval, 0, "prog_retval");
+
+	ASSERT_EQ(skel->bss->tp_called, 1, "tp_called");
+cleanup:
+	timer_start_deadlock__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/timer_start_deadlock.c b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c
new file mode 100644
index 000000000000..368563747a46
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define CLOCK_MONOTONIC 1
+
+char _license[] SEC("license") = "GPL";
+
+struct elem {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, int);
+	__type(value, struct elem);
+} timer_map SEC(".maps");
+
+volatile int in_timer_start;
+volatile int tp_called;
+
+static int timer_cb(void *map, int *key, struct elem *value)
+{
+	return 0;
+}
+
+SEC("tp_btf/hrtimer_cancel")
+int BPF_PROG(tp_hrtimer_cancel, struct hrtimer *hrtimer)
+{
+	struct bpf_timer *timer;
+	static bool called = false;
+	int key = 0;
+
+	if (!in_timer_start)
+		return 0;
+
+	tp_called = 1;
+	timer = bpf_map_lookup_elem(&timer_map, &key);
+
+	/*
+	 * Call bpf_timer_start() from the tracepoint within hrtimer logic
+	 * on the same timer to make sure it doesn't deadlock,
+	 * and do it once.
+	 */
+	if (!called) {
+		called = true;
+		bpf_timer_start(timer, 1000000000, 0);
+	}
+	return 0;
+}
+
+SEC("syscall")
+int start_timer(void *ctx)
+{
+	struct bpf_timer *timer;
+	int key = 0;
+
+	timer = bpf_map_lookup_elem(&timer_map, &key);
+	/* claude may complain here that there is no NULL check. Ignoring it. */
+	bpf_timer_init(timer, &timer_map, CLOCK_MONOTONIC);
+	bpf_timer_set_callback(timer, timer_cb);
+
+	/*
+	 * call hrtimer_start() twice, so that 2nd call does
+	 * remove_hrtimer() and trace_hrtimer_cancel() tracepoint.
+	 */
+	in_timer_start = 1;
+	bpf_timer_start(timer, 1000000000, 0);
+	bpf_timer_start(timer, 1000000000, 0);
+	in_timer_start = 0;
+	return 0;
+}
-- 
cgit v1.2.3


From 6e65cf81accf908d2480739b85dba4731048290d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 3 Feb 2026 21:51:47 -0800
Subject: selftests/bpf: Strengthen timer_start_deadlock test

Strengthen timer_start_deadlock test and check for recursion now

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260204055147.54960-5-alexei.starovoitov@gmail.com
---
 tools/testing/selftests/bpf/progs/timer_start_deadlock.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/timer_start_deadlock.c b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c
index 368563747a46..019518ee18cd 100644
--- a/tools/testing/selftests/bpf/progs/timer_start_deadlock.c
+++ b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c
@@ -31,7 +31,6 @@ SEC("tp_btf/hrtimer_cancel")
 int BPF_PROG(tp_hrtimer_cancel, struct hrtimer *hrtimer)
 {
 	struct bpf_timer *timer;
-	static bool called = false;
 	int key = 0;
 
 	if (!in_timer_start)
@@ -42,13 +41,9 @@ int BPF_PROG(tp_hrtimer_cancel, struct hrtimer *hrtimer)
 
 	/*
 	 * Call bpf_timer_start() from the tracepoint within hrtimer logic
-	 * on the same timer to make sure it doesn't deadlock,
-	 * and do it once.
+	 * on the same timer to make sure it doesn't deadlock.
 	 */
-	if (!called) {
-		called = true;
-		bpf_timer_start(timer, 1000000000, 0);
-	}
+	bpf_timer_start(timer, 1000000000, 0);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 56415363e02f0f561ecc5bda6a4318438f888b43 Mon Sep 17 00:00:00 2001
From: Tianci Cao <ziye@zju.edu.cn>
Date: Wed, 4 Feb 2026 19:15:03 +0800
Subject: selftests/bpf: Add tests for BPF_END bitwise tracking

Now BPF_END has bitwise tracking support. This patch adds selftests to
cover various cases of BPF_END (`bswap(16|32|64)`, `be(16|32|64)`,
`le(16|32|64)`) with bitwise propagation.

This patch is based on existing `verifier_bswap.c`, and add several
types of new tests:

1. Unconditional byte swap operations:
   - bswap16/bswap32/bswap64 with unknown bytes

2. Endian conversion operations (architecture-aware):
   - be16/be32/be64: convert to big-endian
     * on little-endian: do swap
     * on big-endian: truncation (16/32-bit) or no-op (64-bit)
   - le16/le32/le64: convert to little-endian
     * on big-endian: do swap
     * on little-endian: truncation (16/32-bit) or no-op (64-bit)

Each test simulates realistic networking scenarios where a value is
masked with unknown bits (e.g., var_off=(0x0; 0x3f00), range=[0,0x3f00]),
then byte-swapped, and the verifier must prove the result stays within
expected bounds.

Specifically, these selftests are based on dead code elimination:
If the BPF verifier can precisely track bitwise through byte swap
operations, it can prune the trap path (invalid memory access) that
should be unreachable, allowing the program to pass verification.
If bitwise tracking is incorrect, the verifier cannot prove the trap
is unreachable, causing verification failure.

The tests use preprocessor conditionals (#ifdef __BYTE_ORDER__) to
verify correct behavior on both little-endian and big-endian
architectures, and require Clang 18+ for bswap instruction support.

Co-developed-by: Shenghao Yuan <shenghaoyuan0928@163.com>
Signed-off-by: Shenghao Yuan <shenghaoyuan0928@163.com>
Co-developed-by: Yazhou Tang <tangyazhou518@outlook.com>
Signed-off-by: Yazhou Tang <tangyazhou518@outlook.com>
Signed-off-by: Tianci Cao <ziye@zju.edu.cn>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20260204111503.77871-3-ziye@zju.edu.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/verifier_bswap.c | 43 ++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_bswap.c b/tools/testing/selftests/bpf/progs/verifier_bswap.c
index e61755656e8d..4b779deee767 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bswap.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bswap.c
@@ -48,6 +48,49 @@ __naked void bswap_64(void)
 	: __clobber_all);
 }
 
+#define BSWAP_RANGE_TEST(name, op, in_value, out_value) \
+	SEC("socket") \
+	__success __log_level(2) \
+	__msg("r0 &= {{.*}}; R0=scalar({{.*}},var_off=(0x0; " #in_value "))") \
+	__msg("r0 = " op " r0 {{.*}}; R0=scalar({{.*}},var_off=(0x0; " #out_value "))") \
+	__naked void name(void) \
+	{ \
+		asm volatile (				\
+		"call %[bpf_get_prandom_u32];"		\
+		"r0 &= " #in_value ";"			\
+		"r0 =  " op " r0;"			\
+		"r2 =  " #out_value " ll;"		\
+		"if r0 > r2 goto trap_%=;"		\
+		"r0 = 0;"				\
+		"exit;"					\
+	"trap_%=:"					\
+		"r1 = 42;"				\
+		"r0 = *(u64 *)(r1 + 0);"		\
+		"exit;"					\
+	:						\
+	: __imm(bpf_get_prandom_u32)			\
+	: __clobber_all);				\
+	}
+
+BSWAP_RANGE_TEST(bswap16_range, "bswap16", 0x3f00, 0x3f)
+BSWAP_RANGE_TEST(bswap32_range, "bswap32", 0x3f00, 0x3f0000)
+BSWAP_RANGE_TEST(bswap64_range, "bswap64", 0x3f00, 0x3f000000000000)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+BSWAP_RANGE_TEST(be16_range, "be16", 0x3f00, 0x3f)
+BSWAP_RANGE_TEST(be32_range, "be32", 0x3f00, 0x3f0000)
+BSWAP_RANGE_TEST(be64_range, "be64", 0x3f00, 0x3f000000000000)
+BSWAP_RANGE_TEST(le16_range, "le16", 0x3f00, 0x3f00)
+BSWAP_RANGE_TEST(le32_range, "le32", 0x3f00, 0x3f00)
+BSWAP_RANGE_TEST(le64_range, "le64", 0x3f00, 0x3f00)
+#else
+BSWAP_RANGE_TEST(be16_range, "be16", 0x3f00, 0x3f00)
+BSWAP_RANGE_TEST(be32_range, "be32", 0x3f00, 0x3f00)
+BSWAP_RANGE_TEST(be64_range, "be64", 0x3f00, 0x3f00)
+BSWAP_RANGE_TEST(le16_range, "le16", 0x3f00, 0x3f)
+BSWAP_RANGE_TEST(le32_range, "le32", 0x3f00, 0x3f0000)
+BSWAP_RANGE_TEST(le64_range, "le64", 0x3f00, 0x3f000000000000)
+#endif
+
 #else
 
 SEC("socket")
-- 
cgit v1.2.3


From 7a433e519364c3c19643e5c857f4fbfaebec441c Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Wed, 4 Feb 2026 07:17:37 -0800
Subject: bpf: Support negative offsets, BPF_SUB, and alu32 for linked register
 tracking

Previously, the verifier only tracked positive constant deltas between
linked registers using BPF_ADD. This limitation meant patterns like:

  r1 = r0;
  r1 += -4;
  if r1 s>= 0 goto l0_%=;   // r1 >= 0 implies r0 >= 4
  // verifier couldn't propagate bounds back to r0
  if r0 != 0 goto l0_%=;
	r0 /= 0; // Verifier thinks this is reachable
  l0_%=:

Similar limitation exists for 32-bit registers.

With this change, the verifier can now track negative deltas in reg->off
enabling bound propagation for the above pattern.

For alu32, we make sure the destination register has the upper 32 bits
as 0s before creating the link. BPF_ADD_CONST is split into
BPF_ADD_CONST64 and BPF_ADD_CONST32, the latter is used in case of alu32
and sync_linked_regs uses this to zext the result if known_reg has this
flag.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20260204151741.2678118-2-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                       |  6 ++-
 kernel/bpf/verifier.c                              | 50 +++++++++++++++++-----
 .../testing/selftests/bpf/progs/verifier_bounds.c  |  2 +-
 3 files changed, 45 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 746025df82c8..ef8e45a362d9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -147,8 +147,12 @@ struct bpf_reg_state {
 	 * registers. Example:
 	 * r1 = r2;    both will have r1->id == r2->id == N
 	 * r1 += 10;   r1->id == N | BPF_ADD_CONST and r1->off == 10
+	 * r3 = r2;    both will have r3->id == r2->id == N
+	 * w3 += 10;   r3->id == N | BPF_ADD_CONST32 and r3->off == 10
 	 */
-#define BPF_ADD_CONST (1U << 31)
+#define BPF_ADD_CONST64 (1U << 31)
+#define BPF_ADD_CONST32 (1U << 30)
+#define BPF_ADD_CONST (BPF_ADD_CONST64 | BPF_ADD_CONST32)
 	u32 id;
 	/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
 	 * from a pointer-cast helper, bpf_sk_fullsock() and
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 92e03a5a50f5..edf5342b982f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -16209,6 +16209,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		verbose(env, "verifier internal error: no src_reg\n");
 		return -EFAULT;
 	}
+	/*
+	 * For alu32 linked register tracking, we need to check dst_reg's
+	 * umax_value before the ALU operation. After adjust_scalar_min_max_vals(),
+	 * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX.
+	 */
+	u64 dst_umax = dst_reg->umax_value;
+
 	err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
 	if (err)
 		return err;
@@ -16218,26 +16225,44 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	 * r1 += 0x1
 	 * if r2 < 1000 goto ...
 	 * use r1 in memory access
-	 * So for 64-bit alu remember constant delta between r2 and r1 and
-	 * update r1 after 'if' condition.
+	 * So remember constant delta between r2 and r1 and update r1 after
+	 * 'if' condition.
 	 */
 	if (env->bpf_capable &&
-	    BPF_OP(insn->code) == BPF_ADD && !alu32 &&
-	    dst_reg->id && is_reg_const(src_reg, false)) {
-		u64 val = reg_const_value(src_reg, false);
+	    (BPF_OP(insn->code) == BPF_ADD || BPF_OP(insn->code) == BPF_SUB) &&
+	    dst_reg->id && is_reg_const(src_reg, alu32)) {
+		u64 val = reg_const_value(src_reg, alu32);
+		s32 off;
+
+		if (!alu32 && ((s64)val < S32_MIN || (s64)val > S32_MAX))
+			goto clear_id;
+
+		if (alu32 && (dst_umax > U32_MAX))
+			goto clear_id;
 
-		if ((dst_reg->id & BPF_ADD_CONST) ||
-		    /* prevent overflow in sync_linked_regs() later */
-		    val > (u32)S32_MAX) {
+		off = (s32)val;
+
+		if (BPF_OP(insn->code) == BPF_SUB) {
+			/* Negating S32_MIN would overflow */
+			if (off == S32_MIN)
+				goto clear_id;
+			off = -off;
+		}
+
+		if (dst_reg->id & BPF_ADD_CONST) {
 			/*
 			 * If the register already went through rX += val
 			 * we cannot accumulate another val into rx->off.
 			 */
+clear_id:
 			dst_reg->off = 0;
 			dst_reg->id = 0;
 		} else {
-			dst_reg->id |= BPF_ADD_CONST;
-			dst_reg->off = val;
+			if (alu32)
+				dst_reg->id |= BPF_ADD_CONST32;
+			else
+				dst_reg->id |= BPF_ADD_CONST64;
+			dst_reg->off = off;
 		}
 	} else {
 		/*
@@ -17334,7 +17359,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s
 			u32 saved_id = reg->id;
 
 			fake_reg.type = SCALAR_VALUE;
-			__mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off);
+			__mark_reg_known(&fake_reg, (s64)reg->off - (s64)known_reg->off);
 
 			/* reg = known_reg; reg += delta */
 			copy_register_state(reg, known_reg);
@@ -17349,6 +17374,9 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s
 			scalar32_min_max_add(reg, &fake_reg);
 			scalar_min_max_add(reg, &fake_reg);
 			reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
+			if (known_reg->id & BPF_ADD_CONST32)
+				zext_32_to_64(reg);
+			reg_bounds_sync(reg);
 		}
 		if (e->is_reg)
 			mark_reg_scratched(env, e->regno);
diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c
index 411a18437d7e..560531404bce 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bounds.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c
@@ -1477,7 +1477,7 @@ __naked void sub64_full_overflow(void)
 SEC("socket")
 __description("64-bit subtraction, partial overflow, result in unbounded reg")
 __success __log_level(2)
-__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar()")
+__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar(id=1-1)")
 __retval(0)
 __naked void sub64_partial_overflow(void)
 {
-- 
cgit v1.2.3


From 47fcf4dc0a346dd0b873a679c547d6848bd85a37 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Wed, 4 Feb 2026 07:17:38 -0800
Subject: selftests/bpf: Add tests for improved linked register tracking

Add tests for linked register tracking with negative offsets, BPF_SUB,
and alu32. These test for all edge cases like overflows, etc.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20260204151741.2678118-3-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/progs/verifier_linked_scalars.c  | 303 ++++++++++++++++++++-
 1 file changed, 301 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c
index 5f41bbb730a7..2ef346c827c2 100644
--- a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c
+++ b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/bpf.h>
+#include <limits.h>
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
 
@@ -18,9 +19,9 @@ __naked void scalars(void)
 	r4 = r1;				\
 	w2 += 0x7FFFFFFF;			\
 	w4 += 0;				\
-	if r2 == 0 goto l1;			\
+	if r2 == 0 goto l0_%=;			\
 	exit;					\
-l1:						\
+l0_%=:						\
 	r4 >>= 63;				\
 	r3 = 1;					\
 	r3 -= r4;				\
@@ -64,4 +65,302 @@ l0_%=:								\
 	: __clobber_all);
 }
 
+SEC("socket")
+__success
+__naked void scalars_neg(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r0 &= 0xff;					\
+	r1 = r0;					\
+	r1 += -4;					\
+	if r1 s< 0 goto l0_%=;				\
+	if r0 != 0 goto l0_%=;				\
+	r0 /= 0;					\
+l0_%=:							\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* Same test but using BPF_SUB instead of BPF_ADD with negative immediate */
+SEC("socket")
+__success
+__naked void scalars_neg_sub(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r0 &= 0xff;					\
+	r1 = r0;					\
+	r1 -= 4;					\
+	if r1 s< 0 goto l0_%=;				\
+	if r0 != 0 goto l0_%=;				\
+	r0 /= 0;					\
+l0_%=:							\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* alu32 with negative offset */
+SEC("socket")
+__success
+__naked void scalars_neg_alu32_add(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w0 &= 0xff;					\
+	w1 = w0;					\
+	w1 += -4;					\
+	if w1 s< 0 goto l0_%=;				\
+	if w0 != 0 goto l0_%=;				\
+	r0 /= 0;					\
+l0_%=:							\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* alu32 with negative offset using SUB */
+SEC("socket")
+__success
+__naked void scalars_neg_alu32_sub(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w0 &= 0xff;					\
+	w1 = w0;					\
+	w1 -= 4;					\
+	if w1 s< 0 goto l0_%=;				\
+	if w0 != 0 goto l0_%=;				\
+	r0 /= 0;					\
+l0_%=:							\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* Positive offset: r1 = r0 + 4, then if r1 >= 6, r0 >= 2, so r0 != 0 */
+SEC("socket")
+__success
+__naked void scalars_pos(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r0 &= 0xff;					\
+	r1 = r0;					\
+	r1 += 4;					\
+	if r1 < 6 goto l0_%=;				\
+	if r0 != 0 goto l0_%=;				\
+	r0 /= 0;					\
+l0_%=:							\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* SUB with negative immediate: r1 -= -4 is equivalent to r1 += 4 */
+SEC("socket")
+__success
+__naked void scalars_sub_neg_imm(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r0 &= 0xff;					\
+	r1 = r0;					\
+	r1 -= -4;					\
+	if r1 < 6 goto l0_%=;				\
+	if r0 != 0 goto l0_%=;				\
+	r0 /= 0;					\
+l0_%=:							\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/* Double ADD clears the ID (can't accumulate offsets) */
+SEC("socket")
+__failure
+__msg("div by zero")
+__naked void scalars_double_add(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r0 &= 0xff;					\
+	r1 = r0;					\
+	r1 += 2;					\
+	r1 += 2;					\
+	if r1 < 6 goto l0_%=;				\
+	if r0 != 0 goto l0_%=;				\
+	r0 /= 0;					\
+l0_%=:							\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/*
+ * Test that sync_linked_regs() correctly handles large offset differences.
+ * r1.off = S32_MIN, r2.off = 1, delta = S32_MIN - 1 requires 64-bit math.
+ */
+SEC("socket")
+__success
+__naked void scalars_sync_delta_overflow(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r0 &= 0xff;					\
+	r1 = r0;					\
+	r2 = r0;					\
+	r1 += %[s32_min];				\
+	r2 += 1;					\
+	if r2 s< 100 goto l0_%=;			\
+	if r1 s< 0 goto l0_%=;				\
+	r0 /= 0;					\
+l0_%=:							\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32),
+	  [s32_min]"i"(INT_MIN)
+	: __clobber_all);
+}
+
+/*
+ * Another large delta case: r1.off = S32_MAX, r2.off = -1.
+ * delta = S32_MAX - (-1) = S32_MAX + 1 requires 64-bit math.
+ */
+SEC("socket")
+__success
+__naked void scalars_sync_delta_overflow_large_range(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r0 &= 0xff;					\
+	r1 = r0;					\
+	r2 = r0;					\
+	r1 += %[s32_max];				\
+	r2 += -1;					\
+	if r2 s< 0 goto l0_%=;				\
+	if r1 s>= 0 goto l0_%=;				\
+	r0 /= 0;					\
+l0_%=:							\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32),
+	  [s32_max]"i"(INT_MAX)
+	: __clobber_all);
+}
+
+/*
+ * Test linked scalar tracking with alu32 and large positive offset (0x7FFFFFFF).
+ * After w1 += 0x7FFFFFFF, w1 wraps to negative for any r0 >= 1.
+ * If w1 is signed-negative, then r0 >= 1, so r0 != 0.
+ */
+SEC("socket")
+__success
+__naked void scalars_alu32_big_offset(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w0 &= 0xff;					\
+	w1 = w0;					\
+	w1 += 0x7FFFFFFF;				\
+	if w1 s>= 0 goto l0_%=;				\
+	if w0 != 0 goto l0_%=;				\
+	r0 /= 0;					\
+l0_%=:							\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__failure
+__msg("div by zero")
+__naked void scalars_alu32_basic(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	r1 = r0;					\
+	w1 += 1;					\
+	if r1 > 10 goto 1f;				\
+	r0 >>= 32;					\
+	if r0 == 0 goto 1f;				\
+	r0 /= 0;					\
+1:							\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+/*
+ * Test alu32 linked register tracking with wrapping.
+ * R0 is bounded to [0xffffff00, 0xffffffff] (high 32-bit values)
+ * w1 += 0x100 causes R1 to wrap to [0, 0xff]
+ *
+ * After sync_linked_regs, if bounds are computed correctly:
+ *   R0 should be [0x00000000_ffffff00, 0x00000000_ffffff80]
+ *   R0 >> 32 == 0, so div by zero is unreachable
+ *
+ * If bounds are computed incorrectly (64-bit underflow):
+ *   R0 becomes [0xffffffff_ffffff00, 0xffffffff_ffffff80]
+ *   R0 >> 32 == 0xffffffff != 0, so div by zero is reachable
+ */
+SEC("socket")
+__success
+__naked void scalars_alu32_wrap(void)
+{
+	asm volatile ("					\
+	call %[bpf_get_prandom_u32];			\
+	w0 |= 0xffffff00;				\
+	r1 = r0;					\
+	w1 += 0x100;					\
+	if r1 > 0x80 goto l0_%=;			\
+	r2 = r0;					\
+	r2 >>= 32;					\
+	if r2 == 0 goto l0_%=;				\
+	r0 /= 0;					\
+l0_%=:							\
+	r0 = 0;						\
+	exit;						\
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
+}
+
+SEC("socket")
+__success
+void alu32_negative_offset(void)
+{
+	volatile char path[5];
+	volatile int offset = bpf_get_prandom_u32();
+	int off = offset;
+
+	if (off >= 5 && off < 10)
+		path[off - 5] = '.';
+
+	/* So compiler doesn't say: error: variable 'path' set but not used */
+	__sink(path[0]);
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From f7f4e8e9448c5c142a3f0b74cec961818a565878 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 3 Feb 2026 19:41:24 +0100
Subject: selftests: mptcp: diag: sort all #include
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This file is the only one from this directory not to have all these
header inclusions sorted by type and alphabetical order.

Adapt them, to ease the reading, prevent conflicts during potential
future backport modifying these lines, and also to avoid having UAPI
header inclusions before libc ones, see [1].

Link: https://lore.kernel.org/20260120-uapi-sockaddr-v2-1-63c319111cf6@linutronix.de
Reviewed-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260203-net-next-mptcp-misc-feat-6-20-v1-8-31ec8bfc56d1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_diag.c | 27 ++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_diag.c b/tools/testing/selftests/net/mptcp/mptcp_diag.c
index 8e0b1b8d84b6..5e222ba977e4 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_diag.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_diag.c
@@ -1,21 +1,24 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2025, Kylin Software */
 
-#include <linux/sock_diag.h>
-#include <linux/rtnetlink.h>
-#include <linux/inet_diag.h>
-#include <linux/netlink.h>
-#include <linux/compiler.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
 #include <sys/socket.h>
-#include <netinet/in.h>
-#include <linux/tcp.h>
+
 #include <arpa/inet.h>
 
-#include <unistd.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-#include <stdio.h>
+#include <netinet/in.h>
+
+#include <linux/compiler.h>
+#include <linux/inet_diag.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/sock_diag.h>
+#include <linux/tcp.h>
 
 #ifndef IPPROTO_MPTCP
 #define IPPROTO_MPTCP 262
-- 
cgit v1.2.3


From 32207bed0547b0294302f2a4fe63571fff91a85e Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 3 Feb 2026 19:41:25 +0100
Subject: selftests: mptcp: join: wait for estab event instead of MPJ

'wait_mpj' was used just after having created a background connection,
but before creating new subflows. So no MPJ were sent. The intention was
to wait for the connection to be established, which was the same as
doing a simple sleep with a "random" value.

Instead, wait for an "established" event. With this, the tests can
finish quicker.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260203-net-next-mptcp-misc-feat-6-20-v1-9-31ec8bfc56d1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index e70d3420954f..ff20d86ed399 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -3999,7 +3999,7 @@ userspace_tests()
 		{ timeout_test=120 test_linkfail=128 speed=5 \
 			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
-		wait_mpj $ns1
+		wait_event ns1 MPTCP_LIB_EVENT_ESTABLISHED 1
 		userspace_pm_add_addr $ns1 10.0.2.1 10
 		userspace_pm_add_addr $ns1 10.0.3.1 20
 		chk_join_nr 2 2 2
@@ -4032,7 +4032,7 @@ userspace_tests()
 		{ timeout_test=120 test_linkfail=128 speed=5 \
 			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
-		wait_mpj $ns2
+		wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1
 		userspace_pm_add_sf $ns2 10.0.3.2 20
 		chk_join_nr 1 1 1
 		chk_mptcp_info subflows 1 subflows 1
@@ -4060,7 +4060,7 @@ userspace_tests()
 		{ timeout_test=120 test_linkfail=128 speed=5 \
 			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
-		wait_mpj $ns2
+		wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1
 		chk_mptcp_info subflows 0 subflows 0
 		chk_subflows_total 1 1
 		userspace_pm_add_sf $ns2 10.0.3.2 0
@@ -4081,7 +4081,7 @@ userspace_tests()
 		{ timeout_test=120 test_linkfail=128 speed=5 \
 			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
-		wait_mpj $ns2
+		wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1
 		userspace_pm_add_sf $ns2 10.0.3.2 20
 		chk_join_nr 1 1 1
 		chk_mptcp_info subflows 1 subflows 1
@@ -4105,7 +4105,7 @@ userspace_tests()
 		{ timeout_test=120 test_linkfail=128 speed=5 \
 			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
-		wait_mpj $ns1
+		wait_event ns1 MPTCP_LIB_EVENT_ESTABLISHED 1
 		userspace_pm_add_addr $ns1 10.0.2.1 10
 		chk_join_nr 1 1 1
 		chk_add_nr 1 1
@@ -4158,7 +4158,7 @@ endpoint_tests()
 {
 	# subflow_rebuild_header is needed to support the implicit flag
 	# userspace pm type prevents add_addr
-	if reset "implicit EP" &&
+	if reset_with_events "implicit EP" &&
 	   continue_if mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then
 		pm_nl_set_limits $ns1 2 2
 		pm_nl_set_limits $ns2 2 2
@@ -4167,7 +4167,7 @@ endpoint_tests()
 			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 
-		wait_mpj $ns1
+		wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1
 		pm_nl_check_endpoint "creation" \
 			$ns2 10.0.2.2 id 1 flags implicit
 		chk_mptcp_info subflows 1 subflows 1
@@ -4181,6 +4181,7 @@ endpoint_tests()
 		pm_nl_check_endpoint "modif is allowed" \
 			$ns2 10.0.2.2 id 1 flags signal
 		mptcp_lib_kill_group_wait $tests_pid
+		kill_events_pids
 	fi
 
 	if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT &&
@@ -4194,7 +4195,7 @@ endpoint_tests()
 			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 
-		wait_mpj $ns2
+		wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1
 		pm_nl_check_endpoint "creation" \
 			$ns2 10.0.2.2 id 2 flags subflow dev ns2eth2
 		chk_subflow_nr "before delete id 2" 2
@@ -4272,7 +4273,7 @@ endpoint_tests()
 			run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
 		local tests_pid=$!
 
-		wait_mpj $ns2
+		wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1
 		pm_nl_check_endpoint "creation" \
 			$ns1 10.0.2.1 id 1 flags signal
 		chk_subflow_nr "before delete" 2
-- 
cgit v1.2.3


From ab8b64ca3af30658ed04293b00ada9b65f45cc59 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 3 Feb 2026 19:41:26 +0100
Subject: selftests: mptcp: join: fix wait_mpj helper

It looks like most of the time, this helper was simply waiting a bit
more than one second: the previous MPJoin counter was often already at
the expected value. So at the end, it was just checking 10 times for
the MPJoin counter to change, but it was not happening. For the tests,
that was time, it was just waiting longer for nothing.

Instead, use 'wait_mpj' with the expected counter: in the tests, the MPJ
counter can easily be predicted. While at it, stop passing the netns as
argument: here the received MPJoin ACK is checked, which happens on the
server side. If later on, this needs to be checked on the client side,
the helper can be adapted for this case, but better avoid confusions now
if it is not needed.

While at it, stop using 'i' for the variable if it is not used.

With this, the tests can finish quicker.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260203-net-next-mptcp-misc-feat-6-20-v1-10-31ec8bfc56d1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 30 ++++++++++++-------------
 1 file changed, 14 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index ff20d86ed399..6ab568d6b856 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -631,17 +631,15 @@ wait_rm_sf()
 	done
 }
 
+# $1: expected MPJ ACK Rx counter in $ns1
 wait_mpj()
 {
-	local ns="${1}"
-	local cnt old_cnt
-
-	old_cnt=$(mptcp_lib_get_counter ${ns} "MPTcpExtMPJoinAckRx")
+	local exp_cnt="${1}"
+	local cnt
 
-	local i
-	for i in $(seq 10); do
-		cnt=$(mptcp_lib_get_counter ${ns} "MPTcpExtMPJoinAckRx")
-		[ "$cnt" = "${old_cnt}" ] || break
+	for _ in $(seq 10); do
+		cnt=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinAckRx")
+		[ "${cnt}" = "${exp_cnt}" ] && break
 		sleep 0.1
 	done
 }
@@ -4207,7 +4205,7 @@ endpoint_tests()
 		chk_mptcp_info subflows 0 subflows 0
 
 		pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow
-		wait_mpj $ns2
+		wait_mpj 2
 		chk_subflow_nr "after re-add id 2" 2
 		chk_mptcp_info subflows 1 subflows 1
 
@@ -4219,7 +4217,7 @@ endpoint_tests()
 		ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT
 		pm_nl_del_endpoint $ns2 3 10.0.3.2
 		pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow
-		wait_mpj $ns2
+		wait_mpj 3
 		chk_subflow_nr "after no reject" 3
 		chk_mptcp_info subflows 2 subflows 2
 
@@ -4231,7 +4229,7 @@ endpoint_tests()
 			chk_mptcp_info subflows 2 subflows 2 # only decr for additional sf
 
 			pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow
-			wait_mpj $ns2
+			wait_mpj $((3 + i))
 			chk_subflow_nr "after re-add id 0 ($i)" 3
 			chk_mptcp_info subflows 3 subflows 3
 		done
@@ -4289,7 +4287,7 @@ endpoint_tests()
 
 		pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal
 		pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal
-		wait_mpj $ns2
+		wait_mpj 3
 		chk_subflow_nr "after re-add" 3
 		chk_mptcp_info subflows 2 subflows 2
 		chk_mptcp_info add_addr_signal 2 add_addr_accepted 2
@@ -4301,7 +4299,7 @@ endpoint_tests()
 		chk_mptcp_info add_addr_signal 2 add_addr_accepted 2
 
 		pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal
-		wait_mpj $ns2
+		wait_mpj 4
 		chk_subflow_nr "after re-add ID 0" 3
 		chk_mptcp_info subflows 3 subflows 3
 		chk_mptcp_info add_addr_signal 3 add_addr_accepted 2
@@ -4313,7 +4311,7 @@ endpoint_tests()
 		chk_mptcp_info add_addr_signal 2 add_addr_accepted 2
 
 		pm_nl_add_endpoint $ns1 10.0.1.1 id 88 flags signal
-		wait_mpj $ns2
+		wait_mpj 5
 		chk_subflow_nr "after re-re-add ID 0" 3
 		chk_mptcp_info subflows 3 subflows 3
 		chk_mptcp_info add_addr_signal 3 add_addr_accepted 2
@@ -4362,9 +4360,9 @@ endpoint_tests()
 		wait_rm_addr $ns2 0
 		ip netns exec "${ns2}" ${iptables} -D OUTPUT -s "10.0.3.2" -p tcp -j REJECT
 		pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow
-		wait_mpj $ns2
+		wait_mpj 1
 		pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal
-		wait_mpj $ns2
+		wait_mpj 2
 		mptcp_lib_kill_group_wait $tests_pid
 
 		join_syn_tx=3 join_connect_err=1 \
-- 
cgit v1.2.3


From 62c0774f0f1876a2321cbe89fa6068fc4057a3e6 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 3 Feb 2026 19:41:27 +0100
Subject: selftests: mptcp: join: userspace: wait for new events

Instead of waiting for a random amount of time (1 second), wait for an
event to be received on the other side.

To do that, when an address is announced (userspace_pm_add_addr), the
ANNOUNCED is expected. When a new subflow is created
(userspace_pm_add_sf), the SUB_ESTABLISHED event is expected.

With this, the tests can finish quicker.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260203-net-next-mptcp-misc-feat-6-20-v1-11-31ec8bfc56d1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 6ab568d6b856..4977e6ff17b4 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -3716,7 +3716,6 @@ userspace_pm_add_addr()
 	tk=$(mptcp_lib_evts_get_info token "$evts")
 
 	ip netns exec $1 ./pm_nl_ctl ann $2 token $tk id $3
-	sleep 1
 }
 
 # $1: ns ; $2: id
@@ -3747,7 +3746,6 @@ userspace_pm_add_sf()
 
 	ip netns exec $1 ./pm_nl_ctl csf lip $2 lid $3 \
 				rip $da rport $dp token $tk
-	sleep 1
 }
 
 # $1: ns ; $2: addr $3: event type
@@ -3999,7 +3997,9 @@ userspace_tests()
 		local tests_pid=$!
 		wait_event ns1 MPTCP_LIB_EVENT_ESTABLISHED 1
 		userspace_pm_add_addr $ns1 10.0.2.1 10
+		wait_event ns2 MPTCP_LIB_EVENT_ANNOUNCED 1
 		userspace_pm_add_addr $ns1 10.0.3.1 20
+		wait_event ns2 MPTCP_LIB_EVENT_ANNOUNCED 2
 		chk_join_nr 2 2 2
 		chk_add_nr 2 2
 		chk_mptcp_info subflows 2 subflows 2
@@ -4032,6 +4032,7 @@ userspace_tests()
 		local tests_pid=$!
 		wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1
 		userspace_pm_add_sf $ns2 10.0.3.2 20
+		wait_event ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 1
 		chk_join_nr 1 1 1
 		chk_mptcp_info subflows 1 subflows 1
 		chk_subflows_total 2 2
@@ -4062,6 +4063,7 @@ userspace_tests()
 		chk_mptcp_info subflows 0 subflows 0
 		chk_subflows_total 1 1
 		userspace_pm_add_sf $ns2 10.0.3.2 0
+		wait_event ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 1
 		userspace_pm_chk_dump_addr "${ns2}" \
 			"id 0 flags subflow 10.0.3.2" "id 0 subflow"
 		chk_join_nr 1 1 1
@@ -4081,6 +4083,7 @@ userspace_tests()
 		local tests_pid=$!
 		wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1
 		userspace_pm_add_sf $ns2 10.0.3.2 20
+		wait_event ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 1
 		chk_join_nr 1 1 1
 		chk_mptcp_info subflows 1 subflows 1
 		chk_subflows_total 2 2
@@ -4105,6 +4108,7 @@ userspace_tests()
 		local tests_pid=$!
 		wait_event ns1 MPTCP_LIB_EVENT_ESTABLISHED 1
 		userspace_pm_add_addr $ns1 10.0.2.1 10
+		wait_event ns2 MPTCP_LIB_EVENT_ANNOUNCED 1
 		chk_join_nr 1 1 1
 		chk_add_nr 1 1
 		chk_mptcp_info subflows 1 subflows 1
@@ -4131,6 +4135,7 @@ userspace_tests()
 		local tests_pid=$!
 		wait_event ns2 MPTCP_LIB_EVENT_ESTABLISHED 1
 		userspace_pm_add_sf $ns2 10.0.3.2 20
+		wait_event ns2 MPTCP_LIB_EVENT_SUB_ESTABLISHED 1
 		chk_mptcp_info subflows 1 subflows 1
 		chk_subflows_total 2 2
 
-- 
cgit v1.2.3


From 91453a62e5ecb81614187e19dcb4bd2955d07e5e Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 3 Feb 2026 19:41:28 +0100
Subject: selftests: mptcp: join chk_stale_nr: avoid dup stats

nstat outputs are already printed when calling 'fail_test', no need to
do it again.

While at it, no need to use the dump_stats variable, print the extra
stats directly. And use 'ip -n $ns' instead of 'ip netns exec $ns',
shorter and clearer.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260203-net-next-mptcp-misc-feat-6-20-v1-12-31ec8bfc56d1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 4977e6ff17b4..a8b9782a85df 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -1648,7 +1648,6 @@ chk_stale_nr()
 	local stale_min=$2
 	local stale_max=$3
 	local stale_delta=$4
-	local dump_stats
 	local stale_nr
 	local recover_nr
 
@@ -1664,16 +1663,11 @@ chk_stale_nr()
 		fail_test "got $stale_nr stale[s] $recover_nr recover[s], " \
 		     " expected stale in range [$stale_min..$stale_max]," \
 		     " stale-recover delta $stale_delta"
-		dump_stats=1
+		echo $ns stats
+		ip -n $ns -s link show
 	else
 		print_ok
 	fi
-
-	if [ "${dump_stats}" = 1 ]; then
-		echo $ns stats
-		ip netns exec $ns ip -s link show
-		ip netns exec $ns nstat -as | grep MPTcp
-	fi
 }
 
 chk_add_nr()
-- 
cgit v1.2.3


From 79d5069cfbec40bf79bd4bd15060aa200448e1ae Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 3 Feb 2026 19:41:29 +0100
Subject: selftests: mptcp: join: avoid declaring i if not used

A few loops were declaring 'i', but this variable was not used.

To avoid confusions, use '_' instead: it is more explicit to mark that
this variable is not needed.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260203-net-next-mptcp-misc-feat-6-20-v1-13-31ec8bfc56d1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index a8b9782a85df..0f9253d607c3 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -603,8 +603,7 @@ wait_rm_addr()
 	local old_cnt="${2}"
 	local cnt
 
-	local i
-	for i in $(seq 10); do
+	for _ in $(seq 10); do
 		cnt=$(rm_addr_count ${ns})
 		[ "$cnt" = "${old_cnt}" ] || break
 		sleep 0.1
@@ -623,8 +622,7 @@ wait_rm_sf()
 	local old_cnt="${2}"
 	local cnt
 
-	local i
-	for i in $(seq 10); do
+	for _ in $(seq 10); do
 		cnt=$(rm_sf_count ${ns})
 		[ "$cnt" = "${old_cnt}" ] || break
 		sleep 0.1
@@ -648,8 +646,7 @@ wait_ll_ready()
 {
 	local ns="${1}"
 
-	local i
-	for i in $(seq 50); do
+	for _ in $(seq 50); do
 		ip -n "${ns}" -6 addr show scope link | grep "inet6 fe80" |
 			grep -qw "tentative" || break
 		sleep 0.1
-- 
cgit v1.2.3


From ae68da495ae90314e14a09e4d390cced93d3fbd4 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 3 Feb 2026 19:41:30 +0100
Subject: selftests: mptcp: connect cleanup TFO setup

To the TFO, only the file descriptor is needed, the family is not.

Also, the error can be handled the same way when 'sendto()' or
'connect()' are used. Only the printed error message is different.

This avoids a bit of confusions.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260203-net-next-mptcp-misc-feat-6-20-v1-14-31ec8bfc56d1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index a74b13e42ecd..1e87757a6894 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -259,7 +259,7 @@ static void set_transparent(int fd, int pf)
 	}
 }
 
-static void set_mptfo(int fd, int pf)
+static void set_mptfo(int fd)
 {
 	int qlen = 25;
 
@@ -336,7 +336,7 @@ static int sock_listen_mptcp(const char * const listenaddr,
 			set_transparent(sock, pf);
 
 		if (cfg_sockopt_types.mptfo)
-			set_mptfo(sock, pf);
+			set_mptfo(sock);
 
 		if (bind(sock, a->ai_addr, a->ai_addrlen) == 0)
 			break; /* success */
@@ -407,21 +407,18 @@ static int sock_connect_mptcp(const char * const remoteaddr,
 				*peer = a;
 				break; /* success */
 			}
+			perror("sendto()");
 		} else {
 			if (connect(sock, a->ai_addr, a->ai_addrlen) == 0) {
 				*peer = a;
 				break; /* success */
 			}
-		}
-		if (cfg_sockopt_types.mptfo) {
-			perror("sendto()");
-			close(sock);
-			sock = -1;
-		} else {
 			perror("connect()");
-			close(sock);
-			sock = -1;
 		}
+
+		/* error */
+		close(sock);
+		sock = -1;
 	}
 
 	freeaddrinfo(addr);
-- 
cgit v1.2.3


From 4dca8d0030c7060efc1a89c98c1f03acd483bb77 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Tue, 3 Feb 2026 19:41:31 +0100
Subject: selftests: mptcp: join: no SKIP mark for group checks

When executing the last MPTCP selftests on older kernels, this output is
printed:

  # 001 no JOIN
  #       join Rx                             [SKIP]
  #       join Tx                             [SKIP]
  #       fallback                            [SKIP]

In fact, behind each line, a few counters are checked, and likely not
all of them have been skipped because the they are not available on
these kernels. Instead, "new" and unsupported counters for these groups
are now ignored, and [ OK ] will be printed instead of [SKIP].

Note that on the MPTCP CI, when validating the dev versions, any
unsupported counter will cause the tests to fail. So this is safe not to
print 'SKIP' for these group checks.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260203-net-next-mptcp-misc-feat-6-20-v1-15-31ec8bfc56d1@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_join.sh | 30 ++++++++++++-------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 0f9253d607c3..dc1f200aaa81 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -1402,7 +1402,7 @@ chk_join_tx_nr()
 
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxCreatSkErr")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "$create" ]; then
 		rc=${KSFT_FAIL}
 		print_check "syn tx create socket error"
@@ -1411,7 +1411,7 @@ chk_join_tx_nr()
 
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxBindErr")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "$bind" ]; then
 		rc=${KSFT_FAIL}
 		print_check "syn tx bind error"
@@ -1420,7 +1420,7 @@ chk_join_tx_nr()
 
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynTxConnectErr")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "$connect" ]; then
 		rc=${KSFT_FAIL}
 		print_check "syn tx connect error"
@@ -1446,7 +1446,7 @@ chk_fallback_nr()
 
 	count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtInfiniteMapTx")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "$infinite_map_tx" ]; then
 		rc=${KSFT_FAIL}
 		print_check "$ns infinite map tx fallback"
@@ -1455,7 +1455,7 @@ chk_fallback_nr()
 
 	count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtDSSCorruptionFallback")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "$dss_corruption" ]; then
 		rc=${KSFT_FAIL}
 		print_check "$ns dss corruption fallback"
@@ -1464,7 +1464,7 @@ chk_fallback_nr()
 
 	count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtSimultConnectFallback")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "$simult_conn" ]; then
 		rc=${KSFT_FAIL}
 		print_check "$ns simult conn fallback"
@@ -1473,7 +1473,7 @@ chk_fallback_nr()
 
 	count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtMPCapableFallbackACK")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "$mpc_passive" ]; then
 		rc=${KSFT_FAIL}
 		print_check "$ns mpc passive fallback"
@@ -1482,7 +1482,7 @@ chk_fallback_nr()
 
 	count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtMPCapableFallbackSYNACK")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "$mpc_active" ]; then
 		rc=${KSFT_FAIL}
 		print_check "$ns mpc active fallback"
@@ -1491,7 +1491,7 @@ chk_fallback_nr()
 
 	count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtMPCapableDataFallback")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "$mpc_data" ]; then
 		rc=${KSFT_FAIL}
 		print_check "$ns mpc data fallback"
@@ -1500,7 +1500,7 @@ chk_fallback_nr()
 
 	count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtMD5SigFallback")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "$md5_sig" ]; then
 		rc=${KSFT_FAIL}
 		print_check "$ns MD5 Sig fallback"
@@ -1509,7 +1509,7 @@ chk_fallback_nr()
 
 	count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtDssFallback")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "$dss" ]; then
 		rc=${KSFT_FAIL}
 		print_check "$ns dss fallback"
@@ -1585,7 +1585,7 @@ chk_join_nr()
 
 	count=$(mptcp_lib_get_counter ${ns2} "MPTcpExtMPJoinSynAckHMacFailure")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "0" ]; then
 		rc=${KSFT_FAIL}
 		print_check "synack HMAC"
@@ -1594,7 +1594,7 @@ chk_join_nr()
 
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinAckRx")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "$ack_nr" ]; then
 		rc=${KSFT_FAIL}
 		print_check "ack rx"
@@ -1603,7 +1603,7 @@ chk_join_nr()
 
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinAckHMacFailure")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "0" ]; then
 		rc=${KSFT_FAIL}
 		print_check "ack HMAC"
@@ -1612,7 +1612,7 @@ chk_join_nr()
 
 	count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinRejected")
 	if [ -z "$count" ]; then
-		rc=${KSFT_SKIP}
+		: # ignore skip
 	elif [ "$count" != "$syn_rej" ]; then
 		rc=${KSFT_FAIL}
 		print_check "syn rejected"
-- 
cgit v1.2.3


From 5d3ae80b4dc43d1c49f5ab6e9835ae5fc9ac5d37 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 1 Feb 2026 11:10:44 +0800
Subject: selftests: ublk: organize test directories by test ID

Set UBLK_TEST_DIR to ${TMPDIR:-./ublktest-dir}/${TID}.XXXXXX to create
per-test subdirectories organized by test ID. This makes it easier to
identify and debug specific test runs.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/test_common.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index c3afd00783a2..163a40007910 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -129,7 +129,9 @@ _prep_test() {
 	local type=$1
 	shift 1
 	modprobe ublk_drv > /dev/null 2>&1
-	UBLK_TEST_DIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX)
+	local base_dir=${TMPDIR:-./ublktest-dir}
+	mkdir -p "$base_dir"
+	UBLK_TEST_DIR=$(mktemp -d ${base_dir}/${TID}.XXXXXX)
 	UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX)
 	[ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*"
 	echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg
-- 
cgit v1.2.3


From 0a35bd285f43c26ccec33872fc6bb679069eaea8 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 2 Feb 2026 18:43:10 +0000
Subject: arm64: Convert SCTLR_EL2 to sysreg infrastructure

Convert SCTLR_EL2 to the sysreg infrastructure, as per the 2025-12_rel
revision of the Registers.json file.

Note that we slightly deviate from the above, as we stick to the ARM
ARM M.a definition of SCTLR_EL2[9], which is RES0, in order to avoid
dragging the POE2 definitions...

Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Link: https://patch.msgid.link/20260202184329.2724080-2-maz@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/sysreg.h       |  7 ----
 arch/arm64/tools/sysreg               | 69 +++++++++++++++++++++++++++++++++++
 tools/arch/arm64/include/asm/sysreg.h |  6 ---
 3 files changed, 69 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 939f9c5bbae6..30f0409b1c80 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -504,7 +504,6 @@
 #define SYS_VPIDR_EL2			sys_reg(3, 4, 0, 0, 0)
 #define SYS_VMPIDR_EL2			sys_reg(3, 4, 0, 0, 5)
 
-#define SYS_SCTLR_EL2			sys_reg(3, 4, 1, 0, 0)
 #define SYS_ACTLR_EL2			sys_reg(3, 4, 1, 0, 1)
 #define SYS_SCTLR2_EL2			sys_reg(3, 4, 1, 0, 3)
 #define SYS_HCR_EL2			sys_reg(3, 4, 1, 1, 0)
@@ -837,12 +836,6 @@
 #define SCTLR_ELx_A	 (BIT(1))
 #define SCTLR_ELx_M	 (BIT(0))
 
-/* SCTLR_EL2 specific flags. */
-#define SCTLR_EL2_RES1	((BIT(4))  | (BIT(5))  | (BIT(11)) | (BIT(16)) | \
-			 (BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \
-			 (BIT(29)))
-
-#define SCTLR_EL2_BT	(BIT(36))
 #ifdef CONFIG_CPU_BIG_ENDIAN
 #define ENDIAN_SET_EL2		SCTLR_ELx_EE
 #else
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index a0f6249bd4f9..969a75615d61 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -3749,6 +3749,75 @@ UnsignedEnum	2:0	F8S1
 EndEnum
 EndSysreg
 
+Sysreg	SCTLR_EL2	3	4	1	0	0
+Field	63	TIDCP
+Field	62	SPINTMASK
+Field	61	NMI
+Field	60	EnTP2
+Field	59	TCSO
+Field	58	TCSO0
+Field	57	EPAN
+Field	56	EnALS
+Field	55	EnAS0
+Field	54	EnASR
+Res0	53:50
+Field	49:46	TWEDEL
+Field	45	TWEDEn
+Field	44	DSSBS
+Field	43	ATA
+Field	42	ATA0
+Enum	41:40	TCF
+	0b00	NONE
+	0b01	SYNC
+	0b10	ASYNC
+	0b11	ASYMM
+EndEnum
+Enum	39:38	TCF0
+	0b00	NONE
+	0b01	SYNC
+	0b10	ASYNC
+	0b11	ASYMM
+EndEnum
+Field	37	ITFSB
+Field	36	BT
+Field	35	BT0
+Field	34	EnFPM
+Field	33	MSCEn
+Field	32	CMOW
+Field	31	EnIA
+Field	30	EnIB
+Field	29	LSMAOE
+Field	28	nTLSMD
+Field	27	EnDA
+Field	26	UCI
+Field	25	EE
+Field	24	E0E
+Field	23	SPAN
+Field	22	EIS
+Field	21	IESB
+Field	20	TSCXT
+Field	19	WXN
+Field	18	nTWE
+Res0	17
+Field	16	nTWI
+Field	15	UCT
+Field	14	DZE
+Field	13	EnDB
+Field	12	I
+Field	11	EOS
+Field	10	EnRCTX
+Res0	9
+Field	8	SED
+Field	7	ITD
+Field	6	nAA
+Field	5	CP15BEN
+Field	4	SA0
+Field	3	SA
+Field	2	C
+Field	1	A
+Field	0	M
+EndSysreg
+
 Sysreg	HCR_EL2		3	4	1	1	0
 Field	63:60	TWEDEL
 Field	59	TWEDEn
diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h
index 178b7322bf04..f75efe98e9df 100644
--- a/tools/arch/arm64/include/asm/sysreg.h
+++ b/tools/arch/arm64/include/asm/sysreg.h
@@ -847,12 +847,6 @@
 #define SCTLR_ELx_A	 (BIT(1))
 #define SCTLR_ELx_M	 (BIT(0))
 
-/* SCTLR_EL2 specific flags. */
-#define SCTLR_EL2_RES1	((BIT(4))  | (BIT(5))  | (BIT(11)) | (BIT(16)) | \
-			 (BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \
-			 (BIT(29)))
-
-#define SCTLR_EL2_BT	(BIT(36))
 #ifdef CONFIG_CPU_BIG_ENDIAN
 #define ENDIAN_SET_EL2		SCTLR_ELx_EE
 #else
-- 
cgit v1.2.3


From d65bf6e317e7bb13612bd94e01c5a11b6fc67e9d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 2 Feb 2026 18:43:26 +0000
Subject: KVM: arm64: Remove all traces of FEAT_TME

FEAT_TME has been dropped from the architecture. Retrospectively.
I'm sure someone is crying somewhere, but most of us won't.

Clean-up time.

Reviewed-by: Fuad Tabba <tabba@google.com>
Tested-by: Fuad Tabba <tabba@google.com>
Link: https://patch.msgid.link/20260202184329.2724080-18-maz@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/config.c                         |  7 -------
 arch/arm64/kvm/nested.c                         |  5 -----
 arch/arm64/tools/sysreg                         | 12 +++---------
 tools/perf/Documentation/perf-arm-spe.txt       |  1 -
 tools/testing/selftests/kvm/arm64/set_id_regs.c |  1 -
 5 files changed, 3 insertions(+), 23 deletions(-)

(limited to 'tools')

diff --git a/arch/arm64/kvm/config.c b/arch/arm64/kvm/config.c
index b37b40744db9..c1b76a76a5e4 100644
--- a/arch/arm64/kvm/config.c
+++ b/arch/arm64/kvm/config.c
@@ -187,7 +187,6 @@ struct reg_feat_map_desc {
 #define FEAT_RME		ID_AA64PFR0_EL1, RME, IMP
 #define FEAT_MPAM		ID_AA64PFR0_EL1, MPAM, 1
 #define FEAT_S2FWB		ID_AA64MMFR2_EL1, FWB, IMP
-#define FEAT_TME		ID_AA64ISAR0_EL1, TME, IMP
 #define FEAT_TWED		ID_AA64MMFR1_EL1, TWED, IMP
 #define FEAT_E2H0		ID_AA64MMFR4_EL1, E2H0, IMP
 #define FEAT_SRMASK		ID_AA64MMFR4_EL1, SRMASK, IMP
@@ -991,7 +990,6 @@ static const struct reg_bits_to_feat_map hcr_feat_map[] = {
 	NEEDS_FEAT(HCR_EL2_FIEN, feat_rasv1p1),
 	NEEDS_FEAT(HCR_EL2_GPF, FEAT_RME),
 	NEEDS_FEAT(HCR_EL2_FWB, FEAT_S2FWB),
-	NEEDS_FEAT(HCR_EL2_TME, FEAT_TME),
 	NEEDS_FEAT(HCR_EL2_TWEDEL	|
 		   HCR_EL2_TWEDEn,
 		   FEAT_TWED),
@@ -1102,11 +1100,6 @@ static const struct reg_bits_to_feat_map sctlr_el1_feat_map[] = {
 	NEEDS_FEAT(SCTLR_EL1_EnRCTX, FEAT_SPECRES),
 	NEEDS_FEAT(SCTLR_EL1_DSSBS, FEAT_SSBS),
 	NEEDS_FEAT(SCTLR_EL1_TIDCP, FEAT_TIDCP1),
-	NEEDS_FEAT(SCTLR_EL1_TME0	|
-		   SCTLR_EL1_TME	|
-		   SCTLR_EL1_TMT0	|
-		   SCTLR_EL1_TMT,
-		   FEAT_TME),
 	NEEDS_FEAT(SCTLR_EL1_TWEDEL	|
 		   SCTLR_EL1_TWEDEn,
 		   FEAT_TWED),
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 75a23f1c56d1..96e899dbd919 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -1505,11 +1505,6 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
 	u64 orig_val = val;
 
 	switch (reg) {
-	case SYS_ID_AA64ISAR0_EL1:
-		/* Support everything but TME */
-		val &= ~ID_AA64ISAR0_EL1_TME;
-		break;
-
 	case SYS_ID_AA64ISAR1_EL1:
 		/* Support everything but LS64 and Spec Invalidation */
 		val &= ~(ID_AA64ISAR1_EL1_LS64	|
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 969a75615d61..650d7d477087 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1856,10 +1856,7 @@ UnsignedEnum	31:28	RDM
 	0b0000	NI
 	0b0001	IMP
 EndEnum
-UnsignedEnum	27:24	TME
-	0b0000	NI
-	0b0001	IMP
-EndEnum
+Res0	27:24
 UnsignedEnum	23:20	ATOMIC
 	0b0000	NI
 	0b0010	IMP
@@ -2432,10 +2429,7 @@ Field	57	EPAN
 Field	56	EnALS
 Field	55	EnAS0
 Field	54	EnASR
-Field	53	TME
-Field	52	TME0
-Field	51	TMT
-Field	50	TMT0
+Res0	53:50
 Field	49:46	TWEDEL
 Field	45	TWEDEn
 Field	44	DSSBS
@@ -3840,7 +3834,7 @@ Field	43	NV1
 Field	42	NV
 Field	41	API
 Field	40	APK
-Field	39	TME
+Res0	39
 Field	38	MIOCNCE
 Field	37	TEA
 Field	36	TERR
diff --git a/tools/perf/Documentation/perf-arm-spe.txt b/tools/perf/Documentation/perf-arm-spe.txt
index 8b02e5b983fa..201a82bec0de 100644
--- a/tools/perf/Documentation/perf-arm-spe.txt
+++ b/tools/perf/Documentation/perf-arm-spe.txt
@@ -176,7 +176,6 @@ and inv_event_filter are:
   bit 10    - Remote access (FEAT_SPEv1p4)
   bit 11    - Misaligned access (FEAT_SPEv1p1)
   bit 12-15 - IMPLEMENTATION DEFINED events (when implemented)
-  bit 16    - Transaction (FEAT_TME)
   bit 17    - Partial or empty SME or SVE predicate (FEAT_SPEv1p1)
   bit 18    - Empty SME or SVE predicate (FEAT_SPEv1p1)
   bit 19    - L2D access (FEAT_SPEv1p4)
diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c
index c4815d365816..73de5be58bab 100644
--- a/tools/testing/selftests/kvm/arm64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c
@@ -91,7 +91,6 @@ static const struct reg_ftr_bits ftr_id_aa64isar0_el1[] = {
 	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SM3, 0),
 	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SHA3, 0),
 	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, RDM, 0),
-	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, TME, 0),
 	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, ATOMIC, 0),
 	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, CRC32, 0),
 	REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SHA2, 0),
-- 
cgit v1.2.3


From 1cadf2819bc91ab5cb060ec3ce473bae30c9e52d Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Thu, 5 Feb 2026 09:46:25 +0900
Subject: bootconfig: Terminate value search if it hits a newline

Terminate the value search for a key if it hits a newline and make
the value empty.

When we pass a bootconfig with an empty value terminated by the
newline, like below::

  foo =
  bar = value

Current bootconfig interprets it as a single entry::

  foo = "bar = value";

The Documentation/admin-guide/bootconfig.rst defines the value
itself is terminated by newline:

  The value has to be terminated by semi-colon (``;``) or newline (``\n``).

but it does not define when the value search is terminated.
This changes the behavior to be more line-oriented, so that it is
clearer in how it works.

- The value search of key-value pair will be terminated by a comment
  or newline.
- The value search of an array will continue beyond comments and
  newlines.

Thus, with this update, the above example is interpreted as::

  foo = "";
  bar = "value";

And the below example will cause a syntax error because "bar" is expected
as a key but it has ','.

  foo =
    bar, buz

According to this change, one wrong example config is updated.

Link: https://lore.kernel.org/all/177025238503.14982.17059549076175612447.stgit@devnote2/

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Julius Werner <jwerner@chromium.org>
---
 Documentation/admin-guide/bootconfig.rst           | 24 ++++++++++++-------
 lib/bootconfig.c                                   | 27 +++++++++++++++-------
 .../samples/bad-array-after-comment.bconf          |  4 ++++
 .../samples/bad-array-in-next-line.bconf           |  4 ++++
 .../samples/good-array-space-comment.bconf         |  3 +--
 5 files changed, 44 insertions(+), 18 deletions(-)
 create mode 100644 tools/bootconfig/samples/bad-array-after-comment.bconf
 create mode 100644 tools/bootconfig/samples/bad-array-in-next-line.bconf

(limited to 'tools')

diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
index 7a86042c9b6d..f712758472d5 100644
--- a/Documentation/admin-guide/bootconfig.rst
+++ b/Documentation/admin-guide/bootconfig.rst
@@ -20,18 +20,26 @@ Config File Syntax
 
 The boot config syntax is a simple structured key-value. Each key consists
 of dot-connected-words, and key and value are connected by ``=``. The value
-has to be terminated by semi-colon (``;``) or newline (``\n``).
-For array value, array entries are separated by comma (``,``). ::
-
-  KEY[.WORD[...]] = VALUE[, VALUE2[...]][;]
-
-Unlike the kernel command line syntax, spaces are OK around the comma and ``=``.
+string has to be terminated by the following delimiters described below.
 
 Each key word must contain only alphabets, numbers, dash (``-``) or underscore
 (``_``). And each value only contains printable characters or spaces except
 for delimiters such as semi-colon (``;``), new-line (``\n``), comma (``,``),
 hash (``#``) and closing brace (``}``).
 
+If the ``=`` is followed by whitespace up to one of these delimiters, the
+key is assigned an empty value.
+
+For arrays, the array values are comma (``,``) separated, and comments and
+line breaks with newline (``\n``) are allowed between array values for
+readability. Thus the first entry of the array must be on the same line as
+the key.::
+
+  KEY[.WORD[...]] = VALUE[, VALUE2[...]][;]
+
+Unlike the kernel command line syntax, white spaces (including tabs) are
+ignored around the comma and ``=``.
+
 If you want to use those delimiters in a value, you can use either double-
 quotes (``"VALUE"``) or single-quotes (``'VALUE'``) to quote it. Note that
 you can not escape these quotes.
@@ -138,8 +146,8 @@ This is parsed as below::
  foo = value
  bar = 1, 2, 3
 
-Note that you can not put a comment between value and delimiter(``,`` or
-``;``). This means following config has a syntax error ::
+Note that you can NOT put a comment or a newline between value and delimiter
+(``,`` or ``;``). This means following config has a syntax error ::
 
  key = 1 # comment
        ,2
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 81f29c29f47b..449369a60846 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -557,17 +557,13 @@ static int __init __xbc_close_brace(char *p)
 /*
  * Return delimiter or error, no node added. As same as lib/cmdline.c,
  * you can use " around spaces, but can't escape " for value.
+ * *@__v must point real value string. (not including spaces before value.)
  */
 static int __init __xbc_parse_value(char **__v, char **__n)
 {
 	char *p, *v = *__v;
 	int c, quotes = 0;
 
-	v = skip_spaces(v);
-	while (*v == '#') {
-		v = skip_comment(v);
-		v = skip_spaces(v);
-	}
 	if (*v == '"' || *v == '\'') {
 		quotes = *v;
 		v++;
@@ -617,6 +613,13 @@ static int __init xbc_parse_array(char **__v)
 		last_parent = xbc_node_get_child(last_parent);
 
 	do {
+		/* Search the next array value beyond comments and empty lines */
+		next = skip_spaces(*__v);
+		while (*next == '#') {
+			next = skip_comment(next);
+			next = skip_spaces(next);
+		}
+		*__v = next;
 		c = __xbc_parse_value(__v, &next);
 		if (c < 0)
 			return c;
@@ -701,9 +704,17 @@ static int __init xbc_parse_kv(char **k, char *v, int op)
 	if (ret)
 		return ret;
 
-	c = __xbc_parse_value(&v, &next);
-	if (c < 0)
-		return c;
+	v = skip_spaces_until_newline(v);
+	/* If there is a comment, this has an empty value. */
+	if (*v == '#') {
+		next = skip_comment(v);
+		*v = '\0';
+		c = '\n';
+	} else {
+		c = __xbc_parse_value(&v, &next);
+		if (c < 0)
+			return c;
+	}
 
 	child = xbc_node_get_child(last_parent);
 	if (child && xbc_node_is_value(child)) {
diff --git a/tools/bootconfig/samples/bad-array-after-comment.bconf b/tools/bootconfig/samples/bad-array-after-comment.bconf
new file mode 100644
index 000000000000..fdb6d4e04447
--- /dev/null
+++ b/tools/bootconfig/samples/bad-array-after-comment.bconf
@@ -0,0 +1,4 @@
+# the first array value must be on the same line as the key
+key = # comment
+ value1,
+ value2
diff --git a/tools/bootconfig/samples/bad-array-in-next-line.bconf b/tools/bootconfig/samples/bad-array-in-next-line.bconf
new file mode 100644
index 000000000000..95a99a3bde8c
--- /dev/null
+++ b/tools/bootconfig/samples/bad-array-in-next-line.bconf
@@ -0,0 +1,4 @@
+# the first array value must be on the same line as the key
+key =
+  value1,
+  value2
diff --git a/tools/bootconfig/samples/good-array-space-comment.bconf b/tools/bootconfig/samples/good-array-space-comment.bconf
index 45b938dc0695..416fa2ed4109 100644
--- a/tools/bootconfig/samples/good-array-space-comment.bconf
+++ b/tools/bootconfig/samples/good-array-space-comment.bconf
@@ -1,4 +1,3 @@
-key =	# comment
-	"value1",	  # comment1
+key = "value1",	  # comment1
 	"value2"	 , # comment2
 	"value3"
-- 
cgit v1.2.3


From 8c5d862fcb2116ebf5ce762a82db827a38a7d8ee Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Thu, 5 Feb 2026 09:46:35 +0900
Subject: bootconfig: Check the parsed output of the good examples

Check whether the parsed output of the good example configs are
the same as expected.

Link: https://lore.kernel.org/all/177025239529.14982.12913754615993262263.stgit@devnote2/

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Tested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/bootconfig/samples/exp-good-array-space-comment.bconf | 1 +
 tools/bootconfig/samples/exp-good-comment-after-value.bconf | 1 +
 tools/bootconfig/samples/exp-good-mixed-append.bconf        | 2 ++
 tools/bootconfig/samples/exp-good-mixed-kv1.bconf           | 2 ++
 tools/bootconfig/samples/exp-good-mixed-kv2.bconf           | 2 ++
 tools/bootconfig/samples/exp-good-mixed-kv3.bconf           | 5 +++++
 tools/bootconfig/samples/exp-good-mixed-override.bconf      | 2 ++
 tools/bootconfig/samples/exp-good-override.bconf            | 4 ++++
 tools/bootconfig/samples/exp-good-printables.bconf          | 2 ++
 tools/bootconfig/samples/exp-good-simple.bconf              | 8 ++++++++
 tools/bootconfig/samples/exp-good-single.bconf              | 3 +++
 tools/bootconfig/samples/exp-good-space-after-value.bconf   | 1 +
 tools/bootconfig/samples/exp-good-tree.bconf                | 8 ++++++++
 tools/bootconfig/test-bootconfig.sh                         | 3 +++
 14 files changed, 44 insertions(+)
 create mode 100644 tools/bootconfig/samples/exp-good-array-space-comment.bconf
 create mode 100644 tools/bootconfig/samples/exp-good-comment-after-value.bconf
 create mode 100644 tools/bootconfig/samples/exp-good-mixed-append.bconf
 create mode 100644 tools/bootconfig/samples/exp-good-mixed-kv1.bconf
 create mode 100644 tools/bootconfig/samples/exp-good-mixed-kv2.bconf
 create mode 100644 tools/bootconfig/samples/exp-good-mixed-kv3.bconf
 create mode 100644 tools/bootconfig/samples/exp-good-mixed-override.bconf
 create mode 100644 tools/bootconfig/samples/exp-good-override.bconf
 create mode 100644 tools/bootconfig/samples/exp-good-printables.bconf
 create mode 100644 tools/bootconfig/samples/exp-good-simple.bconf
 create mode 100644 tools/bootconfig/samples/exp-good-single.bconf
 create mode 100644 tools/bootconfig/samples/exp-good-space-after-value.bconf
 create mode 100644 tools/bootconfig/samples/exp-good-tree.bconf

(limited to 'tools')

diff --git a/tools/bootconfig/samples/exp-good-array-space-comment.bconf b/tools/bootconfig/samples/exp-good-array-space-comment.bconf
new file mode 100644
index 000000000000..8d3278fa6af5
--- /dev/null
+++ b/tools/bootconfig/samples/exp-good-array-space-comment.bconf
@@ -0,0 +1 @@
+key = "value1", "value2", "value3";
diff --git a/tools/bootconfig/samples/exp-good-comment-after-value.bconf b/tools/bootconfig/samples/exp-good-comment-after-value.bconf
new file mode 100644
index 000000000000..a8e8450db3c0
--- /dev/null
+++ b/tools/bootconfig/samples/exp-good-comment-after-value.bconf
@@ -0,0 +1 @@
+key = "value";
diff --git a/tools/bootconfig/samples/exp-good-mixed-append.bconf b/tools/bootconfig/samples/exp-good-mixed-append.bconf
new file mode 100644
index 000000000000..c2b407901ddd
--- /dev/null
+++ b/tools/bootconfig/samples/exp-good-mixed-append.bconf
@@ -0,0 +1,2 @@
+key = "foo", "bar";
+keyx.subkey = "value";
diff --git a/tools/bootconfig/samples/exp-good-mixed-kv1.bconf b/tools/bootconfig/samples/exp-good-mixed-kv1.bconf
new file mode 100644
index 000000000000..8346287d9251
--- /dev/null
+++ b/tools/bootconfig/samples/exp-good-mixed-kv1.bconf
@@ -0,0 +1,2 @@
+key = "value";
+key.subkey = "another-value";
diff --git a/tools/bootconfig/samples/exp-good-mixed-kv2.bconf b/tools/bootconfig/samples/exp-good-mixed-kv2.bconf
new file mode 100644
index 000000000000..40c6232c7cdd
--- /dev/null
+++ b/tools/bootconfig/samples/exp-good-mixed-kv2.bconf
@@ -0,0 +1,2 @@
+key = "another-value";
+key.subkey = "value";
diff --git a/tools/bootconfig/samples/exp-good-mixed-kv3.bconf b/tools/bootconfig/samples/exp-good-mixed-kv3.bconf
new file mode 100644
index 000000000000..8368a7bef60a
--- /dev/null
+++ b/tools/bootconfig/samples/exp-good-mixed-kv3.bconf
@@ -0,0 +1,5 @@
+key = "value";
+key {
+	subkey1;
+	subkey2 = "foo";
+}
diff --git a/tools/bootconfig/samples/exp-good-mixed-override.bconf b/tools/bootconfig/samples/exp-good-mixed-override.bconf
new file mode 100644
index 000000000000..58757712ca45
--- /dev/null
+++ b/tools/bootconfig/samples/exp-good-mixed-override.bconf
@@ -0,0 +1,2 @@
+key = "value2";
+key.foo = "bar";
diff --git a/tools/bootconfig/samples/exp-good-override.bconf b/tools/bootconfig/samples/exp-good-override.bconf
new file mode 100644
index 000000000000..00bbd30e99ae
--- /dev/null
+++ b/tools/bootconfig/samples/exp-good-override.bconf
@@ -0,0 +1,4 @@
+key {
+	word = "2", "3";
+	new.word = "new";
+}
diff --git a/tools/bootconfig/samples/exp-good-printables.bconf b/tools/bootconfig/samples/exp-good-printables.bconf
new file mode 100644
index 000000000000..5981d304eacb
--- /dev/null
+++ b/tools/bootconfig/samples/exp-good-printables.bconf
@@ -0,0 +1,2 @@
+key = "	
+ !#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
diff --git a/tools/bootconfig/samples/exp-good-simple.bconf b/tools/bootconfig/samples/exp-good-simple.bconf
new file mode 100644
index 000000000000..d17f39421c86
--- /dev/null
+++ b/tools/bootconfig/samples/exp-good-simple.bconf
@@ -0,0 +1,8 @@
+key {
+	word1 = "1";
+	word2 = "2";
+	word3 = "3";
+	word4 = "4";
+	word5 = "5";
+	word6 = "6";
+}
diff --git a/tools/bootconfig/samples/exp-good-single.bconf b/tools/bootconfig/samples/exp-good-single.bconf
new file mode 100644
index 000000000000..01196910d7f4
--- /dev/null
+++ b/tools/bootconfig/samples/exp-good-single.bconf
@@ -0,0 +1,3 @@
+key = "1";
+key2 = "2";
+key3 = "alpha", "beta";
diff --git a/tools/bootconfig/samples/exp-good-space-after-value.bconf b/tools/bootconfig/samples/exp-good-space-after-value.bconf
new file mode 100644
index 000000000000..a8e8450db3c0
--- /dev/null
+++ b/tools/bootconfig/samples/exp-good-space-after-value.bconf
@@ -0,0 +1 @@
+key = "value";
diff --git a/tools/bootconfig/samples/exp-good-tree.bconf b/tools/bootconfig/samples/exp-good-tree.bconf
new file mode 100644
index 000000000000..b711d38d86fd
--- /dev/null
+++ b/tools/bootconfig/samples/exp-good-tree.bconf
@@ -0,0 +1,8 @@
+key {
+	word.tree.value = "0";
+	word2.tree.value = "1", "2";
+}
+other.tree {
+	value = "2";
+	value2 = "3";
+}
diff --git a/tools/bootconfig/test-bootconfig.sh b/tools/bootconfig/test-bootconfig.sh
index 7594659af1e1..be9bd18b1d56 100755
--- a/tools/bootconfig/test-bootconfig.sh
+++ b/tools/bootconfig/test-bootconfig.sh
@@ -179,6 +179,9 @@ done
 echo "=== expected success cases ==="
 for i in samples/good-* ; do
   xpass $BOOTCONF -a $i $INITRD
+  x="samples/exp-"`basename $i`
+  $BOOTCONF $i > $TEMPCONF
+  xpass diff $x $TEMPCONF
 done
 
 
-- 
cgit v1.2.3


From b525fcaf0a76507f152d58c6f9e5ef67b3ff552c Mon Sep 17 00:00:00 2001
From: Petr Pavlu <petr.pavlu@suse.com>
Date: Fri, 23 Jan 2026 11:26:57 +0100
Subject: livepatch: Free klp_{object,func}_ext data after initialization

The klp_object_ext and klp_func_ext data, which are stored in the
__klp_objects and __klp_funcs sections, respectively, are not needed
after they are used to create the actual klp_object and klp_func
instances. This operation is implemented by the init function in
scripts/livepatch/init.c.

Prefix the two sections with ".init" so they are freed after the module
is initializated.

Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Aaron Tomlin <atomlin@atomlin.com>
Link: https://patch.msgid.link/20260123102825.3521961-3-petr.pavlu@suse.com
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 scripts/livepatch/init.c            |  2 +-
 scripts/module.lds.S                |  4 ++--
 tools/objtool/check.c               |  2 +-
 tools/objtool/include/objtool/klp.h | 10 +++++-----
 tools/objtool/klp-diff.c            |  2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/scripts/livepatch/init.c b/scripts/livepatch/init.c
index 9e315fc857bd..638c95cffe76 100644
--- a/scripts/livepatch/init.c
+++ b/scripts/livepatch/init.c
@@ -19,7 +19,7 @@ static int __init livepatch_mod_init(void)
 	unsigned int nr_objs;
 	int ret;
 
-	obj_exts = klp_find_section_by_name(THIS_MODULE, "__klp_objects",
+	obj_exts = klp_find_section_by_name(THIS_MODULE, ".init.klp_objects",
 					    &obj_exts_sec_size);
 	nr_objs = obj_exts_sec_size / sizeof(*obj_exts);
 	if (!nr_objs) {
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index 383d19beffb4..054ef99e8288 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -34,8 +34,8 @@ SECTIONS {
 
 	__patchable_function_entries : { *(__patchable_function_entries) }
 
-	__klp_funcs		0: ALIGN(8) { KEEP(*(__klp_funcs)) }
-	__klp_objects		0: ALIGN(8) { KEEP(*(__klp_objects)) }
+	.init.klp_funcs		0 : ALIGN(8) { KEEP(*(.init.klp_funcs)) }
+	.init.klp_objects	0 : ALIGN(8) { KEEP(*(.init.klp_objects)) }
 
 #ifdef CONFIG_ARCH_USES_CFI_TRAPS
 	__kcfi_traps		: { KEEP(*(.kcfi_traps)) }
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 3f7999317f4d..933868ee3beb 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -4761,7 +4761,7 @@ static int validate_ibt(struct objtool_file *file)
 		    !strcmp(sec->name, "__bug_table")			||
 		    !strcmp(sec->name, "__ex_table")			||
 		    !strcmp(sec->name, "__jump_table")			||
-		    !strcmp(sec->name, "__klp_funcs")			||
+		    !strcmp(sec->name, ".init.klp_funcs")		||
 		    !strcmp(sec->name, "__mcount_loc")			||
 		    !strcmp(sec->name, ".llvm.call-graph-profile")	||
 		    !strcmp(sec->name, ".llvm_bb_addr_map")		||
diff --git a/tools/objtool/include/objtool/klp.h b/tools/objtool/include/objtool/klp.h
index ad830a7ce55b..e32e5e8bc631 100644
--- a/tools/objtool/include/objtool/klp.h
+++ b/tools/objtool/include/objtool/klp.h
@@ -6,12 +6,12 @@
 #define SHN_LIVEPATCH		0xff20
 
 /*
- * __klp_objects and __klp_funcs are created by klp diff and used by the patch
- * module init code to build the klp_patch, klp_object and klp_func structs
- * needed by the livepatch API.
+ * .init.klp_objects and .init.klp_funcs are created by klp diff and used by the
+ * patch module init code to build the klp_patch, klp_object and klp_func
+ * structs needed by the livepatch API.
  */
-#define KLP_OBJECTS_SEC	"__klp_objects"
-#define KLP_FUNCS_SEC	"__klp_funcs"
+#define KLP_OBJECTS_SEC	".init.klp_objects"
+#define KLP_FUNCS_SEC	".init.klp_funcs"
 
 /*
  * __klp_relocs is an intermediate section which are created by klp diff and
diff --git a/tools/objtool/klp-diff.c b/tools/objtool/klp-diff.c
index d94531e3f64e..1e649a3eb4cd 100644
--- a/tools/objtool/klp-diff.c
+++ b/tools/objtool/klp-diff.c
@@ -1436,7 +1436,7 @@ static int clone_special_sections(struct elfs *e)
 }
 
 /*
- * Create __klp_objects and __klp_funcs sections which are intermediate
+ * Create .init.klp_objects and .init.klp_funcs sections which are intermediate
  * sections provided as input to the patch module's init code for building the
  * klp_patch, klp_object and klp_func structs for the livepatch API.
  */
-- 
cgit v1.2.3


From 18328546dd59b6adc111cf84a0ee4cdd3a867611 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Mon, 2 Feb 2026 10:01:08 -0800
Subject: objtool/klp: Fix symbol correlation for orphaned local symbols

When compiling with CONFIG_LTO_CLANG_THIN, vmlinux.o has
__irf_[start|end] before the first FILE entry:

  $ readelf -sW vmlinux.o
  Symbol table '.symtab' contains 597706 entries:
     Num:    Value          Size Type    Bind   Vis      Ndx Name
       0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT  UND
       1: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   18 __irf_start
       2: 0000000000000200     0 NOTYPE  LOCAL  DEFAULT   18 __irf_end
       3: 0000000000000000     0 SECTION LOCAL  DEFAULT   17 .text
       4: 0000000000000000     0 SECTION LOCAL  DEFAULT   18 .init.ramfs

This causes klp-build warnings like:

  vmlinux.o: warning: objtool: no correlation: __irf_start
  vmlinux.o: warning: objtool: no correlation: __irf_end

The problem is that Clang LTO is stripping the initramfs_data.o FILE
symbol, causing those two symbols to be orphaned and not noticed by
klp-diff's correlation logic.  Add a loop to correlate any symbols found
before the first FILE symbol.

Fixes: dd590d4d57eb ("objtool/klp: Introduce klp diff subcommand for diffing object files")
Reported-by: Song Liu <song@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Link: https://patch.msgid.link/e21ec1141fc749b5f538d7329b531c1ab63a6d1a.1770055235.git.jpoimboe@kernel.org
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 tools/objtool/klp-diff.c | 39 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/objtool/klp-diff.c b/tools/objtool/klp-diff.c
index 1e649a3eb4cd..9f1f4011eb9c 100644
--- a/tools/objtool/klp-diff.c
+++ b/tools/objtool/klp-diff.c
@@ -364,11 +364,40 @@ static int correlate_symbols(struct elfs *e)
 	struct symbol *file1_sym, *file2_sym;
 	struct symbol *sym1, *sym2;
 
-	/* Correlate locals */
-	for (file1_sym = first_file_symbol(e->orig),
-	     file2_sym = first_file_symbol(e->patched); ;
-	     file1_sym = next_file_symbol(e->orig, file1_sym),
-	     file2_sym = next_file_symbol(e->patched, file2_sym)) {
+	file1_sym = first_file_symbol(e->orig);
+	file2_sym = first_file_symbol(e->patched);
+
+	/*
+	 * Correlate any locals before the first FILE symbol.  This has been
+	 * seen when LTO inexplicably strips the initramfs_data.o FILE symbol
+	 * due to the file only containing data and no code.
+	 */
+	for_each_sym(e->orig, sym1) {
+		if (sym1 == file1_sym || !is_local_sym(sym1))
+			break;
+
+		if (dont_correlate(sym1))
+			continue;
+
+		for_each_sym(e->patched, sym2) {
+			if (sym2 == file2_sym || !is_local_sym(sym2))
+				break;
+
+			if (sym2->twin || dont_correlate(sym2))
+				continue;
+
+			if (strcmp(sym1->demangled_name, sym2->demangled_name))
+				continue;
+
+			sym1->twin = sym2;
+			sym2->twin = sym1;
+			break;
+		}
+	}
+
+	/* Correlate locals after the first FILE symbol */
+	for (; ; file1_sym = next_file_symbol(e->orig, file1_sym),
+		 file2_sym = next_file_symbol(e->patched, file2_sym)) {
 
 		if (!file1_sym && file2_sym) {
 			ERROR("FILE symbol mismatch: NULL != %s", file2_sym->name);
-- 
cgit v1.2.3


From f495054bd12e2abe5068e243bdf344b704c303c6 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Mon, 2 Feb 2026 11:00:17 -0800
Subject: objtool/klp: Fix unexported static call key access for manually built
 livepatch modules

Enabling CONFIG_MEM_ALLOC_PROFILING_DEBUG with CONFIG_SAMPLE_LIVEPATCH
results in the following error:

  samples/livepatch/livepatch-shadow-fix1.o: error: objtool: static_call: can't find static_call_key symbol: __SCK__WARN_trap

This is caused an extra file->klp sanity check which was added by commit
164c9201e1da ("objtool: Add base objtool support for livepatch
modules").  That check was intended to ensure that livepatch modules
built with klp-build always have full access to their static call keys.

However, it failed to account for the fact that manually built livepatch
modules (i.e., not built with klp-build) might need access to unexported
static call keys, for which read-only access is typically allowed for
modules.

While the livepatch-shadow-fix1 module doesn't explicitly use any static
calls, it does have a memory allocation, which can cause
CONFIG_MEM_ALLOC_PROFILING_DEBUG to insert a WARN() call.  And WARN() is
now an unexported static call as of commit 860238af7a33 ("x86_64/bug:
Inline the UD1").

Fix it by removing the overzealous file->klp check, restoring the
original behavior for manually built livepatch modules.

Fixes: 164c9201e1da ("objtool: Add base objtool support for livepatch modules")
Reported-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Song Liu <song@kernel.org>
Tested-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/0bd3ae9a53c3d743417fe842b740a7720e2bcd1c.1770058775.git.jpoimboe@kernel.org
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 tools/objtool/check.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 933868ee3beb..ef451cd6277c 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -682,7 +682,7 @@ static int create_static_call_sections(struct objtool_file *file)
 
 		key_sym = find_symbol_by_name(file->elf, tmp);
 		if (!key_sym) {
-			if (!opts.module || file->klp) {
+			if (!opts.module) {
 				ERROR("static_call: can't find static_call_key symbol: %s", tmp);
 				return -1;
 			}
-- 
cgit v1.2.3


From 2d94a3f7088b69ae25e27fb98d7f1ef572c843f9 Mon Sep 17 00:00:00 2001
From: Bibo Mao <maobibo@loongson.cn>
Date: Fri, 6 Feb 2026 09:28:01 +0800
Subject: KVM: LoongArch: selftests: Add steal time test case

LoongArch KVM supports steal time accounting now, here add steal time
test case on LoongArch.

Signed-off-by: Bibo Mao <maobibo@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 tools/testing/selftests/kvm/Makefile.kvm |  1 +
 tools/testing/selftests/kvm/steal_time.c | 96 ++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index ba5c2b643efa..a18c00f1a4fa 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -228,6 +228,7 @@ TEST_GEN_PROGS_loongarch += kvm_page_table_test
 TEST_GEN_PROGS_loongarch += memslot_modification_stress_test
 TEST_GEN_PROGS_loongarch += memslot_perf_test
 TEST_GEN_PROGS_loongarch += set_memory_region_test
+TEST_GEN_PROGS_loongarch += steal_time
 
 SPLIT_TESTS += arch_timer
 SPLIT_TESTS += get-reg-list
diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c
index 8edc1fca345b..7be8adfe5dd3 100644
--- a/tools/testing/selftests/kvm/steal_time.c
+++ b/tools/testing/selftests/kvm/steal_time.c
@@ -301,6 +301,102 @@ static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx)
 	pr_info("\n");
 }
 
+#elif defined(__loongarch__)
+
+/* steal_time must have 64-byte alignment */
+#define STEAL_TIME_SIZE		((sizeof(struct kvm_steal_time) + 63) & ~63)
+#define KVM_STEAL_PHYS_VALID	BIT_ULL(0)
+
+struct kvm_steal_time {
+	__u64 steal;
+	__u32 version;
+	__u32 flags;
+	__u8  preempted;
+	__u8  pad[47];
+};
+
+static void check_status(struct kvm_steal_time *st)
+{
+	GUEST_ASSERT(!(READ_ONCE(st->version) & 1));
+	GUEST_ASSERT_EQ(READ_ONCE(st->flags), 0);
+	GUEST_ASSERT_EQ(READ_ONCE(st->preempted), 0);
+}
+
+static void guest_code(int cpu)
+{
+	uint32_t version;
+	struct kvm_steal_time *st = st_gva[cpu];
+
+	memset(st, 0, sizeof(*st));
+	GUEST_SYNC(0);
+
+	check_status(st);
+	WRITE_ONCE(guest_stolen_time[cpu], st->steal);
+	version = READ_ONCE(st->version);
+	check_status(st);
+	GUEST_SYNC(1);
+
+	check_status(st);
+	GUEST_ASSERT(version < READ_ONCE(st->version));
+	WRITE_ONCE(guest_stolen_time[cpu], st->steal);
+	check_status(st);
+	GUEST_DONE();
+}
+
+static bool is_steal_time_supported(struct kvm_vcpu *vcpu)
+{
+	int err;
+	uint64_t val;
+	struct kvm_device_attr attr = {
+		.group = KVM_LOONGARCH_VCPU_CPUCFG,
+		.attr = CPUCFG_KVM_FEATURE,
+		.addr = (uint64_t)&val,
+	};
+
+	err = __vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &attr);
+	if (err)
+		return false;
+
+	err = __vcpu_ioctl(vcpu, KVM_GET_DEVICE_ATTR, &attr);
+	if (err)
+		return false;
+
+	return val & BIT(KVM_FEATURE_STEAL_TIME);
+}
+
+static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i)
+{
+	int err;
+	uint64_t st_gpa;
+	struct kvm_vm *vm = vcpu->vm;
+	struct kvm_device_attr attr = {
+		.group = KVM_LOONGARCH_VCPU_PVTIME_CTRL,
+		.attr = KVM_LOONGARCH_VCPU_PVTIME_GPA,
+		.addr = (uint64_t)&st_gpa,
+	};
+
+	/* ST_GPA_BASE is identity mapped */
+	st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE);
+	sync_global_to_guest(vm, st_gva[i]);
+
+	err = __vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &attr);
+	TEST_ASSERT(err == 0, "No PV stealtime Feature");
+
+	st_gpa = (unsigned long)st_gva[i] | KVM_STEAL_PHYS_VALID;
+	err = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &attr);
+	TEST_ASSERT(err == 0, "Fail to set PV stealtime GPA");
+}
+
+static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx)
+{
+	struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]);
+
+	ksft_print_msg("VCPU%d:\n", vcpu_idx);
+	ksft_print_msg("    steal:     %lld\n", st->steal);
+	ksft_print_msg("    flags:     %d\n", st->flags);
+	ksft_print_msg("    version:   %d\n", st->version);
+	ksft_print_msg("    preempted: %d\n", st->preempted);
+}
 #endif
 
 static void *do_steal_time(void *arg)
-- 
cgit v1.2.3


From 59ecffa3995e70a675beeb870f0b3a28470428de Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 21 Jan 2026 22:52:59 +0100
Subject: selftests: netfilter: nft_queue.sh: add udp fraglist gro test case

Without the preceding patch, this fails with:

FAIL: test_udp_gro_ct: Expected udp conntrack entry
FAIL: test_udp_gro_ct: Expected software segmentation to occur, had 10 and 0

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 tools/testing/selftests/net/netfilter/nft_queue.sh | 142 ++++++++++++++++++++-
 1 file changed, 136 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh
index 6136ceec45e0..139bc1211878 100755
--- a/tools/testing/selftests/net/netfilter/nft_queue.sh
+++ b/tools/testing/selftests/net/netfilter/nft_queue.sh
@@ -510,7 +510,7 @@ EOF
 
 udp_listener_ready()
 {
-	ss -S -N "$1" -uln -o "sport = :12345" | grep -q 12345
+	ss -S -N "$1" -uln -o "sport = :$2" | grep -q "$2"
 }
 
 output_files_written()
@@ -518,7 +518,7 @@ output_files_written()
 	test -s "$1" && test -s "$2"
 }
 
-test_udp_ct_race()
+test_udp_nat_race()
 {
         ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
 flush ruleset
@@ -545,8 +545,8 @@ EOF
 	ip netns exec "$nsrouter" ./nf_queue -q 12 -d 1000 &
 	local nfqpid=$!
 
-	busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2"
-	busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3"
+	busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" 12345
+	busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3" 12345
 	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 12
 
 	# Send two packets, one should end up in ns1, other in ns2.
@@ -557,7 +557,7 @@ EOF
 
 	busywait 10000 output_files_written "$TMPFILE1" "$TMPFILE2"
 
-	kill "$nfqpid"
+	kill "$nfqpid" "$rpid1" "$rpid2"
 
 	if ! ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12345 2>/dev/null | wc -l | grep -q "^1"'; then
 		echo "FAIL: Expected One udp conntrack entry"
@@ -585,6 +585,135 @@ EOF
 	echo "PASS: both udp receivers got one packet each"
 }
 
+# Make sure UDPGRO aggregated packets don't lose
+# their skb->nfct entry when nfqueue passes the
+# skb to userspace with software gso segmentation on.
+test_udp_gro_ct()
+{
+	local errprefix="FAIL: test_udp_gro_ct:"
+
+	ip netns exec "$nsrouter" conntrack -F 2>/dev/null
+
+        ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet udpq {
+	# Number of packets/bytes queued to userspace
+	counter toqueue { }
+	# Number of packets/bytes reinjected from userspace with 'ct new' intact
+	counter fromqueue { }
+	# These two counters should be identical and not 0.
+
+	chain prerouting {
+		type filter hook prerouting priority -300; policy accept;
+
+		# userspace sends small packets, if < 1000, UDPGRO did
+		# not kick in, but test needs a 'new' conntrack with udpgro skb.
+		meta iifname veth0 meta l4proto udp meta length > 1000 accept
+
+		# don't pick up non-gso packets and don't queue them to
+		# userspace.
+		notrack
+	}
+
+        chain postrouting {
+		type filter hook postrouting priority 0; policy accept;
+
+		# Only queue unconfirmed fraglist gro skbs to userspace.
+		udp dport 12346 ct status ! confirmed counter name "toqueue" mark set 1 queue num 1
+        }
+
+	chain validate {
+		type filter hook postrouting priority 1; policy accept;
+		# ... and only count those that were reinjected with the
+		# skb->nfct intact.
+		mark 1 counter name "fromqueue"
+	}
+}
+EOF
+	timeout 10 ip netns exec "$ns2" socat UDP-LISTEN:12346,fork,pf=ipv4 OPEN:"$TMPFILE1",trunc &
+	local rpid=$!
+
+	ip netns exec "$nsrouter" ./nf_queue -G -c -q 1 -t 2 > "$TMPFILE2" &
+	local nfqpid=$!
+
+	ip netns exec "$nsrouter" ethtool -K "veth0" rx-udp-gro-forwarding on rx-gro-list on generic-receive-offload on
+
+	busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" 12346
+	busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 1
+
+	local bs=512
+	local count=$(((32 * 1024 * 1024) / bs))
+	dd if=/dev/zero bs="$bs" count="$count" 2>/dev/null | for i in $(seq 1 16); do
+		timeout 5 ip netns exec "$ns1" \
+			socat -u -b 512 STDIN UDP-DATAGRAM:10.0.2.99:12346,reuseport,bind=0.0.0.0:55221 &
+	done
+
+	busywait 10000 test -s "$TMPFILE1"
+
+	kill "$rpid"
+
+	wait
+
+	local p
+	local b
+	local pqueued
+	local bqueued
+
+	c=$(ip netns exec "$nsrouter" nft list counter inet udpq "toqueue" | grep packets)
+	read p pqueued b bqueued <<EOF
+$c
+EOF
+	local preinject
+	local breinject
+	c=$(ip netns exec "$nsrouter" nft list counter inet udpq "fromqueue" | grep packets)
+	read p preinject b breinject <<EOF
+$c
+EOF
+	ip netns exec "$nsrouter" ethtool -K "veth0" rx-udp-gro-forwarding off
+	ip netns exec "$nsrouter" ethtool -K "veth1" rx-udp-gro-forwarding off
+
+	if [ "$pqueued" -eq 0 ];then
+		# happens when gro did not build at least on aggregate
+		echo "SKIP: No packets were queued"
+		return
+	fi
+
+	local saw_ct_entry=0
+	if ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12346 2>/dev/null | wc -l | grep -q "^1"'; then
+		saw_ct_entry=1
+	else
+		echo "$errprefix Expected udp conntrack entry"
+		ip netns exec "$nsrouter" conntrack -L
+		ret=1
+	fi
+
+	if [ "$pqueued" -ge "$preinject" ] ;then
+		echo "$errprefix Expected software segmentation to occur, had $pqueued and $preinject"
+		ret=1
+		return
+	fi
+
+	# sw segmentation adds extra udp and ip headers.
+	local breinject_expect=$((preinject * (512 + 20 + 8)))
+
+	if [ "$breinject" -eq "$breinject_expect" ]; then
+		if [ "$saw_ct_entry" -eq 1 ];then
+			echo "PASS: fraglist gro skb passed with conntrack entry"
+		else
+			echo "$errprefix fraglist gro skb passed without conntrack entry"
+			ret=1
+		fi
+	else
+		echo "$errprefix Counter mismatch, conntrack entry dropped by nfqueue? Queued: $pqueued, $bqueued. Post-queue: $preinject, $breinject. Expected $breinject_expect"
+		ret=1
+	fi
+
+	if ! ip netns exec "$nsrouter" nft delete table inet udpq; then
+		echo "$errprefix: Could not delete udpq table"
+		ret=1
+	fi
+}
+
 test_queue_removal()
 {
 	read tainted_then < /proc/sys/kernel/tainted
@@ -663,7 +792,8 @@ test_tcp_localhost_connectclose
 test_tcp_localhost_requeue
 test_sctp_forward
 test_sctp_output
-test_udp_ct_race
+test_udp_nat_race
+test_udp_gro_ct
 
 # should be last, adds vrf device in ns1 and changes routes
 test_icmp_vrf
-- 
cgit v1.2.3


From 1d79ae50e310092182a0a8450292ee1c2f99efcf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 30 Jan 2026 19:21:51 +0100
Subject: selftests: netfilter: add IPV6_TUNNEL to config

The script now requires IPV6 tunnel support, enable this.
This should have caught by CI, but as the config option is missing,
the tunnel interface isn't added.  This results in an error cascade
that ends with "route change default" failure.

That in turn means the "ipv6 tunnel" test re-uses the previous
test setup so the "ip6ip6" test passes and script returns 0.

Make sure to catch such bugs, set ret=1 if device cannot be added
and delete the old default route before installing the new one.

After this change, IPV6_TUNNEL=n kernel builds fail with the expected
  FAIL: flow offload for ns1/ns2 with IP6IP6 tunnel

... while builds with IPV6_TUNNEL=m pass as before.

Fixes: 5e5180352193 ("selftests: netfilter: nft_flowtable.sh: Add IP6IP6 flowtable selftest")
Acked-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 tools/testing/selftests/net/netfilter/config          |  1 +
 .../testing/selftests/net/netfilter/nft_flowtable.sh  | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config
index 12ce61fa15a8..979cff56e1f5 100644
--- a/tools/testing/selftests/net/netfilter/config
+++ b/tools/testing/selftests/net/netfilter/config
@@ -29,6 +29,7 @@ CONFIG_IP_NF_RAW=m
 CONFIG_IP_SCTP=m
 CONFIG_IPV6=y
 CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_TUNNEL=m
 CONFIG_IP_VS=m
 CONFIG_IP_VS_PROTO_TCP=y
 CONFIG_IP_VS_RR=m
diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
index 14d7f67715ed..7a34ef468975 100755
--- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh
+++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
@@ -601,14 +601,19 @@ ip -net "$nsr2" link set tun0 up
 ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0
 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null
 
-ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1
+ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1 || ret=1
 ip -net "$nsr2" link set tun6 up
 ip -net "$nsr2" addr add fee1:3::2/64 dev tun6 nodad
 
 ip -net "$nsr1" route change default via 192.168.100.2
 ip -net "$nsr2" route change default via 192.168.100.1
-ip -6 -net "$nsr1" route change default via fee1:3::2
-ip -6 -net "$nsr2" route change default via fee1:3::1
+
+# do not use "route change" and delete old default so
+# socat fails to connect in case new default can't be added.
+ip -6 -net "$nsr1" route delete default
+ip -6 -net "$nsr1" route add default via fee1:3::2
+ip -6 -net "$nsr2" route delete default
+ip -6 -net "$nsr2" route add default via fee1:3::1
 ip -net "$ns2" route add default via 10.0.2.1
 ip -6 -net "$ns2" route add default via dead:2::1
 
@@ -649,7 +654,8 @@ ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0.10 a
 ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote fee1:4::2
 ip -net "$nsr1" link set tun6.10 up
 ip -net "$nsr1" addr add fee1:5::1/64 dev tun6.10 nodad
-ip -6 -net "$nsr1" route change default via fee1:5::2
+ip -6 -net "$nsr1" route delete default
+ip -6 -net "$nsr1" route add default via fee1:5::2
 ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6.10 accept'
 
 ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10
@@ -664,10 +670,11 @@ ip -net "$nsr2" addr add 192.168.200.2/24 dev tun0.10
 ip -net "$nsr2" route change default via 192.168.200.1
 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null
 
-ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1
+ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1 || ret=1
 ip -net "$nsr2" link set tun6.10 up
 ip -net "$nsr2" addr add fee1:5::2/64 dev tun6.10 nodad
-ip -6 -net "$nsr2" route change default via fee1:5::1
+ip -6 -net "$nsr2" route delete default
+ip -6 -net "$nsr2" route add default via fee1:5::1
 
 if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then
 	echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2
-- 
cgit v1.2.3


From ab2a7b7b6b8831348646688345c3209cdaee5d46 Mon Sep 17 00:00:00 2001
From: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
Date: Wed, 27 Aug 2025 00:29:39 +0800
Subject: KVM: riscv: selftests: add Zilsd and Zclsd extension to get-reg-list
 test

The KVM RISC-V allows Zilsd and Zclsd extensions for Guest/VM so add
this extension to get-reg-list test.

Signed-off-by: Pincheng Wang <pincheng.plct@isrc.iscas.ac.cn>
Reviewed-by: Nutty Liu <nutty.liu@hotmail.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Link: https://lore.kernel.org/r/20250826162939.1494021-6-pincheng.plct@isrc.iscas.ac.cn
Signed-off-by: Anup Patel <anup@brainfault.org>
---
 tools/testing/selftests/kvm/riscv/get-reg-list.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c
index cb54a56990a0..20cc7d9b65ed 100644
--- a/tools/testing/selftests/kvm/riscv/get-reg-list.c
+++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c
@@ -78,6 +78,7 @@ bool filter_reg(__u64 reg)
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZCB:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZCD:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZCF:
+	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZCLSD:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZCMOP:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZFA:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZFBFMIN:
@@ -94,6 +95,7 @@ bool filter_reg(__u64 reg)
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZIHINTNTL:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZIHINTPAUSE:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZIHPM:
+	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZILSD:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZIMOP:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKND:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKNE:
@@ -538,6 +540,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off)
 		KVM_ISA_EXT_ARR(ZCB),
 		KVM_ISA_EXT_ARR(ZCD),
 		KVM_ISA_EXT_ARR(ZCF),
+		KVM_ISA_EXT_ARR(ZCLSD),
 		KVM_ISA_EXT_ARR(ZCMOP),
 		KVM_ISA_EXT_ARR(ZFA),
 		KVM_ISA_EXT_ARR(ZFBFMIN),
@@ -554,6 +557,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off)
 		KVM_ISA_EXT_ARR(ZIHINTNTL),
 		KVM_ISA_EXT_ARR(ZIHINTPAUSE),
 		KVM_ISA_EXT_ARR(ZIHPM),
+		KVM_ISA_EXT_ARR(ZILSD),
 		KVM_ISA_EXT_ARR(ZIMOP),
 		KVM_ISA_EXT_ARR(ZKND),
 		KVM_ISA_EXT_ARR(ZKNE),
@@ -1179,6 +1183,7 @@ KVM_ISA_EXT_SIMPLE_CONFIG(zca, ZCA);
 KVM_ISA_EXT_SIMPLE_CONFIG(zcb, ZCB);
 KVM_ISA_EXT_SIMPLE_CONFIG(zcd, ZCD);
 KVM_ISA_EXT_SIMPLE_CONFIG(zcf, ZCF);
+KVM_ISA_EXT_SIMPLE_CONFIG(zclsd, ZCLSD);
 KVM_ISA_EXT_SIMPLE_CONFIG(zcmop, ZCMOP);
 KVM_ISA_EXT_SIMPLE_CONFIG(zfa, ZFA);
 KVM_ISA_EXT_SIMPLE_CONFIG(zfbfmin, ZFBFMIN);
@@ -1195,6 +1200,7 @@ KVM_ISA_EXT_SIMPLE_CONFIG(zifencei, ZIFENCEI);
 KVM_ISA_EXT_SIMPLE_CONFIG(zihintntl, ZIHINTNTL);
 KVM_ISA_EXT_SIMPLE_CONFIG(zihintpause, ZIHINTPAUSE);
 KVM_ISA_EXT_SIMPLE_CONFIG(zihpm, ZIHPM);
+KVM_ISA_EXT_SIMPLE_CONFIG(zilsd, ZILSD);
 KVM_ISA_EXT_SIMPLE_CONFIG(zimop, ZIMOP);
 KVM_ISA_EXT_SIMPLE_CONFIG(zknd, ZKND);
 KVM_ISA_EXT_SIMPLE_CONFIG(zkne, ZKNE);
@@ -1259,6 +1265,7 @@ struct vcpu_reg_list *vcpu_configs[] = {
 	&config_zcb,
 	&config_zcd,
 	&config_zcf,
+	&config_zclsd,
 	&config_zcmop,
 	&config_zfa,
 	&config_zfbfmin,
@@ -1275,6 +1282,7 @@ struct vcpu_reg_list *vcpu_configs[] = {
 	&config_zihintntl,
 	&config_zihintpause,
 	&config_zihpm,
+	&config_zilsd,
 	&config_zimop,
 	&config_zknd,
 	&config_zkne,
-- 
cgit v1.2.3


From 39ad809dd2579d9b7400bbc50a5b95d84527b75e Mon Sep 17 00:00:00 2001
From: Wu Fei <wu.fei9@sanechips.com.cn>
Date: Wed, 5 Nov 2025 23:14:26 +0800
Subject: KVM: riscv: selftests: Add riscv vm satp modes

Current vm modes cannot represent riscv guest modes precisely, here add
all 9 combinations of P(56,40,41) x V(57,48,39). Also the default vm
mode is detected on runtime instead of hardcoded one, which might not be
supported on specific machine.

Signed-off-by: Wu Fei <wu.fei9@sanechips.com.cn>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Nutty Liu <nutty.liu@hotmail.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Link: https://lore.kernel.org/r/20251105151442.28767-1-wu.fei9@sanechips.com.cn
Signed-off-by: Anup Patel <anup@brainfault.org>
---
 tools/testing/selftests/kvm/include/kvm_util.h     | 17 ++++--
 .../selftests/kvm/include/riscv/processor.h        |  2 +
 tools/testing/selftests/kvm/lib/guest_modes.c      | 41 +++++++++++---
 tools/testing/selftests/kvm/lib/kvm_util.c         | 33 ++++++++++++
 tools/testing/selftests/kvm/lib/riscv/processor.c  | 63 ++++++++++++++++++++--
 5 files changed, 142 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 81f4355ff28a..fe1bd661b1c1 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -186,6 +186,17 @@ enum vm_guest_mode {
 	VM_MODE_P36V48_64K,
 	VM_MODE_P47V47_16K,
 	VM_MODE_P36V47_16K,
+
+	VM_MODE_P56V57_4K,	/* For riscv64 */
+	VM_MODE_P56V48_4K,
+	VM_MODE_P56V39_4K,
+	VM_MODE_P50V57_4K,
+	VM_MODE_P50V48_4K,
+	VM_MODE_P50V39_4K,
+	VM_MODE_P41V57_4K,
+	VM_MODE_P41V48_4K,
+	VM_MODE_P41V39_4K,
+
 	NUM_VM_MODES,
 };
 
@@ -210,10 +221,10 @@ kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t));
 	shape;					\
 })
 
-#if defined(__aarch64__)
-
 extern enum vm_guest_mode vm_mode_default;
 
+#if defined(__aarch64__)
+
 #define VM_MODE_DEFAULT			vm_mode_default
 #define MIN_PAGE_SHIFT			12U
 #define ptes_per_page(page_size)	((page_size) / 8)
@@ -236,7 +247,7 @@ extern enum vm_guest_mode vm_mode_default;
 #error "RISC-V 32-bit kvm selftests not supported"
 #endif
 
-#define VM_MODE_DEFAULT			VM_MODE_P40V48_4K
+#define VM_MODE_DEFAULT			vm_mode_default
 #define MIN_PAGE_SHIFT			12U
 #define ptes_per_page(page_size)	((page_size) / 8)
 
diff --git a/tools/testing/selftests/kvm/include/riscv/processor.h b/tools/testing/selftests/kvm/include/riscv/processor.h
index e58282488beb..4dade8c4d18e 100644
--- a/tools/testing/selftests/kvm/include/riscv/processor.h
+++ b/tools/testing/selftests/kvm/include/riscv/processor.h
@@ -192,4 +192,6 @@ static inline void local_irq_disable(void)
 	csr_clear(CSR_SSTATUS, SR_SIE);
 }
 
+unsigned long riscv64_get_satp_mode(void);
+
 #endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c
index b04901e55138..ce3099630397 100644
--- a/tools/testing/selftests/kvm/lib/guest_modes.c
+++ b/tools/testing/selftests/kvm/lib/guest_modes.c
@@ -4,7 +4,7 @@
  */
 #include "guest_modes.h"
 
-#ifdef __aarch64__
+#if defined(__aarch64__) || defined(__riscv)
 #include "processor.h"
 enum vm_guest_mode vm_mode_default;
 #endif
@@ -13,9 +13,11 @@ struct guest_mode guest_modes[NUM_VM_MODES];
 
 void guest_modes_append_default(void)
 {
-#ifndef __aarch64__
+#if !defined(__aarch64__) && !defined(__riscv)
 	guest_mode_append(VM_MODE_DEFAULT, true);
-#else
+#endif
+
+#ifdef __aarch64__
 	{
 		unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
 		uint32_t ipa4k, ipa16k, ipa64k;
@@ -74,11 +76,36 @@ void guest_modes_append_default(void)
 #ifdef __riscv
 	{
 		unsigned int sz = kvm_check_cap(KVM_CAP_VM_GPA_BITS);
+		unsigned long satp_mode = riscv64_get_satp_mode() << SATP_MODE_SHIFT;
+		int i;
 
-		if (sz >= 52)
-			guest_mode_append(VM_MODE_P52V48_4K, true);
-		if (sz >= 48)
-			guest_mode_append(VM_MODE_P48V48_4K, true);
+		switch (sz) {
+		case 59:
+			guest_mode_append(VM_MODE_P56V57_4K, satp_mode >= SATP_MODE_57);
+			guest_mode_append(VM_MODE_P56V48_4K, satp_mode >= SATP_MODE_48);
+			guest_mode_append(VM_MODE_P56V39_4K, satp_mode >= SATP_MODE_39);
+			break;
+		case 50:
+			guest_mode_append(VM_MODE_P50V57_4K, satp_mode >= SATP_MODE_57);
+			guest_mode_append(VM_MODE_P50V48_4K, satp_mode >= SATP_MODE_48);
+			guest_mode_append(VM_MODE_P50V39_4K, satp_mode >= SATP_MODE_39);
+			break;
+		case 41:
+			guest_mode_append(VM_MODE_P41V57_4K, satp_mode >= SATP_MODE_57);
+			guest_mode_append(VM_MODE_P41V48_4K, satp_mode >= SATP_MODE_48);
+			guest_mode_append(VM_MODE_P41V39_4K, satp_mode >= SATP_MODE_39);
+			break;
+		default:
+			break;
+		}
+
+		/* set the first supported mode as default */
+		vm_mode_default = NUM_VM_MODES;
+		for (i = 0; vm_mode_default == NUM_VM_MODES && i < NUM_VM_MODES; i++) {
+			if (guest_modes[i].supported && guest_modes[i].enabled)
+				vm_mode_default = i;
+		}
+		TEST_ASSERT(vm_mode_default != NUM_VM_MODES, "No supported mode!");
 	}
 #endif
 }
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 8279b6ced8d2..174490e6cd10 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -209,6 +209,15 @@ const char *vm_guest_mode_string(uint32_t i)
 		[VM_MODE_P36V48_64K]	= "PA-bits:36,  VA-bits:48, 64K pages",
 		[VM_MODE_P47V47_16K]	= "PA-bits:47,  VA-bits:47, 16K pages",
 		[VM_MODE_P36V47_16K]	= "PA-bits:36,  VA-bits:47, 16K pages",
+		[VM_MODE_P56V57_4K]	= "PA-bits:56,  VA-bits:57,  4K pages",
+		[VM_MODE_P56V48_4K]	= "PA-bits:56,  VA-bits:48,  4K pages",
+		[VM_MODE_P56V39_4K]	= "PA-bits:56,  VA-bits:39,  4K pages",
+		[VM_MODE_P50V57_4K]	= "PA-bits:50,  VA-bits:57,  4K pages",
+		[VM_MODE_P50V48_4K]	= "PA-bits:50,  VA-bits:48,  4K pages",
+		[VM_MODE_P50V39_4K]	= "PA-bits:50,  VA-bits:39,  4K pages",
+		[VM_MODE_P41V57_4K]	= "PA-bits:41,  VA-bits:57,  4K pages",
+		[VM_MODE_P41V48_4K]	= "PA-bits:41,  VA-bits:48,  4K pages",
+		[VM_MODE_P41V39_4K]	= "PA-bits:41,  VA-bits:39,  4K pages",
 	};
 	_Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
 		       "Missing new mode strings?");
@@ -236,6 +245,15 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = {
 	[VM_MODE_P36V48_64K]	= { 36, 48, 0x10000, 16 },
 	[VM_MODE_P47V47_16K]	= { 47, 47,  0x4000, 14 },
 	[VM_MODE_P36V47_16K]	= { 36, 47,  0x4000, 14 },
+	[VM_MODE_P56V57_4K]	= { 56, 57,  0x1000, 12 },
+	[VM_MODE_P56V48_4K]	= { 56, 48,  0x1000, 12 },
+	[VM_MODE_P56V39_4K]	= { 56, 39,  0x1000, 12 },
+	[VM_MODE_P50V57_4K]	= { 50, 57,  0x1000, 12 },
+	[VM_MODE_P50V48_4K]	= { 50, 48,  0x1000, 12 },
+	[VM_MODE_P50V39_4K]	= { 50, 39,  0x1000, 12 },
+	[VM_MODE_P41V57_4K]	= { 41, 57,  0x1000, 12 },
+	[VM_MODE_P41V48_4K]	= { 41, 48,  0x1000, 12 },
+	[VM_MODE_P41V39_4K]	= { 41, 39,  0x1000, 12 },
 };
 _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
 	       "Missing new mode params?");
@@ -338,6 +356,21 @@ struct kvm_vm *____vm_create(struct vm_shape shape)
 	case VM_MODE_P44V64_4K:
 		vm->pgtable_levels = 5;
 		break;
+	case VM_MODE_P56V57_4K:
+	case VM_MODE_P50V57_4K:
+	case VM_MODE_P41V57_4K:
+		vm->pgtable_levels = 5;
+		break;
+	case VM_MODE_P56V48_4K:
+	case VM_MODE_P50V48_4K:
+	case VM_MODE_P41V48_4K:
+		vm->pgtable_levels = 4;
+		break;
+	case VM_MODE_P56V39_4K:
+	case VM_MODE_P50V39_4K:
+	case VM_MODE_P41V39_4K:
+		vm->pgtable_levels = 3;
+		break;
 	default:
 		TEST_FAIL("Unknown guest mode: 0x%x", vm->mode);
 	}
diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c
index 2eac7d4b59e9..003693576225 100644
--- a/tools/testing/selftests/kvm/lib/riscv/processor.c
+++ b/tools/testing/selftests/kvm/lib/riscv/processor.c
@@ -8,6 +8,7 @@
 #include <linux/compiler.h>
 #include <assert.h>
 
+#include "guest_modes.h"
 #include "kvm_util.h"
 #include "processor.h"
 #include "ucall_common.h"
@@ -197,22 +198,41 @@ void riscv_vcpu_mmu_setup(struct kvm_vcpu *vcpu)
 {
 	struct kvm_vm *vm = vcpu->vm;
 	unsigned long satp;
+	unsigned long satp_mode;
+	unsigned long max_satp_mode;
 
 	/*
 	 * The RISC-V Sv48 MMU mode supports 56-bit physical address
 	 * for 48-bit virtual address with 4KB last level page size.
 	 */
 	switch (vm->mode) {
-	case VM_MODE_P52V48_4K:
-	case VM_MODE_P48V48_4K:
-	case VM_MODE_P40V48_4K:
+	case VM_MODE_P56V57_4K:
+	case VM_MODE_P50V57_4K:
+	case VM_MODE_P41V57_4K:
+		satp_mode = SATP_MODE_57;
+		break;
+	case VM_MODE_P56V48_4K:
+	case VM_MODE_P50V48_4K:
+	case VM_MODE_P41V48_4K:
+		satp_mode = SATP_MODE_48;
+		break;
+	case VM_MODE_P56V39_4K:
+	case VM_MODE_P50V39_4K:
+	case VM_MODE_P41V39_4K:
+		satp_mode = SATP_MODE_39;
 		break;
 	default:
 		TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
 	}
 
+	max_satp_mode = vcpu_get_reg(vcpu, RISCV_CONFIG_REG(satp_mode));
+
+	if ((satp_mode >> SATP_MODE_SHIFT) > max_satp_mode)
+		TEST_FAIL("Unable to set satp mode 0x%lx, max mode 0x%lx\n",
+			  satp_mode >> SATP_MODE_SHIFT, max_satp_mode);
+
 	satp = (vm->pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN;
-	satp |= SATP_MODE_48;
+	satp |= satp_mode;
 
 	vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(satp), satp);
 }
@@ -515,3 +535,38 @@ unsigned long get_host_sbi_spec_version(void)
 
 	return ret.value;
 }
+
+void kvm_selftest_arch_init(void)
+{
+	/*
+	 * riscv64 doesn't have a true default mode, so start by detecting the
+	 * supported vm mode.
+	 */
+	guest_modes_append_default();
+}
+
+unsigned long riscv64_get_satp_mode(void)
+{
+	int kvm_fd, vm_fd, vcpu_fd, err;
+	uint64_t val;
+	struct kvm_one_reg reg = {
+		.id     = RISCV_CONFIG_REG(satp_mode),
+		.addr   = (uint64_t)&val,
+	};
+
+	kvm_fd = open_kvm_dev_path_or_exit();
+	vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, NULL);
+	TEST_ASSERT(vm_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm_fd));
+
+	vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0);
+	TEST_ASSERT(vcpu_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu_fd));
+
+	err = ioctl(vcpu_fd, KVM_GET_ONE_REG, &reg);
+	TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd));
+
+	close(vcpu_fd);
+	close(vm_fd);
+	close(kvm_fd);
+
+	return val;
+}
-- 
cgit v1.2.3


From 671995ff4c308fc1adf01727df670c83434ffb5a Mon Sep 17 00:00:00 2001
From: Xu Lu <luxu.kernel@bytedance.com>
Date: Mon, 20 Oct 2025 12:29:04 +0800
Subject: RISC-V: KVM: selftests: Add Zalasr extensions to get-reg-list test

The KVM RISC-V allows Zalasr extensions for Guest/VM so add this
extension to get-reg-list test.

Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Link: https://lore.kernel.org/r/20251020042904.32096-1-luxu.kernel@bytedance.com
Signed-off-by: Anup Patel <anup@brainfault.org>
---
 tools/testing/selftests/kvm/riscv/get-reg-list.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c
index 20cc7d9b65ed..8d6b951434eb 100644
--- a/tools/testing/selftests/kvm/riscv/get-reg-list.c
+++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c
@@ -65,6 +65,7 @@ bool filter_reg(__u64 reg)
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZAAMO:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZABHA:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZACAS:
+	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZALASR:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZALRSC:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZAWRS:
 	case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBA:
@@ -527,6 +528,7 @@ static const char *isa_ext_single_id_to_str(__u64 reg_off)
 		KVM_ISA_EXT_ARR(ZAAMO),
 		KVM_ISA_EXT_ARR(ZABHA),
 		KVM_ISA_EXT_ARR(ZACAS),
+		KVM_ISA_EXT_ARR(ZALASR),
 		KVM_ISA_EXT_ARR(ZALRSC),
 		KVM_ISA_EXT_ARR(ZAWRS),
 		KVM_ISA_EXT_ARR(ZBA),
@@ -1170,6 +1172,7 @@ KVM_ISA_EXT_SIMPLE_CONFIG(svvptc, SVVPTC);
 KVM_ISA_EXT_SIMPLE_CONFIG(zaamo, ZAAMO);
 KVM_ISA_EXT_SIMPLE_CONFIG(zabha, ZABHA);
 KVM_ISA_EXT_SIMPLE_CONFIG(zacas, ZACAS);
+KVM_ISA_EXT_SIMPLE_CONFIG(zalasr, ZALASR);
 KVM_ISA_EXT_SIMPLE_CONFIG(zalrsc, ZALRSC);
 KVM_ISA_EXT_SIMPLE_CONFIG(zawrs, ZAWRS);
 KVM_ISA_EXT_SIMPLE_CONFIG(zba, ZBA);
@@ -1253,6 +1256,7 @@ struct vcpu_reg_list *vcpu_configs[] = {
 	&config_zabha,
 	&config_zacas,
 	&config_zalrsc,
+	&config_zalasr,
 	&config_zawrs,
 	&config_zba,
 	&config_zbb,
-- 
cgit v1.2.3


From a108a6a4b9e8d81f6be0c0f8b93d3fbd57d2359e Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Wed, 4 Feb 2026 14:26:15 +0000
Subject: perf record: Make logs more readable for event open failures

Since commit ee27476fa3004f83 ("perf record: Skip don't fail for events
that don't open"), if a user does not have permission to access a PMU
event, perf reports:

  perf record -e cs_etm// -C 3 -- ls
  Error:
  Failure to open event 'cs_etm//u' on PMU 'cs_etm' which will be removed.
  No fallback found for 'cs_etm//u' for error 13
  Error:
  Failure to open event 'dummy:u' on PMU 'software' which will be removed.
  No fallback found for 'dummy:u' for error 13
  Error:
  Failure to open any events for recording.

The log is not very helpful, as no clear indication of what "error 13"
means or how to address the issue.

This commit restores evsel__open_strerror() to generate a readable error
message and print it out:

  perf record -e cs_etm// -C 3 -- ls
  Error:
  Failure to open event 'cs_etm//' on PMU 'cs_etm' which will be removed.
  Access to performance monitoring and observability operations is limited.
  Consider adjusting /proc/sys/kernel/perf_event_paranoid setting to open
  access to performance monitoring and observability operations for processes
  without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability.
  More information can be found at 'Perf events and tool security' document:
  https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html
  perf_event_paranoid setting is 1:
    -1: Allow use of (almost) all events by all users
        Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
  >= 0: Disallow raw and ftrace function tracepoint access
  >= 1: Disallow CPU event access
  >= 2: Disallow kernel profiling
  To make the adjusted perf_event_paranoid setting permanent preserve it
  in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = <setting>)
  Error:
  Failure to open event 'dummy:u' on PMU 'software' which will be removed.
  Access to performance monitoring and observability operations is limited.
  Consider adjusting /proc/sys/kernel/perf_event_paranoid setting to open
  access to performance monitoring and observability operations for processes
  without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability.
  More information can be found at 'Perf events and tool security' document:
  https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html
  perf_event_paranoid setting is 1:
    -1: Allow use of (almost) all events by all users
        Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
  >= 0: Disallow raw and ftrace function tracepoint access
  >= 1: Disallow CPU event access
  >= 2: Disallow kernel profiling
  To make the adjusted perf_event_paranoid setting permanent preserve it
  in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = <setting>)
  Error:
  Failure to open any events for recording.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 663ca3a03396..60d764068302 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1403,6 +1403,7 @@ try_again:
 			}
 #endif
 			if (report_error || verbose > 0) {
+				evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
 				ui__error("Failure to open event '%s' on PMU '%s' which will be "
 					  "removed.\n%s\n",
 					  evsel__name(pos), evsel__pmu_name(pos), msg);
-- 
cgit v1.2.3


From 1d9622c3c1c12e317b0d3a16a26ea17090435d61 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 3 Feb 2026 20:26:29 -0800
Subject: perf tests: Additional 'perf stat' tests

Recently 'perf stat' regressed in per CPU mode [1].

Let's expand test coverage to catch the same breakage again as well as
to test the repeat, pid, detailed and no aggregation options.

[1] https://lore.kernel.org/linux-perf-users/cgja46br2smmznxs7kbeabs6zgv3b4olfqgh2fdp5mxk2yom4v@w6jjgov6hdi6/

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andres Freund <andres@anarazel.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/stat.sh | 242 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 242 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/stat.sh b/tools/perf/tests/shell/stat.sh
index 792a0b79f6b8..4edb04039036 100755
--- a/tools/perf/tests/shell/stat.sh
+++ b/tools/perf/tests/shell/stat.sh
@@ -5,6 +5,21 @@
 set -e
 
 err=0
+stat_output=$(mktemp /tmp/perf-stat-test-output.XXXXX)
+
+cleanup() {
+  rm -f "${stat_output}"
+  trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+  echo "Unexpected signal in ${FUNCNAME[1]}"
+  cleanup
+  exit 1
+}
+
+trap trap_cleanup EXIT TERM INT
+
 test_default_stat() {
   echo "Basic stat command test"
   if ! perf stat true 2>&1 | grep -E -q "Performance counter stats for 'true':"
@@ -248,6 +263,226 @@ test_hybrid() {
   echo "hybrid test [Success]"
 }
 
+test_stat_cpu() {
+  echo "stat -C <cpu> test"
+  # Test the full online CPU list (ranges and lists)
+  online_cpus=$(cat /sys/devices/system/cpu/online)
+  if ! perf stat -C "$online_cpus" -a true > "${stat_output}" 2>&1
+  then
+    echo "stat -C <cpu> test [Failed - command failed for cpus $online_cpus]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! grep -E -q "Performance counter stats for" "${stat_output}"
+  then
+    echo "stat -C <cpu> test [Failed - missing output for cpus $online_cpus]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  # Test each individual online CPU
+  for cpu_dir in /sys/devices/system/cpu/cpu[0-9]*; do
+    cpu=${cpu_dir##*/cpu}
+    # Check if online
+    if [ -f "$cpu_dir/online" ] && [ "$(cat "$cpu_dir/online")" -eq 0 ]
+    then
+      continue
+    fi
+
+    if ! perf stat -C "$cpu" -a true > "${stat_output}" 2>&1
+    then
+      echo "stat -C <cpu> test [Failed - command failed for cpu $cpu]"
+      cat "${stat_output}"
+      err=1
+      return
+    fi
+    if ! grep -E -q "Performance counter stats for" "${stat_output}"
+    then
+      echo "stat -C <cpu> test [Failed - missing output for cpu $cpu]"
+      cat "${stat_output}"
+      err=1
+      return
+    fi
+  done
+
+  # Test synthetic list and range if cpu0 and cpu1 are online
+  c0_online=0
+  c1_online=0
+  if [ -d "/sys/devices/system/cpu/cpu0" ]
+  then
+    if [ ! -f "/sys/devices/system/cpu/cpu0/online" ] || [ "$(cat /sys/devices/system/cpu/cpu0/online)" -eq 1 ]
+    then
+      c0_online=1
+    fi
+  fi
+  if [ -d "/sys/devices/system/cpu/cpu1" ]
+  then
+    if [ ! -f "/sys/devices/system/cpu/cpu1/online" ] || [ "$(cat /sys/devices/system/cpu/cpu1/online)" -eq 1 ]
+    then
+      c1_online=1
+    fi
+  fi
+
+  if [ $c0_online -eq 1 ] && [ $c1_online -eq 1 ]
+  then
+    # Test list "0,1"
+    if ! perf stat -C "0,1" -a true > "${stat_output}" 2>&1
+    then
+      echo "stat -C <cpu> test [Failed - command failed for cpus 0,1]"
+      cat "${stat_output}"
+      err=1
+      return
+    fi
+    if ! grep -E -q "Performance counter stats for" "${stat_output}"
+    then
+      echo "stat -C <cpu> test [Failed - missing output for cpus 0,1]"
+      cat "${stat_output}"
+      err=1
+      return
+    fi
+
+    # Test range "0-1"
+    if ! perf stat -C "0-1" -a true > "${stat_output}" 2>&1
+    then
+      echo "stat -C <cpu> test [Failed - command failed for cpus 0-1]"
+      cat "${stat_output}"
+      err=1
+      return
+    fi
+    if ! grep -E -q "Performance counter stats for" "${stat_output}"
+    then
+      echo "stat -C <cpu> test [Failed - missing output for cpus 0-1]"
+      cat "${stat_output}"
+      err=1
+      return
+    fi
+  fi
+
+  echo "stat -C <cpu> test [Success]"
+}
+
+test_stat_no_aggr() {
+  echo "stat -A test"
+  if ! perf stat -A -a true > "${stat_output}" 2>&1
+  then
+    echo "stat -A test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! grep -E -q "CPU" "${stat_output}"
+  then
+    echo "stat -A test [Failed - missing CPU column]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+  echo "stat -A test [Success]"
+}
+
+test_stat_detailed() {
+  echo "stat -d test"
+  if ! perf stat -d true > "${stat_output}" 2>&1
+  then
+    echo "stat -d test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! grep -E -q "Performance counter stats" "${stat_output}"
+  then
+    echo "stat -d test [Failed - missing output]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! perf stat -dd true > "${stat_output}" 2>&1
+  then
+    echo "stat -dd test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! grep -E -q "Performance counter stats" "${stat_output}"
+  then
+    echo "stat -dd test [Failed - missing output]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! perf stat -ddd true > "${stat_output}" 2>&1
+  then
+    echo "stat -ddd test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! grep -E -q "Performance counter stats" "${stat_output}"
+  then
+    echo "stat -ddd test [Failed - missing output]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  echo "stat -d test [Success]"
+}
+
+test_stat_repeat() {
+  echo "stat -r test"
+  if ! perf stat -r 2 true > "${stat_output}" 2>&1
+  then
+    echo "stat -r test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+
+  if ! grep -E -q "\([[:space:]]*\+-.*%[[:space:]]*\)" "${stat_output}"
+  then
+    echo "stat -r test [Failed - missing variance]"
+    cat "${stat_output}"
+    err=1
+    return
+  fi
+  echo "stat -r test [Success]"
+}
+
+test_stat_pid() {
+  echo "stat -p test"
+  sleep 1 &
+  pid=$!
+  if ! perf stat -p $pid > "${stat_output}" 2>&1
+  then
+    echo "stat -p test [Failed - command failed]"
+    cat "${stat_output}"
+    err=1
+    kill $pid 2>/dev/null || true
+    wait $pid 2>/dev/null || true
+    return
+  fi
+
+  if ! grep -E -q "Performance counter stats" "${stat_output}"
+  then
+    echo "stat -p test [Failed - missing output]"
+    cat "${stat_output}"
+    err=1
+  else
+    echo "stat -p test [Success]"
+  fi
+  kill $pid 2>/dev/null || true
+  wait $pid 2>/dev/null || true
+}
+
 test_default_stat
 test_null_stat
 test_offline_cpu_stat
@@ -258,4 +493,11 @@ test_topdown_groups
 test_topdown_weak_groups
 test_cputype
 test_hybrid
+test_stat_cpu
+test_stat_no_aggr
+test_stat_detailed
+test_stat_repeat
+test_stat_pid
+
+cleanup
 exit $err
-- 
cgit v1.2.3


From f637bb2eedc01aa533f2b1e57b6abd8ca864fea8 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 5 Feb 2026 10:36:02 -0800
Subject: perf tests: build-test coverage for NO_JEVENTS=1

Leo reported 'perf stat' being broken and this highlighted that the
'make NO_JEVENTS=1' variant is missing from 'make -C tools/perf
build-test', add it.

Closes: https://lore.kernel.org/linux-perf-users/20260205175250.GC3529712@e132581.arm.com/
Reported-by: Leo Yan <leo.yan@arm.com>
Reviewed-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/make | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index eb41516c0562..6587dc326d1b 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -70,6 +70,7 @@ make_python_perf_so := $(python_perf_so)
 make_debug          := DEBUG=1
 make_nondistro      := BUILD_NONDISTRO=1
 make_extra_tests    := EXTRA_TESTS=1
+make_no_jevents     := NO_JEVENTS=1
 make_jevents_all    := JEVENTS_ARCH=all
 make_no_bpf_skel    := BUILD_BPF_SKEL=0
 make_gen_vmlinux_h  := GEN_VMLINUX_H=1
@@ -144,6 +145,7 @@ ifneq ($(new_libbfd),)
 run += make_nondistro
 endif
 run += make_extra_tests
+run += make_no_jevents
 run += make_jevents_all
 run += make_no_bpf_skel
 run += make_gen_vmlinux_h
-- 
cgit v1.2.3


From cee275edcdb1acfdc8270f80e96f30750b633220 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 5 Feb 2026 10:36:03 -0800
Subject: perf metricgroup: Don't early exit if no CPUID table exists

The failure to find a table of metrics with a CPUID shouldn't early
exit as the metric code will now also consider the default table.

When searching for a metric or metric group,
pmu_metrics_table__for_each_metric() considers all tables and so the
caller doesn't need to switch the table to do this.

Fixes: c7adeb0974f18da4 ("perf jevents: Add set of common metrics based on default ones")
Reviewed-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Leo Yan <leo.yan@arm.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/metricgroup.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 40a1e14de418..46bf4dfeebc8 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -1562,8 +1562,6 @@ int metricgroup__parse_groups(struct evlist *perf_evlist,
 {
 	const struct pmu_metrics_table *table = pmu_metrics_table__find();
 
-	if (!table)
-		return -EINVAL;
 	if (hardware_aware_grouping)
 		pr_debug("Use hardware aware grouping instead of traditional metric grouping method\n");
 
@@ -1601,22 +1599,16 @@ static int metricgroup__has_metric_or_groups_callback(const struct pmu_metric *p
 
 bool metricgroup__has_metric_or_groups(const char *pmu, const char *metric_or_groups)
 {
-	const struct pmu_metrics_table *tables[2] = {
-		pmu_metrics_table__find(),
-		pmu_metrics_table__default(),
-	};
+	const struct pmu_metrics_table *table = pmu_metrics_table__find();
 	struct metricgroup__has_metric_data data = {
 		.pmu = pmu,
 		.metric_or_groups = metric_or_groups,
 	};
 
-	for (size_t i = 0; i < ARRAY_SIZE(tables); i++) {
-		if (pmu_metrics_table__for_each_metric(tables[i],
-							metricgroup__has_metric_or_groups_callback,
-							&data))
-			return true;
-	}
-	return false;
+	return pmu_metrics_table__for_each_metric(table,
+						  metricgroup__has_metric_or_groups_callback,
+						  &data)
+		? true : false;
 }
 
 static int metricgroup__topdown_max_level_callback(const struct pmu_metric *pm,
-- 
cgit v1.2.3


From c2e28ae2946f473d6c340ebbeac0cf87be46d582 Mon Sep 17 00:00:00 2001
From: Dapeng Mi <dapeng1.mi@linux.intel.com>
Date: Tue, 3 Feb 2026 10:43:53 +0800
Subject: perf regs: Fix abort for "-I" or "--user-regs" options

Fix an issue where the `perf` tool aborts unexpectedly when running the
following command:

```
perf record -e cycles -I -- true

 Usage: perf record [<options>] [<command>]
    or: perf record [<options>] -- <command> [<options>]

    -I, --intr-regs[=<any register>]
    sample selected machine registers on interrupt, use '-I?' to list register names
```

The usage of the `-I` or `--user-regs` options without specifying any
registers should default to sampling all general-purpose registers.

However, this currently causes an abnormal termination.

The issue was introduced by commit 3d06db9bad1a ("perf regs: Refactor
 use of arch__sample_reg_masks() to perf_reg_name()").

This patch resolves the problem, ensuring that the `-I` or `--user-regs`
options work as intended without causing an abort.

Fixes: 3d06db9bad1ad8e6 ("perf regs: Refactor use of arch__sample_reg_masks() to perf_reg_name()")
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Guo Ren <guoren@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-csky@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Xudong Hao <xudong.hao@intel.com>
Cc: Zide Chen <zide.chen@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-regs-options.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c
index 8dd35f50f644..b44b47d9059f 100644
--- a/tools/perf/util/parse-regs-options.c
+++ b/tools/perf/util/parse-regs-options.c
@@ -66,12 +66,14 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
 	if (*mode)
 		return -1;
 
-	/* str may be NULL in case no arg is passed to -I */
-	if (!str)
-		return -1;
-
 	mask = intr ? arch__intr_reg_mask() : arch__user_reg_mask();
 
+	/* str may be NULL in case no arg is passed to -I */
+	if (!str) {
+		*mode = mask;
+		return 0;
+	}
+
 	/* because str is read-only */
 	s = os = strdup(str);
 	if (!s)
@@ -104,9 +106,6 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
 	}
 	ret = 0;
 
-	/* default to all possible regs */
-	if (*mode == 0)
-		*mode = mask;
 error:
 	free(os);
 	return ret;
-- 
cgit v1.2.3


From e716e69cf67bb45c49653b884f88d8e97f454f50 Mon Sep 17 00:00:00 2001
From: Dapeng Mi <dapeng1.mi@linux.intel.com>
Date: Tue, 3 Feb 2026 10:43:54 +0800
Subject: perf arch: Update arch headers to use relative UAPI paths

The architectural specific headers perf_regs.h currently rely on the
host architecture's 'asm/perf_regs.h'.

This can lead to compilation inconsistencies or failures when including
and building perf for a target architecture that differs from the host's
architecture.

Explicitly point to the UAPI headers within the tools source tree using
relative paths.

This ensures that perf is always built against the intended
architecture.

No functional changes are intended.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Guo Ren <guoren@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Xudong Hao <xudong.hao@intel.com>
Cc: Zide Chen <zide.chen@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/include/perf_regs.h       | 2 +-
 tools/perf/arch/arm64/include/perf_regs.h     | 2 +-
 tools/perf/arch/csky/include/perf_regs.h      | 2 +-
 tools/perf/arch/loongarch/include/perf_regs.h | 2 +-
 tools/perf/arch/mips/include/perf_regs.h      | 2 +-
 tools/perf/arch/powerpc/include/perf_regs.h   | 2 +-
 tools/perf/arch/riscv/include/perf_regs.h     | 2 +-
 tools/perf/arch/s390/include/perf_regs.h      | 2 +-
 tools/perf/arch/x86/include/perf_regs.h       | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/arch/arm/include/perf_regs.h b/tools/perf/arch/arm/include/perf_regs.h
index 75ce1c370114..20c54766e3a0 100644
--- a/tools/perf/arch/arm/include/perf_regs.h
+++ b/tools/perf/arch/arm/include/perf_regs.h
@@ -4,7 +4,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/arm/include/uapi/asm/perf_regs.h"
 
 void perf_regs_load(u64 *regs);
 
diff --git a/tools/perf/arch/arm64/include/perf_regs.h b/tools/perf/arch/arm64/include/perf_regs.h
index 58639ee9f7ea..372f2565a9dd 100644
--- a/tools/perf/arch/arm64/include/perf_regs.h
+++ b/tools/perf/arch/arm64/include/perf_regs.h
@@ -5,7 +5,7 @@
 #include <stdlib.h>
 #include <linux/types.h>
 #define perf_event_arm_regs perf_event_arm64_regs
-#include <asm/perf_regs.h>
+#include "../../../../arch/arm64/include/uapi/asm/perf_regs.h"
 #undef perf_event_arm_regs
 
 void perf_regs_load(u64 *regs);
diff --git a/tools/perf/arch/csky/include/perf_regs.h b/tools/perf/arch/csky/include/perf_regs.h
index 076c7746c8a2..0bf7b963909c 100644
--- a/tools/perf/arch/csky/include/perf_regs.h
+++ b/tools/perf/arch/csky/include/perf_regs.h
@@ -6,7 +6,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/csky/include/uapi/asm/perf_regs.h"
 
 #define PERF_REGS_MASK	((1ULL << PERF_REG_CSKY_MAX) - 1)
 #define PERF_REGS_MAX	PERF_REG_CSKY_MAX
diff --git a/tools/perf/arch/loongarch/include/perf_regs.h b/tools/perf/arch/loongarch/include/perf_regs.h
index 45c799fa5330..b86078a55e90 100644
--- a/tools/perf/arch/loongarch/include/perf_regs.h
+++ b/tools/perf/arch/loongarch/include/perf_regs.h
@@ -4,7 +4,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/loongarch/include/uapi/asm/perf_regs.h"
 
 #define PERF_REGS_MAX PERF_REG_LOONGARCH_MAX
 
diff --git a/tools/perf/arch/mips/include/perf_regs.h b/tools/perf/arch/mips/include/perf_regs.h
index 7082e91e0ed1..66655f0c4fea 100644
--- a/tools/perf/arch/mips/include/perf_regs.h
+++ b/tools/perf/arch/mips/include/perf_regs.h
@@ -4,7 +4,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/mips/include/uapi/asm/perf_regs.h"
 
 #define PERF_REGS_MAX PERF_REG_MIPS_MAX
 
diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h
index 1c66f6ba6773..22b492a3dd58 100644
--- a/tools/perf/arch/powerpc/include/perf_regs.h
+++ b/tools/perf/arch/powerpc/include/perf_regs.h
@@ -4,7 +4,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/powerpc/include/uapi/asm/perf_regs.h"
 
 void perf_regs_load(u64 *regs);
 
diff --git a/tools/perf/arch/riscv/include/perf_regs.h b/tools/perf/arch/riscv/include/perf_regs.h
index d482edb413e5..89d5bbb8d2b8 100644
--- a/tools/perf/arch/riscv/include/perf_regs.h
+++ b/tools/perf/arch/riscv/include/perf_regs.h
@@ -6,7 +6,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/riscv/include/uapi/asm/perf_regs.h"
 
 #define PERF_REGS_MASK	((1ULL << PERF_REG_RISCV_MAX) - 1)
 #define PERF_REGS_MAX	PERF_REG_RISCV_MAX
diff --git a/tools/perf/arch/s390/include/perf_regs.h b/tools/perf/arch/s390/include/perf_regs.h
index 130dfad2b96a..9c95589965fe 100644
--- a/tools/perf/arch/s390/include/perf_regs.h
+++ b/tools/perf/arch/s390/include/perf_regs.h
@@ -3,7 +3,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/s390/include/uapi/asm/perf_regs.h"
 
 void perf_regs_load(u64 *regs);
 
diff --git a/tools/perf/arch/x86/include/perf_regs.h b/tools/perf/arch/x86/include/perf_regs.h
index f209ce2c1dd9..5495e5ca7cdc 100644
--- a/tools/perf/arch/x86/include/perf_regs.h
+++ b/tools/perf/arch/x86/include/perf_regs.h
@@ -4,7 +4,7 @@
 
 #include <stdlib.h>
 #include <linux/types.h>
-#include <asm/perf_regs.h>
+#include "../../../../arch/x86/include/uapi/asm/perf_regs.h"
 
 void perf_regs_load(u64 *regs);
 
-- 
cgit v1.2.3


From 16dccbb84203196dab2e578b27c3c8f549ebff66 Mon Sep 17 00:00:00 2001
From: Dapeng Mi <dapeng1.mi@linux.intel.com>
Date: Tue, 3 Feb 2026 10:43:55 +0800
Subject: perf regs: Remove __weak attributive arch__xxx_reg_mask() functions

Currently, some architecture-specific perf-regs functions, such as
arch__intr_reg_mask() and arch__user_reg_mask(), are defined with the
__weak attribute.

This approach ensures that only functions matching the architecture of
the build/run host are compiled and executed, reducing build time and
binary size.

However, this __weak attribute restricts these functions to be called
only on the same architecture, preventing cross-architecture
functionality.

For example, a perf.data file captured on x86 cannot be parsed on an ARM
platform.

To address this limitation, this patch removes the __weak attribute from
these perf-regs functions.

The architecture-specific code is moved from the arch/ directory to the
util/perf-regs-arch/ directory.

The appropriate architectural functions are then called based on the
EM_HOST.

No functional changes are intended.

Suggested-by: Ian Rogers <irogers@google.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Guo Ren <guoren@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Xudong Hao <xudong.hao@intel.com>
Cc: Zide Chen <zide.chen@intel.com>
[ Fixed up somme fuzz with s390 and riscv Build files wrt removing perf_regs.o ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm/util/Build                     |  2 -
 tools/perf/arch/arm/util/perf_regs.c               | 13 ----
 tools/perf/arch/arm64/util/perf_regs.c             | 36 ----------
 tools/perf/arch/csky/Build                         |  1 -
 tools/perf/arch/csky/util/Build                    |  1 -
 tools/perf/arch/csky/util/perf_regs.c              | 13 ----
 tools/perf/arch/loongarch/util/Build               |  1 -
 tools/perf/arch/loongarch/util/perf_regs.c         | 13 ----
 tools/perf/arch/mips/util/Build                    |  1 -
 tools/perf/arch/mips/util/perf_regs.c              | 13 ----
 tools/perf/arch/powerpc/util/perf_regs.c           | 47 ------------
 tools/perf/arch/riscv/include/perf_regs.h          |  7 +-
 tools/perf/arch/riscv/util/Build                   |  1 -
 tools/perf/arch/riscv/util/perf_regs.c             | 13 ----
 tools/perf/arch/s390/util/Build                    |  1 -
 tools/perf/arch/s390/util/perf_regs.c              | 13 ----
 tools/perf/arch/x86/util/perf_regs.c               | 48 -------------
 tools/perf/util/evsel.c                            |  4 +-
 tools/perf/util/parse-regs-options.c               |  2 +-
 tools/perf/util/perf-regs-arch/perf_regs_aarch64.c | 53 +++++++++++++-
 tools/perf/util/perf-regs-arch/perf_regs_arm.c     |  7 +-
 tools/perf/util/perf-regs-arch/perf_regs_csky.c    |  7 +-
 .../perf/util/perf-regs-arch/perf_regs_loongarch.c |  7 +-
 tools/perf/util/perf-regs-arch/perf_regs_mips.c    |  7 +-
 tools/perf/util/perf-regs-arch/perf_regs_powerpc.c | 77 +++++++++++++++++++-
 tools/perf/util/perf-regs-arch/perf_regs_riscv.c   |  7 +-
 tools/perf/util/perf-regs-arch/perf_regs_s390.c    |  7 +-
 tools/perf/util/perf-regs-arch/perf_regs_x86.c     | 60 +++++++++++++++-
 tools/perf/util/perf_regs.c                        | 84 ++++++++++++++++++++--
 tools/perf/util/perf_regs.h                        | 22 +++++-
 30 files changed, 332 insertions(+), 236 deletions(-)
 delete mode 100644 tools/perf/arch/arm/util/perf_regs.c
 delete mode 100644 tools/perf/arch/csky/Build
 delete mode 100644 tools/perf/arch/csky/util/Build
 delete mode 100644 tools/perf/arch/csky/util/perf_regs.c
 delete mode 100644 tools/perf/arch/loongarch/util/perf_regs.c
 delete mode 100644 tools/perf/arch/mips/util/perf_regs.c
 delete mode 100644 tools/perf/arch/riscv/util/perf_regs.c
 delete mode 100644 tools/perf/arch/s390/util/perf_regs.c

(limited to 'tools')

diff --git a/tools/perf/arch/arm/util/Build b/tools/perf/arch/arm/util/Build
index 3291f893b943..b94bf3c5279a 100644
--- a/tools/perf/arch/arm/util/Build
+++ b/tools/perf/arch/arm/util/Build
@@ -1,5 +1,3 @@
-perf-util-y += perf_regs.o
-
 perf-util-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind.o
 
 perf-util-y += pmu.o auxtrace.o cs-etm.o
diff --git a/tools/perf/arch/arm/util/perf_regs.c b/tools/perf/arch/arm/util/perf_regs.c
deleted file mode 100644
index 03a5bc0cf64c..000000000000
--- a/tools/perf/arch/arm/util/perf_regs.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "perf_regs.h"
-#include "../../../util/perf_regs.h"
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
diff --git a/tools/perf/arch/arm64/util/perf_regs.c b/tools/perf/arch/arm64/util/perf_regs.c
index 9bb768e1bea1..47f58eaba032 100644
--- a/tools/perf/arch/arm64/util/perf_regs.c
+++ b/tools/perf/arch/arm64/util/perf_regs.c
@@ -103,39 +103,3 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
 
 	return SDT_ARG_VALID;
 }
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	struct perf_event_attr attr = {
-		.type                   = PERF_TYPE_HARDWARE,
-		.config                 = PERF_COUNT_HW_CPU_CYCLES,
-		.sample_type            = PERF_SAMPLE_REGS_USER,
-		.disabled               = 1,
-		.exclude_kernel         = 1,
-		.sample_period		= 1,
-		.sample_regs_user	= PERF_REGS_MASK
-	};
-	int fd;
-
-	if (getauxval(AT_HWCAP) & HWCAP_SVE)
-		attr.sample_regs_user |= SMPL_REG_MASK(PERF_REG_ARM64_VG);
-
-	/*
-	 * Check if the pmu supports perf extended regs, before
-	 * returning the register mask to sample.
-	 */
-	if (attr.sample_regs_user != PERF_REGS_MASK) {
-		event_attr_init(&attr);
-		fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
-		if (fd != -1) {
-			close(fd);
-			return attr.sample_regs_user;
-		}
-	}
-	return PERF_REGS_MASK;
-}
diff --git a/tools/perf/arch/csky/Build b/tools/perf/arch/csky/Build
deleted file mode 100644
index e63eabc2c8f4..000000000000
--- a/tools/perf/arch/csky/Build
+++ /dev/null
@@ -1 +0,0 @@
-perf-util-y += util/
diff --git a/tools/perf/arch/csky/util/Build b/tools/perf/arch/csky/util/Build
deleted file mode 100644
index 6b2d0e021b11..000000000000
--- a/tools/perf/arch/csky/util/Build
+++ /dev/null
@@ -1 +0,0 @@
-perf-util-y += perf_regs.o
diff --git a/tools/perf/arch/csky/util/perf_regs.c b/tools/perf/arch/csky/util/perf_regs.c
deleted file mode 100644
index 2cf7a54106e0..000000000000
--- a/tools/perf/arch/csky/util/perf_regs.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "perf_regs.h"
-#include "../../util/perf_regs.h"
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
diff --git a/tools/perf/arch/loongarch/util/Build b/tools/perf/arch/loongarch/util/Build
index 1cb06a5f8935..3ad73d0289f3 100644
--- a/tools/perf/arch/loongarch/util/Build
+++ b/tools/perf/arch/loongarch/util/Build
@@ -1,5 +1,4 @@
 perf-util-y += header.o
-perf-util-y += perf_regs.o
 
 perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
 perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/loongarch/util/perf_regs.c b/tools/perf/arch/loongarch/util/perf_regs.c
deleted file mode 100644
index 03a5bc0cf64c..000000000000
--- a/tools/perf/arch/loongarch/util/perf_regs.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "perf_regs.h"
-#include "../../../util/perf_regs.h"
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
diff --git a/tools/perf/arch/mips/util/Build b/tools/perf/arch/mips/util/Build
index 691fa2051958..818b808a8247 100644
--- a/tools/perf/arch/mips/util/Build
+++ b/tools/perf/arch/mips/util/Build
@@ -1,2 +1 @@
-perf-util-y += perf_regs.o
 perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
diff --git a/tools/perf/arch/mips/util/perf_regs.c b/tools/perf/arch/mips/util/perf_regs.c
deleted file mode 100644
index 2cf7a54106e0..000000000000
--- a/tools/perf/arch/mips/util/perf_regs.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "perf_regs.h"
-#include "../../util/perf_regs.h"
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
index 779073f7e992..93f929fc32e3 100644
--- a/tools/perf/arch/powerpc/util/perf_regs.c
+++ b/tools/perf/arch/powerpc/util/perf_regs.c
@@ -123,50 +123,3 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
 
 	return SDT_ARG_VALID;
 }
-
-uint64_t arch__intr_reg_mask(void)
-{
-	struct perf_event_attr attr = {
-		.type                   = PERF_TYPE_HARDWARE,
-		.config                 = PERF_COUNT_HW_CPU_CYCLES,
-		.sample_type            = PERF_SAMPLE_REGS_INTR,
-		.precise_ip             = 1,
-		.disabled               = 1,
-		.exclude_kernel         = 1,
-	};
-	int fd;
-	u32 version;
-	u64 extended_mask = 0, mask = PERF_REGS_MASK;
-
-	/*
-	 * Get the PVR value to set the extended
-	 * mask specific to platform.
-	 */
-	version = (((mfspr(SPRN_PVR)) >>  16) & 0xFFFF);
-	if (version == PVR_POWER9)
-		extended_mask = PERF_REG_PMU_MASK_300;
-	else if ((version == PVR_POWER10) || (version == PVR_POWER11))
-		extended_mask = PERF_REG_PMU_MASK_31;
-	else
-		return mask;
-
-	attr.sample_regs_intr = extended_mask;
-	attr.sample_period = 1;
-	event_attr_init(&attr);
-
-	/*
-	 * check if the pmu supports perf extended regs, before
-	 * returning the register mask to sample.
-	 */
-	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
-	if (fd != -1) {
-		close(fd);
-		mask |= extended_mask;
-	}
-	return mask;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
diff --git a/tools/perf/arch/riscv/include/perf_regs.h b/tools/perf/arch/riscv/include/perf_regs.h
index 89d5bbb8d2b8..af7a1b47bf66 100644
--- a/tools/perf/arch/riscv/include/perf_regs.h
+++ b/tools/perf/arch/riscv/include/perf_regs.h
@@ -10,10 +10,15 @@
 
 #define PERF_REGS_MASK	((1ULL << PERF_REG_RISCV_MAX) - 1)
 #define PERF_REGS_MAX	PERF_REG_RISCV_MAX
+
+#if defined(__riscv_xlen)
 #if __riscv_xlen == 64
-#define PERF_SAMPLE_REGS_ABI    PERF_SAMPLE_REGS_ABI_64
+#define PERF_SAMPLE_REGS_ABI	PERF_SAMPLE_REGS_ABI_64
 #else
 #define PERF_SAMPLE_REGS_ABI	PERF_SAMPLE_REGS_ABI_32
 #endif
+#else
+#define PERF_SAMPLE_REGS_ABI	PERF_SAMPLE_REGS_NONE
+#endif
 
 #endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/riscv/util/Build b/tools/perf/arch/riscv/util/Build
index c01231bcf9c3..2328fb9a30a3 100644
--- a/tools/perf/arch/riscv/util/Build
+++ b/tools/perf/arch/riscv/util/Build
@@ -1,2 +1 @@
-perf-util-y += perf_regs.o
 perf-util-y += header.o
diff --git a/tools/perf/arch/riscv/util/perf_regs.c b/tools/perf/arch/riscv/util/perf_regs.c
deleted file mode 100644
index 2cf7a54106e0..000000000000
--- a/tools/perf/arch/riscv/util/perf_regs.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "perf_regs.h"
-#include "../../util/perf_regs.h"
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build
index 87229f2c4397..65d75cd5b138 100644
--- a/tools/perf/arch/s390/util/Build
+++ b/tools/perf/arch/s390/util/Build
@@ -1,5 +1,4 @@
 perf-util-y += header.o
-perf-util-y += perf_regs.o
 
 perf-util-y += machine.o
 perf-util-y += pmu.o
diff --git a/tools/perf/arch/s390/util/perf_regs.c b/tools/perf/arch/s390/util/perf_regs.c
deleted file mode 100644
index 2cf7a54106e0..000000000000
--- a/tools/perf/arch/s390/util/perf_regs.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "perf_regs.h"
-#include "../../util/perf_regs.h"
-
-uint64_t arch__intr_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c
index a7ca4154fdf9..41141cebe226 100644
--- a/tools/perf/arch/x86/util/perf_regs.c
+++ b/tools/perf/arch/x86/util/perf_regs.c
@@ -233,51 +233,3 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
 
 	return SDT_ARG_VALID;
 }
-
-uint64_t arch__intr_reg_mask(void)
-{
-	struct perf_event_attr attr = {
-		.type			= PERF_TYPE_HARDWARE,
-		.config			= PERF_COUNT_HW_CPU_CYCLES,
-		.sample_type		= PERF_SAMPLE_REGS_INTR,
-		.sample_regs_intr	= PERF_REG_EXTENDED_MASK,
-		.precise_ip		= 1,
-		.disabled 		= 1,
-		.exclude_kernel		= 1,
-	};
-	int fd;
-	/*
-	 * In an unnamed union, init it here to build on older gcc versions
-	 */
-	attr.sample_period = 1;
-
-	if (perf_pmus__num_core_pmus() > 1) {
-		struct perf_pmu *pmu = NULL;
-		__u64 type = PERF_TYPE_RAW;
-
-		/*
-		 * The same register set is supported among different hybrid PMUs.
-		 * Only check the first available one.
-		 */
-		while ((pmu = perf_pmus__scan_core(pmu)) != NULL) {
-			type = pmu->type;
-			break;
-		}
-		attr.config |= type << PERF_PMU_TYPE_SHIFT;
-	}
-
-	event_attr_init(&attr);
-
-	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
-	if (fd != -1) {
-		close(fd);
-		return (PERF_REG_EXTENDED_MASK | PERF_REGS_MASK);
-	}
-
-	return PERF_REGS_MASK;
-}
-
-uint64_t arch__user_reg_mask(void)
-{
-	return PERF_REGS_MASK;
-}
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index aff44ffd3ff1..f59228c1a39e 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1055,13 +1055,13 @@ static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *o
 			evsel__set_sample_bit(evsel, REGS_USER);
 			evsel__set_sample_bit(evsel, STACK_USER);
 			if (opts->sample_user_regs &&
-			    DWARF_MINIMAL_REGS(e_machine) != arch__user_reg_mask()) {
+			    DWARF_MINIMAL_REGS(e_machine) != perf_user_reg_mask(EM_HOST)) {
 				attr->sample_regs_user |= DWARF_MINIMAL_REGS(e_machine);
 				pr_warning("WARNING: The use of --call-graph=dwarf may require all the user registers, "
 					   "specifying a subset with --user-regs may render DWARF unwinding unreliable, "
 					   "so the minimal registers set (IP, SP) is explicitly forced.\n");
 			} else {
-				attr->sample_regs_user |= arch__user_reg_mask();
+				attr->sample_regs_user |= perf_user_reg_mask(EM_HOST);
 			}
 			attr->sample_stack_user = param->dump_size;
 			attr->exclude_callchain_user = 1;
diff --git a/tools/perf/util/parse-regs-options.c b/tools/perf/util/parse-regs-options.c
index b44b47d9059f..c93c2f0c8105 100644
--- a/tools/perf/util/parse-regs-options.c
+++ b/tools/perf/util/parse-regs-options.c
@@ -66,7 +66,7 @@ __parse_regs(const struct option *opt, const char *str, int unset, bool intr)
 	if (*mode)
 		return -1;
 
-	mask = intr ? arch__intr_reg_mask() : arch__user_reg_mask();
+	mask = intr ? perf_intr_reg_mask(EM_HOST) : perf_user_reg_mask(EM_HOST);
 
 	/* str may be NULL in case no arg is passed to -I */
 	if (!str) {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c b/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
index 9dcda80d310f..666874f625b6 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
@@ -1,7 +1,58 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <regex.h>
+#include <string.h>
+#include <sys/auxv.h>
+#include <linux/kernel.h>
+#include <linux/zalloc.h>
 
+#include "../debug.h"
+#include "../event.h"
 #include "../perf_regs.h"
-#include "../../../arch/arm64/include/uapi/asm/perf_regs.h"
+#include "../../perf-sys.h"
+#include "../../arch/arm64/include/perf_regs.h"
+
+#define SMPL_REG_MASK(b) (1ULL << (b))
+
+#ifndef HWCAP_SVE
+#define HWCAP_SVE	(1 << 22)
+#endif
+
+uint64_t __perf_reg_mask_arm64(bool intr)
+{
+	struct perf_event_attr attr = {
+		.type                   = PERF_TYPE_HARDWARE,
+		.config                 = PERF_COUNT_HW_CPU_CYCLES,
+		.sample_type            = PERF_SAMPLE_REGS_USER,
+		.disabled               = 1,
+		.exclude_kernel         = 1,
+		.sample_period		= 1,
+		.sample_regs_user	= PERF_REGS_MASK
+	};
+	int fd;
+
+	if (intr)
+		return PERF_REGS_MASK;
+
+	if (getauxval(AT_HWCAP) & HWCAP_SVE)
+		attr.sample_regs_user |= SMPL_REG_MASK(PERF_REG_ARM64_VG);
+
+	/*
+	 * Check if the pmu supports perf extended regs, before
+	 * returning the register mask to sample. Open the event
+	 * on the perf process to check this.
+	 */
+	if (attr.sample_regs_user != PERF_REGS_MASK) {
+		event_attr_init(&attr);
+		fd = sys_perf_event_open(&attr, /*pid=*/0, /*cpu=*/-1,
+					 /*group_fd=*/-1, /*flags=*/0);
+		if (fd != -1) {
+			close(fd);
+			return attr.sample_regs_user;
+		}
+	}
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_arm64(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_arm.c b/tools/perf/util/perf-regs-arch/perf_regs_arm.c
index e29d130a587a..184d6e248dfc 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_arm.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_arm.c
@@ -1,7 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "../perf_regs.h"
-#include "../../../arch/arm/include/uapi/asm/perf_regs.h"
+#include "../../arch/arm/include/perf_regs.h"
+
+uint64_t __perf_reg_mask_arm(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_arm(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_csky.c b/tools/perf/util/perf-regs-arch/perf_regs_csky.c
index 95808f93d45b..16cbd8303acf 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_csky.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_csky.c
@@ -9,7 +9,12 @@
 #include "../perf_regs.h"
 #undef __CSKYABIV2__
 #define __CSKYABIV2__ 1  // Always want the V2 register definitions.
-#include "../../arch/csky/include/uapi/asm/perf_regs.h"
+#include "../../arch/csky/include/perf_regs.h"
+
+uint64_t __perf_reg_mask_csky(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_csky(int id, uint32_t e_flags)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c b/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c
index 043f97f4e3ac..478ee889afa1 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_loongarch.c
@@ -1,7 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "../perf_regs.h"
-#include "../../../arch/loongarch/include/uapi/asm/perf_regs.h"
+#include "../../arch/loongarch/include/perf_regs.h"
+
+uint64_t __perf_reg_mask_loongarch(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_loongarch(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_mips.c b/tools/perf/util/perf-regs-arch/perf_regs_mips.c
index 793178fc3c78..c5a475f6ec64 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_mips.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_mips.c
@@ -1,7 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "../perf_regs.h"
-#include "../../../arch/mips/include/uapi/asm/perf_regs.h"
+#include "../../arch/mips/include/perf_regs.h"
+
+uint64_t __perf_reg_mask_mips(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_mips(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c b/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
index 08636bb09a3a..f0a547ad809b 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
@@ -1,7 +1,82 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <errno.h>
+#include <string.h>
+#include <regex.h>
+#include <linux/zalloc.h>
+
+#include "../debug.h"
+#include "../event.h"
+#include "../header.h"
 #include "../perf_regs.h"
-#include "../../../arch/powerpc/include/uapi/asm/perf_regs.h"
+#include "../../perf-sys.h"
+#include "../../arch/powerpc/util/utils_header.h"
+#include "../../arch/powerpc/include/perf_regs.h"
+
+#include <linux/kernel.h>
+
+#define PVR_POWER9		0x004E
+#define PVR_POWER10		0x0080
+#define PVR_POWER11		0x0082
+
+/*
+ * mfspr is a POWERPC specific instruction, ensure it's only
+ * built and called on POWERPC by guarding with __powerpc64__
+ * or __powerpc__.
+ */
+#if defined(__powerpc64__) && defined(__powerpc__)
+uint64_t __perf_reg_mask_powerpc(bool intr)
+{
+	struct perf_event_attr attr = {
+		.type                   = PERF_TYPE_HARDWARE,
+		.config                 = PERF_COUNT_HW_CPU_CYCLES,
+		.sample_type            = PERF_SAMPLE_REGS_INTR,
+		.precise_ip             = 1,
+		.disabled               = 1,
+		.exclude_kernel         = 1,
+	};
+	int fd;
+	u32 version;
+	u64 extended_mask = 0, mask = PERF_REGS_MASK;
+
+	if (!intr)
+		return PERF_REGS_MASK;
+
+	/*
+	 * Get the PVR value to set the extended
+	 * mask specific to platform.
+	 */
+	version = (((mfspr(SPRN_PVR)) >>  16) & 0xFFFF);
+	if (version == PVR_POWER9)
+		extended_mask = PERF_REG_PMU_MASK_300;
+	else if ((version == PVR_POWER10) || (version == PVR_POWER11))
+		extended_mask = PERF_REG_PMU_MASK_31;
+	else
+		return mask;
+
+	attr.sample_regs_intr = extended_mask;
+	attr.sample_period = 1;
+	event_attr_init(&attr);
+
+	/*
+	 * Check if the pmu supports perf extended regs, before
+	 * returning the register mask to sample. Open the event
+	 * on the perf process to check this.
+	 */
+	fd = sys_perf_event_open(&attr, /*pid=*/0, /*cpu=*/-1,
+				 /*group_fd=*/-1, /*flags=*/0);
+	if (fd != -1) {
+		close(fd);
+		mask |= extended_mask;
+	}
+	return mask;
+}
+#else
+uint64_t __perf_reg_mask_powerpc(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
+#endif
 
 const char *__perf_reg_name_powerpc(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_riscv.c b/tools/perf/util/perf-regs-arch/perf_regs_riscv.c
index 337b687c655d..5b5f21fcba8c 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_riscv.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_riscv.c
@@ -1,7 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "../perf_regs.h"
-#include "../../../arch/riscv/include/uapi/asm/perf_regs.h"
+#include "../../arch/riscv/include/perf_regs.h"
+
+uint64_t __perf_reg_mask_riscv(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_riscv(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_s390.c b/tools/perf/util/perf-regs-arch/perf_regs_s390.c
index d69bba881080..c61df24edf0f 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_s390.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_s390.c
@@ -1,7 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "../perf_regs.h"
-#include "../../../arch/s390/include/uapi/asm/perf_regs.h"
+#include "../../arch/s390/include/perf_regs.h"
+
+uint64_t __perf_reg_mask_s390(bool intr __maybe_unused)
+{
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_s390(int id)
 {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_x86.c b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
index 708954a9d35d..d573f9a9ca46 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_x86.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
@@ -1,7 +1,65 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <errno.h>
+#include <string.h>
+#include <regex.h>
+#include <linux/kernel.h>
+#include <linux/zalloc.h>
+
+#include "../debug.h"
+#include "../event.h"
+#include "../pmu.h"
+#include "../pmus.h"
 #include "../perf_regs.h"
-#include "../../../arch/x86/include/uapi/asm/perf_regs.h"
+#include "../../perf-sys.h"
+#include "../../arch/x86/include/perf_regs.h"
+
+uint64_t __perf_reg_mask_x86(bool intr)
+{
+	struct perf_event_attr attr = {
+		.type			= PERF_TYPE_HARDWARE,
+		.config			= PERF_COUNT_HW_CPU_CYCLES,
+		.sample_type		= PERF_SAMPLE_REGS_INTR,
+		.sample_regs_intr	= PERF_REG_EXTENDED_MASK,
+		.precise_ip		= 1,
+		.disabled		= 1,
+		.exclude_kernel		= 1,
+	};
+	int fd;
+
+	if (!intr)
+		return PERF_REGS_MASK;
+
+	/*
+	 * In an unnamed union, init it here to build on older gcc versions
+	 */
+	attr.sample_period = 1;
+
+	if (perf_pmus__num_core_pmus() > 1) {
+		struct perf_pmu *pmu = NULL;
+		__u64 type = PERF_TYPE_RAW;
+
+		/*
+		 * The same register set is supported among different hybrid PMUs.
+		 * Only check the first available one.
+		 */
+		while ((pmu = perf_pmus__scan_core(pmu)) != NULL) {
+			type = pmu->type;
+			break;
+		}
+		attr.config |= type << PERF_PMU_TYPE_SHIFT;
+	}
+
+	event_attr_init(&attr);
+	fd = sys_perf_event_open(&attr, /*pid=*/0, /*cpu=*/-1,
+				 /*group_fd=*/-1, /*flags=*/0);
+	if (fd != -1) {
+		close(fd);
+		return (PERF_REG_EXTENDED_MASK | PERF_REGS_MASK);
+	}
+
+	return PERF_REGS_MASK;
+}
 
 const char *__perf_reg_name_x86(int id)
 {
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index 14b7be30ab20..4d9a286a0e56 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -13,14 +13,90 @@ int __weak arch_sdt_arg_parse_op(char *old_op __maybe_unused,
 	return SDT_ARG_SKIP;
 }
 
-uint64_t __weak arch__intr_reg_mask(void)
+uint64_t perf_intr_reg_mask(uint16_t e_machine)
 {
-	return 0;
+	uint64_t mask = 0;
+
+	switch (e_machine) {
+	case EM_ARM:
+		mask = __perf_reg_mask_arm(/*intr=*/true);
+		break;
+	case EM_AARCH64:
+		mask = __perf_reg_mask_arm64(/*intr=*/true);
+		break;
+	case EM_CSKY:
+		mask = __perf_reg_mask_csky(/*intr=*/true);
+		break;
+	case EM_LOONGARCH:
+		mask = __perf_reg_mask_loongarch(/*intr=*/true);
+		break;
+	case EM_MIPS:
+		mask = __perf_reg_mask_mips(/*intr=*/true);
+		break;
+	case EM_PPC:
+	case EM_PPC64:
+		mask = __perf_reg_mask_powerpc(/*intr=*/true);
+		break;
+	case EM_RISCV:
+		mask = __perf_reg_mask_riscv(/*intr=*/true);
+		break;
+	case EM_S390:
+		mask = __perf_reg_mask_s390(/*intr=*/true);
+		break;
+	case EM_386:
+	case EM_X86_64:
+		mask = __perf_reg_mask_x86(/*intr=*/true);
+		break;
+	default:
+		pr_debug("Unknown ELF machine %d, interrupt sampling register mask will be empty.\n",
+			 e_machine);
+		break;
+	}
+
+	return mask;
 }
 
-uint64_t __weak arch__user_reg_mask(void)
+uint64_t perf_user_reg_mask(uint16_t e_machine)
 {
-	return 0;
+	uint64_t mask = 0;
+
+	switch (e_machine) {
+	case EM_ARM:
+		mask = __perf_reg_mask_arm(/*intr=*/false);
+		break;
+	case EM_AARCH64:
+		mask = __perf_reg_mask_arm64(/*intr=*/false);
+		break;
+	case EM_CSKY:
+		mask = __perf_reg_mask_csky(/*intr=*/false);
+		break;
+	case EM_LOONGARCH:
+		mask = __perf_reg_mask_loongarch(/*intr=*/false);
+		break;
+	case EM_MIPS:
+		mask = __perf_reg_mask_mips(/*intr=*/false);
+		break;
+	case EM_PPC:
+	case EM_PPC64:
+		mask = __perf_reg_mask_powerpc(/*intr=*/false);
+		break;
+	case EM_RISCV:
+		mask = __perf_reg_mask_riscv(/*intr=*/false);
+		break;
+	case EM_S390:
+		mask = __perf_reg_mask_s390(/*intr=*/false);
+		break;
+	case EM_386:
+	case EM_X86_64:
+		mask = __perf_reg_mask_x86(/*intr=*/false);
+		break;
+	default:
+		pr_debug("Unknown ELF machine %d, user sampling register mask will be empty.\n",
+			 e_machine);
+		break;
+	}
+
+	return mask;
 }
 
 const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags)
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index ed7c1b1358fa..2b27139acadb 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -13,37 +13,55 @@ enum {
 };
 
 int arch_sdt_arg_parse_op(char *old_op, char **new_op);
-uint64_t arch__intr_reg_mask(void);
-uint64_t arch__user_reg_mask(void);
+uint64_t perf_intr_reg_mask(uint16_t e_machine);
+uint64_t perf_user_reg_mask(uint16_t e_machine);
 
 const char *perf_reg_name(int id, uint16_t e_machine, uint32_t e_flags);
 int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
 uint64_t perf_arch_reg_ip(uint16_t e_machine);
 uint64_t perf_arch_reg_sp(uint16_t e_machine);
+
+uint64_t __perf_reg_mask_arm64(bool intr);
 const char *__perf_reg_name_arm64(int id);
 uint64_t __perf_reg_ip_arm64(void);
 uint64_t __perf_reg_sp_arm64(void);
+
+uint64_t __perf_reg_mask_arm(bool intr);
 const char *__perf_reg_name_arm(int id);
 uint64_t __perf_reg_ip_arm(void);
 uint64_t __perf_reg_sp_arm(void);
+
+uint64_t __perf_reg_mask_csky(bool intr);
 const char *__perf_reg_name_csky(int id, uint32_t e_flags);
 uint64_t __perf_reg_ip_csky(void);
 uint64_t __perf_reg_sp_csky(void);
+
+uint64_t __perf_reg_mask_loongarch(bool intr);
 const char *__perf_reg_name_loongarch(int id);
 uint64_t __perf_reg_ip_loongarch(void);
 uint64_t __perf_reg_sp_loongarch(void);
+
+uint64_t __perf_reg_mask_mips(bool intr);
 const char *__perf_reg_name_mips(int id);
 uint64_t __perf_reg_ip_mips(void);
 uint64_t __perf_reg_sp_mips(void);
+
+uint64_t __perf_reg_mask_powerpc(bool intr);
 const char *__perf_reg_name_powerpc(int id);
 uint64_t __perf_reg_ip_powerpc(void);
 uint64_t __perf_reg_sp_powerpc(void);
+
+uint64_t __perf_reg_mask_riscv(bool intr);
 const char *__perf_reg_name_riscv(int id);
 uint64_t __perf_reg_ip_riscv(void);
 uint64_t __perf_reg_sp_riscv(void);
+
+uint64_t __perf_reg_mask_s390(bool intr);
 const char *__perf_reg_name_s390(int id);
 uint64_t __perf_reg_ip_s390(void);
 uint64_t __perf_reg_sp_s390(void);
+
+uint64_t __perf_reg_mask_x86(bool intr);
 const char *__perf_reg_name_x86(int id);
 uint64_t __perf_reg_ip_x86(void);
 uint64_t __perf_reg_sp_x86(void);
-- 
cgit v1.2.3


From e5e66adfe45a6480d96b4e40edc05883915f44b2 Mon Sep 17 00:00:00 2001
From: Dapeng Mi <dapeng1.mi@linux.intel.com>
Date: Tue, 3 Feb 2026 10:43:56 +0800
Subject: perf regs: Remove __weak attributive arch_sdt_arg_parse_op() function

In line with the previous patch, the __weak arch_sdt_arg_parse_op()
function is removed.

Architectural-specific implementations in the arch/ directory are now
converted into sub-functions within the util/perf-regs-arch/ directory.

The perf_sdt_arg_parse_op() function will call these sub-functions based
on the EM_HOST.

This change enables cross-architecture calls to arch_sdt_arg_parse_op().

No functional changes are intended.

Suggested-by: Ian Rogers <irogers@google.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Guo Ren <guoren@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Xudong Hao <xudong.hao@intel.com>
Cc: Zide Chen <zide.chen@intel.com>
[ Fixed up somme fuzz with powerpc and x86 Build files wrt removing perf_regs.o ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/arm64/util/Build                   |   1 -
 tools/perf/arch/arm64/util/perf_regs.c             | 105 ---------
 tools/perf/arch/powerpc/util/Build                 |   1 -
 tools/perf/arch/powerpc/util/perf_regs.c           | 125 -----------
 tools/perf/arch/x86/util/Build                     |   1 -
 tools/perf/arch/x86/util/perf_regs.c               | 235 ---------------------
 tools/perf/util/perf-regs-arch/perf_regs_aarch64.c |  86 ++++++++
 tools/perf/util/perf-regs-arch/perf_regs_powerpc.c | 106 ++++++++++
 tools/perf/util/perf-regs-arch/perf_regs_x86.c     | 221 +++++++++++++++++++
 tools/perf/util/perf_regs.c                        |  25 ++-
 tools/perf/util/perf_regs.h                        |   5 +-
 tools/perf/util/probe-file.c                       |   3 +-
 12 files changed, 441 insertions(+), 473 deletions(-)
 delete mode 100644 tools/perf/arch/arm64/util/perf_regs.c
 delete mode 100644 tools/perf/arch/powerpc/util/perf_regs.c
 delete mode 100644 tools/perf/arch/x86/util/perf_regs.c

(limited to 'tools')

diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
index d25edd9e1883..4e06a08d281a 100644
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
@@ -7,6 +7,5 @@ perf-util-y += header.o
 perf-util-y += hisi-ptt.o
 perf-util-y += machine.o
 perf-util-y += mem-events.o
-perf-util-y += perf_regs.o
 perf-util-y += pmu.o
 perf-util-y += tsc.o
diff --git a/tools/perf/arch/arm64/util/perf_regs.c b/tools/perf/arch/arm64/util/perf_regs.c
deleted file mode 100644
index 47f58eaba032..000000000000
--- a/tools/perf/arch/arm64/util/perf_regs.c
+++ /dev/null
@@ -1,105 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <errno.h>
-#include <regex.h>
-#include <string.h>
-#include <sys/auxv.h>
-#include <linux/kernel.h>
-#include <linux/zalloc.h>
-
-#include "perf_regs.h"
-#include "../../../perf-sys.h"
-#include "../../../util/debug.h"
-#include "../../../util/event.h"
-#include "../../../util/perf_regs.h"
-
-#define SMPL_REG_MASK(b) (1ULL << (b))
-
-#ifndef HWCAP_SVE
-#define HWCAP_SVE	(1 << 22)
-#endif
-
-/* %xNUM */
-#define SDT_OP_REGEX1  "^(x[1-2]?[0-9]|3[0-1])$"
-
-/* [sp], [sp, NUM] */
-#define SDT_OP_REGEX2  "^\\[sp(, )?([0-9]+)?\\]$"
-
-static regex_t sdt_op_regex1, sdt_op_regex2;
-
-static int sdt_init_op_regex(void)
-{
-	static int initialized;
-	int ret = 0;
-
-	if (initialized)
-		return 0;
-
-	ret = regcomp(&sdt_op_regex1, SDT_OP_REGEX1, REG_EXTENDED);
-	if (ret)
-		goto error;
-
-	ret = regcomp(&sdt_op_regex2, SDT_OP_REGEX2, REG_EXTENDED);
-	if (ret)
-		goto free_regex1;
-
-	initialized = 1;
-	return 0;
-
-free_regex1:
-	regfree(&sdt_op_regex1);
-error:
-	pr_debug4("Regex compilation error.\n");
-	return ret;
-}
-
-/*
- * SDT marker arguments on Arm64 uses %xREG or [sp, NUM], currently
- * support these two formats.
- */
-int arch_sdt_arg_parse_op(char *old_op, char **new_op)
-{
-	int ret, new_len;
-	regmatch_t rm[5];
-
-	ret = sdt_init_op_regex();
-	if (ret < 0)
-		return ret;
-
-	if (!regexec(&sdt_op_regex1, old_op, 3, rm, 0)) {
-		/* Extract xNUM */
-		new_len = 2;	/* % NULL */
-		new_len += (int)(rm[1].rm_eo - rm[1].rm_so);
-
-		*new_op = zalloc(new_len);
-		if (!*new_op)
-			return -ENOMEM;
-
-		scnprintf(*new_op, new_len, "%%%.*s",
-			(int)(rm[1].rm_eo - rm[1].rm_so), old_op + rm[1].rm_so);
-	} else if (!regexec(&sdt_op_regex2, old_op, 5, rm, 0)) {
-		/* [sp], [sp, NUM] or [sp,NUM] */
-		new_len = 7;	/* + ( % s p ) NULL */
-
-		/* If the argument is [sp], need to fill offset '0' */
-		if (rm[2].rm_so == -1)
-			new_len += 1;
-		else
-			new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
-
-		*new_op = zalloc(new_len);
-		if (!*new_op)
-			return -ENOMEM;
-
-		if (rm[2].rm_so == -1)
-			scnprintf(*new_op, new_len, "+0(%%sp)");
-		else
-			scnprintf(*new_op, new_len, "+%.*s(%%sp)",
-				  (int)(rm[2].rm_eo - rm[2].rm_so),
-				  old_op + rm[2].rm_so);
-	} else {
-		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
-		return SDT_ARG_SKIP;
-	}
-
-	return SDT_ARG_VALID;
-}
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index e091b6785674..d66574cbb9a9 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -1,5 +1,4 @@
 perf-util-y += header.o
-perf-util-y += perf_regs.o
 perf-util-y += mem-events.o
 perf-util-y += pmu.o
 perf-util-y += sym-handling.o
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
deleted file mode 100644
index 93f929fc32e3..000000000000
--- a/tools/perf/arch/powerpc/util/perf_regs.c
+++ /dev/null
@@ -1,125 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <errno.h>
-#include <string.h>
-#include <regex.h>
-#include <linux/zalloc.h>
-
-#include "perf_regs.h"
-#include "../../../util/perf_regs.h"
-#include "../../../util/debug.h"
-#include "../../../util/event.h"
-#include "../../../util/header.h"
-#include "../../../perf-sys.h"
-#include "utils_header.h"
-
-#include <linux/kernel.h>
-
-#define PVR_POWER9		0x004E
-#define PVR_POWER10		0x0080
-#define PVR_POWER11		0x0082
-
-/* REG or %rREG */
-#define SDT_OP_REGEX1  "^(%r)?([1-2]?[0-9]|3[0-1])$"
-
-/* -NUM(REG) or NUM(REG) or -NUM(%rREG) or NUM(%rREG) */
-#define SDT_OP_REGEX2  "^(\\-)?([0-9]+)\\((%r)?([1-2]?[0-9]|3[0-1])\\)$"
-
-static regex_t sdt_op_regex1, sdt_op_regex2;
-
-static int sdt_init_op_regex(void)
-{
-	static int initialized;
-	int ret = 0;
-
-	if (initialized)
-		return 0;
-
-	ret = regcomp(&sdt_op_regex1, SDT_OP_REGEX1, REG_EXTENDED);
-	if (ret)
-		goto error;
-
-	ret = regcomp(&sdt_op_regex2, SDT_OP_REGEX2, REG_EXTENDED);
-	if (ret)
-		goto free_regex1;
-
-	initialized = 1;
-	return 0;
-
-free_regex1:
-	regfree(&sdt_op_regex1);
-error:
-	pr_debug4("Regex compilation error.\n");
-	return ret;
-}
-
-/*
- * Parse OP and convert it into uprobe format, which is, +/-NUM(%gprREG).
- * Possible variants of OP are:
- *	Format		Example
- *	-------------------------
- *	NUM(REG)	48(18)
- *	-NUM(REG)	-48(18)
- *	NUM(%rREG)	48(%r18)
- *	-NUM(%rREG)	-48(%r18)
- *	REG		18
- *	%rREG		%r18
- *	iNUM		i0
- *	i-NUM		i-1
- *
- * SDT marker arguments on Powerpc uses %rREG form with -mregnames flag
- * and REG form with -mno-regnames. Here REG is general purpose register,
- * which is in 0 to 31 range.
- */
-int arch_sdt_arg_parse_op(char *old_op, char **new_op)
-{
-	int ret, new_len;
-	regmatch_t rm[5];
-	char prefix;
-
-	/* Constant argument. Uprobe does not support it */
-	if (old_op[0] == 'i') {
-		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
-		return SDT_ARG_SKIP;
-	}
-
-	ret = sdt_init_op_regex();
-	if (ret < 0)
-		return ret;
-
-	if (!regexec(&sdt_op_regex1, old_op, 3, rm, 0)) {
-		/* REG or %rREG --> %gprREG */
-
-		new_len = 5;	/* % g p r NULL */
-		new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
-
-		*new_op = zalloc(new_len);
-		if (!*new_op)
-			return -ENOMEM;
-
-		scnprintf(*new_op, new_len, "%%gpr%.*s",
-			(int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so);
-	} else if (!regexec(&sdt_op_regex2, old_op, 5, rm, 0)) {
-		/*
-		 * -NUM(REG) or NUM(REG) or -NUM(%rREG) or NUM(%rREG) -->
-		 *	+/-NUM(%gprREG)
-		 */
-		prefix = (rm[1].rm_so == -1) ? '+' : '-';
-
-		new_len = 8;	/* +/- ( % g p r ) NULL */
-		new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
-		new_len += (int)(rm[4].rm_eo - rm[4].rm_so);
-
-		*new_op = zalloc(new_len);
-		if (!*new_op)
-			return -ENOMEM;
-
-		scnprintf(*new_op, new_len, "%c%.*s(%%gpr%.*s)", prefix,
-			(int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so,
-			(int)(rm[4].rm_eo - rm[4].rm_so), old_op + rm[4].rm_so);
-	} else {
-		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
-		return SDT_ARG_SKIP;
-	}
-
-	return SDT_ARG_VALID;
-}
diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
index 0c4cf1dd07bf..b94c91984c66 100644
--- a/tools/perf/arch/x86/util/Build
+++ b/tools/perf/arch/x86/util/Build
@@ -1,7 +1,6 @@
 perf-util-y += header.o
 perf-util-y += tsc.o
 perf-util-y += pmu.o
-perf-util-y += perf_regs.o
 perf-util-y += topdown.o
 perf-util-y += machine.o
 perf-util-y += event.o
diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c
deleted file mode 100644
index 41141cebe226..000000000000
--- a/tools/perf/arch/x86/util/perf_regs.c
+++ /dev/null
@@ -1,235 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <errno.h>
-#include <string.h>
-#include <regex.h>
-#include <linux/kernel.h>
-#include <linux/zalloc.h>
-
-#include "perf_regs.h"
-#include "../../../perf-sys.h"
-#include "../../../util/perf_regs.h"
-#include "../../../util/debug.h"
-#include "../../../util/event.h"
-#include "../../../util/pmu.h"
-#include "../../../util/pmus.h"
-
-struct sdt_name_reg {
-	const char *sdt_name;
-	const char *uprobe_name;
-};
-#define SDT_NAME_REG(n, m) {.sdt_name = "%" #n, .uprobe_name = "%" #m}
-#define SDT_NAME_REG_END {.sdt_name = NULL, .uprobe_name = NULL}
-
-static const struct sdt_name_reg sdt_reg_tbl[] = {
-	SDT_NAME_REG(eax, ax),
-	SDT_NAME_REG(rax, ax),
-	SDT_NAME_REG(al,  ax),
-	SDT_NAME_REG(ah,  ax),
-	SDT_NAME_REG(ebx, bx),
-	SDT_NAME_REG(rbx, bx),
-	SDT_NAME_REG(bl,  bx),
-	SDT_NAME_REG(bh,  bx),
-	SDT_NAME_REG(ecx, cx),
-	SDT_NAME_REG(rcx, cx),
-	SDT_NAME_REG(cl,  cx),
-	SDT_NAME_REG(ch,  cx),
-	SDT_NAME_REG(edx, dx),
-	SDT_NAME_REG(rdx, dx),
-	SDT_NAME_REG(dl,  dx),
-	SDT_NAME_REG(dh,  dx),
-	SDT_NAME_REG(esi, si),
-	SDT_NAME_REG(rsi, si),
-	SDT_NAME_REG(sil, si),
-	SDT_NAME_REG(edi, di),
-	SDT_NAME_REG(rdi, di),
-	SDT_NAME_REG(dil, di),
-	SDT_NAME_REG(ebp, bp),
-	SDT_NAME_REG(rbp, bp),
-	SDT_NAME_REG(bpl, bp),
-	SDT_NAME_REG(rsp, sp),
-	SDT_NAME_REG(esp, sp),
-	SDT_NAME_REG(spl, sp),
-
-	/* rNN registers */
-	SDT_NAME_REG(r8b,  r8),
-	SDT_NAME_REG(r8w,  r8),
-	SDT_NAME_REG(r8d,  r8),
-	SDT_NAME_REG(r9b,  r9),
-	SDT_NAME_REG(r9w,  r9),
-	SDT_NAME_REG(r9d,  r9),
-	SDT_NAME_REG(r10b, r10),
-	SDT_NAME_REG(r10w, r10),
-	SDT_NAME_REG(r10d, r10),
-	SDT_NAME_REG(r11b, r11),
-	SDT_NAME_REG(r11w, r11),
-	SDT_NAME_REG(r11d, r11),
-	SDT_NAME_REG(r12b, r12),
-	SDT_NAME_REG(r12w, r12),
-	SDT_NAME_REG(r12d, r12),
-	SDT_NAME_REG(r13b, r13),
-	SDT_NAME_REG(r13w, r13),
-	SDT_NAME_REG(r13d, r13),
-	SDT_NAME_REG(r14b, r14),
-	SDT_NAME_REG(r14w, r14),
-	SDT_NAME_REG(r14d, r14),
-	SDT_NAME_REG(r15b, r15),
-	SDT_NAME_REG(r15w, r15),
-	SDT_NAME_REG(r15d, r15),
-	SDT_NAME_REG_END,
-};
-
-/*
- * Perf only supports OP which is in  +/-NUM(REG)  form.
- * Here plus-minus sign, NUM and parenthesis are optional,
- * only REG is mandatory.
- *
- * SDT events also supports indirect addressing mode with a
- * symbol as offset, scaled mode and constants in OP. But
- * perf does not support them yet. Below are few examples.
- *
- * OP with scaled mode:
- *     (%rax,%rsi,8)
- *     10(%ras,%rsi,8)
- *
- * OP with indirect addressing mode:
- *     check_action(%rip)
- *     mp_+52(%rip)
- *     44+mp_(%rip)
- *
- * OP with constant values:
- *     $0
- *     $123
- *     $-1
- */
-#define SDT_OP_REGEX  "^([+\\-]?)([0-9]*)(\\(?)(%[a-z][a-z0-9]+)(\\)?)$"
-
-static regex_t sdt_op_regex;
-
-static int sdt_init_op_regex(void)
-{
-	static int initialized;
-	int ret = 0;
-
-	if (initialized)
-		return 0;
-
-	ret = regcomp(&sdt_op_regex, SDT_OP_REGEX, REG_EXTENDED);
-	if (ret < 0) {
-		pr_debug4("Regex compilation error.\n");
-		return ret;
-	}
-
-	initialized = 1;
-	return 0;
-}
-
-/*
- * Max x86 register name length is 5(ex: %r15d). So, 6th char
- * should always contain NULL. This helps to find register name
- * length using strlen, instead of maintaining one more variable.
- */
-#define SDT_REG_NAME_SIZE  6
-
-/*
- * The uprobe parser does not support all gas register names;
- * so, we have to replace them (ex. for x86_64: %rax -> %ax).
- * Note: If register does not require renaming, just copy
- * paste as it is, but don't leave it empty.
- */
-static void sdt_rename_register(char *sdt_reg, int sdt_len, char *uprobe_reg)
-{
-	int i = 0;
-
-	for (i = 0; sdt_reg_tbl[i].sdt_name != NULL; i++) {
-		if (!strncmp(sdt_reg_tbl[i].sdt_name, sdt_reg, sdt_len)) {
-			strcpy(uprobe_reg, sdt_reg_tbl[i].uprobe_name);
-			return;
-		}
-	}
-
-	strncpy(uprobe_reg, sdt_reg, sdt_len);
-}
-
-int arch_sdt_arg_parse_op(char *old_op, char **new_op)
-{
-	char new_reg[SDT_REG_NAME_SIZE] = {0};
-	int new_len = 0, ret;
-	/*
-	 * rm[0]:  +/-NUM(REG)
-	 * rm[1]:  +/-
-	 * rm[2]:  NUM
-	 * rm[3]:  (
-	 * rm[4]:  REG
-	 * rm[5]:  )
-	 */
-	regmatch_t rm[6];
-	/*
-	 * Max prefix length is 2 as it may contains sign(+/-)
-	 * and displacement 0 (Both sign and displacement 0 are
-	 * optional so it may be empty). Use one more character
-	 * to hold last NULL so that strlen can be used to find
-	 * prefix length, instead of maintaining one more variable.
-	 */
-	char prefix[3] = {0};
-
-	ret = sdt_init_op_regex();
-	if (ret < 0)
-		return ret;
-
-	/*
-	 * If unsupported OR does not match with regex OR
-	 * register name too long, skip it.
-	 */
-	if (strchr(old_op, ',') || strchr(old_op, '$') ||
-	    regexec(&sdt_op_regex, old_op, 6, rm, 0)   ||
-	    rm[4].rm_eo - rm[4].rm_so > SDT_REG_NAME_SIZE) {
-		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
-		return SDT_ARG_SKIP;
-	}
-
-	/*
-	 * Prepare prefix.
-	 * If SDT OP has parenthesis but does not provide
-	 * displacement, add 0 for displacement.
-	 *     SDT         Uprobe     Prefix
-	 *     -----------------------------
-	 *     +24(%rdi)   +24(%di)   +
-	 *     24(%rdi)    +24(%di)   +
-	 *     %rdi        %di
-	 *     (%rdi)      +0(%di)    +0
-	 *     -80(%rbx)   -80(%bx)   -
-	 */
-	if (rm[3].rm_so != rm[3].rm_eo) {
-		if (rm[1].rm_so != rm[1].rm_eo)
-			prefix[0] = *(old_op + rm[1].rm_so);
-		else if (rm[2].rm_so != rm[2].rm_eo)
-			prefix[0] = '+';
-		else
-			scnprintf(prefix, sizeof(prefix), "+0");
-	}
-
-	/* Rename register */
-	sdt_rename_register(old_op + rm[4].rm_so, rm[4].rm_eo - rm[4].rm_so,
-			    new_reg);
-
-	/* Prepare final OP which should be valid for uprobe_events */
-	new_len = strlen(prefix)              +
-		  (rm[2].rm_eo - rm[2].rm_so) +
-		  (rm[3].rm_eo - rm[3].rm_so) +
-		  strlen(new_reg)             +
-		  (rm[5].rm_eo - rm[5].rm_so) +
-		  1;					/* NULL */
-
-	*new_op = zalloc(new_len);
-	if (!*new_op)
-		return -ENOMEM;
-
-	scnprintf(*new_op, new_len, "%.*s%.*s%.*s%.*s%.*s",
-		  strlen(prefix), prefix,
-		  (int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so,
-		  (int)(rm[3].rm_eo - rm[3].rm_so), old_op + rm[3].rm_so,
-		  strlen(new_reg), new_reg,
-		  (int)(rm[5].rm_eo - rm[5].rm_so), old_op + rm[5].rm_so);
-
-	return SDT_ARG_VALID;
-}
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c b/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
index 666874f625b6..6833d34dcbfd 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_aarch64.c
@@ -18,6 +18,92 @@
 #define HWCAP_SVE	(1 << 22)
 #endif
 
+/* %xNUM */
+#define SDT_OP_REGEX1  "^(x[1-2]?[0-9]|3[0-1])$"
+
+/* [sp], [sp, NUM] */
+#define SDT_OP_REGEX2  "^\\[sp(, )?([0-9]+)?\\]$"
+
+static regex_t sdt_op_regex1, sdt_op_regex2;
+
+static int sdt_init_op_regex(void)
+{
+	static int initialized;
+	int ret = 0;
+
+	if (initialized)
+		return 0;
+
+	ret = regcomp(&sdt_op_regex1, SDT_OP_REGEX1, REG_EXTENDED);
+	if (ret)
+		goto error;
+
+	ret = regcomp(&sdt_op_regex2, SDT_OP_REGEX2, REG_EXTENDED);
+	if (ret)
+		goto free_regex1;
+
+	initialized = 1;
+	return 0;
+
+free_regex1:
+	regfree(&sdt_op_regex1);
+error:
+	pr_debug4("Regex compilation error.\n");
+	return ret;
+}
+
+/*
+ * SDT marker arguments on Arm64 uses %xREG or [sp, NUM], currently
+ * support these two formats.
+ */
+int __perf_sdt_arg_parse_op_arm64(char *old_op, char **new_op)
+{
+	int ret, new_len;
+	regmatch_t rm[5];
+
+	ret = sdt_init_op_regex();
+	if (ret < 0)
+		return ret;
+
+	if (!regexec(&sdt_op_regex1, old_op, 3, rm, 0)) {
+		/* Extract xNUM */
+		new_len = 2;	/* % NULL */
+		new_len += (int)(rm[1].rm_eo - rm[1].rm_so);
+
+		*new_op = zalloc(new_len);
+		if (!*new_op)
+			return -ENOMEM;
+
+		scnprintf(*new_op, new_len, "%%%.*s",
+			(int)(rm[1].rm_eo - rm[1].rm_so), old_op + rm[1].rm_so);
+	} else if (!regexec(&sdt_op_regex2, old_op, 5, rm, 0)) {
+		/* [sp], [sp, NUM] or [sp,NUM] */
+		new_len = 7;	/* + ( % s p ) NULL */
+
+		/* If the argument is [sp], need to fill offset '0' */
+		if (rm[2].rm_so == -1)
+			new_len += 1;
+		else
+			new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
+
+		*new_op = zalloc(new_len);
+		if (!*new_op)
+			return -ENOMEM;
+
+		if (rm[2].rm_so == -1)
+			scnprintf(*new_op, new_len, "+0(%%sp)");
+		else
+			scnprintf(*new_op, new_len, "+%.*s(%%sp)",
+				  (int)(rm[2].rm_eo - rm[2].rm_so),
+				  old_op + rm[2].rm_so);
+	} else {
+		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+		return SDT_ARG_SKIP;
+	}
+
+	return SDT_ARG_VALID;
+}
+
 uint64_t __perf_reg_mask_arm64(bool intr)
 {
 	struct perf_event_attr attr = {
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c b/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
index f0a547ad809b..217a001ccd2e 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_powerpc.c
@@ -19,6 +19,112 @@
 #define PVR_POWER10		0x0080
 #define PVR_POWER11		0x0082
 
+/* REG or %rREG */
+#define SDT_OP_REGEX1  "^(%r)?([1-2]?[0-9]|3[0-1])$"
+
+/* -NUM(REG) or NUM(REG) or -NUM(%rREG) or NUM(%rREG) */
+#define SDT_OP_REGEX2  "^(\\-)?([0-9]+)\\((%r)?([1-2]?[0-9]|3[0-1])\\)$"
+
+static regex_t sdt_op_regex1, sdt_op_regex2;
+
+static int sdt_init_op_regex(void)
+{
+	static int initialized;
+	int ret = 0;
+
+	if (initialized)
+		return 0;
+
+	ret = regcomp(&sdt_op_regex1, SDT_OP_REGEX1, REG_EXTENDED);
+	if (ret)
+		goto error;
+
+	ret = regcomp(&sdt_op_regex2, SDT_OP_REGEX2, REG_EXTENDED);
+	if (ret)
+		goto free_regex1;
+
+	initialized = 1;
+	return 0;
+
+free_regex1:
+	regfree(&sdt_op_regex1);
+error:
+	pr_debug4("Regex compilation error.\n");
+	return ret;
+}
+
+/*
+ * Parse OP and convert it into uprobe format, which is, +/-NUM(%gprREG).
+ * Possible variants of OP are:
+ *	Format		Example
+ *	-------------------------
+ *	NUM(REG)	48(18)
+ *	-NUM(REG)	-48(18)
+ *	NUM(%rREG)	48(%r18)
+ *	-NUM(%rREG)	-48(%r18)
+ *	REG		18
+ *	%rREG		%r18
+ *	iNUM		i0
+ *	i-NUM		i-1
+ *
+ * SDT marker arguments on Powerpc uses %rREG form with -mregnames flag
+ * and REG form with -mno-regnames. Here REG is general purpose register,
+ * which is in 0 to 31 range.
+ */
+int __perf_sdt_arg_parse_op_powerpc(char *old_op, char **new_op)
+{
+	int ret, new_len;
+	regmatch_t rm[5];
+	char prefix;
+
+	/* Constant argument. Uprobe does not support it */
+	if (old_op[0] == 'i') {
+		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+		return SDT_ARG_SKIP;
+	}
+
+	ret = sdt_init_op_regex();
+	if (ret < 0)
+		return ret;
+
+	if (!regexec(&sdt_op_regex1, old_op, 3, rm, 0)) {
+		/* REG or %rREG --> %gprREG */
+
+		new_len = 5;	/* % g p r NULL */
+		new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
+
+		*new_op = zalloc(new_len);
+		if (!*new_op)
+			return -ENOMEM;
+
+		scnprintf(*new_op, new_len, "%%gpr%.*s",
+			(int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so);
+	} else if (!regexec(&sdt_op_regex2, old_op, 5, rm, 0)) {
+		/*
+		 * -NUM(REG) or NUM(REG) or -NUM(%rREG) or NUM(%rREG) -->
+		 *	+/-NUM(%gprREG)
+		 */
+		prefix = (rm[1].rm_so == -1) ? '+' : '-';
+
+		new_len = 8;	/* +/- ( % g p r ) NULL */
+		new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
+		new_len += (int)(rm[4].rm_eo - rm[4].rm_so);
+
+		*new_op = zalloc(new_len);
+		if (!*new_op)
+			return -ENOMEM;
+
+		scnprintf(*new_op, new_len, "%c%.*s(%%gpr%.*s)", prefix,
+			(int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so,
+			(int)(rm[4].rm_eo - rm[4].rm_so), old_op + rm[4].rm_so);
+	} else {
+		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+		return SDT_ARG_SKIP;
+	}
+
+	return SDT_ARG_VALID;
+}
+
 /*
  * mfspr is a POWERPC specific instruction, ensure it's only
  * built and called on POWERPC by guarding with __powerpc64__
diff --git a/tools/perf/util/perf-regs-arch/perf_regs_x86.c b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
index d573f9a9ca46..b6d20522b4e8 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_x86.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_x86.c
@@ -14,6 +14,227 @@
 #include "../../perf-sys.h"
 #include "../../arch/x86/include/perf_regs.h"
 
+struct sdt_name_reg {
+	const char *sdt_name;
+	const char *uprobe_name;
+};
+#define SDT_NAME_REG(n, m) {.sdt_name = "%" #n, .uprobe_name = "%" #m}
+#define SDT_NAME_REG_END {.sdt_name = NULL, .uprobe_name = NULL}
+
+static const struct sdt_name_reg sdt_reg_tbl[] = {
+	SDT_NAME_REG(eax, ax),
+	SDT_NAME_REG(rax, ax),
+	SDT_NAME_REG(al,  ax),
+	SDT_NAME_REG(ah,  ax),
+	SDT_NAME_REG(ebx, bx),
+	SDT_NAME_REG(rbx, bx),
+	SDT_NAME_REG(bl,  bx),
+	SDT_NAME_REG(bh,  bx),
+	SDT_NAME_REG(ecx, cx),
+	SDT_NAME_REG(rcx, cx),
+	SDT_NAME_REG(cl,  cx),
+	SDT_NAME_REG(ch,  cx),
+	SDT_NAME_REG(edx, dx),
+	SDT_NAME_REG(rdx, dx),
+	SDT_NAME_REG(dl,  dx),
+	SDT_NAME_REG(dh,  dx),
+	SDT_NAME_REG(esi, si),
+	SDT_NAME_REG(rsi, si),
+	SDT_NAME_REG(sil, si),
+	SDT_NAME_REG(edi, di),
+	SDT_NAME_REG(rdi, di),
+	SDT_NAME_REG(dil, di),
+	SDT_NAME_REG(ebp, bp),
+	SDT_NAME_REG(rbp, bp),
+	SDT_NAME_REG(bpl, bp),
+	SDT_NAME_REG(rsp, sp),
+	SDT_NAME_REG(esp, sp),
+	SDT_NAME_REG(spl, sp),
+
+	/* rNN registers */
+	SDT_NAME_REG(r8b,  r8),
+	SDT_NAME_REG(r8w,  r8),
+	SDT_NAME_REG(r8d,  r8),
+	SDT_NAME_REG(r9b,  r9),
+	SDT_NAME_REG(r9w,  r9),
+	SDT_NAME_REG(r9d,  r9),
+	SDT_NAME_REG(r10b, r10),
+	SDT_NAME_REG(r10w, r10),
+	SDT_NAME_REG(r10d, r10),
+	SDT_NAME_REG(r11b, r11),
+	SDT_NAME_REG(r11w, r11),
+	SDT_NAME_REG(r11d, r11),
+	SDT_NAME_REG(r12b, r12),
+	SDT_NAME_REG(r12w, r12),
+	SDT_NAME_REG(r12d, r12),
+	SDT_NAME_REG(r13b, r13),
+	SDT_NAME_REG(r13w, r13),
+	SDT_NAME_REG(r13d, r13),
+	SDT_NAME_REG(r14b, r14),
+	SDT_NAME_REG(r14w, r14),
+	SDT_NAME_REG(r14d, r14),
+	SDT_NAME_REG(r15b, r15),
+	SDT_NAME_REG(r15w, r15),
+	SDT_NAME_REG(r15d, r15),
+	SDT_NAME_REG_END,
+};
+
+/*
+ * Perf only supports OP which is in  +/-NUM(REG)  form.
+ * Here plus-minus sign, NUM and parenthesis are optional,
+ * only REG is mandatory.
+ *
+ * SDT events also supports indirect addressing mode with a
+ * symbol as offset, scaled mode and constants in OP. But
+ * perf does not support them yet. Below are few examples.
+ *
+ * OP with scaled mode:
+ *     (%rax,%rsi,8)
+ *     10(%ras,%rsi,8)
+ *
+ * OP with indirect addressing mode:
+ *     check_action(%rip)
+ *     mp_+52(%rip)
+ *     44+mp_(%rip)
+ *
+ * OP with constant values:
+ *     $0
+ *     $123
+ *     $-1
+ */
+#define SDT_OP_REGEX  "^([+\\-]?)([0-9]*)(\\(?)(%[a-z][a-z0-9]+)(\\)?)$"
+
+static regex_t sdt_op_regex;
+
+static int sdt_init_op_regex(void)
+{
+	static int initialized;
+	int ret = 0;
+
+	if (initialized)
+		return 0;
+
+	ret = regcomp(&sdt_op_regex, SDT_OP_REGEX, REG_EXTENDED);
+	if (ret < 0) {
+		pr_debug4("Regex compilation error.\n");
+		return ret;
+	}
+
+	initialized = 1;
+	return 0;
+}
+
+/*
+ * Max x86 register name length is 5(ex: %r15d). So, 6th char
+ * should always contain NULL. This helps to find register name
+ * length using strlen, instead of maintaining one more variable.
+ */
+#define SDT_REG_NAME_SIZE  6
+
+/*
+ * The uprobe parser does not support all gas register names;
+ * so, we have to replace them (ex. for x86_64: %rax -> %ax).
+ * Note: If register does not require renaming, just copy
+ * paste as it is, but don't leave it empty.
+ */
+static void sdt_rename_register(char *sdt_reg, int sdt_len, char *uprobe_reg)
+{
+	int i = 0;
+
+	for (i = 0; sdt_reg_tbl[i].sdt_name != NULL; i++) {
+		if (!strncmp(sdt_reg_tbl[i].sdt_name, sdt_reg, sdt_len)) {
+			strcpy(uprobe_reg, sdt_reg_tbl[i].uprobe_name);
+			return;
+		}
+	}
+
+	strncpy(uprobe_reg, sdt_reg, sdt_len);
+}
+
+int __perf_sdt_arg_parse_op_x86(char *old_op, char **new_op)
+{
+	char new_reg[SDT_REG_NAME_SIZE] = {0};
+	int new_len = 0, ret;
+	/*
+	 * rm[0]:  +/-NUM(REG)
+	 * rm[1]:  +/-
+	 * rm[2]:  NUM
+	 * rm[3]:  (
+	 * rm[4]:  REG
+	 * rm[5]:  )
+	 */
+	regmatch_t rm[6];
+	/*
+	 * Max prefix length is 2 as it may contains sign(+/-)
+	 * and displacement 0 (Both sign and displacement 0 are
+	 * optional so it may be empty). Use one more character
+	 * to hold last NULL so that strlen can be used to find
+	 * prefix length, instead of maintaining one more variable.
+	 */
+	char prefix[3] = {0};
+
+	ret = sdt_init_op_regex();
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * If unsupported OR does not match with regex OR
+	 * register name too long, skip it.
+	 */
+	if (strchr(old_op, ',') || strchr(old_op, '$') ||
+	    regexec(&sdt_op_regex, old_op, 6, rm, 0)   ||
+	    rm[4].rm_eo - rm[4].rm_so > SDT_REG_NAME_SIZE) {
+		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+		return SDT_ARG_SKIP;
+	}
+
+	/*
+	 * Prepare prefix.
+	 * If SDT OP has parenthesis but does not provide
+	 * displacement, add 0 for displacement.
+	 *     SDT         Uprobe     Prefix
+	 *     -----------------------------
+	 *     +24(%rdi)   +24(%di)   +
+	 *     24(%rdi)    +24(%di)   +
+	 *     %rdi        %di
+	 *     (%rdi)      +0(%di)    +0
+	 *     -80(%rbx)   -80(%bx)   -
+	 */
+	if (rm[3].rm_so != rm[3].rm_eo) {
+		if (rm[1].rm_so != rm[1].rm_eo)
+			prefix[0] = *(old_op + rm[1].rm_so);
+		else if (rm[2].rm_so != rm[2].rm_eo)
+			prefix[0] = '+';
+		else
+			scnprintf(prefix, sizeof(prefix), "+0");
+	}
+
+	/* Rename register */
+	sdt_rename_register(old_op + rm[4].rm_so, rm[4].rm_eo - rm[4].rm_so,
+			    new_reg);
+
+	/* Prepare final OP which should be valid for uprobe_events */
+	new_len = strlen(prefix)              +
+		  (rm[2].rm_eo - rm[2].rm_so) +
+		  (rm[3].rm_eo - rm[3].rm_so) +
+		  strlen(new_reg)             +
+		  (rm[5].rm_eo - rm[5].rm_so) +
+		  1;					/* NULL */
+
+	*new_op = zalloc(new_len);
+	if (!*new_op)
+		return -ENOMEM;
+
+	scnprintf(*new_op, new_len, "%.*s%.*s%.*s%.*s%.*s",
+		  strlen(prefix), prefix,
+		  (int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so,
+		  (int)(rm[3].rm_eo - rm[3].rm_so), old_op + rm[3].rm_so,
+		  strlen(new_reg), new_reg,
+		  (int)(rm[5].rm_eo - rm[5].rm_so), old_op + rm[5].rm_so);
+
+	return SDT_ARG_VALID;
+}
+
 uint64_t __perf_reg_mask_x86(bool intr)
 {
 	struct perf_event_attr attr = {
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index 4d9a286a0e56..5b8f34beb24e 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -7,10 +7,29 @@
 #include "util/sample.h"
 #include "debug.h"
 
-int __weak arch_sdt_arg_parse_op(char *old_op __maybe_unused,
-				 char **new_op __maybe_unused)
+int perf_sdt_arg_parse_op(uint16_t e_machine, char *old_op, char **new_op)
 {
-	return SDT_ARG_SKIP;
+	int ret = SDT_ARG_SKIP;
+
+	switch (e_machine) {
+	case EM_AARCH64:
+		ret = __perf_sdt_arg_parse_op_arm64(old_op, new_op);
+		break;
+	case EM_PPC:
+	case EM_PPC64:
+		ret = __perf_sdt_arg_parse_op_powerpc(old_op, new_op);
+		break;
+	case EM_386:
+	case EM_X86_64:
+		ret = __perf_sdt_arg_parse_op_x86(old_op, new_op);
+		break;
+	default:
+		pr_debug("Unknown ELF machine %d, standard arguments parse will be skipped.\n",
+			 e_machine);
+		break;
+	}
+
+	return ret;
 }
 
 uint64_t perf_intr_reg_mask(uint16_t e_machine)
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index 2b27139acadb..7c04700bf837 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -12,7 +12,7 @@ enum {
 	SDT_ARG_SKIP,
 };
 
-int arch_sdt_arg_parse_op(char *old_op, char **new_op);
+int perf_sdt_arg_parse_op(uint16_t e_machine, char *old_op, char **new_op);
 uint64_t perf_intr_reg_mask(uint16_t e_machine);
 uint64_t perf_user_reg_mask(uint16_t e_machine);
 
@@ -21,6 +21,7 @@ int perf_reg_value(u64 *valp, struct regs_dump *regs, int id);
 uint64_t perf_arch_reg_ip(uint16_t e_machine);
 uint64_t perf_arch_reg_sp(uint16_t e_machine);
 
+int __perf_sdt_arg_parse_op_arm64(char *old_op, char **new_op);
 uint64_t __perf_reg_mask_arm64(bool intr);
 const char *__perf_reg_name_arm64(int id);
 uint64_t __perf_reg_ip_arm64(void);
@@ -46,6 +47,7 @@ const char *__perf_reg_name_mips(int id);
 uint64_t __perf_reg_ip_mips(void);
 uint64_t __perf_reg_sp_mips(void);
 
+int __perf_sdt_arg_parse_op_powerpc(char *old_op, char **new_op);
 uint64_t __perf_reg_mask_powerpc(bool intr);
 const char *__perf_reg_name_powerpc(int id);
 uint64_t __perf_reg_ip_powerpc(void);
@@ -61,6 +63,7 @@ const char *__perf_reg_name_s390(int id);
 uint64_t __perf_reg_ip_s390(void);
 uint64_t __perf_reg_sp_s390(void);
 
+int __perf_sdt_arg_parse_op_x86(char *old_op, char **new_op);
 uint64_t __perf_reg_mask_x86(bool intr);
 const char *__perf_reg_name_x86(int id);
 uint64_t __perf_reg_ip_x86(void);
diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c
index 5069fb61f48c..f78c3bc3d601 100644
--- a/tools/perf/util/probe-file.c
+++ b/tools/perf/util/probe-file.c
@@ -28,6 +28,7 @@
 #include "session.h"
 #include "perf_regs.h"
 #include "string2.h"
+#include "dwarf-regs.h"
 
 /* 4096 - 2 ('\n' + '\0') */
 #define MAX_CMDLEN 4094
@@ -784,7 +785,7 @@ static int synthesize_sdt_probe_arg(struct strbuf *buf, int i, const char *arg)
 		op = desc;
 	}
 
-	ret = arch_sdt_arg_parse_op(op, &new_op);
+	ret = perf_sdt_arg_parse_op(EM_HOST, op, &new_op);
 
 	if (ret < 0)
 		goto error;
-- 
cgit v1.2.3


From 42fc7e6543f6d17d2cf9ed3e5021f103a3d11182 Mon Sep 17 00:00:00 2001
From: Günther Noack <gnoack@google.com>
Date: Thu, 27 Nov 2025 12:51:34 +0100
Subject: landlock: Multithreading support for landlock_restrict_self()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce the LANDLOCK_RESTRICT_SELF_TSYNC flag.  With this flag, a
given Landlock ruleset is applied to all threads of the calling
process, instead of only the current one.

Without this flag, multithreaded userspace programs currently resort
to using the nptl(7)/libpsx hack for multithreaded policy enforcement,
which is also used by libcap and for setuid(2).  Using this
userspace-based scheme, the threads of a process enforce the same
Landlock policy, but the resulting Landlock domains are still
separate.  The domains being separate causes multiple problems:

* When using Landlock's "scoped" access rights, the domain identity is
  used to determine whether an operation is permitted.  As a result,
  when using LANLDOCK_SCOPE_SIGNAL, signaling between sibling threads
  stops working.  This is a problem for programming languages and
  frameworks which are inherently multithreaded (e.g. Go).

* In audit logging, the domains of separate threads in a process will
  get logged with different domain IDs, even when they are based on
  the same ruleset FD, which might confuse users.

Cc: Andrew G. Morgan <morgan@kernel.org>
Cc: John Johansen <john.johansen@canonical.com>
Cc: Paul Moore <paul@paul-moore.com>
Suggested-by: Jann Horn <jannh@google.com>
Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20251127115136.3064948-2-gnoack@google.com
[mic: Fix restrict_self_flags test, clean up Makefile, allign comments,
reduce local variable scope, add missing includes]
Closes: https://github.com/landlock-lsm/linux/issues/2
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 include/uapi/linux/landlock.h                |  13 +
 security/landlock/Makefile                   |  11 +-
 security/landlock/cred.h                     |  12 +
 security/landlock/limits.h                   |   2 +-
 security/landlock/syscalls.c                 |  61 +--
 security/landlock/tsync.c                    | 561 +++++++++++++++++++++++++++
 security/landlock/tsync.h                    |  16 +
 tools/testing/selftests/landlock/base_test.c |   4 +-
 8 files changed, 650 insertions(+), 30 deletions(-)
 create mode 100644 security/landlock/tsync.c
 create mode 100644 security/landlock/tsync.h

(limited to 'tools')

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index 75fd7f5e6cc3..d5081ab4e5ef 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -117,11 +117,24 @@ struct landlock_ruleset_attr {
  *     future nested domains, not the one being created. It can also be used
  *     with a @ruleset_fd value of -1 to mute subdomain logs without creating a
  *     domain.
+ *
+ * The following flag supports policy enforcement in multithreaded processes:
+ *
+ * %LANDLOCK_RESTRICT_SELF_TSYNC
+ *     Applies the new Landlock configuration atomically to all threads of the
+ *     current process, including the Landlock domain and logging
+ *     configuration. This overrides the Landlock configuration of sibling
+ *     threads, irrespective of previously established Landlock domains and
+ *     logging configurations on these threads.
+ *
+ *     If the calling thread is running with no_new_privs, this operation
+ *     enables no_new_privs on the sibling threads as well.
  */
 /* clang-format off */
 #define LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF		(1U << 0)
 #define LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON			(1U << 1)
 #define LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF		(1U << 2)
+#define LANDLOCK_RESTRICT_SELF_TSYNC				(1U << 3)
 /* clang-format on */
 
 /**
diff --git a/security/landlock/Makefile b/security/landlock/Makefile
index 3160c2bdac1d..ffa7646d99f3 100644
--- a/security/landlock/Makefile
+++ b/security/landlock/Makefile
@@ -1,7 +1,14 @@
 obj-$(CONFIG_SECURITY_LANDLOCK) := landlock.o
 
-landlock-y := setup.o syscalls.o object.o ruleset.o \
-	cred.o task.o fs.o
+landlock-y := \
+	setup.o \
+	syscalls.o \
+	object.o \
+	ruleset.o \
+	cred.o \
+	task.o \
+	fs.o \
+	tsync.o
 
 landlock-$(CONFIG_INET) += net.o
 
diff --git a/security/landlock/cred.h b/security/landlock/cred.h
index c82fe63ec598..c10a06727eb1 100644
--- a/security/landlock/cred.h
+++ b/security/landlock/cred.h
@@ -26,6 +26,8 @@
  * This structure is packed to minimize the size of struct
  * landlock_file_security.  However, it is always aligned in the LSM cred blob,
  * see lsm_set_blob_size().
+ *
+ * When updating this, also update landlock_cred_copy() if needed.
  */
 struct landlock_cred_security {
 	/**
@@ -65,6 +67,16 @@ landlock_cred(const struct cred *cred)
 	return cred->security + landlock_blob_sizes.lbs_cred;
 }
 
+static inline void landlock_cred_copy(struct landlock_cred_security *dst,
+				      const struct landlock_cred_security *src)
+{
+	landlock_put_ruleset(dst->domain);
+
+	*dst = *src;
+
+	landlock_get_ruleset(src->domain);
+}
+
 static inline struct landlock_ruleset *landlock_get_current_domain(void)
 {
 	return landlock_cred(current_cred())->domain;
diff --git a/security/landlock/limits.h b/security/landlock/limits.h
index 65b5ff051674..eb584f47288d 100644
--- a/security/landlock/limits.h
+++ b/security/landlock/limits.h
@@ -31,7 +31,7 @@
 #define LANDLOCK_MASK_SCOPE		((LANDLOCK_LAST_SCOPE << 1) - 1)
 #define LANDLOCK_NUM_SCOPE		__const_hweight64(LANDLOCK_MASK_SCOPE)
 
-#define LANDLOCK_LAST_RESTRICT_SELF	LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF
+#define LANDLOCK_LAST_RESTRICT_SELF	LANDLOCK_RESTRICT_SELF_TSYNC
 #define LANDLOCK_MASK_RESTRICT_SELF	((LANDLOCK_LAST_RESTRICT_SELF << 1) - 1)
 
 /* clang-format on */
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index 0116e9f93ffe..3e4e99deb7f9 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -36,6 +36,7 @@
 #include "net.h"
 #include "ruleset.h"
 #include "setup.h"
+#include "tsync.h"
 
 static bool is_initialized(void)
 {
@@ -161,7 +162,7 @@ static const struct file_operations ruleset_fops = {
  * Documentation/userspace-api/landlock.rst should be updated to reflect the
  * UAPI change.
  */
-const int landlock_abi_version = 7;
+const int landlock_abi_version = 8;
 
 /**
  * sys_landlock_create_ruleset - Create a new ruleset
@@ -454,9 +455,10 @@ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd,
  *         - %LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF
  *         - %LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON
  *         - %LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF
+ *         - %LANDLOCK_RESTRICT_SELF_TSYNC
  *
- * This system call enables to enforce a Landlock ruleset on the current
- * thread.  Enforcing a ruleset requires that the task has %CAP_SYS_ADMIN in its
+ * This system call enforces a Landlock ruleset on the current thread.
+ * Enforcing a ruleset requires that the task has %CAP_SYS_ADMIN in its
  * namespace or is running with no_new_privs.  This avoids scenarios where
  * unprivileged tasks can affect the behavior of privileged children.
  *
@@ -478,8 +480,7 @@ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd,
 SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32,
 		flags)
 {
-	struct landlock_ruleset *new_dom,
-		*ruleset __free(landlock_put_ruleset) = NULL;
+	struct landlock_ruleset *ruleset __free(landlock_put_ruleset) = NULL;
 	struct cred *new_cred;
 	struct landlock_cred_security *new_llcred;
 	bool __maybe_unused log_same_exec, log_new_exec, log_subdomains,
@@ -538,33 +539,43 @@ SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32,
 	 * We could optimize this case by not calling commit_creds() if this flag
 	 * was already set, but it is not worth the complexity.
 	 */
-	if (!ruleset)
-		return commit_creds(new_cred);
-
-	/*
-	 * There is no possible race condition while copying and manipulating
-	 * the current credentials because they are dedicated per thread.
-	 */
-	new_dom = landlock_merge_ruleset(new_llcred->domain, ruleset);
-	if (IS_ERR(new_dom)) {
-		abort_creds(new_cred);
-		return PTR_ERR(new_dom);
-	}
+	if (ruleset) {
+		/*
+		 * There is no possible race condition while copying and
+		 * manipulating the current credentials because they are
+		 * dedicated per thread.
+		 */
+		struct landlock_ruleset *const new_dom =
+			landlock_merge_ruleset(new_llcred->domain, ruleset);
+		if (IS_ERR(new_dom)) {
+			abort_creds(new_cred);
+			return PTR_ERR(new_dom);
+		}
 
 #ifdef CONFIG_AUDIT
-	new_dom->hierarchy->log_same_exec = log_same_exec;
-	new_dom->hierarchy->log_new_exec = log_new_exec;
-	if ((!log_same_exec && !log_new_exec) || !prev_log_subdomains)
-		new_dom->hierarchy->log_status = LANDLOCK_LOG_DISABLED;
+		new_dom->hierarchy->log_same_exec = log_same_exec;
+		new_dom->hierarchy->log_new_exec = log_new_exec;
+		if ((!log_same_exec && !log_new_exec) || !prev_log_subdomains)
+			new_dom->hierarchy->log_status = LANDLOCK_LOG_DISABLED;
 #endif /* CONFIG_AUDIT */
 
-	/* Replaces the old (prepared) domain. */
-	landlock_put_ruleset(new_llcred->domain);
-	new_llcred->domain = new_dom;
+		/* Replaces the old (prepared) domain. */
+		landlock_put_ruleset(new_llcred->domain);
+		new_llcred->domain = new_dom;
 
 #ifdef CONFIG_AUDIT
-	new_llcred->domain_exec |= BIT(new_dom->num_layers - 1);
+		new_llcred->domain_exec |= BIT(new_dom->num_layers - 1);
 #endif /* CONFIG_AUDIT */
+	}
+
+	if (flags & LANDLOCK_RESTRICT_SELF_TSYNC) {
+		const int err = landlock_restrict_sibling_threads(
+			current_cred(), new_cred);
+		if (err) {
+			abort_creds(new_cred);
+			return err;
+		}
+	}
 
 	return commit_creds(new_cred);
 }
diff --git a/security/landlock/tsync.c b/security/landlock/tsync.c
new file mode 100644
index 000000000000..0d2b9c646030
--- /dev/null
+++ b/security/landlock/tsync.c
@@ -0,0 +1,561 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Landlock - Cross-thread ruleset enforcement
+ *
+ * Copyright © 2025 Google LLC
+ */
+
+#include <linux/atomic.h>
+#include <linux/cleanup.h>
+#include <linux/completion.h>
+#include <linux/cred.h>
+#include <linux/errno.h>
+#include <linux/overflow.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/task_work.h>
+
+#include "cred.h"
+#include "tsync.h"
+
+/*
+ * Shared state between multiple threads which are enforcing Landlock rulesets
+ * in lockstep with each other.
+ */
+struct tsync_shared_context {
+	/* The old and tentative new creds of the calling thread. */
+	const struct cred *old_cred;
+	const struct cred *new_cred;
+
+	/* True if sibling tasks need to set the no_new_privs flag. */
+	bool set_no_new_privs;
+
+	/* An error encountered in preparation step, or 0. */
+	atomic_t preparation_error;
+
+	/*
+	 * Barrier after preparation step in restrict_one_thread.
+	 * The calling thread waits for completion.
+	 *
+	 * Re-initialized on every round of looking for newly spawned threads.
+	 */
+	atomic_t num_preparing;
+	struct completion all_prepared;
+
+	/* Sibling threads wait for completion. */
+	struct completion ready_to_commit;
+
+	/*
+	 * Barrier after commit step (used by syscall impl to wait for
+	 * completion).
+	 */
+	atomic_t num_unfinished;
+	struct completion all_finished;
+};
+
+struct tsync_work {
+	struct callback_head work;
+	struct task_struct *task;
+	struct tsync_shared_context *shared_ctx;
+};
+
+/*
+ * restrict_one_thread - update a thread's Landlock domain in lockstep with the
+ * other threads in the same process
+ *
+ * When this is run, the same function gets run in all other threads in the same
+ * process (except for the calling thread which called landlock_restrict_self).
+ * The concurrently running invocations of restrict_one_thread coordinate
+ * through the shared ctx object to do their work in lockstep to implement
+ * all-or-nothing semantics for enforcing the new Landlock domain.
+ *
+ * Afterwards, depending on the presence of an error, all threads either commit
+ * or abort the prepared credentials.  The commit operation can not fail any
+ * more.
+ */
+static void restrict_one_thread(struct tsync_shared_context *ctx)
+{
+	int err;
+	struct cred *cred = NULL;
+
+	if (current_cred() == ctx->old_cred) {
+		/*
+		 * Switch out old_cred with new_cred, if possible.
+		 *
+		 * In the common case, where all threads initially point to the same
+		 * struct cred, this optimization avoids creating separate redundant
+		 * credentials objects for each, which would all have the same contents.
+		 *
+		 * Note: We are intentionally dropping the const qualifier here, because
+		 * it is required by commit_creds() and abort_creds().
+		 */
+		cred = (struct cred *)get_cred(ctx->new_cred);
+	} else {
+		/* Else, prepare new creds and populate them. */
+		cred = prepare_creds();
+
+		if (!cred) {
+			atomic_set(&ctx->preparation_error, -ENOMEM);
+
+			/*
+			 * Even on error, we need to adhere to the protocol and coordinate
+			 * with concurrently running invocations.
+			 */
+			if (atomic_dec_return(&ctx->num_preparing) == 0)
+				complete_all(&ctx->all_prepared);
+
+			goto out;
+		}
+
+		landlock_cred_copy(landlock_cred(cred),
+				   landlock_cred(ctx->new_cred));
+	}
+
+	/*
+	 * Barrier: Wait until all threads are done preparing.
+	 * After this point, we can have no more failures.
+	 */
+	if (atomic_dec_return(&ctx->num_preparing) == 0)
+		complete_all(&ctx->all_prepared);
+
+	/*
+	 * Wait for signal from calling thread that it's safe to read the
+	 * preparation error now and we are ready to commit (or abort).
+	 */
+	wait_for_completion(&ctx->ready_to_commit);
+
+	/* Abort the commit if any of the other threads had an error. */
+	err = atomic_read(&ctx->preparation_error);
+	if (err) {
+		abort_creds(cred);
+		goto out;
+	}
+
+	/*
+	 * Make sure that all sibling tasks fulfill the no_new_privs prerequisite.
+	 * (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in
+	 * kernel/seccomp.c)
+	 */
+	if (ctx->set_no_new_privs)
+		task_set_no_new_privs(current);
+
+	commit_creds(cred);
+
+out:
+	/* Notify the calling thread once all threads are done */
+	if (atomic_dec_return(&ctx->num_unfinished) == 0)
+		complete_all(&ctx->all_finished);
+}
+
+/*
+ * restrict_one_thread_callback - task_work callback for restricting a thread
+ *
+ * Calls restrict_one_thread with the struct landlock_shared_tsync_context.
+ */
+static void restrict_one_thread_callback(struct callback_head *work)
+{
+	struct tsync_work *ctx = container_of(work, struct tsync_work, work);
+
+	restrict_one_thread(ctx->shared_ctx);
+}
+
+/*
+ * struct tsync_works - a growable array of per-task contexts
+ *
+ * The zero-initialized struct represents the empty array.
+ */
+struct tsync_works {
+	struct tsync_work **works;
+	size_t size;
+	size_t capacity;
+};
+
+/*
+ * tsync_works_provide - provides a preallocated tsync_work for the given task
+ *
+ * This also stores a task pointer in the context and increments the reference
+ * count of the task.
+ *
+ * This function may fail in the case where we did not preallocate sufficient
+ * capacity.  This can legitimately happen if new threads get started after we
+ * grew the capacity.
+ *
+ * Returns:
+ *   A pointer to the preallocated context struct, with task filled in.
+ *
+ *   NULL, if we ran out of preallocated context structs.
+ */
+static struct tsync_work *tsync_works_provide(struct tsync_works *s,
+					      struct task_struct *task)
+{
+	struct tsync_work *ctx;
+
+	if (s->size >= s->capacity)
+		return NULL;
+
+	ctx = s->works[s->size];
+	s->size++;
+
+	ctx->task = get_task_struct(task);
+	return ctx;
+}
+
+/*
+ * tsync_works_grow_by - preallocates space for n more contexts in s
+ *
+ * On a successful return, the subsequent n calls to tsync_works_provide() are
+ * guaranteed to succeed.  (size + n <= capacity)
+ *
+ * Returns:
+ *   -ENOMEM if the (re)allocation fails
+
+ *   0       if the allocation succeeds, partially succeeds, or no reallocation
+ *           was needed
+ */
+static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
+{
+	size_t i;
+	size_t new_capacity;
+	struct tsync_work **works;
+	struct tsync_work *work;
+
+	if (check_add_overflow(s->size, n, &new_capacity))
+		return -EOVERFLOW;
+
+	/* No need to reallocate if s already has sufficient capacity. */
+	if (new_capacity <= s->capacity)
+		return 0;
+
+	works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
+			       flags);
+	if (!works)
+		return -ENOMEM;
+
+	s->works = works;
+
+	for (i = s->capacity; i < new_capacity; i++) {
+		work = kzalloc(sizeof(*work), flags);
+		if (!work) {
+			/*
+			 * Leave the object in a consistent state,
+			 * but return an error.
+			 */
+			s->capacity = i;
+			return -ENOMEM;
+		}
+		s->works[i] = work;
+	}
+	s->capacity = new_capacity;
+	return 0;
+}
+
+/*
+ * tsync_works_contains - checks for presence of task in s
+ */
+static bool tsync_works_contains_task(const struct tsync_works *s,
+				      struct task_struct *task)
+{
+	size_t i;
+
+	for (i = 0; i < s->size; i++)
+		if (s->works[i]->task == task)
+			return true;
+	return false;
+}
+
+/*
+ * tsync_works_release - frees memory held by s and drops all task references
+ *
+ * This does not free s itself, only the data structures held by it.
+ */
+static void tsync_works_release(struct tsync_works *s)
+{
+	size_t i;
+
+	for (i = 0; i < s->size; i++) {
+		if (!s->works[i]->task)
+			continue;
+
+		put_task_struct(s->works[i]->task);
+	}
+
+	for (i = 0; i < s->capacity; i++)
+		kfree(s->works[i]);
+	kfree(s->works);
+	s->works = NULL;
+	s->size = 0;
+	s->capacity = 0;
+}
+
+/*
+ * count_additional_threads - counts the sibling threads that are not in works
+ */
+static size_t count_additional_threads(const struct tsync_works *works)
+{
+	struct task_struct *thread, *caller;
+	size_t n = 0;
+
+	caller = current;
+
+	guard(rcu)();
+
+	for_each_thread(caller, thread) {
+		/* Skip current, since it is initiating the sync. */
+		if (thread == caller)
+			continue;
+
+		/* Skip exited threads. */
+		if (thread->flags & PF_EXITING)
+			continue;
+
+		/* Skip threads that we have already seen. */
+		if (tsync_works_contains_task(works, thread))
+			continue;
+
+		n++;
+	}
+	return n;
+}
+
+/*
+ * schedule_task_work - adds task_work for all eligible sibling threads
+ *                      which have not been scheduled yet
+ *
+ * For each added task_work, atomically increments shared_ctx->num_preparing and
+ * shared_ctx->num_unfinished.
+ *
+ * Returns:
+ *     true, if at least one eligible sibling thread was found
+ */
+static bool schedule_task_work(struct tsync_works *works,
+			       struct tsync_shared_context *shared_ctx)
+{
+	int err;
+	struct task_struct *thread, *caller;
+	struct tsync_work *ctx;
+	bool found_more_threads = false;
+
+	caller = current;
+
+	guard(rcu)();
+
+	for_each_thread(caller, thread) {
+		/* Skip current, since it is initiating the sync. */
+		if (thread == caller)
+			continue;
+
+		/* Skip exited threads. */
+		if (thread->flags & PF_EXITING)
+			continue;
+
+		/* Skip threads that we already looked at. */
+		if (tsync_works_contains_task(works, thread))
+			continue;
+
+		/*
+		 * We found a sibling thread that is not doing its task_work yet, and
+		 * which might spawn new threads before our task work runs, so we need
+		 * at least one more round in the outer loop.
+		 */
+		found_more_threads = true;
+
+		ctx = tsync_works_provide(works, thread);
+		if (!ctx) {
+			/*
+			 * We ran out of preallocated contexts -- we need to try again with
+			 * this thread at a later time!
+			 * found_more_threads is already true at this point.
+			 */
+			break;
+		}
+
+		ctx->shared_ctx = shared_ctx;
+
+		atomic_inc(&shared_ctx->num_preparing);
+		atomic_inc(&shared_ctx->num_unfinished);
+
+		init_task_work(&ctx->work, restrict_one_thread_callback);
+		err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
+		if (err) {
+			/*
+			 * task_work_add() only fails if the task is about to exit.  We
+			 * checked that earlier, but it can happen as a race.  Resume
+			 * without setting an error, as the task is probably gone in the
+			 * next loop iteration.  For consistency, remove the task from ctx
+			 * so that it does not look like we handed it a task_work.
+			 */
+			put_task_struct(ctx->task);
+			ctx->task = NULL;
+
+			atomic_dec(&shared_ctx->num_preparing);
+			atomic_dec(&shared_ctx->num_unfinished);
+		}
+	}
+
+	return found_more_threads;
+}
+
+/*
+ * cancel_tsync_works - cancel all task works where it is possible
+ *
+ * Task works can be canceled as long as they are still queued and have not
+ * started running.  If they get canceled, we decrement
+ * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
+ * completions if needed, as if the task was never scheduled.
+ */
+static void cancel_tsync_works(struct tsync_works *works,
+			       struct tsync_shared_context *shared_ctx)
+{
+	int i;
+
+	for (i = 0; i < works->size; i++) {
+		if (!task_work_cancel(works->works[i]->task,
+				      &works->works[i]->work))
+			continue;
+
+		/* After dequeueing, act as if the task work had executed. */
+
+		if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
+			complete_all(&shared_ctx->all_prepared);
+
+		if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
+			complete_all(&shared_ctx->all_finished);
+	}
+}
+
+/*
+ * restrict_sibling_threads - enables a Landlock policy for all sibling threads
+ */
+int landlock_restrict_sibling_threads(const struct cred *old_cred,
+				      const struct cred *new_cred)
+{
+	int err;
+	struct tsync_shared_context shared_ctx;
+	struct tsync_works works = {};
+	size_t newly_discovered_threads;
+	bool found_more_threads;
+
+	atomic_set(&shared_ctx.preparation_error, 0);
+	init_completion(&shared_ctx.all_prepared);
+	init_completion(&shared_ctx.ready_to_commit);
+	atomic_set(&shared_ctx.num_unfinished, 1);
+	init_completion(&shared_ctx.all_finished);
+	shared_ctx.old_cred = old_cred;
+	shared_ctx.new_cred = new_cred;
+	shared_ctx.set_no_new_privs = task_no_new_privs(current);
+
+	/*
+	 * We schedule a pseudo-signal task_work for each of the calling task's
+	 * sibling threads.  In the task work, each thread:
+	 *
+	 * 1) runs prepare_creds() and writes back the error to
+	 *    shared_ctx.preparation_error, if needed.
+	 *
+	 * 2) signals that it's done with prepare_creds() to the calling task.
+	 *    (completion "all_prepared").
+	 *
+	 * 3) waits for the completion "ready_to_commit".  This is sent by the
+	 *    calling task after ensuring that all sibling threads have done
+	 *    with the "preparation" stage.
+	 *
+	 *    After this barrier is reached, it's safe to read
+	 *    shared_ctx.preparation_error.
+	 *
+	 * 4) reads shared_ctx.preparation_error and then either does commit_creds()
+	 *    or abort_creds().
+	 *
+	 * 5) signals that it's done altogether (barrier synchronization
+	 *    "all_finished")
+	 *
+	 * Unlike seccomp, which modifies sibling tasks directly, we do not need to
+	 * acquire the cred_guard_mutex and sighand->siglock:
+	 *
+	 * - As in our case, all threads are themselves exchanging their own struct
+	 *   cred through the credentials API, no locks are needed for that.
+	 * - Our for_each_thread() loops are protected by RCU.
+	 * - We do not acquire a lock to keep the list of sibling threads stable
+	 *   between our for_each_thread loops.  If the list of available sibling
+	 *   threads changes between these for_each_thread loops, we make up for
+	 *   that by continuing to look for threads until they are all discovered
+	 *   and have entered their task_work, where they are unable to spawn new
+	 *   threads.
+	 */
+	do {
+		/* In RCU read-lock, count the threads we need. */
+		newly_discovered_threads = count_additional_threads(&works);
+
+		if (newly_discovered_threads == 0)
+			break; /* done */
+
+		err = tsync_works_grow_by(&works, newly_discovered_threads,
+					  GFP_KERNEL_ACCOUNT);
+		if (err) {
+			atomic_set(&shared_ctx.preparation_error, err);
+			break;
+		}
+
+		/*
+		 * The "all_prepared" barrier is used locally to the loop body, this use
+		 * of for_each_thread().  We can reset it on each loop iteration because
+		 * all previous loop iterations are done with it already.
+		 *
+		 * num_preparing is initialized to 1 so that the counter can not go to 0
+		 * and mark the completion as done before all task works are registered.
+		 * We decrement it at the end of the loop body.
+		 */
+		atomic_set(&shared_ctx.num_preparing, 1);
+		reinit_completion(&shared_ctx.all_prepared);
+
+		/*
+		 * In RCU read-lock, schedule task work on newly discovered sibling
+		 * tasks.
+		 */
+		found_more_threads = schedule_task_work(&works, &shared_ctx);
+
+		/*
+		 * Decrement num_preparing for current, to undo that we initialized it
+		 * to 1 a few lines above.
+		 */
+		if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
+			if (wait_for_completion_interruptible(
+				    &shared_ctx.all_prepared)) {
+				/* In case of interruption, we need to retry the system call. */
+				atomic_set(&shared_ctx.preparation_error,
+					   -ERESTARTNOINTR);
+
+				/*
+				 * Cancel task works for tasks that did not start running yet,
+				 * and decrement all_prepared and num_unfinished accordingly.
+				 */
+				cancel_tsync_works(&works, &shared_ctx);
+
+				/*
+				 * The remaining task works have started running, so waiting for
+				 * their completion will finish.
+				 */
+				wait_for_completion(&shared_ctx.all_prepared);
+			}
+		}
+	} while (found_more_threads &&
+		 !atomic_read(&shared_ctx.preparation_error));
+
+	/*
+	 * We now have all sibling threads blocking and in "prepared" state in the
+	 * task work. Ask all threads to commit.
+	 */
+	complete_all(&shared_ctx.ready_to_commit);
+
+	/*
+	 * Decrement num_unfinished for current, to undo that we initialized it to 1
+	 * at the beginning.
+	 */
+	if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
+		wait_for_completion(&shared_ctx.all_finished);
+
+	tsync_works_release(&works);
+
+	return atomic_read(&shared_ctx.preparation_error);
+}
diff --git a/security/landlock/tsync.h b/security/landlock/tsync.h
new file mode 100644
index 000000000000..ef86bb61c2f6
--- /dev/null
+++ b/security/landlock/tsync.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Landlock - Cross-thread ruleset enforcement
+ *
+ * Copyright © 2025 Google LLC
+ */
+
+#ifndef _SECURITY_LANDLOCK_TSYNC_H
+#define _SECURITY_LANDLOCK_TSYNC_H
+
+#include <linux/cred.h>
+
+int landlock_restrict_sibling_threads(const struct cred *old_cred,
+				      const struct cred *new_cred);
+
+#endif /* _SECURITY_LANDLOCK_TSYNC_H */
diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
index 7b69002239d7..fdbb672009ac 100644
--- a/tools/testing/selftests/landlock/base_test.c
+++ b/tools/testing/selftests/landlock/base_test.c
@@ -76,7 +76,7 @@ TEST(abi_version)
 	const struct landlock_ruleset_attr ruleset_attr = {
 		.handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE,
 	};
-	ASSERT_EQ(7, landlock_create_ruleset(NULL, 0,
+	ASSERT_EQ(8, landlock_create_ruleset(NULL, 0,
 					     LANDLOCK_CREATE_RULESET_VERSION));
 
 	ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0,
@@ -306,7 +306,7 @@ TEST(restrict_self_fd_flags)
 
 TEST(restrict_self_flags)
 {
-	const __u32 last_flag = LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF;
+	const __u32 last_flag = LANDLOCK_RESTRICT_SELF_TSYNC;
 
 	/* Tests invalid flag combinations. */
 
-- 
cgit v1.2.3


From 50c058e3eafe31a5197d4cffb599f2f5f165d4eb Mon Sep 17 00:00:00 2001
From: Günther Noack <gnoack@google.com>
Date: Thu, 27 Nov 2025 12:51:35 +0100
Subject: selftests/landlock: Add LANDLOCK_RESTRICT_SELF_TSYNC tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Exercise various scenarios where Landlock domains are enforced across
all of a processes' threads.

Test coverage for security/landlock is 91.6% of 2130 lines according to
LLVM 21.

Cc: Andrew G. Morgan <morgan@kernel.org>
Cc: John Johansen <john.johansen@canonical.com>
Cc: Paul Moore <paul@paul-moore.com>
Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20251127115136.3064948-3-gnoack@google.com
[mic: Fix subject, use EXPECT_EQ(close()), make helpers static, add test
coverage]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/base_test.c  |   4 +-
 tools/testing/selftests/landlock/tsync_test.c | 161 ++++++++++++++++++++++++++
 2 files changed, 163 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/landlock/tsync_test.c

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
index fdbb672009ac..0fea236ef4bd 100644
--- a/tools/testing/selftests/landlock/base_test.c
+++ b/tools/testing/selftests/landlock/base_test.c
@@ -288,7 +288,7 @@ TEST(restrict_self_fd)
 	EXPECT_EQ(EBADFD, errno);
 }
 
-TEST(restrict_self_fd_flags)
+TEST(restrict_self_fd_logging_flags)
 {
 	int fd;
 
@@ -304,7 +304,7 @@ TEST(restrict_self_fd_flags)
 	EXPECT_EQ(EBADFD, errno);
 }
 
-TEST(restrict_self_flags)
+TEST(restrict_self_logging_flags)
 {
 	const __u32 last_flag = LANDLOCK_RESTRICT_SELF_TSYNC;
 
diff --git a/tools/testing/selftests/landlock/tsync_test.c b/tools/testing/selftests/landlock/tsync_test.c
new file mode 100644
index 000000000000..37ef0d2270db
--- /dev/null
+++ b/tools/testing/selftests/landlock/tsync_test.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Landlock tests - Enforcing the same restrictions across multiple threads
+ *
+ * Copyright © 2025 Günther Noack <gnoack3000@gmail.com>
+ */
+
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sys/prctl.h>
+#include <linux/landlock.h>
+
+#include "common.h"
+
+/* create_ruleset - Create a simple ruleset FD common to all tests */
+static int create_ruleset(struct __test_metadata *const _metadata)
+{
+	struct landlock_ruleset_attr ruleset_attr = {
+		.handled_access_fs = (LANDLOCK_ACCESS_FS_WRITE_FILE |
+				      LANDLOCK_ACCESS_FS_TRUNCATE),
+	};
+	const int ruleset_fd =
+		landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
+
+	ASSERT_LE(0, ruleset_fd)
+	{
+		TH_LOG("landlock_create_ruleset: %s", strerror(errno));
+	}
+	return ruleset_fd;
+}
+
+TEST(single_threaded_success)
+{
+	const int ruleset_fd = create_ruleset(_metadata);
+
+	disable_caps(_metadata);
+
+	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+	ASSERT_EQ(0, landlock_restrict_self(ruleset_fd,
+					    LANDLOCK_RESTRICT_SELF_TSYNC));
+
+	EXPECT_EQ(0, close(ruleset_fd));
+}
+
+static void store_no_new_privs(void *data)
+{
+	bool *nnp = data;
+
+	if (!nnp)
+		return;
+	*nnp = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
+}
+
+static void *idle(void *data)
+{
+	pthread_cleanup_push(store_no_new_privs, data);
+
+	while (true)
+		sleep(1);
+
+	pthread_cleanup_pop(1);
+}
+
+TEST(multi_threaded_success)
+{
+	pthread_t t1, t2;
+	bool no_new_privs1, no_new_privs2;
+	const int ruleset_fd = create_ruleset(_metadata);
+
+	disable_caps(_metadata);
+
+	ASSERT_EQ(0, pthread_create(&t1, NULL, idle, &no_new_privs1));
+	ASSERT_EQ(0, pthread_create(&t2, NULL, idle, &no_new_privs2));
+
+	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+
+	EXPECT_EQ(0, landlock_restrict_self(ruleset_fd,
+					    LANDLOCK_RESTRICT_SELF_TSYNC));
+
+	ASSERT_EQ(0, pthread_cancel(t1));
+	ASSERT_EQ(0, pthread_cancel(t2));
+	ASSERT_EQ(0, pthread_join(t1, NULL));
+	ASSERT_EQ(0, pthread_join(t2, NULL));
+
+	/* The no_new_privs flag was implicitly enabled on all threads. */
+	EXPECT_TRUE(no_new_privs1);
+	EXPECT_TRUE(no_new_privs2);
+
+	EXPECT_EQ(0, close(ruleset_fd));
+}
+
+TEST(multi_threaded_success_despite_diverging_domains)
+{
+	pthread_t t1, t2;
+	const int ruleset_fd = create_ruleset(_metadata);
+
+	disable_caps(_metadata);
+
+	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+
+	ASSERT_EQ(0, pthread_create(&t1, NULL, idle, NULL));
+	ASSERT_EQ(0, pthread_create(&t2, NULL, idle, NULL));
+
+	/*
+	 * The main thread enforces a ruleset,
+	 * thereby bringing the threads' Landlock domains out of sync.
+	 */
+	EXPECT_EQ(0, landlock_restrict_self(ruleset_fd, 0));
+
+	/* Still, TSYNC succeeds, bringing the threads in sync again. */
+	EXPECT_EQ(0, landlock_restrict_self(ruleset_fd,
+					    LANDLOCK_RESTRICT_SELF_TSYNC));
+
+	ASSERT_EQ(0, pthread_cancel(t1));
+	ASSERT_EQ(0, pthread_cancel(t2));
+	ASSERT_EQ(0, pthread_join(t1, NULL));
+	ASSERT_EQ(0, pthread_join(t2, NULL));
+	EXPECT_EQ(0, close(ruleset_fd));
+}
+
+struct thread_restrict_data {
+	pthread_t t;
+	int ruleset_fd;
+	int result;
+};
+
+static void *thread_restrict(void *data)
+{
+	struct thread_restrict_data *d = data;
+
+	d->result = landlock_restrict_self(d->ruleset_fd,
+					   LANDLOCK_RESTRICT_SELF_TSYNC);
+	return NULL;
+}
+
+TEST(competing_enablement)
+{
+	const int ruleset_fd = create_ruleset(_metadata);
+	struct thread_restrict_data d[] = {
+		{ .ruleset_fd = ruleset_fd },
+		{ .ruleset_fd = ruleset_fd },
+	};
+
+	disable_caps(_metadata);
+
+	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+	ASSERT_EQ(0, pthread_create(&d[0].t, NULL, thread_restrict, &d[0]));
+	ASSERT_EQ(0, pthread_create(&d[1].t, NULL, thread_restrict, &d[1]));
+
+	/* Wait for threads to finish. */
+	ASSERT_EQ(0, pthread_join(d[0].t, NULL));
+	ASSERT_EQ(0, pthread_join(d[1].t, NULL));
+
+	/* Expect that both succeeded. */
+	EXPECT_EQ(0, d[0].result);
+	EXPECT_EQ(0, d[1].result);
+
+	EXPECT_EQ(0, close(ruleset_fd));
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 42e41b2a0afa04ca49ee2725aadf90ccb058ed28 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Tue, 3 Feb 2026 16:50:57 +0100
Subject: selftests/xsk: properly handle batch ending in the middle of a packet

Referenced commit reduced the scope of the variable pkt, so now it has to
be reinitialized via pkt_stream_get_next_rx_pkt(), which also increments
some counters. When the packet is interrupted by the batch ending, pkt
stream therefore proceeds to the next packet, while xsk ring still contains
the previous one, this results in a pkt_nb mismatch.

Decrement the affected counters when packet is interrupted.

Fixes: 8913e653e9b8 ("selftests/xsk: Iterate over all the sockets in the receive pkts function")
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Link: https://lore.kernel.org/r/20260203155103.2305816-2-larysa.zaremba@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/test_xsk.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c
index 5af28f359cfd..69a5a9a5189b 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_xsk.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c
@@ -1090,6 +1090,8 @@ static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk)
 			xsk_ring_prod__cancel(&umem->fq, nb_frags);
 		}
 		frags_processed -= nb_frags;
+		pkt_stream_cancel(pkt_stream);
+		pkts_sent--;
 	}
 
 	if (ifobj->use_fill_ring)
-- 
cgit v1.2.3


From 88af9fefed412e4bea9a1a771cbe6fe347fa3507 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Tue, 3 Feb 2026 16:50:58 +0100
Subject: selftests/xsk: fix number of Tx frags in invalid packet

The issue occurs in TOO_MANY_FRAGS test case when xdp_zc_max_segs is set to
an odd number.

TOO_MANY_FRAGS test case contains an invalid packet consisting of
(xdp_zc_max_segs) frags. Every frag, even the last one has XDP_PKT_CONTD
flag set. This packet is expected to be dropped. After that, there is a
valid linear packet, which is expected to be received back.

Once (xdp_zc_max_segs) is an odd number, the last packet cannot be
received, if packet forwarding between Rx and Tx interfaces relies on
the ethernet header, e.g. checks for ETH_P_LOOPBACK. Packet is malformed,
if all traffic is looped.

Turns out, sending function processes multiple invalid frags as if they
were in 2-frag packets. So once the invalid mbuf packet contains an odd
number of those, the valid packet after gets paired with the previous
invalid descriptor, and hence does not get an ethernet header generated, so
it is either dropped or malformed.

Make invalid packets in verbatim mode always have only a single frag. For
such packets, number of frags is otherwise meaningless, as descriptor flags
are pre-configured in verbatim mode and packet data is not generated for
invalid descriptors.

Fixes: 697604492b64 ("selftests/xsk: add invalid descriptor test for multi-buffer")
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
Link: https://lore.kernel.org/r/20260203155103.2305816-3-larysa.zaremba@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/test_xsk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c
index 69a5a9a5189b..bab4a31621c7 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_xsk.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c
@@ -433,7 +433,7 @@ static u32 pkt_nb_frags(u32 frame_size, struct pkt_stream *pkt_stream, struct pk
 	}
 
 	/* Search for the end of the packet in verbatim mode */
-	if (!pkt_continues(pkt->options))
+	if (!pkt_continues(pkt->options) || !pkt->valid)
 		return nb_frags;
 
 	next_frag = pkt_stream->current_pkt_nb;
-- 
cgit v1.2.3


From a724a8fce5e25b45b2146abea61d22d6634dde59 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Fri, 6 Feb 2026 11:00:20 +0000
Subject: perf kvm stat: Fix build error

Since commit ceea279f9376 ("perf kvm stat: Remove use of the arch
directory"), a native build on Arm64 machine reports:

  util/kvm-stat-arch/kvm-stat-x86.c:7:10: fatal error: asm/svm.h: No such file or directory
    7 | #include <asm/svm.h>
      |          ^~~~~~~~~~~
  compilation terminated.

The build fails to find x86's asm headers when building for Arm64.  Fix
this by including asm headers with relative path instead.

Fixes: ceea279f9376 ("perf kvm stat: Remove use of the arch directory")
Signed-off-by: Leo Yan <leo.yan@arm.com>
Link: https://lore.kernel.org/r/20260206-perf_fix_kvm_stat_error-v1-1-ad40115876be@arm.com
Cc: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-perf-users@vger.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/kvm-stat-arch/kvm-stat-x86.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/kvm-stat-arch/kvm-stat-x86.c b/tools/perf/util/kvm-stat-arch/kvm-stat-x86.c
index 1cf541385a4b..43275d25b6cb 100644
--- a/tools/perf/util/kvm-stat-arch/kvm-stat-x86.c
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-x86.c
@@ -4,9 +4,9 @@
 #include "../kvm-stat.h"
 #include "../evsel.h"
 #include "../env.h"
-#include <asm/svm.h>
-#include <asm/vmx.h>
-#include <asm/kvm.h>
+#include "../../arch/x86/include/uapi/asm/svm.h"
+#include "../../arch/x86/include/uapi/asm/vmx.h"
+#include "../../arch/x86/include/uapi/asm/kvm.h"
 #include <subcmd/parse-options.h>
 
 define_exit_reasons_table(vmx_exit_reasons, VMX_EXIT_REASONS);
-- 
cgit v1.2.3


From 04f81f45b432feab13a169a82a032987e948b1a6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 5 Feb 2026 12:56:21 -0800
Subject: perf callchain lbr: Make the leaf IP that of the sample
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current IP of a leaf function when reported from a perf record with
"--call-graph lbr" is the "to" field of the LBR branch stack record.

The sample for the event being recorded may be further into the function
and there may be inlining information associated with it.

Rather than use the branch stack "to" field in this case switch to the
callchain appending the sample->ip and thereby allowing the inline
information to show.

Before this change:
```
$ perf record --call-graph lbr perf test -w inlineloop
...
$ perf script --fields +srcline
...
perf-inlineloop  467586  4649.344493:     950905 cpu_core/cycles/P:
           55dfda2829c0 parent+0x0 (perf)
 inlineloop.c:31
           55dfda282a96 inlineloop+0x86 (perf)
 inlineloop.c:47
           55dfda236420 run_workload+0x59 (perf)
 builtin-test.c:715
           55dfda236b03 cmd_test+0x413 (perf)
 builtin-test.c:825
...
```

After this change:
```
$ perf record --call-graph lbr perf test -w inlineloop
...
$ perf script --fields +srcline
...
perf-inlineloop  529703 11878.680815:     950905 cpu_core/cycles/P:
            555ce86be9e6 leaf+0x26
  inlineloop.c:20 (inlined)
            555ce86be9e6 middle+0x26
  inlineloop.c:27 (inlined)
            555ce86be9e6 parent+0x26 (perf)
  inlineloop.c:32
            555ce86bea96 inlineloop+0x86 (perf)
  inlineloop.c:47
            555ce8672420 run_workload+0x59 (perf)
  builtin-test.c:715
            555ce8672b03 cmd_test+0x413 (perf)
  builtin-test.c:825
...
```

Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Weilin Wang <weilin.wang@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/machine.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 5b0f5a48ffd4..e76f8c86e62a 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2423,8 +2423,14 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread,
 	}
 
 	if (callee) {
-		/* Add LBR ip from first entries.to */
-		ip = entries[0].to;
+		/*
+		 * Set the (first) leaf function's IP to sample->ip (the
+		 * location of the sample) but if not recorded use entries.to
+		 */
+		if (sample->ip)
+			ip = sample->ip;
+		else
+			ip = entries[0].to;
 		flags = &entries[0].flags;
 		*branch_from = entries[0].from;
 		err = add_callchain_ip(thread, cursor, parent,
@@ -2477,8 +2483,14 @@ static int lbr_callchain_add_lbr_ip(struct thread *thread,
 	}
 
 	if (lbr_nr > 0) {
-		/* Add LBR ip from first entries.to */
-		ip = entries[0].to;
+		/*
+		 * Set the (first) leaf function's IP to sample->ip (the
+		 * location of the sample) but if not recorded use entries.to
+		 */
+		if (sample->ip)
+			ip = sample->ip;
+		else
+			ip = entries[0].to;
 		flags = &entries[0].flags;
 		*branch_from = entries[0].from;
 		err = add_callchain_ip(thread, cursor, parent,
-- 
cgit v1.2.3


From 446c595dc0dd1759e56a7d736752d65361e13753 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 5 Feb 2026 12:56:22 -0800
Subject: perf test addr2line_inlines: Ensure inline information shows on LBR
 leaves
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expand the addr2line inline function testing to also run for an LBR
callchain, skipping if LBR support isn't present.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Krzysztof Łopatowski <krzysztof.m.lopatowski@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Weilin Wang <weilin.wang@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/addr2line_inlines.sh | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/addr2line_inlines.sh b/tools/perf/tests/shell/addr2line_inlines.sh
index ce30d9c7e0bf..e8754ef2d7f2 100755
--- a/tools/perf/tests/shell/addr2line_inlines.sh
+++ b/tools/perf/tests/shell/addr2line_inlines.sh
@@ -61,8 +61,36 @@ test_dwarf() {
     fi
 }
 
+test_lbr() {
+    echo "Inline unwinding LBR verification test"
+    if [ ! -f /sys/bus/event_source/devices/cpu/caps/branches ] &&
+       [ ! -f /sys/bus/event_source/devices/cpu_core/caps/branches ]
+    then
+        echo "Skip: only x86 CPUs support LBR"
+        return
+    fi
+
+    # Record data. Currently only dwarf callchains support inlined functions.
+    perf record --call-graph lbr -e cycles:u -o "${perf_data}" -- perf test -w inlineloop 1
+
+    # Check output with inline (default) and srcline
+    perf script -i "${perf_data}" --fields +srcline > "${perf_script_txt}"
+
+    # Expect the leaf and middle functions to occur on lines in the 20s, with
+    # the non-inlined parent function on a line in the 30s.
+    if grep -q "inlineloop.c:2. (inlined)" "${perf_script_txt}" &&
+       grep -q "inlineloop.c:3.$" "${perf_script_txt}"
+    then
+        echo "Inline unwinding lbr verification test [Success]"
+    else
+        echo "Inline unwinding lbr verification test [Failed missing inlined functions]"
+        err=1
+    fi
+}
+
 test_fp
 test_dwarf
+test_lbr
 
 cleanup
 exit $err
-- 
cgit v1.2.3


From bb5a920b9099127915706fdd23eb540c9a69c338 Mon Sep 17 00:00:00 2001
From: Chun-Tse Shao <ctshao@google.com>
Date: Tue, 3 Feb 2026 15:06:22 -0800
Subject: perf stat: Ensure metrics are displayed even with failed events

Currently, `perf stat` skips or hides metrics when the underlying
hardware events cannot be counted (e.g., due to insufficient permissions
or unsupported events).

In `--metric-only` mode, this often results in missing columns or blank
spaces, making the output difficult to parse.

Modify the logic to ensure metrics are consistently displayed by
propagating NAN (Not a Number) through the expression evaluator.
Specifically:

1. Update `prepare_metric()` in stat-shadow.c to treat uncounted events
   (where `run == 0`) as NAN. This leverages the existing math in expr.y
   to propagate NAN through metric expressions.

2. Remove the early return in the display logic's `printout()` function
   that was previously skipping metrics in `--metric-only` mode for
   failed events.
l
3. Simplify `perf_stat__skip_metric_event()` to no longer depend on
   event runtime.

Tested:

1. `perf all metrics test` did not crash while paranoid is 2.

2. Multiple combinations with `CPUs_utilized` while paranoid is 2.

  $ ./perf stat -M CPUs_utilized -a -- sleep 1

   Performance counter stats for 'system wide':

     <not supported> msec cpu-clock:u                      #      nan CPUs  CPUs_utilized
       1,006,356,120      duration_time

         1.004375550 seconds time elapsed

  $ ./perf stat -M CPUs_utilized -a -j -- sleep 1
  {"counter-value" : "<not supported>", "unit" : "msec", "event" : "cpu-clock:u", "event-runtime" : 0, "pcnt-running" : 100.00, "metric-value" : "nan", "metric-unit" : "CPUs  CPUs_utilized"}
  {"counter-value" : "1006642462.000000", "unit" : "", "event" : "duration_time", "event-runtime" : 1, "pcnt-running" : 100.00}

  $ ./perf stat -M CPUs_utilized -a --metric-only -- sleep 1

   Performance counter stats for 'system wide':

    CPUs  CPUs_utilized
                      nan

         1.004424652 seconds time elapsed

  $ ./perf stat -M CPUs_utilized -a --metric-only -j -- sleep 1
  {"CPUs  CPUs_utilized" : "none"}

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Chun-Tse Shao <ctshao@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat-display.c | 59 +++++++++++++++++++-----------------------
 tools/perf/util/stat-shadow.c  |  8 ++----
 tools/perf/util/stat.h         |  2 +-
 3 files changed, 29 insertions(+), 40 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 2ce0602974a1..dc2b66855f6c 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -820,12 +820,6 @@ static void printout(struct perf_stat_config *config, struct outstate *os,
 	}
 
 	if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
-		if (config->metric_only) {
-			pm(config, os, METRIC_THRESHOLD_UNKNOWN, /*format=*/NULL,
-			   /*unit=*/NULL, /*val=*/0);
-			return;
-		}
-
 		ok = false;
 
 		if (counter->supported) {
@@ -848,33 +842,32 @@ static void printout(struct perf_stat_config *config, struct outstate *os,
 		print_running(config, os, run, ena, /*before_metric=*/true);
 	}
 
-	if (ok) {
-		if (!config->metric_only && counter->default_metricgroup && !counter->default_show_events) {
-			void *from = NULL;
-
-			aggr_printout(config, os, os->evsel, os->id, os->aggr_nr);
-			/* Print out all the metricgroup with the same metric event. */
-			do {
-				int num = 0;
-
-				/* Print out the new line for the next new metricgroup. */
-				if (from) {
-					if (config->json_output)
-						new_line_json(config, (void *)os);
-					else
-						__new_line_std_csv(config, os);
-				}
-
-				print_noise(config, os, counter, noise, /*before_metric=*/true);
-				print_running(config, os, run, ena, /*before_metric=*/true);
-				from = perf_stat__print_shadow_stats_metricgroup(config, counter, aggr_idx,
-										 &num, from, &out);
-			} while (from != NULL);
-		} else {
-			perf_stat__print_shadow_stats(config, counter, aggr_idx, &out);
-		}
+	if (!config->metric_only && counter->default_metricgroup &&
+	    !counter->default_show_events) {
+		void *from = NULL;
+
+		aggr_printout(config, os, os->evsel, os->id, os->aggr_nr);
+		/* Print out all the metricgroup with the same metric event. */
+		do {
+			int num = 0;
+
+			/* Print out the new line for the next new metricgroup. */
+			if (from) {
+				if (config->json_output)
+					new_line_json(config, (void *)os);
+				else
+					__new_line_std_csv(config, os);
+			}
+
+			print_noise(config, os, counter, noise,
+				    /*before_metric=*/true);
+			print_running(config, os, run, ena,
+				      /*before_metric=*/true);
+			from = perf_stat__print_shadow_stats_metricgroup(
+				config, counter, aggr_idx, &num, from, &out);
+		} while (from != NULL);
 	} else {
-		pm(config, os, METRIC_THRESHOLD_UNKNOWN, /*format=*/NULL, /*unit=*/NULL, /*val=*/0);
+		perf_stat__print_shadow_stats(config, counter, aggr_idx, &out);
 	}
 
 	if (!config->metric_only) {
@@ -987,7 +980,7 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
 	ena = aggr->counts.ena;
 	run = aggr->counts.run;
 
-	if (perf_stat__skip_metric_event(counter, ena, run))
+	if (perf_stat__skip_metric_event(counter))
 		return;
 
 	if (val == 0 && should_skip_zero_counter(config, counter, &id))
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 9c83f7d96caa..5d8d09e0e6ae 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -83,7 +83,7 @@ static int prepare_metric(struct perf_stat_config *config,
 		}
 		/* Time events are always on CPU0, the first aggregation index. */
 		aggr = &ps->aggr[is_tool_time ? tool_aggr_idx : aggr_idx];
-		if (!aggr || !metric_events[i]->supported) {
+		if (!aggr || !metric_events[i]->supported || aggr->counts.run == 0) {
 			/*
 			 * Not supported events will have a count of 0, which
 			 * can be confusing in a metric. Explicitly set the
@@ -335,14 +335,10 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
  * perf_stat__skip_metric_event - Skip the evsel in the Default metricgroup,
  *				  if it's not running or not the metric event.
  */
-bool perf_stat__skip_metric_event(struct evsel *evsel,
-				  u64 ena, u64 run)
+bool perf_stat__skip_metric_event(struct evsel *evsel)
 {
 	if (!evsel->default_metricgroup)
 		return false;
 
-	if (!ena || !run)
-		return true;
-
 	return !metricgroup__lookup(&evsel->evlist->metric_events, evsel, false);
 }
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index f986911c9296..4bced233d2fc 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -163,7 +163,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 				   struct evsel *evsel,
 				   int aggr_idx,
 				   struct perf_stat_output_ctx *out);
-bool perf_stat__skip_metric_event(struct evsel *evsel, u64 ena, u64 run);
+bool perf_stat__skip_metric_event(struct evsel *evsel);
 void *perf_stat__print_shadow_stats_metricgroup(struct perf_stat_config *config,
 						struct evsel *evsel,
 						int aggr_idx,
-- 
cgit v1.2.3


From 64ea7a4620008652c7f72065ae61efbde7af3ea0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 5 Feb 2026 17:27:43 -0800
Subject: perf annotate: Fix register usage in data type profiling

On data type profiling, it tried to match register name with a partial
string.  For example, it allowed to match with "%rbp)" or "%rdi,8)".

But with recent change in the area, it doesn't match anymore and break
the data type profiling.

Let's pass the correct register name by removing the unwanted part.

Add arch__dwarf_regnum() to handle it in a single place.

Closes: 7d3n23li6drroxrdlpxn7ixehdeszkjdftah3zyngjl2qs22ef@yelcjv53v42o
Reported-by: Dmitry Dolgov <9erthalion6@gmail.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zecheng Li <zli94@ncsu.edu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/annotate.c | 61 +++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 31 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 880b1bd300c2..2e3522905046 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -2447,6 +2447,29 @@ int annotate_check_args(void)
 	return 0;
 }
 
+static int arch__dwarf_regnum(const struct arch *arch, const char *str)
+{
+	const char *p;
+	char *regname, *q;
+	int reg;
+
+	p = strchr(str, arch->objdump.register_char);
+	if (p == NULL)
+		return -1;
+
+	regname = strdup(p);
+	if (regname == NULL)
+		return -1;
+
+	q = strpbrk(regname, ",) ");
+	if (q)
+		*q = '\0';
+
+	reg = get_dwarf_regnum(regname, arch->id.e_machine, arch->id.e_flags);
+	free(regname);
+	return reg;
+}
+
 /*
  * Get register number and access offset from the given instruction.
  * It assumes AT&T x86 asm format like OFFSET(REG).  Maybe it needs
@@ -2457,7 +2480,6 @@ static int extract_reg_offset(const struct arch *arch, const char *str,
 			      struct annotated_op_loc *op_loc)
 {
 	char *p;
-	char *regname;
 
 	if (arch->objdump.register_char == 0)
 		return -1;
@@ -2482,31 +2504,14 @@ static int extract_reg_offset(const struct arch *arch, const char *str,
 	}
 
 	op_loc->offset = strtol(str, &p, 0);
-
-	p = strchr(p, arch->objdump.register_char);
-	if (p == NULL)
+	op_loc->reg1 = arch__dwarf_regnum(arch, p);
+	if (op_loc->reg1 == -1)
 		return -1;
 
-	regname = strdup(p);
-	if (regname == NULL)
-		return -1;
-
-	op_loc->reg1 = get_dwarf_regnum(regname, arch->id.e_machine, arch->id.e_flags);
-	free(regname);
-
 	/* Get the second register */
-	if (op_loc->multi_regs) {
-		p = strchr(p + 1, arch->objdump.register_char);
-		if (p == NULL)
-			return -1;
-
-		regname = strdup(p);
-		if (regname == NULL)
-			return -1;
+	if (op_loc->multi_regs)
+		op_loc->reg2 = arch__dwarf_regnum(arch, p + 1);
 
-		op_loc->reg2 = get_dwarf_regnum(regname, arch->id.e_machine, arch->id.e_flags);
-		free(regname);
-	}
 	return 0;
 }
 
@@ -2585,7 +2590,8 @@ int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl,
 			op_loc->multi_regs = multi_regs;
 			extract_reg_offset(arch, insn_str, op_loc);
 		} else {
-			char *s, *p = NULL;
+			const char *s = insn_str;
+			char *p = NULL;
 
 			if (arch__is_x86(arch)) {
 				/* FIXME: Handle other segment registers */
@@ -2599,21 +2605,14 @@ int annotate_get_insn_location(const struct arch *arch, struct disasm_line *dl,
 				}
 			}
 
-			s = strdup(insn_str);
-			if (s == NULL)
-				return -1;
-
 			if (*s == arch->objdump.register_char) {
-				op_loc->reg1 = get_dwarf_regnum(s,
-								arch->id.e_machine,
-								arch->id.e_flags);
+				op_loc->reg1 = arch__dwarf_regnum(arch, s);
 			}
 			else if (*s == arch->objdump.imm_char) {
 				op_loc->offset = strtol(s + 1, &p, 0);
 				if (p && p != s + 1)
 					op_loc->imm = true;
 			}
-			free(s);
 		}
 	}
 
-- 
cgit v1.2.3


From c73a56ed3c97ae6571c2c50e6bc8772b1cee42e0 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Fri, 28 Nov 2025 10:11:39 +0100
Subject: perf test: Fix test case Leader sampling on s390

The subtest 'Leader sampling' some time fails on s390.
- for z/VM guest: Disable the test for z/VM guest. There is no
  CPU Measurement facility to run the test successfully.
- for LPAR: Use correct event names.

A detailed analysis follows here:
Now to the debugging and investigation:
1. With command
       perf record -e '{cycles,cycles}:S' -- ....
   the first cycles event starts sampling.
   On s390 this sets up sampling with a frequency of 4000 Hz.
   This translates to hardware sample rate of 1377000 instructions per
   micro-second to meet a frequency of 4000 HZ.

2. With first event cycles now sampling into a hardware buffer, an
   interrupt is triggered each time a sampling buffer gets full.
   The interrupt handler is then invoked and debug output shows the
   processing of samples.  The size of one hardware sample is 32 bytes.
   With an interrupt triggered when the hardware buffer page of 4KB
   gets full, the interrupt handler processes 128 samples.
   (This is taken from s390 specific fast debug data gathering)
   2025-11-07 14:35:51.977248  000003ffe013cbfa \
	   perf_event_count_update event->count 0x0 count 0x1502e8
   2025-11-07 14:35:51.977248  000003ffe013cbfa \
	   perf_event_count_update event->count 0x1502e8 count 0x1502e8
   2025-11-07 14:35:51.977248  000003ffe013cbfa \
	   perf_event_count_update event->count 0x2a05d0 count 0x1502e8
   2025-11-07 14:35:51.977252  000003ffe013cbfa \
	   perf_event_count_update event->count 0x3f08b8 count 0x1502e8
   2025-11-07 14:35:51.977252  000003ffe013cbfa \
	   perf_event_count_update event->count 0x540ba0 count 0x1502e8
   2025-11-07 14:35:51.977253  000003ffe013cbfa \
	   perf_event_count_update event->count 0x690e88 count 0x1502e8
   2025-11-07 14:35:51.977254  000003ffe013cbfa \
	   perf_event_count_update event->count 0x7e1170 count 0x1502e8
   2025-11-07 14:35:51.977254  000003ffe013cbfa \
	   perf_event_count_update event->count 0x931458 count 0x1502e8
   2025-11-07 14:35:51.977254  000003ffe013cbfa \
	   perf_event_count_update event->count 0xa81740 count 0x1502e8

3. The value is constantly increasing by the number of instructions
   executed to generate a sample entry.  This is the first line of the
   pairs of lines. count 0x1502e8 --> 1377000

   # perf script | grep 1377000 | wc -l
   214
   # perf script | wc -l
   428
   #
   That is 428 lines in total, and half of the lines contain value
   1377000.

4. The second event cycles is opened against the counting PMU, which
   is an independent PMU and is not interrupt driven.  Once enabled it
   runs in the background and keeps running, incrementing silently
   about 400+ counters. The counter values are read via assembly
   instructions.

   This second counter PMU's read call back function is called when the
   interrupt handler of the sampling facility processes each sample. The
   function call sequence is:

   perf_event_overflow()
   +--> __perf_event_overflow()
        +--> __perf_event_output()
               +--> perf_output_sample()
                    +--> perf_output_read()
                         +--> perf_output_read_group()
	                          for_each_sibling_event(sub, leader) {
		values[n++] = perf_event_count(sub, self);
		printk("%s sub %p values %#lx\n", __func__, sub, values[n-1]);
			          }

   The last function perf_event_count() is invoked on the second event
   cylces *on* the counting PMU. An added printk statement shows the
   following lines in the dmesg output:

   # dmesg|grep perf_output_read_group |head -10
   [  332.368620] perf_output_read_group sub 00000000d80b7c1f values 0x3a80917 (1)
   [  332.368624] perf_output_read_group sub 00000000d80b7c1f values 0x3a86c7f (2)
   [  332.368627] perf_output_read_group sub 00000000d80b7c1f values 0x3a89c15 (3)
   [  332.368629] perf_output_read_group sub 00000000d80b7c1f values 0x3a8c895 (4)
   [  332.368631] perf_output_read_group sub 00000000d80b7c1f values 0x3a8f569 (5)
   [  332.368633] perf_output_read_group sub 00000000d80b7c1f values 0x3a9204b
   [  332.368635] perf_output_read_group sub 00000000d80b7c1f values 0x3a94790
   [  332.368637] perf_output_read_group sub 00000000d80b7c1f values 0x3a9704b
   [  332.368638] perf_output_read_group sub 00000000d80b7c1f values 0x3a99888
   #

   This correlates with the output of
   # perf report -D | grep 'id 00000000000000'|head -10
   ..... id 0000000000000006, value 00000000001502e8, lost 0
   ..... id 000000000000000e, value 0000000003a80917, lost 0 --> line (1) above
   ..... id 0000000000000006, value 00000000002a05d0, lost 0
   ..... id 000000000000000e, value 0000000003a86c7f, lost 0 --> line (2) above
   ..... id 0000000000000006, value 00000000003f08b8, lost 0
   ..... id 000000000000000e, value 0000000003a89c15, lost 0 --> line (3) above
   ..... id 0000000000000006, value 0000000000540ba0, lost 0
   ..... id 000000000000000e, value 0000000003a8c895, lost 0 --> line (4) above
   ..... id 0000000000000006, value 0000000000690e88, lost 0
   ..... id 000000000000000e, value 0000000003a8f569, lost 0 --> line (5) above

Summary:
- Above command starts the CPU sampling facility, with runs interrupt
  driven when a 4KB page is full. An interrupt processes the 128 samples
  and calls eventually perf_output_read_group() for each sample to save it
  in the event's ring buffer.

- At that time the CPU counting facility is invoked to read the value of
  the event cycles. This value is saved as the second value in the
  sample_read structure.

- The first and odd lines in the perf script output displays the period
  value between 2 samples being created by hardware. It is the number
  of instructions executes before the hardware writes a sample.

- The second and even lines in the perf script output displays the number
  of CPU cycles needed to process each sample and save it in the event's
  ring buffer.
These 2 different values can never be identical on s390.

Since event leader sampling is not possible on s390 the perf tool will
return EOPNOTSUPP soon. Perpare the test case for that.

Suggested-by: James Clark <james.clark@linaro.org>
Reviewed-by: Jan Polensky <japo@linux.ibm.com>
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Tested-by: Jan Polensky <japo@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/record.sh | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh
index 0f5841c479e7..46b96d565680 100755
--- a/tools/perf/tests/shell/record.sh
+++ b/tools/perf/tests/shell/record.sh
@@ -260,7 +260,21 @@ test_uid() {
 
 test_leader_sampling() {
   echo "Basic leader sampling test"
-  if ! perf record -o "${perfdata}" -e "{cycles,cycles}:Su" -- \
+  events="{cycles,cycles}:Su"
+  [ $(uname -m) = "s390x" ] && {
+    [ ! -d /sys/devices/cpum_sf ] && {
+      echo "No CPUMF [Skipped record]"
+      return
+    }
+    events="{cpum_sf/SF_CYCLES_BASIC/,cycles}:Su"
+    perf record -o "${perfdata}" -e "$events" -- perf test -w brstack 2> /dev/null
+    # Perf grouping might be unsupported, depends on version.
+    [ "$?" -ne 0 ] && {
+      echo "Grouping not support [Skipped record]"
+      return
+    }
+  }
+  if ! perf record -o "${perfdata}" -e "$events" -- \
     perf test -w brstack 2> /dev/null
   then
     echo "Leader sampling [Failed record]"
-- 
cgit v1.2.3


From 920c5570a67549956eb4e6922eb1ed5e32169a0d Mon Sep 17 00:00:00 2001
From: Ricky Ringler <ricky.ringler@proton.me>
Date: Thu, 29 Jan 2026 00:42:27 +0000
Subject: perf sort: Replace static cacheline size with sysconf cacheline size

Testing:
- Built perf
- Executed perf mem record and report

Committer notes:

This addresses a TODO and improves the situation where record and
report/c2c are performed on the same machine or in machines with the
same cacheline size, but the proper way is to store the cacheline size
in the perf.data header at 'record' time and then use it at post
processing time.

Signed-off-by: Ricky Ringler <ricky.ringler@proton.me>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20260129004223.26799-1-ricky.ringler@proton.me
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/sort.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 01a9d73ae348..42d5cd7ef4e2 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2474,8 +2474,7 @@ struct sort_entry sort_type_offset = {
 
 /* --sort typecln */
 
-/* TODO: use actual value in the system */
-#define TYPE_CACHELINE_SIZE  64
+#define DEFAULT_CACHELINE_SIZE 64
 
 static int64_t
 sort__typecln_sort(struct hist_entry *left, struct hist_entry *right)
@@ -2484,6 +2483,10 @@ sort__typecln_sort(struct hist_entry *left, struct hist_entry *right)
 	struct annotated_data_type *right_type = right->mem_type;
 	int64_t left_cln, right_cln;
 	int64_t ret;
+	int cln_size = cacheline_size();
+
+	if (cln_size == 0)
+		cln_size = DEFAULT_CACHELINE_SIZE;
 
 	if (!left_type) {
 		sort__type_init(left);
@@ -2499,8 +2502,8 @@ sort__typecln_sort(struct hist_entry *left, struct hist_entry *right)
 	if (ret)
 		return ret;
 
-	left_cln = left->mem_type_off / TYPE_CACHELINE_SIZE;
-	right_cln = right->mem_type_off / TYPE_CACHELINE_SIZE;
+	left_cln = left->mem_type_off / cln_size;
+	right_cln = right->mem_type_off / cln_size;
 	return left_cln - right_cln;
 }
 
@@ -2508,9 +2511,13 @@ static int hist_entry__typecln_snprintf(struct hist_entry *he, char *bf,
 				     size_t size, unsigned int width __maybe_unused)
 {
 	struct annotated_data_type *he_type = he->mem_type;
+	int cln_size = cacheline_size();
+
+	if (cln_size == 0)
+		cln_size = DEFAULT_CACHELINE_SIZE;
 
 	return repsep_snprintf(bf, size, "%s: cache-line %d", he_type->self.type_name,
-			       he->mem_type_off / TYPE_CACHELINE_SIZE);
+			       he->mem_type_off / cln_size);
 }
 
 struct sort_entry sort_type_cacheline = {
-- 
cgit v1.2.3


From 4479884d1fe4d0746a59fb86eaf925478092e7ed Mon Sep 17 00:00:00 2001
From: "Tycho Andersen (AMD)" <tycho@kernel.org>
Date: Thu, 5 Feb 2026 09:32:50 -0700
Subject: perf lock contention: fix segfault in `lock contention -b/--use-bpf`

When run on a kernel without BTF info, perf crashes:

    libbpf: kernel BTF is missing at '/sys/kernel/btf/vmlinux', was CONFIG_DEBUG_INFO_BTF enabled?
    libbpf: failed to find valid kernel BTF

    Program received signal SIGSEGV, Segmentation fault.
    0x00005555556915b7 in btf.type_cnt ()
    (gdb) bt
    #0  0x00005555556915b7 in btf.type_cnt ()
    #1  0x0000555555691fbc in btf_find_by_name_kind ()
    #2  0x00005555556920d0 in btf.find_by_name_kind ()
    #3  0x00005555558a1b7c in init_numa_data (con=0x7fffffffd0a0) at util/bpf_lock_contention.c:125
    #4  0x00005555558a264b in lock_contention_prepare (con=0x7fffffffd0a0) at util/bpf_lock_contention.c:313
    #5  0x0000555555620702 in __cmd_contention (argc=0, argv=0x7fffffffea10) at builtin-lock.c:2084
    #6  0x0000555555622c8d in cmd_lock (argc=0, argv=0x7fffffffea10) at builtin-lock.c:2755
    #7  0x0000555555651451 in run_builtin (p=0x555556104f00 <commands+576>, argc=3, argv=0x7fffffffea10)
        at perf.c:349
    #8  0x00005555556516ed in handle_internal_command (argc=3, argv=0x7fffffffea10) at perf.c:401
    #9  0x000055555565184e in run_argv (argcp=0x7fffffffe7fc, argv=0x7fffffffe7f0) at perf.c:445
    #10 0x0000555555651b9f in main (argc=3, argv=0x7fffffffea10) at perf.c:553

Check if btf loading failed, and don't do anything with it in
init_numa_data(). This leads to the following error message, instead of
just a crash:

    libbpf: kernel BTF is missing at '/sys/kernel/btf/vmlinux', was CONFIG_DEBUG_INFO_BTF enabled?
    libbpf: failed to find valid kernel BTF
    libbpf: kernel BTF is missing at '/sys/kernel/btf/vmlinux', was CONFIG_DEBUG_INFO_BTF enabled?
    libbpf: failed to find valid kernel BTF
    libbpf: Error loading vmlinux BTF: -ESRCH
    libbpf: failed to load BPF skeleton 'lock_contention_bpf': -ESRCH
    Failed to load lock-contention BPF skeleton
    lock contention BPF setup failed

Signed-off-by: Tycho Andersen (AMD) <tycho@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_lock_contention.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index 788d30be2058..cbd7435579fe 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -117,6 +117,9 @@ static void init_numa_data(struct lock_contention *con)
 	long last = -1;
 	int ret;
 
+	if (!con->btf)
+		return;
+
 	/*
 	 * 'struct zone' is embedded in 'struct pglist_data' as an array.
 	 * As we may not have full information of the struct zone in the
-- 
cgit v1.2.3


From a55d4bbbe64494bb92b32402018efb2ffc44d796 Mon Sep 17 00:00:00 2001
From: Ted Logan <tedlogan@fb.com>
Date: Mon, 2 Feb 2026 17:23:53 -0800
Subject: vfio: selftests: only build tests on arm64 and x86_64

Only build vfio self-tests on arm64 and x86_64; these are the only
architectures where the vfio self-tests are run. Addresses compiler
warnings for format and conversions on i386.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202601211830.aBEjmEFD-lkp@intel.com/
Signed-off-by: Ted Logan <tedlogan@fb.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20260202-vfio-selftest-only-64bit-v2-1-9c3ebb37f0f4@fb.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 tools/testing/selftests/vfio/Makefile | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile
index ead27892ab65..8e90e409e91d 100644
--- a/tools/testing/selftests/vfio/Makefile
+++ b/tools/testing/selftests/vfio/Makefile
@@ -1,3 +1,10 @@
+ARCH ?= $(shell uname -m)
+
+ifeq (,$(filter $(ARCH),arm64 x86_64))
+# Do nothing on unsupported architectures
+include ../lib.mk
+else
+
 CFLAGS = $(KHDR_INCLUDES)
 TEST_GEN_PROGS += vfio_dma_mapping_test
 TEST_GEN_PROGS += vfio_dma_mapping_mmio_test
@@ -28,3 +35,5 @@ TEST_DEP_FILES = $(patsubst %.o, %.d, $(TEST_GEN_PROGS_O) $(LIBVFIO_O))
 -include $(TEST_DEP_FILES)
 
 EXTRA_CLEAN += $(TEST_GEN_PROGS_O) $(TEST_DEP_FILES)
+
+endif
-- 
cgit v1.2.3


From 36a1b0061a584430277861fe5d8bd107aef26137 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 3 Feb 2026 08:43:23 -0800
Subject: perf build: Reduce pmu-events related copying and mkdirs

When building to an output directory the previous code would remove
files and then copy the source files over.

Each source file copy would have a rule to make its directory. All JSON
for every architecture was considered a source file.

This led to unnecessary copying as a file would be deleted and then the
same file copied again, unnecessary directory making, and copying of
files not used in the build.

A side-effect would be a lot of build messages.

This change makes it so that all computed output files are created and
then compared to all files in the OUTPUT directory.

By filtering out the files that would be copied, unnecessary files can
be determined and then deleted - note, this is a phony target which
would remake the pmu-events.c if always depended upon, and so the
dependency is conditional on there being files to remove.

This has some overhead as the $(OUTPUT)/pmu-events is "find" over rather
than just "rm -fr", but the savings from unnecessary copying, etc.
should make up for this new make overhead.

The copy target just does copying but has a dependency on the directory
it needs being built, avoiding repetitive mkdirs.

The source files for copying only consider the JEVENTS_ARCH unless the
JEVENTS_ARCH is all.

The metric JSON is only generated if appropriate, rather than always
being generated and jevents.py deciding whether or not to use the files.

The mypy and pylint targets are fixed as variable names had changed but
the rules not updated.

The line count of a build with "make -C tools/perf O=/tmp/perf clean all"
prior to this change was 2181 lines, after this change it is 1596
lines.

This is a reduction of 585 lines or about 27%.

The generated pmu-events.c for JEVENTS_ARCH "x86" and "all" were
validated as being identical after this change.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/pmu-events/Build | 217 ++++++++++++++++++++++++++++++--------------
 1 file changed, 149 insertions(+), 68 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build
index ec964ed05974..63c65788d442 100644
--- a/tools/perf/pmu-events/Build
+++ b/tools/perf/pmu-events/Build
@@ -1,63 +1,61 @@
-pmu-events-y	+= pmu-events.o
-JSON		=  $(shell find pmu-events/arch -name '*.json' -o -name '*.csv')
-JSON_DIRS	=  $(shell find pmu-events/arch -type d)
-JDIR_TEST	=  pmu-events/arch/test
-JSON_TEST	=  $(shell [ -d $(JDIR_TEST) ] &&			\
-			find $(JDIR_TEST) -name '*.json')
-JEVENTS_PY	=  pmu-events/jevents.py
-METRIC_PY	=  pmu-events/metric.py
-METRIC_TEST_PY	=  pmu-events/metric_test.py
 EMPTY_PMU_EVENTS_C = pmu-events/empty-pmu-events.c
+# pmu-events.c will be generated by jevents.py or copied from EMPTY_PMU_EVENTS_C
 PMU_EVENTS_C	=  $(OUTPUT)pmu-events/pmu-events.c
-METRIC_TEST_LOG	=  $(OUTPUT)pmu-events/metric_test.log
-TEST_EMPTY_PMU_EVENTS_C = $(OUTPUT)pmu-events/test-empty-pmu-events.c
-EMPTY_PMU_EVENTS_TEST_LOG = $(OUTPUT)pmu-events/empty-pmu-events.log
-LEGACY_CACHE_PY	=  pmu-events/make_legacy_cache.py
-LEGACY_CACHE_JSON = $(OUTPUT)pmu-events/arch/common/common/legacy-cache.json
+pmu-events-y	+= pmu-events.o
 
-ifeq ($(JEVENTS_ARCH),)
-JEVENTS_ARCH=$(SRCARCH)
-endif
-JEVENTS_MODEL ?= all
+# pmu-events.c file is generated in the OUTPUT directory so it needs a
+# separate rule to depend on it properly
+$(OUTPUT)pmu-events/pmu-events.o: $(PMU_EVENTS_C)
+	$(call rule_mkdir)
+	$(call if_changed_dep,cc_o_c)
 
-#
-# Locate/process JSON files in pmu-events/arch/
-# directory and create tables in pmu-events.c.
-#
+# Message for $(call echo-cmd,cp), possibly remove the src file from
+# the destination to save space in the build log.
+quiet_cmd_cp   = COPY    $(patsubst %$<,%,$@) <- $<
 
+# --- NO_JEVENTS=1 build ---
 ifeq ($(NO_JEVENTS),1)
 $(PMU_EVENTS_C): $(EMPTY_PMU_EVENTS_C)
 	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,gen)cp $< $@
+	$(Q)$(call echo-cmd,cp)cp $< $@
 else
-# Functions to extract the model from a extra-metrics.json or extra-metricgroups.json path.
-model_name = $(shell echo $(1)|sed -e 's@.\+/\(.*\)/extra-metric.*\.json@\1@')
-vendor_name = $(shell echo $(1)|sed -e 's@.\+/\(.*\)/[^/]*/extra-metric.*\.json@\1@')
+# --- Regular build ---
 
-# Copy checked-in json to OUTPUT for generation if it's an out of source build
-ifneq ($(OUTPUT),)
-# Remove all output directories when any source directory timestamp changes
-# so there are no stale deleted files
-JSON_DIRS_ROOT = $(OUTPUT)pmu-events/arch/
-$(JSON_DIRS_ROOT): $(JSON_DIRS)
-	$(Q)$(call echo-cmd,gen)rm -rf $@
-	$(Q)mkdir -p $@
+# Setup the JEVENTS_ARCH and JEVENTS_MODEL
+ifeq ($(JEVENTS_ARCH),)
+JEVENTS_ARCH=$(SRCARCH)
+endif
+JEVENTS_MODEL ?= all
 
-$(OUTPUT)pmu-events/arch/%: pmu-events/arch/% $(JSON_DIRS_ROOT)
-	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,gen)cp $< $@
+# The input json/csv files
+SRC_DIR		:= pmu-events/arch
+ifeq ($(JEVENTS_ARCH),all)
+SRC_JSON	:= $(shell find $(SRC_DIR) -name '*.json' -o -name '*.csv')
+else
+SRC_JSON	:= $(shell find $(SRC_DIR)/common $(SRC_DIR)/test $(SRC_DIR)/$(JEVENTS_ARCH) -name '*.json' -o -name '*.csv')
 endif
 
-$(LEGACY_CACHE_JSON): $(LEGACY_CACHE_PY) $(JSON_DIRS_ROOT)
+# Python to build the generic legacy cache events
+LEGACY_CACHE_PY	=  pmu-events/make_legacy_cache.py
+LEGACY_CACHE_JSON = $(OUTPUT)pmu-events/arch/common/common/legacy-cache.json
+GEN_JSON = $(LEGACY_CACHE_JSON)
+
+$(LEGACY_CACHE_JSON): $(LEGACY_CACHE_PY)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)$(PYTHON) $(LEGACY_CACHE_PY) > $@
 
+# Python to generate architectural metrics
 GEN_METRIC_DEPS := pmu-events/metric.py pmu-events/common_metrics.py
+# Functions to extract the model from an extra-metrics.json or extra-metricgroups.json path.
+model_name = $(shell echo $(1)|sed -e 's@.\+/\(.*\)/extra-metric.*\.json@\1@')
+vendor_name = $(shell echo $(1)|sed -e 's@.\+/\(.*\)/[^/]*/extra-metric.*\.json@\1@')
 
+ifeq ($(JEVENTS_ARCH),$(filter $(JEVENTS_ARCH),x86 all))
 # Generate AMD Json
 ZENS = $(shell ls -d pmu-events/arch/x86/amdzen*)
 ZEN_METRICS = $(foreach x,$(ZENS),$(OUTPUT)$(x)/extra-metrics.json)
 ZEN_METRICGROUPS = $(foreach x,$(ZENS),$(OUTPUT)$(x)/extra-metricgroups.json)
+GEN_JSON += $(ZEN_METRICS) $(ZEN_METRICGROUPS)
 
 $(ZEN_METRICS): pmu-events/amd_metrics.py $(GEN_METRIC_DEPS)
 	$(call rule_mkdir)
@@ -67,10 +65,14 @@ $(ZEN_METRICGROUPS): pmu-events/amd_metrics.py $(GEN_METRIC_DEPS)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) pmu-events/arch > $@
 
+endif
+
+ifeq ($(JEVENTS_ARCH),$(filter $(JEVENTS_ARCH),arm64 all))
 # Generate ARM Json
 ARMS = $(shell ls -d pmu-events/arch/arm64/arm/*|grep -v cmn)
 ARM_METRICS = $(foreach x,$(ARMS),$(OUTPUT)$(x)/extra-metrics.json)
 ARM_METRICGROUPS = $(foreach x,$(ARMS),$(OUTPUT)$(x)/extra-metricgroups.json)
+GEN_JSON += $(ARM_METRICS) $(ARM_METRICGROUPS)
 
 $(ARM_METRICS): pmu-events/arm64_metrics.py $(GEN_METRIC_DEPS)
 	$(call rule_mkdir)
@@ -80,10 +82,14 @@ $(ARM_METRICGROUPS): pmu-events/arm64_metrics.py $(GEN_METRIC_DEPS)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call vendor_name,$@) $(call model_name,$@) pmu-events/arch > $@
 
+endif
+
+ifeq ($(JEVENTS_ARCH),$(filter $(JEVENTS_ARCH),x86 all))
 # Generate Intel Json
 INTELS = $(shell ls -d pmu-events/arch/x86/*|grep -v amdzen|grep -v mapfile.csv)
 INTEL_METRICS = $(foreach x,$(INTELS),$(OUTPUT)$(x)/extra-metrics.json)
 INTEL_METRICGROUPS = $(foreach x,$(INTELS),$(OUTPUT)$(x)/extra-metricgroups.json)
+GEN_JSON += $(INTEL_METRICS) $(INTEL_METRICGROUPS)
 
 $(INTEL_METRICS): pmu-events/intel_metrics.py $(GEN_METRIC_DEPS)
 	$(call rule_mkdir)
@@ -93,18 +99,69 @@ $(INTEL_METRICGROUPS): pmu-events/intel_metrics.py $(GEN_METRIC_DEPS)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)$(PYTHON) $< -metricgroups $(call model_name,$@) pmu-events/arch > $@
 
-GEN_JSON = $(patsubst %,$(OUTPUT)%,$(JSON)) \
-            $(LEGACY_CACHE_JSON) \
-            $(JSON_DIRS) \
-            $(ZEN_METRICS) $(ZEN_METRICGROUPS) \
-            $(ARM_METRICS) $(ARM_METRICGROUPS) \
-            $(INTEL_METRICS) $(INTEL_METRICGROUPS)
+endif
+
+OUT_DIR		:= $(OUTPUT)pmu-events/arch
+
+ifeq ($(OUTPUT),)
+OUT_JSON	:= $(SRC_JSON)
+ORPHAN_FILES	:=
+else
+# Things that need to be built in the OUTPUT directory. Note, ensure
+# there is a slash after the directory name so that it matches what
+# $(dir) gives in COPY_RULE.
+OUT_JSON	:= $(patsubst $(SRC_DIR)/%,$(OUT_DIR)/%,$(SRC_JSON))
+OUT_DIRS	:= $(sort $(patsubst %/,%,$(dir $(OUT_JSON))))
+
+# Things already in the OUTPUT directory
+CUR_OUT_JSON	:= $(shell [ -d $(OUT_DIR) ] && find $(OUT_DIR) -type f)
+
+# Things in the OUTPUT directory but shouldn't be there as computed by
+# OUT_JSON and GEN_JSON.
+ORPHAN_FILES	:= $(filter-out $(OUT_JSON) $(GEN_JSON),$(CUR_OUT_JSON))
+
+# Message for $(call echo-cmd,mkd). There is already a mkdir message
+# but it assumes $@ is a file to mkdir the directory for.
+quiet_cmd_mkd  = MKDIR   $@
+
+$(OUT_DIRS):
+	$(Q)$(call echo-cmd,mkd)mkdir -p $@
+
+# Explicitly generate rules to copy SRC_JSON files as $(dir) cannot
+# apply to $@ in a dependency. Exclude from the copy rules any that
+# look like they are copying generated json. This happens as a perf
+# build within the tools/perf directory will leave generated json
+# files within the tree, these then get picked up by SRC_JSON find.
+define COPY_RULE
+$(2): $(1) | $(3)
+	$$(Q)$$(call echo-cmd,cp)cp $(1) $(2)
+endef
+$(foreach src,$(SRC_JSON), \
+    $(eval dest := $(patsubst $(SRC_DIR)/%,$(OUT_DIR)/%,$(src))) \
+    $(eval ddir := $(patsubst %/,%,$(dir $(dest)))) \
+    $(if $(filter $(dest),$(GEN_JSON)),, \
+        $(eval $(call COPY_RULE,$(src),$(dest),$(ddir))) \
+    ) \
+)
+
+endif # ifneq ($(OUTPUT),)
+
+JEVENTS_PY	=  pmu-events/jevents.py
+METRIC_PY	=  pmu-events/metric.py
+
+# Rule to run the metric test.
+METRIC_TEST_PY	=  pmu-events/metric_test.py
+METRIC_TEST_LOG	=  $(OUTPUT)pmu-events/metric_test.log
 
 $(METRIC_TEST_LOG): $(METRIC_TEST_PY) $(METRIC_PY)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,test)$(PYTHON) $< 2> $@ || (cat $@ && false)
 
-$(TEST_EMPTY_PMU_EVENTS_C): $(GEN_JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(METRIC_TEST_LOG)
+# Rule to create then ensure the empty-pmu-events.c is in sync.
+TEST_EMPTY_PMU_EVENTS_C = $(OUTPUT)pmu-events/test-empty-pmu-events.c
+EMPTY_PMU_EVENTS_TEST_LOG = $(OUTPUT)pmu-events/empty-pmu-events.log
+
+$(TEST_EMPTY_PMU_EVENTS_C): $(OUT_JSON) $(GEN_JSON) $(JEVENTS_PY) $(METRIC_PY)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) none none $(OUTPUT)pmu-events/arch $@
 
@@ -112,36 +169,60 @@ $(EMPTY_PMU_EVENTS_TEST_LOG): $(EMPTY_PMU_EVENTS_C) $(TEST_EMPTY_PMU_EVENTS_C)
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,test)diff -u $^ 2> $@ || (cat $@ && false)
 
+
+# Dependencies for jevents.py
+JEVENTS_DEPS := $(OUT_JSON) $(GEN_JSON) $(JEVENTS_PY) $(METRIC_PY) $(EMPTY_PMU_EVENTS_TEST_LOG) $(METRIC_TEST_LOG)
+
+# Rules to run mypy if enabled.
 ifdef MYPY
-  PMU_EVENTS_PY_TESTS := $(wildcard *.py)
-  PMU_EVENTS_MYPY_TEST_LOGS := $(JEVENTS_PY_TESTS:%=%.mypy_log)
-else
-  PMU_EVENTS_MYPY_TEST_LOGS :=
+define MYPY_RULE
+$(2): $(1)
+	$$(Q)$$(call echo-cmd,test)mypy $(1) > $(2) || (cat $(2) && rm $(2) && false)
+endef
+$(foreach src,$(wildcard pmu-events/*.py), \
+    $(eval dest := $(patsubst pmu-events/%,$(OUTPUT)pmu-events/%.mypy_log,$(src))) \
+    $(eval $(call MYPY_RULE,$(src),$(dest))) \
+)
+
+MYPY_INPUTS := $(wildcard pmu-events/*.py)
+MYPY_OUTPUTS := $(patsubst pmu-events/%,$(OUTPUT)pmu-events/%.mypy_log,$(MYPY_INPUTS))
+JEVENTS_DEPS += $(MYPY_OUTPUTS)
 endif
 
-$(OUTPUT)%.mypy_log: %
-	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,test)mypy "$<" > $@ || (cat $@ && rm $@ && false)
-
+# Rules to run pylint if enabled.
 ifdef PYLINT
-  PMU_EVENTS_PY_TESTS := $(wildcard *.py)
-  PMU_EVENTS_PYLINT_TEST_LOGS := $(JEVENTS_PY_TESTS:%=%.pylint_log)
-else
-  PMU_EVENTS_PYLINT_TEST_LOGS :=
+define PYLINT_RULE
+$(2): $(1)
+	$$(Q)$$(call echo-cmd,test)pylint $(1) > $(2) || (cat $(2) && rm $(2) && false)
+endef
+$(foreach src,$(wildcard pmu-events/*.py), \
+    $(eval dest := $(patsubst pmu-events/%,$(OUTPUT)pmu-events/%.pylint_log,$(src))) \
+    $(eval $(call PYLINT_RULE,$(src),$(dest))) \
+)
+
+PYLINT_INPUTS := $(wildcard pmu-events/*.py)
+PYLINT_OUTPUTS := $(patsubst pmu-events/%,$(OUTPUT)pmu-events/%.pylint_log,$(PYLINT_INPUTS))
+JEVENTS_DEPS += $(PYLINT_OUTPUTS)
 endif
 
-$(OUTPUT)%.pylint_log: %
-	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,test)pylint "$<" > $@ || (cat $@ && rm $@ && false)
+# If there are orphaned files remove them.
+ifneq ($(strip $(ORPHAN_FILES)),)
+.PHONY: prune_orphans
 
-$(PMU_EVENTS_C): $(GEN_JSON) $(JSON_TEST) $(JEVENTS_PY) $(METRIC_PY) $(METRIC_TEST_LOG) \
-    $(EMPTY_PMU_EVENTS_TEST_LOG) $(PMU_EVENTS_MYPY_TEST_LOGS) $(PMU_EVENTS_PYLINT_TEST_LOGS)
-	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) $(JEVENTS_ARCH) $(JEVENTS_MODEL) $(OUTPUT)pmu-events/arch $@
+# Message for $(call echo-cmd,rm). Generally cleaning files isn't part
+# of a build step.
+quiet_cmd_rm  = RM      $^
+
+prune_orphans: $(ORPHAN_FILES)
+	$(Q)$(call echo-cmd,rm)rm -f $^
+
+JEVENTS_DEPS += prune_orphans
 endif
 
-# pmu-events.c file is generated in the OUTPUT directory so it needs a
-# separate rule to depend on it properly
-$(OUTPUT)pmu-events/pmu-events.o: $(PMU_EVENTS_C)
+# Finally, the rule to build pmu-events.c using jevents.py. All test
+# and inputs are dependencies.
+$(PMU_EVENTS_C): $(JEVENTS_DEPS)
 	$(call rule_mkdir)
-	$(call if_changed_dep,cc_o_c)
+	$(Q)$(call echo-cmd,gen)$(PYTHON) $(JEVENTS_PY) $(JEVENTS_ARCH) $(JEVENTS_MODEL) $(OUT_DIR) $@
+
+endif # ifeq ($(NO_JEVENTS),1)
-- 
cgit v1.2.3


From d652f425d5e332125d358a92158a840084061107 Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Thu, 5 Feb 2026 14:29:10 -0800
Subject: selftests/bpf: Update sk_storage_omem_uncharge test

Check sk_omem_alloc when the caller of bpf_local_storage_destroy()
returns. bpf_local_storage_destroy() now returns the memory to uncharge
to the caller instead of directly uncharge. Therefore, in the
sk_storage_omem_uncharge, check sk_omem_alloc when bpf_sk_storage_free()
returns instead of bpf_local_storage_destroy().

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260205222916.1788211-13-ameryhung@gmail.com
---
 tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c
index 46d6eb2a3b17..c8f4815c8dfb 100644
--- a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c
+++ b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c
@@ -6,7 +6,6 @@
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_core_read.h>
 
-void *local_storage_ptr = NULL;
 void *sk_ptr = NULL;
 int cookie_found = 0;
 __u64 cookie = 0;
@@ -19,21 +18,17 @@ struct {
 	__type(value, int);
 } sk_storage SEC(".maps");
 
-SEC("fexit/bpf_local_storage_destroy")
-int BPF_PROG(bpf_local_storage_destroy, struct bpf_local_storage *local_storage)
+SEC("fexit/bpf_sk_storage_free")
+int BPF_PROG(bpf_sk_storage_free, struct sock *sk)
 {
-	struct sock *sk;
-
-	if (local_storage_ptr != local_storage)
+	if (sk_ptr != sk)
 		return 0;
 
-	sk = bpf_core_cast(sk_ptr, struct sock);
 	if (sk->sk_cookie.counter != cookie)
 		return 0;
 
 	cookie_found++;
 	omem = sk->sk_omem_alloc.counter;
-	local_storage_ptr = NULL;
 
 	return 0;
 }
@@ -50,7 +45,6 @@ int BPF_PROG(inet6_sock_destruct, struct sock *sk)
 	if (value && *value == 0xdeadbeef) {
 		cookie_found++;
 		sk_ptr = sk;
-		local_storage_ptr = sk->sk_bpf_storage;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From e4772031d1053e7640e3094834916ee2605f288f Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Thu, 5 Feb 2026 14:29:11 -0800
Subject: selftests/bpf: Update task_local_storage/recursion test

Update the expected result of the selftest as recursion of task local
storage syscall and helpers have been relaxed. Now that the percpu
counter is removed, task local storage helpers, bpf_task_storage_get()
and bpf_task_storage_delete() can now run on the same CPU at the same
time unless they cause deadlock.

Note that since there is no percpu counter preventing recursion in
task local storage helpers, bpf_trampoline now catches the recursion
of on_update as reported by recursion_misses.

on_enter: tp_btf/sys_enter
on_update: fentry/bpf_local_storage_update

           Old behavior                         New behavior
           ____________                         ____________
on_enter                             on_enter
  bpf_task_storage_get(&map_a)         bpf_task_storage_get(&map_a)
    bpf_task_storage_trylock succeed     bpf_local_storage_update(&map_a)
    bpf_local_storage_update(&map_a)

    on_update                            on_update
      bpf_task_storage_get(&map_a)         bpf_task_storage_get(&map_a)
        bpf_task_storage_trylock fail        on_update::misses++ (1)
        return NULL                        create and return map_a::ptr

                                           map_a::ptr += 1 (1)

                                           bpf_task_storage_delete(&map_a)
                                             return 0

      bpf_task_storage_get(&map_b)         bpf_task_storage_get(&map_b)
        bpf_task_storage_trylock fail        on_update::misses++ (2)
        return NULL                        create and return map_b::ptr

                                           map_b::ptr += 1 (1)

    create and return map_a::ptr         create and return map_a::ptr
  map_a::ptr = 200                     map_a::ptr = 200

  bpf_task_storage_get(&map_b)         bpf_task_storage_get(&map_b)
    bpf_task_storage_trylock succeed     lockless lookup succeed
    bpf_local_storage_update(&map_b)     return map_b::ptr

    on_update
      bpf_task_storage_get(&map_a)
        bpf_task_storage_trylock fail
        lockless lookup succeed
        return map_a::ptr

      map_a::ptr += 1 (201)

      bpf_task_storage_delete(&map_a)
        bpf_task_storage_trylock fail
        return -EBUSY
      nr_del_errs++ (1)

      bpf_task_storage_get(&map_b)
        bpf_task_storage_trylock fail
        return NULL

    create and return ptr

  map_b::ptr = 100

Expected result:

map_a::ptr = 201                          map_a::ptr = 200
map_b::ptr = 100                          map_b::ptr = 1
nr_del_err = 1                            nr_del_err = 0
on_update::recursion_misses = 0           on_update::recursion_misses = 2
On_enter::recursion_misses = 0            on_enter::recursion_misses = 0

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260205222916.1788211-14-ameryhung@gmail.com
---
 .../testing/selftests/bpf/prog_tests/task_local_storage.c  | 10 +++++-----
 tools/testing/selftests/bpf/progs/task_ls_recursion.c      | 14 ++------------
 2 files changed, 7 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
index 42e822ea352f..7bee33797c71 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c
@@ -112,24 +112,24 @@ static void test_recursion(void)
 	task_ls_recursion__detach(skel);
 
 	/* Refer to the comment in BPF_PROG(on_update) for
-	 * the explanation on the value 201 and 100.
+	 * the explanation on the value 200 and 1.
 	 */
 	map_fd = bpf_map__fd(skel->maps.map_a);
 	err = bpf_map_lookup_elem(map_fd, &task_fd, &value);
 	ASSERT_OK(err, "lookup map_a");
-	ASSERT_EQ(value, 201, "map_a value");
-	ASSERT_EQ(skel->bss->nr_del_errs, 1, "bpf_task_storage_delete busy");
+	ASSERT_EQ(value, 200, "map_a value");
+	ASSERT_EQ(skel->bss->nr_del_errs, 0, "bpf_task_storage_delete busy");
 
 	map_fd = bpf_map__fd(skel->maps.map_b);
 	err = bpf_map_lookup_elem(map_fd, &task_fd, &value);
 	ASSERT_OK(err, "lookup map_b");
-	ASSERT_EQ(value, 100, "map_b value");
+	ASSERT_EQ(value, 1, "map_b value");
 
 	prog_fd = bpf_program__fd(skel->progs.on_update);
 	memset(&info, 0, sizeof(info));
 	err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len);
 	ASSERT_OK(err, "get prog info");
-	ASSERT_EQ(info.recursion_misses, 0, "on_update prog recursion");
+	ASSERT_EQ(info.recursion_misses, 2, "on_update prog recursion");
 
 	prog_fd = bpf_program__fd(skel->progs.on_enter);
 	memset(&info, 0, sizeof(info));
diff --git a/tools/testing/selftests/bpf/progs/task_ls_recursion.c b/tools/testing/selftests/bpf/progs/task_ls_recursion.c
index f1853c38aada..b37359432692 100644
--- a/tools/testing/selftests/bpf/progs/task_ls_recursion.c
+++ b/tools/testing/selftests/bpf/progs/task_ls_recursion.c
@@ -36,14 +36,9 @@ int BPF_PROG(on_update)
 	if (!test_pid || task->pid != test_pid)
 		return 0;
 
+	/* This will succeed as there is no real deadlock */
 	ptr = bpf_task_storage_get(&map_a, task, 0,
 				   BPF_LOCAL_STORAGE_GET_F_CREATE);
-	/* ptr will not be NULL when it is called from
-	 * the bpf_task_storage_get(&map_b,...F_CREATE) in
-	 * the BPF_PROG(on_enter) below.  It is because
-	 * the value can be found in map_a and the kernel
-	 * does not need to acquire any spin_lock.
-	 */
 	if (ptr) {
 		int err;
 
@@ -53,12 +48,7 @@ int BPF_PROG(on_update)
 			nr_del_errs++;
 	}
 
-	/* This will still fail because map_b is empty and
-	 * this BPF_PROG(on_update) has failed to acquire
-	 * the percpu busy lock => meaning potential
-	 * deadlock is detected and it will fail to create
-	 * new storage.
-	 */
+	/* This will succeed as there is no real deadlock */
 	ptr = bpf_task_storage_get(&map_b, task, 0,
 				   BPF_LOCAL_STORAGE_GET_F_CREATE);
 	if (ptr)
-- 
cgit v1.2.3


From 902a79b6389ff39fd736c6fd1581ded1372adbf5 Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Thu, 5 Feb 2026 14:29:12 -0800
Subject: selftests/bpf: Update task_local_storage/task_storage_nodeadlock test

Adjust the error code we are checking against as
bpf_task_storage_delete() now returns -EDEADLK or -ETIMEDOUT when
deadlock happens.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260205222916.1788211-15-ameryhung@gmail.com
---
 tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c b/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c
index 986829aaf73a..6ce98fe9f387 100644
--- a/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c
+++ b/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c
@@ -1,15 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "vmlinux.h"
+#include <errno.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
 char _license[] SEC("license") = "GPL";
 
-#ifndef EBUSY
-#define EBUSY 16
-#endif
-
 extern bool CONFIG_PREEMPTION __kconfig __weak;
 int nr_get_errs = 0;
 int nr_del_errs = 0;
@@ -40,7 +37,7 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type,
 
 	ret = bpf_task_storage_delete(&task_storage,
 				      bpf_get_current_task_btf());
-	if (ret == -EBUSY)
+	if (ret == -EDEADLK || ret == -ETIMEDOUT)
 		__sync_fetch_and_add(&nr_del_errs, 1);
 
 	return 0;
-- 
cgit v1.2.3


From e02cf06b85f8ae337c86db1bad5a0fd54c7bd301 Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Thu, 5 Feb 2026 14:29:13 -0800
Subject: selftests/bpf: Remove test_task_storage_map_stress_lookup

Remove a test in test_maps that checks if the updating of the percpu
counter in task local storage map is preemption safe as the percpu
counter is now removed.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260205222916.1788211-16-ameryhung@gmail.com
---
 .../selftests/bpf/map_tests/task_storage_map.c     | 128 ---------------------
 .../bpf/progs/read_bpf_task_storage_busy.c         |  38 ------
 2 files changed, 166 deletions(-)
 delete mode 100644 tools/testing/selftests/bpf/map_tests/task_storage_map.c
 delete mode 100644 tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/map_tests/task_storage_map.c b/tools/testing/selftests/bpf/map_tests/task_storage_map.c
deleted file mode 100644
index a4121d2248ac..000000000000
--- a/tools/testing/selftests/bpf/map_tests/task_storage_map.c
+++ /dev/null
@@ -1,128 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2022. Huawei Technologies Co., Ltd */
-#define _GNU_SOURCE
-#include <sched.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <errno.h>
-#include <string.h>
-#include <pthread.h>
-
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-
-#include "bpf_util.h"
-#include "test_maps.h"
-#include "task_local_storage_helpers.h"
-#include "read_bpf_task_storage_busy.skel.h"
-
-struct lookup_ctx {
-	bool start;
-	bool stop;
-	int pid_fd;
-	int map_fd;
-	int loop;
-};
-
-static void *lookup_fn(void *arg)
-{
-	struct lookup_ctx *ctx = arg;
-	long value;
-	int i = 0;
-
-	while (!ctx->start)
-		usleep(1);
-
-	while (!ctx->stop && i++ < ctx->loop)
-		bpf_map_lookup_elem(ctx->map_fd, &ctx->pid_fd, &value);
-	return NULL;
-}
-
-static void abort_lookup(struct lookup_ctx *ctx, pthread_t *tids, unsigned int nr)
-{
-	unsigned int i;
-
-	ctx->stop = true;
-	ctx->start = true;
-	for (i = 0; i < nr; i++)
-		pthread_join(tids[i], NULL);
-}
-
-void test_task_storage_map_stress_lookup(void)
-{
-#define MAX_NR_THREAD 4096
-	unsigned int i, nr = 256, loop = 8192, cpu = 0;
-	struct read_bpf_task_storage_busy *skel;
-	pthread_t tids[MAX_NR_THREAD];
-	struct lookup_ctx ctx;
-	cpu_set_t old, new;
-	const char *cfg;
-	int err;
-
-	cfg = getenv("TASK_STORAGE_MAP_NR_THREAD");
-	if (cfg) {
-		nr = atoi(cfg);
-		if (nr > MAX_NR_THREAD)
-			nr = MAX_NR_THREAD;
-	}
-	cfg = getenv("TASK_STORAGE_MAP_NR_LOOP");
-	if (cfg)
-		loop = atoi(cfg);
-	cfg = getenv("TASK_STORAGE_MAP_PIN_CPU");
-	if (cfg)
-		cpu = atoi(cfg);
-
-	skel = read_bpf_task_storage_busy__open_and_load();
-	err = libbpf_get_error(skel);
-	CHECK(err, "open_and_load", "error %d\n", err);
-
-	/* Only for a fully preemptible kernel */
-	if (!skel->kconfig->CONFIG_PREEMPTION) {
-		printf("%s SKIP (no CONFIG_PREEMPTION)\n", __func__);
-		read_bpf_task_storage_busy__destroy(skel);
-		skips++;
-		return;
-	}
-
-	/* Save the old affinity setting */
-	sched_getaffinity(getpid(), sizeof(old), &old);
-
-	/* Pinned on a specific CPU */
-	CPU_ZERO(&new);
-	CPU_SET(cpu, &new);
-	sched_setaffinity(getpid(), sizeof(new), &new);
-
-	ctx.start = false;
-	ctx.stop = false;
-	ctx.pid_fd = sys_pidfd_open(getpid(), 0);
-	ctx.map_fd = bpf_map__fd(skel->maps.task);
-	ctx.loop = loop;
-	for (i = 0; i < nr; i++) {
-		err = pthread_create(&tids[i], NULL, lookup_fn, &ctx);
-		if (err) {
-			abort_lookup(&ctx, tids, i);
-			CHECK(err, "pthread_create", "error %d\n", err);
-			goto out;
-		}
-	}
-
-	ctx.start = true;
-	for (i = 0; i < nr; i++)
-		pthread_join(tids[i], NULL);
-
-	skel->bss->pid = getpid();
-	err = read_bpf_task_storage_busy__attach(skel);
-	CHECK(err, "attach", "error %d\n", err);
-
-	/* Trigger program */
-	sys_gettid();
-	skel->bss->pid = 0;
-
-	CHECK(skel->bss->busy != 0, "bad bpf_task_storage_busy", "got %d\n", skel->bss->busy);
-out:
-	read_bpf_task_storage_busy__destroy(skel);
-	/* Restore affinity setting */
-	sched_setaffinity(getpid(), sizeof(old), &old);
-	printf("%s:PASS\n", __func__);
-}
diff --git a/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c b/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c
deleted file mode 100644
index 69da05bb6c63..000000000000
--- a/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c
+++ /dev/null
@@ -1,38 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2022. Huawei Technologies Co., Ltd */
-#include "vmlinux.h"
-#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
-
-extern bool CONFIG_PREEMPTION __kconfig __weak;
-extern const int bpf_task_storage_busy __ksym;
-
-char _license[] SEC("license") = "GPL";
-
-int pid = 0;
-int busy = 0;
-
-struct {
-	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
-	__type(key, int);
-	__type(value, long);
-} task SEC(".maps");
-
-SEC("raw_tp/sys_enter")
-int BPF_PROG(read_bpf_task_storage_busy)
-{
-	int *value;
-
-	if (!CONFIG_PREEMPTION)
-		return 0;
-
-	if (bpf_get_current_pid_tgid() >> 32 != pid)
-		return 0;
-
-	value = bpf_this_cpu_ptr(&bpf_task_storage_busy);
-	if (value)
-		busy = *value;
-
-	return 0;
-}
-- 
cgit v1.2.3


From cdce7b0848f6f2be4c6d7dbf243244981d315f6f Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Thu, 5 Feb 2026 14:29:14 -0800
Subject: selftests/bpf: Choose another percpu variable in bpf for btf_dump
 test

bpf_cgrp_storage_busy has been removed. Use bpf_bprintf_nest_level
instead. This percpu variable is also in the bpf subsystem so that
if it is removed in the future, BPF-CI will catch this type of CI-
breaking change.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260205222916.1788211-17-ameryhung@gmail.com
---
 tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index 10cba526d3e6..f1642794f70e 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -875,8 +875,8 @@ static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d,
 	TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT,
 			  "int cpu_number = (int)100", 100);
 #endif
-	TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_cgrp_storage_busy", int, BTF_F_COMPACT,
-			  "static int bpf_cgrp_storage_busy = (int)2", 2);
+	TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_bprintf_nest_level", int, BTF_F_COMPACT,
+			  "static int bpf_bprintf_nest_level = (int)2", 2);
 }
 
 struct btf_dump_string_ctx {
-- 
cgit v1.2.3


From 97b859b5ed04dbbe99be19895d8498009a19553f Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Thu, 5 Feb 2026 14:29:15 -0800
Subject: selftests/bpf: Fix outdated test on storage->smap

bpf_local_storage_free() already does not rely on local_storage->smap
since switching to kmalloc_nolock(). As local_storage->smap is removed,
fix the outdated test by dropping the local_storage->smap check. Keep
the second map in task local storage map test to test that multiple
elements can be added to the storage similar to sk storage test.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260205222916.1788211-18-ameryhung@gmail.com
---
 tools/testing/selftests/bpf/progs/local_storage.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c
index 637e75df2e14..d0be77011a84 100644
--- a/tools/testing/selftests/bpf/progs/local_storage.c
+++ b/tools/testing/selftests/bpf/progs/local_storage.c
@@ -62,7 +62,6 @@ SEC("lsm/inode_unlink")
 int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim)
 {
 	__u32 pid = bpf_get_current_pid_tgid() >> 32;
-	struct bpf_local_storage *local_storage;
 	struct local_storage *storage;
 	struct task_struct *task;
 	bool is_self_unlink;
@@ -88,15 +87,10 @@ int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim)
 	if (!storage || storage->value)
 		return 0;
 
-	if (bpf_task_storage_delete(&task_storage_map, task))
+	if (bpf_task_storage_delete(&task_storage_map2, task))
 		return 0;
 
-	/* Ensure that the task_storage_map is disconnected from the storage.
-	 * The storage memory should not be freed back to the
-	 * bpf_mem_alloc.
-	 */
-	local_storage = task->bpf_storage;
-	if (!local_storage || local_storage->smap)
+	if (bpf_task_storage_delete(&task_storage_map, task))
 		return 0;
 
 	task_storage_result = 0;
@@ -164,18 +158,9 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address,
 	if (bpf_sk_storage_delete(&sk_storage_map2, sk))
 		return 0;
 
-	storage = bpf_sk_storage_get(&sk_storage_map2, sk, 0,
-				     BPF_LOCAL_STORAGE_GET_F_CREATE);
-	if (!storage)
-		return 0;
-
 	if (bpf_sk_storage_delete(&sk_storage_map, sk))
 		return 0;
 
-	/* Ensure that the sk_storage_map is disconnected from the storage. */
-	if (!sk->sk_bpf_storage || sk->sk_bpf_storage->smap)
-		return 0;
-
 	sk_storage_result = 0;
 	return 0;
 }
-- 
cgit v1.2.3


From 53e553369167d361bdd550d194122ac7cdb00f3c Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Thu, 5 Feb 2026 18:34:24 +0100
Subject: selftests: mptcp: connect: fix maybe-uninitialize warn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This warning can be seen with GCC 15.2:

  mptcp_connect.c: In function ‘main_loop’:
  mptcp_connect.c:1422:37: warning: ‘peer’ may be used uninitialized [-Wmaybe-uninitialized]
   1422 |                 if (connect(fd, peer->ai_addr, peer->ai_addrlen))
        |                                 ~~~~^~~~~~~~~
  mptcp_connect.c:1377:26: note: ‘peer’ was declared here
   1377 |         struct addrinfo *peer;
        |                          ^~~~

This variable is set in sock_connect_mptcp() in some conditions. If not,
this helper returns an error, and the program stops. So this is a false
positive, but better removing it by initialising peer to NULL.

Reviewed-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260205-net-mptcp-misc-fixes-6-19-rc8-v2-4-c2720ce75c34@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index 10f6f99cfd4e..24b4abac8687 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -1296,8 +1296,8 @@ void xdisconnect(int fd)
 
 int main_loop(void)
 {
+	struct addrinfo *peer = NULL;
 	int fd = 0, ret, fd_in = 0;
-	struct addrinfo *peer;
 	struct wstate winfo;
 
 	if (cfg_input && cfg_sockopt_types.mptfo) {
-- 
cgit v1.2.3


From c60ee958d625998422ff833ec0de0dcafc1165fa Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 7 Feb 2026 15:04:52 -0800
Subject: perf test record.sh: Fix shellcheck warning

Add quotes to avoid the following warning:
```
In tests/shell/record.sh line 264:
 [ $(uname -m) = "s390x" ] && {
   ^---------^ SC2046 (warning): Quote this to prevent word splitting.

For more information:
 https://www.shellcheck.net/wiki/SC2046 -- Quote this to prevent word splitt...
```

Fixes: c73a56ed3c97ae65 ("perf test: Fix test case Leader sampling on s390")
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/record.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh
index 46b96d565680..7cb81cf3444a 100755
--- a/tools/perf/tests/shell/record.sh
+++ b/tools/perf/tests/shell/record.sh
@@ -261,7 +261,7 @@ test_uid() {
 test_leader_sampling() {
   echo "Basic leader sampling test"
   events="{cycles,cycles}:Su"
-  [ $(uname -m) = "s390x" ] && {
+  [ "$(uname -m)" = "s390x" ] && {
     [ ! -d /sys/devices/cpum_sf ] && {
       echo "No CPUMF [Skipped record]"
       return
-- 
cgit v1.2.3


From ff9aeb6bd14dbc70651971c81e81fa8269c3101a Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 7 Feb 2026 21:54:55 -0800
Subject: perf test parse-metric: Ensure aggregate counts appear to have run

Commit bb5a920b90991279 ("perf stat: Ensure metrics are displayed even
with failed events") with failed events") made it so that counters which
weren't enabled in the kernel were handled as NaN in metrics.

This caused the "Parse and process metrics" test to start failing as it
wasn't putting a non-zero value in these variables.

Add arbitrary values of 1 to fix the test.

Fixes: bb5a920b90991279 ("perf stat: Ensure metrics are displayed even with failed events")
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Chun-Tse Shao <ctshao@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/parse-metric.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/parse-metric.c b/tools/perf/tests/parse-metric.c
index 6bbc209a5c6a..7c7f489a5eb0 100644
--- a/tools/perf/tests/parse-metric.c
+++ b/tools/perf/tests/parse-metric.c
@@ -41,6 +41,8 @@ static void load_runtime_stat(struct evlist *evlist, struct value *vals)
 		count = find_value(evsel->name, vals);
 		evsel->supported = true;
 		evsel->stats->aggr->counts.val = count;
+		evsel->stats->aggr->counts.ena = 1;
+		evsel->stats->aggr->counts.run = 1;
 	}
 }
 
-- 
cgit v1.2.3


From 6a32fa5ccd33da5d187ec6e78f3b45683399ab66 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Sun, 8 Feb 2026 13:22:23 +0100
Subject: tools build: Add a feature test for rust compiler

Add a feature test to identify if the rust compiler is available, so
that perf could build rust based worloads based on that.

Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature     |  6 ++++--
 tools/build/feature/Makefile     |  7 +++++++
 tools/build/feature/test-rust.rs |  4 ++++
 tools/perf/Makefile.config       | 11 +++++++++++
 tools/perf/builtin-check.c       |  1 +
 5 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 tools/build/feature/test-rust.rs

(limited to 'tools')

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 7f119eafc7c4..64d21152fc81 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -100,7 +100,8 @@ FEATURE_TESTS_BASIC :=                  \
         disassembler-four-args		\
         disassembler-init-styled	\
         file-handle			\
-        libopenssl
+        libopenssl			\
+        rust
 
 # FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
 # of all feature tests
@@ -149,7 +150,8 @@ FEATURE_DISPLAY ?=              \
          bpf			\
          libaio			\
          libzstd		\
-         libopenssl
+         libopenssl		\
+         rust
 
 #
 # Declare group members of a feature to display the logical OR of the detection
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 5c15572d505e..9ae69d857166 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -112,6 +112,9 @@ __BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(
 __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS)
   BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1
 
+__BUILDRS = $(RUSTC) $(RUSTC_FLAGS) -o $@ $(patsubst %.bin,%.rs,$(@F))
+  BUILDRS = $(__BUILDRS) > $(@:.bin=.make.output) 2>&1
+
 ###############################
 
 $(OUTPUT)test-all.bin:
@@ -388,6 +391,10 @@ $(OUTPUT)test-libopenssl.bin:
 $(OUTPUT)test-bpftool-skeletons.bin:
 	$(SYSTEM_BPFTOOL) version | grep '^features:.*skeletons' \
 		> $(@:.bin=.make.output) 2>&1
+
+$(OUTPUT)test-rust.bin:
+	$(BUILDRS) > $(@:.bin=.make.output) 2>&1
+
 ###############################
 
 clean:
diff --git a/tools/build/feature/test-rust.rs b/tools/build/feature/test-rust.rs
new file mode 100644
index 000000000000..f2fc91cc4f69
--- /dev/null
+++ b/tools/build/feature/test-rust.rs
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+fn main() {
+    println!("hi")
+}
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index b683aab3ab97..94aecfe38b95 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -1153,6 +1153,17 @@ ifneq ($(NO_LIBTRACEEVENT),1)
   endif
 endif
 
+ifndef NO_RUST
+  ifneq ($(feature-rust), 1)
+    $(warning Rust is not found. Test workloads with rust are disabled.)
+    NO_RUST := 1
+  else
+    NO_RUST := 0
+    CFLAGS += -DHAVE_RUST_SUPPORT
+    $(call detected,CONFIG_RUST_SUPPORT)
+  endif
+endif
+
 # Among the variables below, these:
 #   perfexecdir
 #   libbpf_include_dir
diff --git a/tools/perf/builtin-check.c b/tools/perf/builtin-check.c
index d19769a8f689..27a41beeaddf 100644
--- a/tools/perf/builtin-check.c
+++ b/tools/perf/builtin-check.c
@@ -60,6 +60,7 @@ struct feature_status supported_features[] = {
 	FEATURE_STATUS("numa_num_possible_cpus", HAVE_LIBNUMA_SUPPORT),
 	FEATURE_STATUS("zlib", HAVE_ZLIB_SUPPORT),
 	FEATURE_STATUS("zstd", HAVE_ZSTD_SUPPORT),
+	FEATURE_STATUS("rust", HAVE_RUST_SUPPORT),
 
 	/* this should remain at end, to know the array end */
 	FEATURE_STATUS(NULL, _)
-- 
cgit v1.2.3


From 2e05bb52a12d3cdb81f3b6f5de5cb3905d383552 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Sun, 8 Feb 2026 13:22:24 +0100
Subject: perf test workload: Add code_with_type test workload

The purpose of the workload is to gather samples of rust runtime. To
achieve that it has a dummy rust library linked with it.

Per recommendations for such scenarios [1], the rust library is
statically linked.

An example:

$ perf record perf test -w code_with_type
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.160 MB perf.data (4074 samples) ]

$ perf report --stdio --dso perf -s srcfile,srcline
    45.16%  ub_checks.rs       ub_checks.rs:72
     6.72%  code_with_type.rs  code_with_type.rs:15
     6.64%  range.rs           range.rs:767
     4.26%  code_with_type.rs  code_with_type.rs:21
     4.23%  range.rs           range.rs:0
     3.99%  code_with_type.rs  code_with_type.rs:16
    [...]

[1]: https://doc.rust-lang.org/reference/linkage.html#mixed-rust-and-foreign-codebases

Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.build                   | 14 +++++++++
 tools/perf/Makefile.perf                     |  2 +-
 tools/perf/tests/builtin-test.c              |  4 +++
 tools/perf/tests/tests.h                     |  4 +++
 tools/perf/tests/workloads/Build             |  5 +++
 tools/perf/tests/workloads/code_with_type.c  | 46 ++++++++++++++++++++++++++++
 tools/perf/tests/workloads/code_with_type.rs | 23 ++++++++++++++
 tools/scripts/Makefile.include               |  2 ++
 8 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 tools/perf/tests/workloads/code_with_type.c
 create mode 100644 tools/perf/tests/workloads/code_with_type.rs

(limited to 'tools')

diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build
index 3584ff308607..60e65870eae1 100644
--- a/tools/build/Makefile.build
+++ b/tools/build/Makefile.build
@@ -76,6 +76,14 @@ quiet_cmd_host_ld_multi = HOSTLD  $@
       cmd_host_ld_multi = $(if $(strip $(obj-y)),\
                           $(HOSTLD) -r -o $@  $(filter $(obj-y),$^),rm -f $@; $(HOSTAR) rcs $@)
 
+rust_common_cmd = \
+	$(RUSTC) $(rust_flags) \
+	--crate-type staticlib -L $(objtree)/rust/ \
+	--emit=dep-info=$(depfile),link
+
+quiet_cmd_rustc_a_rs = $(RUSTC) $(quiet_modtag) $@
+      cmd_rustc_a_rs = $(rust_common_cmd) -o $@ -g $< $(cmd_objtool)
+
 ifneq ($(filter $(obj),$(hostprogs)),)
   host = host_
 endif
@@ -105,6 +113,12 @@ $(OUTPUT)%.s: %.c FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_s_c)
 
+# it's recommended to build a static rust library, when a foreight (to rust)
+# linker is used.
+$(OUTPUT)%.a: %.rs FORCE
+	$(call rule_mkdir)
+	$(call if_changed_dep,rustc_a_rs)
+
 # bison and flex files are generated in the OUTPUT directory
 # so it needs a separate rule to depend on them properly
 $(OUTPUT)%-bison.o: $(OUTPUT)%-bison.c FORCE
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 2a7e5814b159..a6d8ca3e9233 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -271,7 +271,7 @@ ifeq ($(PYLINT),1)
   PYLINT := $(shell which pylint 2> /dev/null)
 endif
 
-export srctree OUTPUT RM CC CXX LD AR CFLAGS CXXFLAGS V BISON FLEX AWK
+export srctree OUTPUT RM CC CXX RUSTC LD AR CFLAGS CXXFLAGS V BISON FLEX AWK
 export HOSTCC HOSTLD HOSTAR HOSTCFLAGS SHELLCHECK MYPY PYLINT
 
 include $(srctree)/tools/build/Makefile.include
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index e2490652f030..06507066213b 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -154,6 +154,10 @@ static struct test_workload *workloads[] = {
 	&workload__landlock,
 	&workload__traploop,
 	&workload__inlineloop,
+
+#ifdef HAVE_RUST_SUPPORT
+	&workload__code_with_type,
+#endif
 };
 
 #define workloads__for_each(workload) \
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 1f0f8b267fb1..f5f1238d1f7f 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -242,6 +242,10 @@ DECLARE_WORKLOAD(landlock);
 DECLARE_WORKLOAD(traploop);
 DECLARE_WORKLOAD(inlineloop);
 
+#ifdef HAVE_RUST_SUPPORT
+DECLARE_WORKLOAD(code_with_type);
+#endif
+
 extern const char *dso_to_test;
 extern const char *test_objdump_path;
 
diff --git a/tools/perf/tests/workloads/Build b/tools/perf/tests/workloads/Build
index 866a00bd14a0..2ef97f7affce 100644
--- a/tools/perf/tests/workloads/Build
+++ b/tools/perf/tests/workloads/Build
@@ -10,6 +10,11 @@ perf-test-y += landlock.o
 perf-test-y += traploop.o
 perf-test-y += inlineloop.o
 
+ifeq ($(CONFIG_RUST_SUPPORT),y)
+    perf-test-y += code_with_type.o
+    perf-test-y += code_with_type.a
+endif
+
 CFLAGS_sqrtloop.o         = -g -O0 -fno-inline -U_FORTIFY_SOURCE
 CFLAGS_leafloop.o         = -g -O0 -fno-inline -fno-omit-frame-pointer -U_FORTIFY_SOURCE
 CFLAGS_brstack.o          = -g -O0 -fno-inline -U_FORTIFY_SOURCE
diff --git a/tools/perf/tests/workloads/code_with_type.c b/tools/perf/tests/workloads/code_with_type.c
new file mode 100644
index 000000000000..65d7be7dac24
--- /dev/null
+++ b/tools/perf/tests/workloads/code_with_type.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <pthread.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <linux/compiler.h>
+#include "../tests.h"
+
+extern void test_rs(uint count);
+
+static volatile sig_atomic_t done;
+
+static void sighandler(int sig __maybe_unused)
+{
+	done = 1;
+}
+
+static int code_with_type(int argc, const char **argv)
+{
+	int sec = 1, num_loops = 100;
+
+	pthread_setname_np(pthread_self(), "perf-code-with-type");
+	if (argc > 0)
+		sec = atoi(argv[0]);
+
+	if (argc > 1)
+		num_loops = atoi(argv[1]);
+
+	signal(SIGINT, sighandler);
+	signal(SIGALRM, sighandler);
+	alarm(sec);
+
+	/*
+	 * Rust doesn't have signal management in the standard library. To
+	 * not deal with any external crates, offload signal handling to the
+	 * outside code.
+	 */
+	while (!done) {
+		test_rs(num_loops);
+		continue;
+	}
+
+	return 0;
+}
+
+DEFINE_WORKLOAD(code_with_type);
diff --git a/tools/perf/tests/workloads/code_with_type.rs b/tools/perf/tests/workloads/code_with_type.rs
new file mode 100644
index 000000000000..3b91e51919dd
--- /dev/null
+++ b/tools/perf/tests/workloads/code_with_type.rs
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// We're going to look for this structure in the data type profiling report
+#[allow(dead_code)]
+struct Buf {
+    data1: u64,
+    data2: String,
+    data3: u64,
+}
+
+#[no_mangle]
+pub extern "C" fn test_rs(count: u32) {
+    let mut b =  Buf { data1: 0, data2: String::from("data"), data3: 0};
+
+    for _ in 1..count {
+        b.data1 += 1;
+        if b.data1 == 123 {
+            b.data1 += 1;
+        }
+
+        b.data3 += b.data1;
+    }
+}
diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include
index ded48263dd5e..b5ecf137febc 100644
--- a/tools/scripts/Makefile.include
+++ b/tools/scripts/Makefile.include
@@ -94,6 +94,8 @@ LLVM_STRIP	?= llvm-strip
 # Some tools require bpftool
 SYSTEM_BPFTOOL	?= bpftool
 
+RUSTC		?= rustc
+
 ifeq ($(CC_NO_CLANG), 1)
 EXTRA_WARNINGS += -Wstrict-aliasing=3
 
-- 
cgit v1.2.3


From f60a5c22967b845d5319d4f447cb28190021795c Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Sun, 8 Feb 2026 13:22:25 +0100
Subject: perf tests: Test annotate with data type profiling and rust

Exercise the annotate command with data type profiling feature on the
rust runtime. For that add a new shell test, which will profile the
code_with_type workload, then annotate the result expecting to see some
data structures from the rust code.

Committer testing:

  root@number:~# perf test 'perf data type profiling tests'
   83: perf data type profiling tests                            : Ok
  root@number:~# perf test -v 'perf data type profiling tests'
   83: perf data type profiling tests                            : Ok
  root@number:~# perf test -vv 'perf data type profiling tests'
   83: perf data type profiling tests:
  --- start ---
  test child forked, pid 111044
  Basic perf annotate test
  Basic annotate test [Success]
  Pipe perf annotate test
  Pipe annotate test [Success]
  ---- end(0) ----
   83: perf data type profiling tests                            : Ok
  root@number:~#

Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/data_type_profiling.sh | 69 +++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100755 tools/perf/tests/shell/data_type_profiling.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/data_type_profiling.sh b/tools/perf/tests/shell/data_type_profiling.sh
new file mode 100755
index 000000000000..cdc9adb7d708
--- /dev/null
+++ b/tools/perf/tests/shell/data_type_profiling.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# perf data type profiling tests
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+# The logic below follows the same line as the annotate test, but looks for a
+# data type profiling manifestation
+testtype="# data-type: struct Buf"
+
+err=0
+perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
+perfout=$(mktemp /tmp/__perf_test.perf.out.XXXXX)
+testprog="perf test -w code_with_type"
+
+cleanup() {
+  rm -rf "${perfdata}" "${perfout}"
+  rm -rf "${perfdata}".old
+
+  trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+  echo "Unexpected signal in ${FUNCNAME[1]}"
+  cleanup
+  exit 1
+}
+trap trap_cleanup EXIT TERM INT
+
+test_basic_annotate() {
+  mode=$1
+  echo "${mode} perf annotate test"
+  if [ "x${mode}" == "xBasic" ]
+  then
+    perf mem record -o "${perfdata}" ${testprog} 2> /dev/null
+  else
+    perf mem record -o - ${testprog} 2> /dev/null > "${perfdata}"
+  fi
+  if [ "x$?" != "x0" ]
+  then
+    echo "${mode} annotate [Failed: perf record]"
+    err=1
+    return
+  fi
+
+  # Generate the annotated output file
+  if [ "x${mode}" == "xBasic" ]
+  then
+    perf annotate --code-with-type -i "${perfdata}" --stdio --percent-limit 1 2> /dev/null > "${perfout}"
+  else
+    perf annotate --code-with-type -i - --stdio 2> /dev/null --percent-limit 1 < "${perfdata}" > "${perfout}"
+  fi
+
+  # check if it has the target data type
+  if ! grep -q "${testtype}" "${perfout}"
+  then
+    echo "${mode} annotate [Failed: missing target data type]"
+    cat "${perfout}"
+    err=1
+    return
+  fi
+  echo "${mode} annotate test [Success]"
+}
+
+test_basic_annotate Basic
+test_basic_annotate Pipe
+
+cleanup
+exit $err
-- 
cgit v1.2.3


From 335047109d7d488bf5ad32a4076e1a011994cd0e Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Sun, 8 Feb 2026 13:22:26 +0100
Subject: perf tests: Test annotate with data type profiling and C

Exercise the annotate command with data type profiling feature with C.

For that extend the existing data type profiling shell test to profile
the datasym workload, then annotate the result expecting to see some
data structures from the C code.

Committer testing:

  root@number:~# perf test 'perf data type profiling tests'
   83: perf data type profiling tests                                  : Ok
  root@number:~# perf test -vv 'perf data type profiling tests'
   83: perf data type profiling tests:
  --- start ---
  test child forked, pid 125028
  Basic Rust perf annotate test
  Basic annotate test [Success]
  Pipe Rust perf annotate test
  Pipe annotate test [Success]
  Basic C perf annotate test
  Basic annotate test [Success]
  Pipe C perf annotate test
  Pipe annotate test [Success]
  ---- end(0) ----
   83: perf data type profiling tests                                  : Ok
  root@number:~#

Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/data_type_profiling.sh | 31 ++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/data_type_profiling.sh b/tools/perf/tests/shell/data_type_profiling.sh
index cdc9adb7d708..a230f5d4c42c 100755
--- a/tools/perf/tests/shell/data_type_profiling.sh
+++ b/tools/perf/tests/shell/data_type_profiling.sh
@@ -6,12 +6,14 @@ set -e
 
 # The logic below follows the same line as the annotate test, but looks for a
 # data type profiling manifestation
-testtype="# data-type: struct Buf"
+
+# Values in testtypes and testprogs should match
+testtypes=("# data-type: struct Buf" "# data-type: struct _buf")
+testprogs=("perf test -w code_with_type" "perf test -w datasym")
 
 err=0
 perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
 perfout=$(mktemp /tmp/__perf_test.perf.out.XXXXX)
-testprog="perf test -w code_with_type"
 
 cleanup() {
   rm -rf "${perfdata}" "${perfout}"
@@ -29,12 +31,23 @@ trap trap_cleanup EXIT TERM INT
 
 test_basic_annotate() {
   mode=$1
-  echo "${mode} perf annotate test"
+  runtime=$2
+
+  echo "${mode} ${runtime} perf annotate test"
+
+  case "x${runtime}" in
+    "xRust")
+    index=0 ;;
+
+    "xC")
+    index=1 ;;
+  esac
+
   if [ "x${mode}" == "xBasic" ]
   then
-    perf mem record -o "${perfdata}" ${testprog} 2> /dev/null
+    perf mem record -o "${perfdata}" ${testprogs[$index]} 2> /dev/null
   else
-    perf mem record -o - ${testprog} 2> /dev/null > "${perfdata}"
+    perf mem record -o - ${testprogs[$index]} 2> /dev/null > "${perfdata}"
   fi
   if [ "x$?" != "x0" ]
   then
@@ -52,7 +65,7 @@ test_basic_annotate() {
   fi
 
   # check if it has the target data type
-  if ! grep -q "${testtype}" "${perfout}"
+  if ! grep -q "${testtypes[$index]}" "${perfout}"
   then
     echo "${mode} annotate [Failed: missing target data type]"
     cat "${perfout}"
@@ -62,8 +75,10 @@ test_basic_annotate() {
   echo "${mode} annotate test [Success]"
 }
 
-test_basic_annotate Basic
-test_basic_annotate Pipe
+test_basic_annotate Basic Rust
+test_basic_annotate Pipe Rust
+test_basic_annotate Basic C
+test_basic_annotate Pipe C
 
 cleanup
 exit $err
-- 
cgit v1.2.3


From 5af56f30c4fcbade4a92f94dadfea517d1db9703 Mon Sep 17 00:00:00 2001
From: Francesco Lavra <flavra@baylibre.com>
Date: Mon, 9 Feb 2026 10:50:01 +0100
Subject: spi: tools: Add include folder to .gitignore

The Makefile for the SPI tools creates an include/linux/spi folder and some
symlinks inside it. After running `make -C spi/tools`, this folder shows up
as untracked in the git status.
Add the above folder to the .gitignore file.

Fixes: f325b73dc4db ("spi: tools: move to tools buildsystem")
Signed-off-by: Francesco Lavra <flavra@baylibre.com>
Link: https://patch.msgid.link/20260209095001.556495-1-flavra@baylibre.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 tools/spi/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/spi/.gitignore b/tools/spi/.gitignore
index 14ddba3d2195..038261b34ed8 100644
--- a/tools/spi/.gitignore
+++ b/tools/spi/.gitignore
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 spidev_fdx
 spidev_test
+include/
-- 
cgit v1.2.3


From 3f5dfa472ea6771c821ee0bb10dee7de41ef6021 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 9 Feb 2026 10:58:44 -0300
Subject: tools build: Fix rust feature detection

Features in FEATURE_TESTS_BASIC will be set as being available if
test-all.c builds, so since the rust test isn't included in test-all.c,
we can't have 'rust' in there, remove it from FEATURE_TESTS_BASIC and
use feature-check so that it tries to build test-rust.bin, doing the
actual feature detection.

On a system lacking a rust compiler:

  Makefile.config:1158: Rust is not found. Test workloads with rust are disabled.

  Auto-detecting system features:
  ...                                   libdw: [ on  ]
  ...                                   glibc: [ on  ]
  ...                                  libelf: [ on  ]
  ...                                 libnuma: [ on  ]
  ...                  numa_num_possible_cpus: [ on  ]
  ...                               libpython: [ on  ]
  ...                             libcapstone: [ on  ]
  ...                               llvm-perf: [ on  ]
  ...                                    zlib: [ on  ]
  ...                                    lzma: [ on  ]
  ...                                     bpf: [ on  ]
  ...                                  libaio: [ on  ]
  ...                                 libzstd: [ on  ]
  ...                              libopenssl: [ on  ]
  ...                                    rust: [ OFF ]

  $ cat /tmp/build/perf-tools-next/feature/test-rust.make.output
  /bin/sh: line 1: rustc: command not found
  $ file /tmp/build/perf-tools-next/feature/test-rust.bin
  /tmp/build/perf-tools-next/feature/test-rust.bin: cannot open `/tmp/build/perf-tools-next/feature/test-rust.bin' (No such file or directory)
  $
  $ perf -vv | grep RUST
                  rust: [ OFF ]  # HAVE_RUST_SUPPORT
  $

And after installing it:

  ...                                    rust: [ on  ]

  $ cat /tmp/build/perf-tools-next/feature/test-rust.make.output
  $ file /tmp/build/perf-tools-next/feature/test-rust.bin
/tmp/build/perf-tools-next/feature/test-rust.bin: ELF 64-bit LSB pie executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=9c416edf673ee3705b97bae893a99a6fcf1ee258, for GNU/Linux 3.2.0, with debug_info, not stripped
  $
  $ perf -vv | grep RUST
                  rust: [ on  ]  # HAVE_RUST_SUPPORT
  $

Fixes: 6a32fa5ccd33da5d ("tools build: Add a feature test for rust compiler")
Cc: Dmitrii Dolgov <9erthalion6@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/Makefile.feature | 3 +--
 tools/perf/Makefile.config   | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 64d21152fc81..0b7a7c38cb88 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -100,8 +100,7 @@ FEATURE_TESTS_BASIC :=                  \
         disassembler-four-args		\
         disassembler-init-styled	\
         file-handle			\
-        libopenssl			\
-        rust
+        libopenssl
 
 # FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
 # of all feature tests
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 94aecfe38b95..a8dc72cfe48e 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -1154,6 +1154,7 @@ ifneq ($(NO_LIBTRACEEVENT),1)
 endif
 
 ifndef NO_RUST
+  $(call feature_check,rust)
   ifneq ($(feature-rust), 1)
     $(warning Rust is not found. Test workloads with rust are disabled.)
     NO_RUST := 1
-- 
cgit v1.2.3


From 1d3ffe6233b1b6e8697f5027b9441ce70385c997 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Mon, 9 Feb 2026 15:05:32 +0100
Subject: perf tests workload: Formatting for code_with_type.rs

One part of the rust code for code_with_type workload wasn't properly
formatted.

Pass it through rustfmt to fix that.

Closes: https://lore.kernel.org/oe-kbuild-all/202602091357.oyRv6hgQ-lkp@intel.com/
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/workloads/code_with_type.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/tests/workloads/code_with_type.rs b/tools/perf/tests/workloads/code_with_type.rs
index 3b91e51919dd..3dab39b22dd7 100644
--- a/tools/perf/tests/workloads/code_with_type.rs
+++ b/tools/perf/tests/workloads/code_with_type.rs
@@ -10,7 +10,11 @@ struct Buf {
 
 #[no_mangle]
 pub extern "C" fn test_rs(count: u32) {
-    let mut b =  Buf { data1: 0, data2: String::from("data"), data3: 0};
+    let mut b = Buf {
+        data1: 0,
+        data2: String::from("data"),
+        data3: 0,
+    };
 
     for _ in 1..count {
         b.data1 += 1;
-- 
cgit v1.2.3


From 2a400eeba40b4cf1fb28f78f41bf73a898b00d06 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 9 Feb 2026 10:58:44 -0300
Subject: perf test code_with_type.sh: Skip test if rust wasn't available at
 build time

  $ perf test 'perf data type profiling tests'
   83: perf data type profiling tests                         : Skip
  $ perf test -vv 'perf data type profiling tests'
   83: perf data type profiling tests:
  --- start ---
  test child forked, pid 977213
  Skip: code_with_type workload not built in 'perf test'
  ---- end(-2) ----
   83: perf data type profiling tests                         : Skip
  $

Cc: Dmitrii Dolgov <9erthalion6@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/data_type_profiling.sh | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/data_type_profiling.sh b/tools/perf/tests/shell/data_type_profiling.sh
index a230f5d4c42c..3ef72a10850d 100755
--- a/tools/perf/tests/shell/data_type_profiling.sh
+++ b/tools/perf/tests/shell/data_type_profiling.sh
@@ -4,6 +4,11 @@
 
 set -e
 
+if ! perf test --list-workloads | grep -qw code_with_type ; then
+	echo "Skip: code_with_type workload not built in 'perf test'"
+	exit 2
+fi
+
 # The logic below follows the same line as the annotate test, but looks for a
 # data type profiling manifestation
 
-- 
cgit v1.2.3


From 3d012b8614ee020666f3dd15af9f65dc487e3f5f Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Mon, 9 Feb 2026 16:32:56 +0100
Subject: perf test: Fix test case perftool-testsuite_report for s390

Test case perftool-testsuite_report fails on s390 for some time
now.

Root cause is a time out which is too tight for large s390 machines.
The time out value addr2line_timeout_ms is per default set to 1 second.

This is the maximum time the function read_addr2line_record() waits for
a reply from the forked off tool addr2line, which is started as a child
in interactive mode.

It reads stdin (an address in hexadecimal) and replies on stdout with
function name, file name and line number. This might take more than one
second.

However one second is not always enough and the reply from addr2line
tool is not received. Function read_addr2line_record() fails and emits
a warning, which is not expected by the test case. It fails.

Output before:

 # perf test -F 133
 -- [ PASS ] -- perf_report :: setup :: prepare the perf.data file
 ==================
 [ perf record: Woken up 1 times to write data ]
 [ perf record: Captured and wrote 0.087 MB \
	/tmp/perftool-testsuite_report.FHz/perf_report/perf.data.1 \
	(207 samples) ]
 ==================
 -- [ PASS ] -- perf_report :: setup :: prepare the perf.data.1 file
 ## [ PASS ] ## perf_report :: setup SUMMARY
 -- [ SKIP ] -- perf_report :: test_basic :: help message :: testcase skipped
 Line did not match any pattern: "cmd__addr2line /usr/lib/debug/lib/modules/
 	6.19.0-20260205.rc8.git366.9845cf73f7db.300.fc43.s390x+next/
	vmlinux: could not read first record"
 Line did not match any pattern: "cmd__addr2line /usr/lib/debug/lib/modules/
	6.19.0-20260205.rc8.git366.9845cf73f7db.300.fc43.s390x+next/
	vmlinux: could not read first record"
 -- [ FAIL ] -- perf_report :: test_basic :: basic execution
	(output regexp parsing)
 ....
 133: perftool-testsuite_report      : FAILED!

Output after:

 # ./perf test -F 133
 -- [ PASS ] -- perf_report :: setup :: prepare the perf.data file
 ==================
 [ perf record: Woken up 1 times to write data ]
 [ perf record: Captured and wrote 0.087 MB \
	 /tmp/perftool-testsuite_report.Mlp/perf_report/perf.data.1
	 (188 samples) ]
 ==================
 -- [ PASS ] -- perf_report :: setup :: prepare the perf.data.1 file
 ## [ PASS ] ## perf_report :: setup SUMMARY
 -- [ SKIP ] -- perf_report :: test_basic :: help message :: testcase skipped
 -- [ PASS ] -- perf_report :: test_basic :: basic execution
 -- [ PASS ] -- perf_report :: test_basic :: number of samples
 -- [ PASS ] -- perf_report :: test_basic :: header
 -- [ PASS ] -- perf_report :: test_basic :: header timestamp
 -- [ PASS ] -- perf_report :: test_basic :: show CPU utilization
 -- [ PASS ] -- perf_report :: test_basic :: pid
 -- [ PASS ] -- perf_report :: test_basic :: non-existing symbol
 -- [ PASS ] -- perf_report :: test_basic :: symbol filter
 -- [ PASS ] -- perf_report :: test_basic :: latency header
 -- [ PASS ] -- perf_report :: test_basic :: default report for latency profile
 -- [ PASS ] -- perf_report :: test_basic :: latency report for latency profile
 -- [ PASS ] -- perf_report :: test_basic :: parallelism histogram
 ## [ PASS ] ## perf_report :: test_basic SUMMARY
 133: perftool-testsuite_report      : Ok
 #

Fixes: 257046a36750a6db ("perf srcline: Fallback between addr2line implementations")
Reviewed-by: Jan Polensky <japo@linux.ibm.com>
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ian Rogers <irogers@google.com>
Cc: linux-s390@vger.kernel.org
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/addr2line.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/addr2line.c b/tools/perf/util/addr2line.c
index 0f1499350d47..31c0391fffa3 100644
--- a/tools/perf/util/addr2line.c
+++ b/tools/perf/util/addr2line.c
@@ -18,8 +18,8 @@
 
 #define MAX_INLINE_NEST 1024
 
-/* If addr2line doesn't return data for 1 second then timeout. */
-int addr2line_timeout_ms = 1 * 1000;
+/* If addr2line doesn't return data for 5 seconds then timeout. */
+int addr2line_timeout_ms = 5 * 1000;
 
 static int filename_split(char *filename, unsigned int *line_nr)
 {
-- 
cgit v1.2.3


From b211a30690f8263b79f30b6b1770ffe216fa378c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 5 Feb 2026 22:54:40 -0800
Subject: docs: kdoc_parser: allow __exit in function prototypes

Handle functions that are marked with __exit to prevent warnings:

Documentation/networking/iucv:35: ../net/iucv/iucv.c:1918: WARNING: Error in declarator or parameters
Invalid C declaration: Expecting "(" in parameters. [error at 12]
  void __exit iucv_exit (void)
  ------------^

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Message-ID: <20260206065440.2412185-1-rdunlap@infradead.org>
---
 tools/lib/python/kdoc/kdoc_parser.py | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/lib/python/kdoc/kdoc_parser.py b/tools/lib/python/kdoc/kdoc_parser.py
index fd57944ae907..ca00695b47b3 100644
--- a/tools/lib/python/kdoc/kdoc_parser.py
+++ b/tools/lib/python/kdoc/kdoc_parser.py
@@ -175,6 +175,7 @@ function_xforms  = [
     (KernRe(r"^__FORTIFY_INLINE +"), ""),
     (KernRe(r"__init +"), ""),
     (KernRe(r"__init_or_module +"), ""),
+    (KernRe(r"__exit +"), ""),
     (KernRe(r"__deprecated +"), ""),
     (KernRe(r"__flatten +"), ""),
     (KernRe(r"__meminit +"), ""),
-- 
cgit v1.2.3


From 600f72ded8c877be95322ce806d23345ea5e89bc Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <geomatsi@gmail.com>
Date: Sun, 25 Jan 2026 21:09:57 -0700
Subject: selftests: riscv: test ptrace vector interface

Add a test case to check ptrace behavior in the case when vector
extension is supported by the system, but vector context is not
yet enabled for the traced process.

Signed-off-by: Sergey Matyukevich <geomatsi@gmail.com>
Reviewed-by: Andy Chiu <andybnac@gmail.com>
Tested-by: Andy Chiu <andybnac@gmail.com>
Link: https://patch.msgid.link/20251214163537.1054292-6-geomatsi@gmail.com
[pjw@kernel.org: dropped duplicate sys/wait.h include]
Signed-off-by: Paul Walmsley <pjw@kernel.org>
---
 tools/testing/selftests/riscv/vector/.gitignore    |  2 +
 tools/testing/selftests/riscv/vector/Makefile      | 10 ++-
 tools/testing/selftests/riscv/vector/v_helpers.c   | 23 +++++++
 tools/testing/selftests/riscv/vector/v_helpers.h   |  2 +
 .../selftests/riscv/vector/validate_v_ptrace.c     | 79 ++++++++++++++++++++++
 5 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/riscv/vector/validate_v_ptrace.c

(limited to 'tools')

diff --git a/tools/testing/selftests/riscv/vector/.gitignore b/tools/testing/selftests/riscv/vector/.gitignore
index 7d9c87cd0649..40a82baf364f 100644
--- a/tools/testing/selftests/riscv/vector/.gitignore
+++ b/tools/testing/selftests/riscv/vector/.gitignore
@@ -2,3 +2,5 @@ vstate_exec_nolibc
 vstate_prctl
 v_initval
 v_exec_initval_nolibc
+vstate_ptrace
+validate_v_ptrace
diff --git a/tools/testing/selftests/riscv/vector/Makefile b/tools/testing/selftests/riscv/vector/Makefile
index 2c2a33fc083e..326dafd739bf 100644
--- a/tools/testing/selftests/riscv/vector/Makefile
+++ b/tools/testing/selftests/riscv/vector/Makefile
@@ -2,11 +2,14 @@
 # Copyright (C) 2021 ARM Limited
 # Originally tools/testing/arm64/abi/Makefile
 
-TEST_GEN_PROGS := v_initval vstate_prctl vstate_ptrace
+TEST_GEN_PROGS := v_initval vstate_prctl vstate_ptrace validate_v_ptrace
 TEST_GEN_PROGS_EXTENDED := vstate_exec_nolibc v_exec_initval_nolibc
+TEST_GEN_LIBS := v_helpers.c sys_hwprobe.c
 
 include ../../lib.mk
 
+TEST_GEN_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(TEST_GEN_LIBS))
+
 $(OUTPUT)/sys_hwprobe.o: ../hwprobe/sys_hwprobe.S
 	$(CC) -static -c -o$@ $(CFLAGS) $^
 
@@ -29,3 +32,8 @@ $(OUTPUT)/v_exec_initval_nolibc: v_exec_initval_nolibc.c
 
 $(OUTPUT)/vstate_ptrace: vstate_ptrace.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o
 	$(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^
+
+$(OUTPUT)/validate_v_ptrace: validate_v_ptrace.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o
+	$(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^
+
+EXTRA_CLEAN += $(TEST_GEN_OBJ)
diff --git a/tools/testing/selftests/riscv/vector/v_helpers.c b/tools/testing/selftests/riscv/vector/v_helpers.c
index 01a8799dcb78..de6da7c8d2f1 100644
--- a/tools/testing/selftests/riscv/vector/v_helpers.c
+++ b/tools/testing/selftests/riscv/vector/v_helpers.c
@@ -26,6 +26,29 @@ bool is_vector_supported(void)
 	return pair.value & RISCV_HWPROBE_EXT_ZVE32X;
 }
 
+unsigned long get_vr_len(void)
+{
+	unsigned long vlenb;
+
+	if (is_vector_supported()) {
+		asm volatile("csrr %[vlenb], vlenb" : [vlenb] "=r"(vlenb));
+		return vlenb;
+	}
+
+	if (is_xtheadvector_supported()) {
+		asm volatile (
+			// 0 | zimm[10:0] | rs1 | 1 1 1 | rd | 1010111 | vsetvli
+			// vsetvli	t4, x0, e8, m1, d1
+			".4byte		0b00000000000000000111111011010111\n\t"
+			"mv		%[vlenb], t4\n\t"
+			: [vlenb] "=r"(vlenb) : : "memory", "t4");
+		return vlenb;
+	}
+
+	printf("WARNING: vector not supported\n");
+	return 0;
+}
+
 int launch_test(char *next_program, int test_inherit, int xtheadvector)
 {
 	char *exec_argv[4], *exec_envp[1];
diff --git a/tools/testing/selftests/riscv/vector/v_helpers.h b/tools/testing/selftests/riscv/vector/v_helpers.h
index 763cddfe26da..c538077f1195 100644
--- a/tools/testing/selftests/riscv/vector/v_helpers.h
+++ b/tools/testing/selftests/riscv/vector/v_helpers.h
@@ -5,4 +5,6 @@ bool is_xtheadvector_supported(void);
 
 bool is_vector_supported(void);
 
+unsigned long get_vr_len(void);
+
 int launch_test(char *next_program, int test_inherit, int xtheadvector);
diff --git a/tools/testing/selftests/riscv/vector/validate_v_ptrace.c b/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
new file mode 100644
index 000000000000..3ffef2704b0b
--- /dev/null
+++ b/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <linux/ptrace.h>
+#include <linux/elf.h>
+
+#include "kselftest_harness.h"
+#include "v_helpers.h"
+
+volatile unsigned long chld_lock;
+
+TEST(ptrace_v_not_enabled)
+{
+	pid_t pid;
+
+	if (!(is_vector_supported() || is_xtheadvector_supported()))
+		SKIP(return, "Vector not supported");
+
+	chld_lock = 1;
+	pid = fork();
+	ASSERT_LE(0, pid)
+		TH_LOG("fork: %m");
+
+	if (pid == 0) {
+		while (chld_lock == 1)
+			asm volatile("" : : "g"(chld_lock) : "memory");
+
+		asm volatile ("ebreak" : : : );
+	} else {
+		struct __riscv_v_regset_state *regset_data;
+		unsigned long vlenb = get_vr_len();
+		size_t regset_size;
+		struct iovec iov;
+		int status;
+		int ret;
+
+		/* attach */
+
+		ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid, NULL, NULL));
+		ASSERT_EQ(pid, waitpid(pid, &status, 0));
+		ASSERT_TRUE(WIFSTOPPED(status));
+
+		/* unlock */
+
+		ASSERT_EQ(0, ptrace(PTRACE_POKEDATA, pid, &chld_lock, 0));
+
+		/* resume and wait for ebreak */
+
+		ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL));
+		ASSERT_EQ(pid, waitpid(pid, &status, 0));
+		ASSERT_TRUE(WIFSTOPPED(status));
+
+		/* try to read vector registers from the tracee */
+
+		regset_size = sizeof(*regset_data) + vlenb * 32;
+		regset_data = calloc(1, regset_size);
+
+		iov.iov_base = regset_data;
+		iov.iov_len = regset_size;
+
+		/* V extension is available, but not yet enabled for the tracee */
+
+		errno = 0;
+		ret = ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov);
+		ASSERT_EQ(ENODATA, errno);
+		ASSERT_EQ(-1, ret);
+
+		/* cleanup */
+
+		ASSERT_EQ(0, kill(pid, SIGKILL));
+	}
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 66d03044891df63c82b18ae1da07bc4bc077ae48 Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <geomatsi@gmail.com>
Date: Sun, 25 Jan 2026 21:09:57 -0700
Subject: selftests: riscv: verify initial vector state with ptrace

Add a test case that attaches to a traced process immediately after its
first executed vector instructions to verify the initial vector context.

Signed-off-by: Sergey Matyukevich <geomatsi@gmail.com>
Reviewed-by: Andy Chiu <andybnac@gmail.com>
Tested-by: Andy Chiu <andybnac@gmail.com>
Link: https://patch.msgid.link/20251214163537.1054292-7-geomatsi@gmail.com
Signed-off-by: Paul Walmsley <pjw@kernel.org>
---
 .../selftests/riscv/vector/validate_v_ptrace.c     | 135 +++++++++++++++++++++
 1 file changed, 135 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/riscv/vector/validate_v_ptrace.c b/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
index 3ffef2704b0b..768ef93b33da 100644
--- a/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
+++ b/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
@@ -12,6 +12,9 @@
 #include "kselftest_harness.h"
 #include "v_helpers.h"
 
+#define SR_FS_DIRTY	0x00006000UL
+#define CSR_VXRM_SHIFT	1
+
 volatile unsigned long chld_lock;
 
 TEST(ptrace_v_not_enabled)
@@ -76,4 +79,136 @@ TEST(ptrace_v_not_enabled)
 	}
 }
 
+TEST(ptrace_v_early_debug)
+{
+	static volatile unsigned long vstart;
+	static volatile unsigned long vtype;
+	static volatile unsigned long vlenb;
+	static volatile unsigned long vcsr;
+	static volatile unsigned long vl;
+	bool xtheadvector;
+	pid_t pid;
+
+	if (!(is_vector_supported() || is_xtheadvector_supported()))
+		SKIP(return, "Vector not supported");
+
+	xtheadvector = is_xtheadvector_supported();
+
+	chld_lock = 1;
+	pid = fork();
+	ASSERT_LE(0, pid)
+		TH_LOG("fork: %m");
+
+	if (pid == 0) {
+		unsigned long vxsat, vxrm;
+
+		vlenb = get_vr_len();
+
+		while (chld_lock == 1)
+			asm volatile ("" : : "g"(chld_lock) : "memory");
+
+		asm volatile (
+			"csrr %[vstart], vstart\n"
+			"csrr %[vtype], vtype\n"
+			"csrr %[vl], vl\n"
+			: [vtype] "=r"(vtype), [vstart] "=r"(vstart), [vl] "=r"(vl)
+			:
+			: "memory");
+
+		/* no 'is_xtheadvector_supported()' here to avoid clobbering v-state by syscall */
+		if (xtheadvector) {
+			asm volatile (
+				"csrs sstatus, %[bit]\n"
+				"csrr %[vxsat], vxsat\n"
+				"csrr %[vxrm], vxrm\n"
+				: [vxsat] "=r"(vxsat), [vxrm] "=r"(vxrm)
+				: [bit] "r" (SR_FS_DIRTY)
+				: "memory");
+			vcsr = vxsat | vxrm << CSR_VXRM_SHIFT;
+		} else {
+			asm volatile (
+				"csrr %[vcsr], vcsr\n"
+				: [vcsr] "=r"(vcsr)
+				:
+				: "memory");
+		}
+
+		asm volatile (
+			".option push\n"
+			".option norvc\n"
+			"ebreak\n"
+			".option pop\n");
+	} else {
+		struct __riscv_v_regset_state *regset_data;
+		unsigned long vstart_csr;
+		unsigned long vlenb_csr;
+		unsigned long vtype_csr;
+		unsigned long vcsr_csr;
+		unsigned long vl_csr;
+		size_t regset_size;
+		struct iovec iov;
+		int status;
+
+		/* attach */
+
+		ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid, NULL, NULL));
+		ASSERT_EQ(pid, waitpid(pid, &status, 0));
+		ASSERT_TRUE(WIFSTOPPED(status));
+
+		/* unlock */
+
+		ASSERT_EQ(0, ptrace(PTRACE_POKEDATA, pid, &chld_lock, 0));
+
+		/* resume and wait for ebreak */
+
+		ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL));
+		ASSERT_EQ(pid, waitpid(pid, &status, 0));
+		ASSERT_TRUE(WIFSTOPPED(status));
+
+		/* read tracee vector csr regs using ptrace PEEKDATA */
+
+		errno = 0;
+		vstart_csr = ptrace(PTRACE_PEEKDATA, pid, &vstart, NULL);
+		ASSERT_FALSE((errno != 0) && (vstart_csr == -1));
+
+		errno = 0;
+		vl_csr = ptrace(PTRACE_PEEKDATA, pid, &vl, NULL);
+		ASSERT_FALSE((errno != 0) && (vl_csr == -1));
+
+		errno = 0;
+		vtype_csr = ptrace(PTRACE_PEEKDATA, pid, &vtype, NULL);
+		ASSERT_FALSE((errno != 0) && (vtype_csr == -1));
+
+		errno = 0;
+		vcsr_csr = ptrace(PTRACE_PEEKDATA, pid, &vcsr, NULL);
+		ASSERT_FALSE((errno != 0) && (vcsr_csr == -1));
+
+		errno = 0;
+		vlenb_csr = ptrace(PTRACE_PEEKDATA, pid, &vlenb, NULL);
+		ASSERT_FALSE((errno != 0) && (vlenb_csr == -1));
+
+		/* read tracee csr regs using ptrace GETREGSET */
+
+		regset_size = sizeof(*regset_data) + vlenb_csr * 32;
+		regset_data = calloc(1, regset_size);
+
+		iov.iov_base = regset_data;
+		iov.iov_len = regset_size;
+
+		ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov));
+
+		/* compare */
+
+		EXPECT_EQ(vstart_csr, regset_data->vstart);
+		EXPECT_EQ(vtype_csr, regset_data->vtype);
+		EXPECT_EQ(vlenb_csr, regset_data->vlenb);
+		EXPECT_EQ(vcsr_csr, regset_data->vcsr);
+		EXPECT_EQ(vl_csr, regset_data->vl);
+
+		/* cleanup */
+
+		ASSERT_EQ(0, kill(pid, SIGKILL));
+	}
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 3789d5eecd5ae01149d0ef5ba70e8120da2f55db Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <geomatsi@gmail.com>
Date: Sun, 25 Jan 2026 21:09:57 -0700
Subject: selftests: riscv: verify syscalls discard vector context

Add a test to v_ptrace test suite to verify that vector csr registers
are clobbered on syscalls.

Signed-off-by: Sergey Matyukevich <geomatsi@gmail.com>
Reviewed-by: Andy Chiu <andybnac@gmail.com>
Tested-by: Andy Chiu <andybnac@gmail.com>
Link: https://patch.msgid.link/20251214163537.1054292-8-geomatsi@gmail.com
[pjw@kernel.org: cleaned up a checkpatch issue]
Signed-off-by: Paul Walmsley <pjw@kernel.org>
---
 .../selftests/riscv/vector/validate_v_ptrace.c     | 123 +++++++++++++++++++++
 1 file changed, 123 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/riscv/vector/validate_v_ptrace.c b/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
index 768ef93b33da..7ff9a0cf229c 100644
--- a/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
+++ b/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
@@ -211,4 +211,127 @@ TEST(ptrace_v_early_debug)
 	}
 }
 
+TEST(ptrace_v_syscall_clobbering)
+{
+	pid_t pid;
+
+	if (!is_vector_supported() && !is_xtheadvector_supported())
+		SKIP(return, "Vector not supported");
+
+	chld_lock = 1;
+	pid = fork();
+	ASSERT_LE(0, pid)
+		TH_LOG("fork: %m");
+
+	if (pid == 0) {
+		unsigned long vl;
+
+		while (chld_lock == 1)
+			asm volatile("" : : "g"(chld_lock) : "memory");
+
+		if (is_xtheadvector_supported()) {
+			asm volatile (
+				// 0 | zimm[10:0] | rs1 | 1 1 1 | rd |1010111| vsetvli
+				// vsetvli	t4, x0, e16, m2, d1
+				".4byte		0b00000000010100000111111011010111\n"
+				"mv		%[new_vl], t4\n"
+				: [new_vl] "=r" (vl) : : "t4");
+		} else {
+			asm volatile (
+				".option push\n"
+				".option arch, +zve32x\n"
+				"vsetvli %[new_vl], x0, e16, m2, tu, mu\n"
+				".option pop\n"
+				: [new_vl] "=r"(vl) : : );
+		}
+
+		while (1) {
+			asm volatile (
+				".option push\n"
+				".option norvc\n"
+				"ebreak\n"
+				".option pop\n");
+
+			sleep(0);
+		}
+	} else {
+		struct __riscv_v_regset_state *regset_data;
+		unsigned long vlenb = get_vr_len();
+		struct user_regs_struct regs;
+		size_t regset_size;
+		struct iovec iov;
+		int status;
+
+		/* attach */
+
+		ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid, NULL, NULL));
+		ASSERT_EQ(pid, waitpid(pid, &status, 0));
+		ASSERT_TRUE(WIFSTOPPED(status));
+
+		/* unlock */
+
+		ASSERT_EQ(0, ptrace(PTRACE_POKEDATA, pid, &chld_lock, 0));
+
+		/* resume and wait for the 1st ebreak */
+
+		ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL));
+		ASSERT_EQ(pid, waitpid(pid, &status, 0));
+		ASSERT_TRUE(WIFSTOPPED(status));
+
+		/* read tracee vector csr regs using ptrace GETREGSET */
+
+		regset_size = sizeof(*regset_data) + vlenb * 32;
+		regset_data = calloc(1, regset_size);
+
+		iov.iov_base = regset_data;
+		iov.iov_len = regset_size;
+
+		ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov));
+
+		/* verify initial vsetvli settings */
+
+		if (is_xtheadvector_supported())
+			EXPECT_EQ(5UL, regset_data->vtype);
+		else
+			EXPECT_EQ(9UL, regset_data->vtype);
+
+		EXPECT_EQ(regset_data->vlenb, regset_data->vl);
+		EXPECT_EQ(vlenb, regset_data->vlenb);
+		EXPECT_EQ(0UL, regset_data->vstart);
+		EXPECT_EQ(0UL, regset_data->vcsr);
+
+		/* skip 1st ebreak, then resume and wait for the 2nd ebreak */
+
+		iov.iov_base = &regs;
+		iov.iov_len = sizeof(regs);
+
+		ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov));
+		regs.pc += 4;
+		ASSERT_EQ(0, ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov));
+
+		ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL));
+		ASSERT_EQ(pid, waitpid(pid, &status, 0));
+		ASSERT_TRUE(WIFSTOPPED(status));
+
+		/* read tracee vtype using ptrace GETREGSET */
+
+		iov.iov_base = regset_data;
+		iov.iov_len = regset_size;
+
+		ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov));
+
+		/* verify that V state is illegal after syscall */
+
+		EXPECT_EQ((1UL << (__riscv_xlen - 1)), regset_data->vtype);
+		EXPECT_EQ(vlenb, regset_data->vlenb);
+		EXPECT_EQ(0UL, regset_data->vstart);
+		EXPECT_EQ(0UL, regset_data->vcsr);
+		EXPECT_EQ(0UL, regset_data->vl);
+
+		/* cleanup */
+
+		ASSERT_EQ(0, kill(pid, SIGKILL));
+	}
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 30eb191c895b086c21fc04c5c1482cb1bb0f3caf Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <geomatsi@gmail.com>
Date: Sun, 25 Jan 2026 21:09:57 -0700
Subject: selftests: riscv: verify ptrace rejects invalid vector csr inputs

Add a test to v_ptrace test suite to verify that ptrace rejects the
invalid input combinations of vector csr registers. Use kselftest
fixture variants to create multiple invalid inputs for the test.

Signed-off-by: Sergey Matyukevich <geomatsi@gmail.com>
Tested-by: Andy Chiu <andybnac@gmail.com>
Link: https://patch.msgid.link/20251214163537.1054292-9-geomatsi@gmail.com
[pjw@kernel.org: cleaned up some checkpatch issues]
Signed-off-by: Paul Walmsley <pjw@kernel.org>
---
 .../selftests/riscv/vector/validate_v_ptrace.c     | 317 +++++++++++++++++++++
 1 file changed, 317 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/riscv/vector/validate_v_ptrace.c b/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
index 7ff9a0cf229c..3919df68edc8 100644
--- a/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
+++ b/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
@@ -334,4 +334,321 @@ TEST(ptrace_v_syscall_clobbering)
 	}
 }
 
+FIXTURE(v_csr_invalid)
+{
+};
+
+FIXTURE_SETUP(v_csr_invalid)
+{
+}
+
+FIXTURE_TEARDOWN(v_csr_invalid)
+{
+}
+
+#define VECTOR_1_0		BIT(0)
+#define XTHEAD_VECTOR_0_7	BIT(1)
+
+#define vector_test(x)		((x) & VECTOR_1_0)
+#define xthead_test(x)		((x) & XTHEAD_VECTOR_0_7)
+
+/* modifications of the initial vsetvli settings */
+FIXTURE_VARIANT(v_csr_invalid)
+{
+	unsigned long vstart;
+	unsigned long vl;
+	unsigned long vtype;
+	unsigned long vcsr;
+	unsigned long vlenb_mul;
+	unsigned long vlenb_min;
+	unsigned long vlenb_max;
+	unsigned long spec;
+};
+
+/* unexpected vlenb value */
+FIXTURE_VARIANT_ADD(v_csr_invalid, new_vlenb)
+{
+	.vstart = 0x0,
+	.vl = 0x0,
+	.vtype = 0x3,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x2,
+	.vlenb_min = 0x0,
+	.vlenb_max = 0x0,
+	.spec = VECTOR_1_0 | XTHEAD_VECTOR_0_7,
+};
+
+/* invalid reserved bits in vcsr */
+FIXTURE_VARIANT_ADD(v_csr_invalid, vcsr_invalid_reserved_bits)
+{
+	.vstart = 0x0,
+	.vl = 0x0,
+	.vtype = 0x3,
+	.vcsr = 0x1UL << 8,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x0,
+	.vlenb_max = 0x0,
+	.spec = VECTOR_1_0 | XTHEAD_VECTOR_0_7,
+};
+
+/* invalid reserved bits in vtype */
+FIXTURE_VARIANT_ADD(v_csr_invalid, vtype_invalid_reserved_bits)
+{
+	.vstart = 0x0,
+	.vl = 0x0,
+	.vtype = (0x1UL << 8) | 0x3,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x0,
+	.vlenb_max = 0x0,
+	.spec = VECTOR_1_0 | XTHEAD_VECTOR_0_7,
+};
+
+/* set vill bit */
+FIXTURE_VARIANT_ADD(v_csr_invalid, invalid_vill_bit)
+{
+	.vstart = 0x0,
+	.vl = 0x0,
+	.vtype = (0x1UL << (__riscv_xlen - 1)) | 0x3,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x0,
+	.vlenb_max = 0x0,
+	.spec = VECTOR_1_0 | XTHEAD_VECTOR_0_7,
+};
+
+/* reserved vsew value: vsew > 3 */
+FIXTURE_VARIANT_ADD(v_csr_invalid, reserved_vsew)
+{
+	.vstart = 0x0,
+	.vl = 0x0,
+	.vtype = 0x4UL << 3,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x0,
+	.vlenb_max = 0x0,
+	.spec = VECTOR_1_0,
+};
+
+/* XTheadVector: unsupported non-zero VEDIV value */
+FIXTURE_VARIANT_ADD(v_csr_invalid, reserved_vediv)
+{
+	.vstart = 0x0,
+	.vl = 0x0,
+	.vtype = 0x3UL << 5,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x0,
+	.vlenb_max = 0x0,
+	.spec = XTHEAD_VECTOR_0_7,
+};
+
+/* reserved vlmul value: vlmul == 4 */
+FIXTURE_VARIANT_ADD(v_csr_invalid, reserved_vlmul)
+{
+	.vstart = 0x0,
+	.vl = 0x0,
+	.vtype = 0x4,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x0,
+	.vlenb_max = 0x0,
+	.spec = VECTOR_1_0,
+};
+
+/* invalid fractional LMUL for VLEN <= 256: LMUL= 1/8, SEW = 64 */
+FIXTURE_VARIANT_ADD(v_csr_invalid, frac_lmul1)
+{
+	.vstart = 0x0,
+	.vl = 0x0,
+	.vtype = 0x1d,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x0,
+	.vlenb_max = 0x20,
+	.spec = VECTOR_1_0,
+};
+
+/* invalid integral LMUL for VLEN <= 16: LMUL= 2, SEW = 64 */
+FIXTURE_VARIANT_ADD(v_csr_invalid, int_lmul1)
+{
+	.vstart = 0x0,
+	.vl = 0x0,
+	.vtype = 0x19,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x0,
+	.vlenb_max = 0x2,
+	.spec = VECTOR_1_0,
+};
+
+/* XTheadVector: invalid integral LMUL for VLEN <= 16: LMUL= 2, SEW = 64 */
+FIXTURE_VARIANT_ADD(v_csr_invalid, int_lmul2)
+{
+	.vstart = 0x0,
+	.vl = 0x0,
+	.vtype = 0xd,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x0,
+	.vlenb_max = 0x2,
+	.spec = XTHEAD_VECTOR_0_7,
+};
+
+/* invalid VL for VLEN <= 128: LMUL= 2, SEW = 64, VL = 8 */
+FIXTURE_VARIANT_ADD(v_csr_invalid, vl1)
+{
+	.vstart = 0x0,
+	.vl = 0x8,
+	.vtype = 0x19,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x0,
+	.vlenb_max = 0x10,
+	.spec = VECTOR_1_0,
+};
+
+/* XTheadVector: invalid VL for VLEN <= 128: LMUL= 2, SEW = 64, VL = 8 */
+FIXTURE_VARIANT_ADD(v_csr_invalid, vl2)
+{
+	.vstart = 0x0,
+	.vl = 0x8,
+	.vtype = 0xd,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x0,
+	.vlenb_max = 0x10,
+	.spec = XTHEAD_VECTOR_0_7,
+};
+
+TEST_F(v_csr_invalid, ptrace_v_invalid_values)
+{
+	unsigned long vlenb;
+	pid_t pid;
+
+	if (!is_vector_supported() && !is_xtheadvector_supported())
+		SKIP(return, "Vectors not supported");
+
+	if (is_vector_supported() && !vector_test(variant->spec))
+		SKIP(return, "Test not supported for Vector");
+
+	if (is_xtheadvector_supported() && !xthead_test(variant->spec))
+		SKIP(return, "Test not supported for XTheadVector");
+
+	vlenb = get_vr_len();
+
+	if (variant->vlenb_min) {
+		if (vlenb < variant->vlenb_min)
+			SKIP(return, "This test does not support VLEN < %lu\n",
+			     variant->vlenb_min * 8);
+	}
+
+	if (variant->vlenb_max) {
+		if (vlenb > variant->vlenb_max)
+			SKIP(return, "This test does not support VLEN > %lu\n",
+			     variant->vlenb_max * 8);
+	}
+
+	chld_lock = 1;
+	pid = fork();
+	ASSERT_LE(0, pid)
+		TH_LOG("fork: %m");
+
+	if (pid == 0) {
+		unsigned long vl;
+
+		while (chld_lock == 1)
+			asm volatile("" : : "g"(chld_lock) : "memory");
+
+		if (is_xtheadvector_supported()) {
+			asm volatile (
+				// 0 | zimm[10:0] | rs1 | 1 1 1 | rd |1010111| vsetvli
+				// vsetvli	t4, x0, e16, m2, d1
+				".4byte		0b00000000010100000111111011010111\n"
+				"mv		%[new_vl], t4\n"
+				: [new_vl] "=r" (vl) : : "t4");
+		} else {
+			asm volatile (
+				".option push\n"
+				".option arch, +zve32x\n"
+				"vsetvli %[new_vl], x0, e16, m2, tu, mu\n"
+				".option pop\n"
+				: [new_vl] "=r"(vl) : : );
+		}
+
+		while (1) {
+			asm volatile (
+				".option push\n"
+				".option norvc\n"
+				"ebreak\n"
+				"nop\n"
+				".option pop\n");
+		}
+	} else {
+		struct __riscv_v_regset_state *regset_data;
+		size_t regset_size;
+		struct iovec iov;
+		int status;
+		int ret;
+
+		/* attach */
+
+		ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid, NULL, NULL));
+		ASSERT_EQ(pid, waitpid(pid, &status, 0));
+		ASSERT_TRUE(WIFSTOPPED(status));
+
+		/* unlock */
+
+		ASSERT_EQ(0, ptrace(PTRACE_POKEDATA, pid, &chld_lock, 0));
+
+		/* resume and wait for the 1st ebreak */
+
+		ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL));
+		ASSERT_EQ(pid, waitpid(pid, &status, 0));
+		ASSERT_TRUE(WIFSTOPPED(status));
+
+		/* read tracee vector csr regs using ptrace GETREGSET */
+
+		regset_size = sizeof(*regset_data) + vlenb * 32;
+		regset_data = calloc(1, regset_size);
+
+		iov.iov_base = regset_data;
+		iov.iov_len = regset_size;
+
+		ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov));
+
+		/* verify initial vsetvli settings */
+
+		if (is_xtheadvector_supported())
+			EXPECT_EQ(5UL, regset_data->vtype);
+		else
+			EXPECT_EQ(9UL, regset_data->vtype);
+
+		EXPECT_EQ(regset_data->vlenb, regset_data->vl);
+		EXPECT_EQ(vlenb, regset_data->vlenb);
+		EXPECT_EQ(0UL, regset_data->vstart);
+		EXPECT_EQ(0UL, regset_data->vcsr);
+
+		/* apply invalid settings from fixture variants */
+
+		regset_data->vlenb *= variant->vlenb_mul;
+		regset_data->vstart = variant->vstart;
+		regset_data->vtype = variant->vtype;
+		regset_data->vcsr = variant->vcsr;
+		regset_data->vl = variant->vl;
+
+		iov.iov_base = regset_data;
+		iov.iov_len = regset_size;
+
+		errno = 0;
+		ret = ptrace(PTRACE_SETREGSET, pid, NT_RISCV_VECTOR, &iov);
+		ASSERT_EQ(errno, EINVAL);
+		ASSERT_EQ(ret, -1);
+
+		/* cleanup */
+
+		ASSERT_EQ(0, kill(pid, SIGKILL));
+	}
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 849f05ae1ea6e1ff621243dce27fe455fdc9d0ff Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <geomatsi@gmail.com>
Date: Sun, 25 Jan 2026 21:09:57 -0700
Subject: selftests: riscv: verify ptrace accepts valid vector csr values

Add a test to v_ptrace test suite to verify that ptrace accepts the
valid input combinations of vector csr registers. Use kselftest
fixture variants to create multiple inputs for the test.

The test simulates a debug scenario with three breakpoints:
0. init: let the tracee set up its initial vector configuration
1. 1st bp:  modify the tracee's vector csr registers from the debugger
  - resume the tracee to execute a block without vector instructions
2. 2nd bp: read back the tracees's vector csr registers from the debugger
  - compare with values set by the debugger
  - resume the tracee to execute a block with vector instructions
3. 3rd bp: read back the tracess's vector csr registers again
  - compare with values set by the debugger

The last check helps to confirm that ptrace validation check for vector
csr registers input values works properly and maintains an accurate view
of the tracee's vector context in debugger.

Signed-off-by: Sergey Matyukevich <geomatsi@gmail.com>
Tested-by: Andy Chiu <andybnac@gmail.com>
Link: https://patch.msgid.link/20251214163537.1054292-10-geomatsi@gmail.com
[pjw@kernel.org: cleaned up a checkpatch issue]
Signed-off-by: Paul Walmsley <pjw@kernel.org>
---
 .../selftests/riscv/vector/validate_v_ptrace.c     | 261 +++++++++++++++++++++
 1 file changed, 261 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/riscv/vector/validate_v_ptrace.c b/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
index 3919df68edc8..3589549f7228 100644
--- a/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
+++ b/tools/testing/selftests/riscv/vector/validate_v_ptrace.c
@@ -651,4 +651,265 @@ TEST_F(v_csr_invalid, ptrace_v_invalid_values)
 	}
 }
 
+FIXTURE(v_csr_valid)
+{
+};
+
+FIXTURE_SETUP(v_csr_valid)
+{
+}
+
+FIXTURE_TEARDOWN(v_csr_valid)
+{
+}
+
+/* modifications of the initial vsetvli settings */
+FIXTURE_VARIANT(v_csr_valid)
+{
+	unsigned long vstart;
+	unsigned long vl;
+	unsigned long vtype;
+	unsigned long vcsr;
+	unsigned long vlenb_mul;
+	unsigned long vlenb_min;
+	unsigned long vlenb_max;
+	unsigned long spec;
+};
+
+/* valid for VLEN >= 128: LMUL= 1/4, SEW = 32 */
+FIXTURE_VARIANT_ADD(v_csr_valid, frac_lmul1)
+{
+	.vstart = 0x0,
+	.vl = 0x0,
+	.vtype = 0x16,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x10,
+	.vlenb_max = 0x0,
+	.spec = VECTOR_1_0,
+};
+
+/* valid for VLEN >= 16: LMUL= 2, SEW = 32 */
+FIXTURE_VARIANT_ADD(v_csr_valid, int_lmul1)
+{
+	.vstart = 0x0,
+	.vl = 0x0,
+	.vtype = 0x11,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x2,
+	.vlenb_max = 0x0,
+	.spec = VECTOR_1_0,
+};
+
+/* valid for XTheadVector VLEN >= 16: LMUL= 2, SEW = 32 */
+FIXTURE_VARIANT_ADD(v_csr_valid, int_lmul2)
+{
+	.vstart = 0x0,
+	.vl = 0x0,
+	.vtype = 0x9,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x2,
+	.vlenb_max = 0x0,
+	.spec = XTHEAD_VECTOR_0_7,
+};
+
+/* valid for VLEN >= 32: LMUL= 2, SEW = 32, VL = 2 */
+FIXTURE_VARIANT_ADD(v_csr_valid, int_lmul3)
+{
+	.vstart = 0x0,
+	.vl = 0x2,
+	.vtype = 0x11,
+	.vcsr = 0x0,
+	.vlenb_mul = 0x1,
+	.vlenb_min = 0x4,
+	.vlenb_max = 0x0,
+	.spec = VECTOR_1_0,
+};
+
+TEST_F(v_csr_valid, ptrace_v_valid_values)
+{
+	unsigned long vlenb;
+	pid_t pid;
+
+	if (!is_vector_supported() && !is_xtheadvector_supported())
+		SKIP(return, "Vectors not supported");
+
+	if (is_vector_supported() && !vector_test(variant->spec))
+		SKIP(return, "Test not supported for Vector");
+
+	if (is_xtheadvector_supported() && !xthead_test(variant->spec))
+		SKIP(return, "Test not supported for XTheadVector");
+
+	vlenb = get_vr_len();
+
+	if (variant->vlenb_min) {
+		if (vlenb < variant->vlenb_min)
+			SKIP(return, "This test does not support VLEN < %lu\n",
+			     variant->vlenb_min * 8);
+	}
+	if (variant->vlenb_max) {
+		if (vlenb > variant->vlenb_max)
+			SKIP(return, "This test does not support VLEN > %lu\n",
+			     variant->vlenb_max * 8);
+	}
+
+	chld_lock = 1;
+	pid = fork();
+	ASSERT_LE(0, pid)
+		TH_LOG("fork: %m");
+
+	if (pid == 0) {
+		unsigned long vl;
+
+		while (chld_lock == 1)
+			asm volatile("" : : "g"(chld_lock) : "memory");
+
+		if (is_xtheadvector_supported()) {
+			asm volatile (
+				// 0 | zimm[10:0] | rs1 | 1 1 1 | rd |1010111| vsetvli
+				// vsetvli	t4, x0, e16, m2, d1
+				".4byte		0b00000000010100000111111011010111\n"
+				"mv		%[new_vl], t4\n"
+				: [new_vl] "=r" (vl) : : "t4");
+		} else {
+			asm volatile (
+				".option push\n"
+				".option arch, +zve32x\n"
+				"vsetvli %[new_vl], x0, e16, m2, tu, mu\n"
+				".option pop\n"
+				: [new_vl] "=r"(vl) : : );
+		}
+
+		asm volatile (
+			".option push\n"
+			".option norvc\n"
+			".option arch, +zve32x\n"
+			"ebreak\n" /* breakpoint 1: apply new V state using ptrace */
+			"nop\n"
+			"ebreak\n" /* breakpoint 2: V state clean - context will not be saved */
+			"vmv.v.i v0, -1\n"
+			"ebreak\n" /* breakpoint 3: V state dirty - context will be saved */
+			".option pop\n");
+	} else {
+		struct __riscv_v_regset_state *regset_data;
+		struct user_regs_struct regs;
+		size_t regset_size;
+		struct iovec iov;
+		int status;
+
+		/* attach */
+
+		ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid, NULL, NULL));
+		ASSERT_EQ(pid, waitpid(pid, &status, 0));
+		ASSERT_TRUE(WIFSTOPPED(status));
+
+		/* unlock */
+
+		ASSERT_EQ(0, ptrace(PTRACE_POKEDATA, pid, &chld_lock, 0));
+
+		/* resume and wait for the 1st ebreak */
+
+		ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL));
+		ASSERT_EQ(pid, waitpid(pid, &status, 0));
+		ASSERT_TRUE(WIFSTOPPED(status));
+
+		/* read tracee vector csr regs using ptrace GETREGSET */
+
+		regset_size = sizeof(*regset_data) + vlenb * 32;
+		regset_data = calloc(1, regset_size);
+
+		iov.iov_base = regset_data;
+		iov.iov_len = regset_size;
+
+		ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov));
+
+		/* verify initial vsetvli settings */
+
+		if (is_xtheadvector_supported())
+			EXPECT_EQ(5UL, regset_data->vtype);
+		else
+			EXPECT_EQ(9UL, regset_data->vtype);
+
+		EXPECT_EQ(regset_data->vlenb, regset_data->vl);
+		EXPECT_EQ(vlenb, regset_data->vlenb);
+		EXPECT_EQ(0UL, regset_data->vstart);
+		EXPECT_EQ(0UL, regset_data->vcsr);
+
+		/* apply valid settings from fixture variants */
+
+		regset_data->vlenb *= variant->vlenb_mul;
+		regset_data->vstart = variant->vstart;
+		regset_data->vtype = variant->vtype;
+		regset_data->vcsr = variant->vcsr;
+		regset_data->vl = variant->vl;
+
+		iov.iov_base = regset_data;
+		iov.iov_len = regset_size;
+
+		ASSERT_EQ(0, ptrace(PTRACE_SETREGSET, pid, NT_RISCV_VECTOR, &iov));
+
+		/* skip 1st ebreak, then resume and wait for the 2nd ebreak */
+
+		iov.iov_base = &regs;
+		iov.iov_len = sizeof(regs);
+
+		ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov));
+		regs.pc += 4;
+		ASSERT_EQ(0, ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov));
+
+		ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL));
+		ASSERT_EQ(pid, waitpid(pid, &status, 0));
+		ASSERT_TRUE(WIFSTOPPED(status));
+
+		/* read tracee vector csr regs using ptrace GETREGSET */
+
+		iov.iov_base = regset_data;
+		iov.iov_len = regset_size;
+
+		ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov));
+
+		/* verify vector csr regs from tracee context */
+
+		EXPECT_EQ(regset_data->vstart, variant->vstart);
+		EXPECT_EQ(regset_data->vtype, variant->vtype);
+		EXPECT_EQ(regset_data->vcsr, variant->vcsr);
+		EXPECT_EQ(regset_data->vl, variant->vl);
+		EXPECT_EQ(regset_data->vlenb, vlenb);
+
+		/* skip 2nd ebreak, then resume and wait for the 3rd ebreak */
+
+		iov.iov_base = &regs;
+		iov.iov_len = sizeof(regs);
+
+		ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov));
+		regs.pc += 4;
+		ASSERT_EQ(0, ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov));
+
+		ASSERT_EQ(0, ptrace(PTRACE_CONT, pid, NULL, NULL));
+		ASSERT_EQ(pid, waitpid(pid, &status, 0));
+		ASSERT_TRUE(WIFSTOPPED(status));
+
+		/* read tracee vector csr regs using ptrace GETREGSET */
+
+		iov.iov_base = regset_data;
+		iov.iov_len = regset_size;
+
+		ASSERT_EQ(0, ptrace(PTRACE_GETREGSET, pid, NT_RISCV_VECTOR, &iov));
+
+		/* verify vector csr regs from tracee context */
+
+		EXPECT_EQ(regset_data->vstart, variant->vstart);
+		EXPECT_EQ(regset_data->vtype, variant->vtype);
+		EXPECT_EQ(regset_data->vcsr, variant->vcsr);
+		EXPECT_EQ(regset_data->vl, variant->vl);
+		EXPECT_EQ(regset_data->vlenb, vlenb);
+
+		/* cleanup */
+
+		ASSERT_EQ(0, kill(pid, SIGKILL));
+	}
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 098921ec6818291d98bd3a4002c9dfbe2e75aac2 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Sun, 25 Jan 2026 21:09:57 -0700
Subject: selftests: riscv: vstate_exec_nolibc: Use the regular prctl()
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The my_syscall*() macros are internal implementation details of nolibc.

Now that nolibc has a normal prctl() function, use that.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://patch.msgid.link/20260117-nolibc-mysyscall-riscv-v1-1-0ae1ae3513e9@weissschuh.net
Signed-off-by: Paul Walmsley <pjw@kernel.org>
---
 tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c b/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c
index 7b7d6f21acb4..12f1b1b1c7aa 100644
--- a/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c
+++ b/tools/testing/selftests/riscv/vector/vstate_exec_nolibc.c
@@ -16,10 +16,10 @@ int main(int argc, char **argv)
 	if (argc > 2 && strcmp(argv[2], "x"))
 		xtheadvector = 1;
 
-	ctrl = my_syscall1(__NR_prctl, PR_RISCV_V_GET_CONTROL);
-	if (ctrl < 0) {
+	ctrl = prctl(PR_RISCV_V_GET_CONTROL, 0, 0, 0, 0);
+	if (ctrl == -1) {
 		puts("PR_RISCV_V_GET_CONTROL is not supported\n");
-		return ctrl;
+		exit(-1);
 	}
 
 	if (test_inherit) {
@@ -51,7 +51,7 @@ int main(int argc, char **argv)
 		}
 
 		if (!pid) {
-			rc = my_syscall1(__NR_prctl, PR_RISCV_V_GET_CONTROL);
+			rc = prctl(PR_RISCV_V_GET_CONTROL, 0, 0, 0, 0);
 			if (rc != ctrl) {
 				puts("child's vstate_ctrl not equal to parent's\n");
 				exit(-1);
-- 
cgit v1.2.3


From 5374c334d64f8e1dfc4aadcbcd3a1090fbe39acb Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 6 Feb 2026 16:35:02 -0800
Subject: tools: ynltool: factor out qstat dumping

The logic to open a socket and dump the queues is the same
across sub-commands. Factor it out, we'll need it again.

No functional changes intended.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20260207003509.3927744-3-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynltool/qstats.c | 95 ++++++++++++++++++------------------------
 1 file changed, 41 insertions(+), 54 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynltool/qstats.c b/tools/net/ynl/ynltool/qstats.c
index 31fb45709ffa..d19acab0bf2a 100644
--- a/tools/net/ynl/ynltool/qstats.c
+++ b/tools/net/ynl/ynltool/qstats.c
@@ -237,13 +237,47 @@ static void print_plain_qstats(struct netdev_qstats_get_list *qstats)
 	}
 }
 
-static int do_show(int argc, char **argv)
+static struct netdev_qstats_get_list *
+qstats_dump(enum netdev_qstats_scope scope)
 {
 	struct netdev_qstats_get_list *qstats;
 	struct netdev_qstats_get_req *req;
 	struct ynl_error yerr;
 	struct ynl_sock *ys;
-	int ret = 0;
+
+	ys = ynl_sock_create(&ynl_netdev_family, &yerr);
+	if (!ys) {
+		p_err("YNL: %s", yerr.msg);
+		return NULL;
+	}
+
+	req = netdev_qstats_get_req_alloc();
+	if (!req) {
+		p_err("failed to allocate qstats request");
+		goto err_close;
+	}
+
+	if (scope)
+		netdev_qstats_get_req_set_scope(req, scope);
+
+	qstats = netdev_qstats_get_dump(ys, req);
+	netdev_qstats_get_req_free(req);
+	if (!qstats) {
+		p_err("failed to get queue stats: %s", ys->err.msg);
+		goto err_close;
+	}
+
+	ynl_sock_destroy(ys);
+	return qstats;
+
+err_close:
+	ynl_sock_destroy(ys);
+	return NULL;
+}
+
+static int do_show(int argc, char **argv)
+{
+	struct netdev_qstats_get_list *qstats;
 
 	/* Parse options */
 	while (argc > 0) {
@@ -268,29 +302,9 @@ static int do_show(int argc, char **argv)
 		}
 	}
 
-	ys = ynl_sock_create(&ynl_netdev_family, &yerr);
-	if (!ys) {
-		p_err("YNL: %s", yerr.msg);
+	qstats = qstats_dump(scope);
+	if (!qstats)
 		return -1;
-	}
-
-	req = netdev_qstats_get_req_alloc();
-	if (!req) {
-		p_err("failed to allocate qstats request");
-		ret = -1;
-		goto exit_close;
-	}
-
-	if (scope)
-		netdev_qstats_get_req_set_scope(req, scope);
-
-	qstats = netdev_qstats_get_dump(ys, req);
-	netdev_qstats_get_req_free(req);
-	if (!qstats) {
-		p_err("failed to get queue stats: %s", ys->err.msg);
-		ret = -1;
-		goto exit_close;
-	}
 
 	/* Print the stats as returned by the kernel */
 	if (json_output)
@@ -299,9 +313,7 @@ static int do_show(int argc, char **argv)
 		print_plain_qstats(qstats);
 
 	netdev_qstats_get_list_free(qstats);
-exit_close:
-	ynl_sock_destroy(ys);
-	return ret;
+	return 0;
 }
 
 static void compute_stats(__u64 *values, unsigned int count,
@@ -406,10 +418,7 @@ static int cmp_ifindex_type(const void *a, const void *b)
 static int do_balance(int argc, char **argv __attribute__((unused)))
 {
 	struct netdev_qstats_get_list *qstats;
-	struct netdev_qstats_get_req *req;
 	struct netdev_qstats_get_rsp **sorted;
-	struct ynl_error yerr;
-	struct ynl_sock *ys;
 	unsigned int count = 0;
 	unsigned int i, j;
 	int ret = 0;
@@ -419,29 +428,9 @@ static int do_balance(int argc, char **argv __attribute__((unused)))
 		return -1;
 	}
 
-	ys = ynl_sock_create(&ynl_netdev_family, &yerr);
-	if (!ys) {
-		p_err("YNL: %s", yerr.msg);
+	qstats = qstats_dump(NETDEV_QSTATS_SCOPE_QUEUE);
+	if (!qstats)
 		return -1;
-	}
-
-	req = netdev_qstats_get_req_alloc();
-	if (!req) {
-		p_err("failed to allocate qstats request");
-		ret = -1;
-		goto exit_close;
-	}
-
-	/* Always use queue scope for balance analysis */
-	netdev_qstats_get_req_set_scope(req, NETDEV_QSTATS_SCOPE_QUEUE);
-
-	qstats = netdev_qstats_get_dump(ys, req);
-	netdev_qstats_get_req_free(req);
-	if (!qstats) {
-		p_err("failed to get queue stats: %s", ys->err.msg);
-		ret = -1;
-		goto exit_close;
-	}
 
 	/* Count and sort queues */
 	ynl_dump_foreach(qstats, qs)
@@ -576,8 +565,6 @@ exit_free_sorted:
 	free(sorted);
 exit_free_qstats:
 	netdev_qstats_get_list_free(qstats);
-exit_close:
-	ynl_sock_destroy(ys);
 	return ret;
 }
 
-- 
cgit v1.2.3


From c61a375315c0374134b9ad883f0c64c982c2016b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 6 Feb 2026 16:35:03 -0800
Subject: tools: ynltool: add qstats analysis for HW-GRO efficiency / savings

Extend ynltool to compute HW GRO savings metric - how many
packets has HW GRO been able to save the kernel from seeing.

Note that this definition does not actually take into account
whether the segments were or weren't eligible for HW GRO.
If a machine is receiving all-UDP traffic - new metric will show
HW-GRO savings of 0%. Conversely since the super-packet still
counts as a received packet, savings of 100% is not achievable.
Perfect HW-GRO on a machine with 4k MTU and 64kB super-frames
would show ~93.75% savings. With 1.5k MTU we may see up to
~97.8% savings (if my math is right).

Example after 10 sec of iperf on a freshly booted machine
with 1.5k MTU:

  $ ynltool qstats show
  eth0     rx-packets:  40681280               rx-bytes:   61575208437
        rx-alloc-fail:         0      rx-hw-gro-packets:       1225133
                                 rx-hw-gro-wire-packets:      40656633
  $ ynltool qstats hw-gro
  eth0: 96.9% savings

None of the NICs I have access to can report "missed" HW-GRO
opportunities so computing a true "effectiveness" metric
is not possible. One could also argue that effectiveness metric
is inferior in environments where we control both senders and
receivers, the savings metrics will capture both regressions
in receiver's HW GRO effectiveness but also regressions in senders
sending smaller TSO trains. And we care about both. The main
downside is that it's hard to tell at a glance how well the NIC
is doing because the savings will be dependent on traffic patterns.

Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20260207003509.3927744-4-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/net/ynl/ynltool/qstats.c | 76 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 71 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/net/ynl/ynltool/qstats.c b/tools/net/ynl/ynltool/qstats.c
index d19acab0bf2a..a6c28ba4f25c 100644
--- a/tools/net/ynl/ynltool/qstats.c
+++ b/tools/net/ynl/ynltool/qstats.c
@@ -568,6 +568,65 @@ exit_free_qstats:
 	return ret;
 }
 
+static int do_hw_gro(int argc, char **argv __attribute__((unused)))
+{
+	struct netdev_qstats_get_list *qstats;
+
+	if (argc > 0) {
+		p_err("hw-gro command takes no arguments");
+		return -1;
+	}
+
+	qstats = qstats_dump(0);
+	if (!qstats)
+		return -1;
+
+	if (json_output)
+		jsonw_start_array(json_wtr);
+
+	ynl_dump_foreach(qstats, qs) {
+		char ifname[IF_NAMESIZE];
+		const char *name;
+		double savings;
+
+		if (!qs->_present.rx_packets ||
+		    !qs->_present.rx_hw_gro_packets ||
+		    !qs->_present.rx_hw_gro_wire_packets)
+			continue;
+
+		if (!qs->rx_packets)
+			continue;
+
+		/* How many skbs did we avoid allocating thanks to HW GRO */
+		savings = (double)(qs->rx_hw_gro_wire_packets -
+				   qs->rx_hw_gro_packets) /
+			qs->rx_packets * 100.0;
+
+		name = if_indextoname(qs->ifindex, ifname);
+
+		if (json_output) {
+			jsonw_start_object(json_wtr);
+			jsonw_uint_field(json_wtr, "ifindex", qs->ifindex);
+			if (name)
+				jsonw_string_field(json_wtr, "ifname", name);
+			jsonw_float_field(json_wtr, "savings", savings);
+			jsonw_end_object(json_wtr);
+		} else {
+			if (name)
+				printf("%s", name);
+			else
+				printf("ifindex:%u", qs->ifindex);
+			printf(": %.1f%% savings\n", savings);
+		}
+	}
+
+	if (json_output)
+		jsonw_end_array(json_wtr);
+
+	netdev_qstats_get_list_free(qstats);
+	return 0;
+}
+
 static int do_help(int argc __attribute__((unused)),
 		   char **argv __attribute__((unused)))
 {
@@ -577,9 +636,10 @@ static int do_help(int argc __attribute__((unused)),
 	}
 
 	fprintf(stderr,
-		"Usage: %s qstats { COMMAND | help }\n"
-		"       %s qstats [ show ] [ OPTIONS ]\n"
-		"       %s qstats balance\n"
+		"Usage: %1$s qstats { COMMAND | help }\n"
+		"       %1$s qstats [ show ] [ OPTIONS ]\n"
+		"       %1$s qstats balance\n"
+		"       %1$s qstats hw-gro\n"
 		"\n"
 		"       OPTIONS := { scope queue | group-by { device | queue } }\n"
 		"\n"
@@ -588,9 +648,14 @@ static int do_help(int argc __attribute__((unused)),
 		"       show scope queue      - Display per-queue statistics\n"
 		"       show group-by device  - Display device-aggregated statistics (default)\n"
 		"       show group-by queue   - Display per-queue statistics\n"
-		"       balance               - Analyze traffic distribution balance.\n"
+		"\n"
+		"  Analysis:\n"
+		"       balance               - Traffic distribution between queues.\n"
+		"       hw-gro                - HW GRO effectiveness analysis\n"
+		"                               - savings - delta between packets received\n"
+		"                                 on the wire and packets seen by the kernel.\n"
 		"",
-		bin_name, bin_name, bin_name);
+		bin_name);
 
 	return 0;
 }
@@ -598,6 +663,7 @@ static int do_help(int argc __attribute__((unused)),
 static const struct cmd qstats_cmds[] = {
 	{ "show",	do_show },
 	{ "balance",	do_balance },
+	{ "hw-gro",	do_hw_gro },
 	{ "help",	do_help },
 	{ 0 }
 };
-- 
cgit v1.2.3


From c431b00ca6afc5da3133636ecc34ee7edd38d6cc Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Fri, 6 Feb 2026 21:43:36 +0100
Subject: objtool/rust: add one more `noreturn` Rust function

`objtool` with Rust 1.84.0 reports:

    rust/kernel.o: error: objtool: _RNvXNtNtCsaRPFapPOzLs_6kernel3str9parse_intaNtNtB2_7private12FromStrRadix14from_str_radix()
    falls through to next function _RNvXNtNtCsaRPFapPOzLs_6kernel3str9parse_intaNtNtB2_7private12FromStrRadix16from_u64_negated()

This is very similar to commit c18f35e49049 ("objtool/rust: add one more
`noreturn` Rust function"), which added `from_ascii_radix_panic` for Rust
1.86.0, except that Rust 1.84.0 ends up needing `from_str_radix_panic`.

Thus add it to the list to fix the warning.

Cc: FUJITA Tomonori <fujita.tomonori@gmail.com>
Fixes: 51d9ee90ea90 ("rust: str: add radix prefixed integer parsing functions")
Reported-by: Alice Ryhl <aliceryhl@google.com>
Link: https://rust-for-linux.zulipchat.com/#narrow/channel/291565/topic/x/with/572427627
Tested-by: Alice Ryhl <aliceryhl@google.com>
Link: https://patch.msgid.link/20260206204336.38462-1-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 tools/objtool/check.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 3fd98c5b6e1a..37ec0d757e9b 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -197,7 +197,8 @@ static bool is_rust_noreturn(const struct symbol *func)
 	 * as well as changes to the source code itself between versions (since
 	 * these come from the Rust standard library).
 	 */
-	return str_ends_with(func->name, "_4core3num22from_ascii_radix_panic")				||
+	return str_ends_with(func->name, "_4core3num20from_str_radix_panic")				||
+	       str_ends_with(func->name, "_4core3num22from_ascii_radix_panic")				||
 	       str_ends_with(func->name, "_4core5sliceSp15copy_from_slice17len_mismatch_fail")		||
 	       str_ends_with(func->name, "_4core6option13expect_failed")				||
 	       str_ends_with(func->name, "_4core6option13unwrap_failed")				||
-- 
cgit v1.2.3


From c01a6c700fd54dd775020a8ddfe69dedeaca73cc Mon Sep 17 00:00:00 2001
From: Felix Maurer <fmaurer@redhat.com>
Date: Thu, 5 Feb 2026 14:57:28 +0100
Subject: selftests: hsr: Add ping test for PRP

Add a selftest for PRP that performs a basic ping test on IPv4 and IPv6,
over the plain PRP interface and a VLAN interface, similar to the existing
ping test for HSR. The test first checks reachability of the other node,
then checks for no loss and no duplicates.

Signed-off-by: Felix Maurer <fmaurer@redhat.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://patch.msgid.link/4a342189e842d7308d037da72af566729ee75834.1770299429.git.fmaurer@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/hsr/Makefile    |   1 +
 tools/testing/selftests/net/hsr/prp_ping.sh | 147 ++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100755 tools/testing/selftests/net/hsr/prp_ping.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/hsr/Makefile b/tools/testing/selftests/net/hsr/Makefile
index 4b6afc0fe9f8..1886f345897a 100644
--- a/tools/testing/selftests/net/hsr/Makefile
+++ b/tools/testing/selftests/net/hsr/Makefile
@@ -5,6 +5,7 @@ top_srcdir = ../../../../..
 TEST_PROGS := \
 	hsr_ping.sh \
 	hsr_redbox.sh \
+	prp_ping.sh \
 # end of TEST_PROGS
 
 TEST_FILES += hsr_common.sh
diff --git a/tools/testing/selftests/net/hsr/prp_ping.sh b/tools/testing/selftests/net/hsr/prp_ping.sh
new file mode 100755
index 000000000000..fd2ba9f05d4c
--- /dev/null
+++ b/tools/testing/selftests/net/hsr/prp_ping.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ipv6=true
+
+source ./hsr_common.sh
+
+optstring="h4"
+usage() {
+	echo "Usage: $0 [OPTION]"
+	echo -e "\t-4: IPv4 only: disable IPv6 tests (default: test both IPv4 and IPv6)"
+}
+
+while getopts "$optstring" option;do
+	case "$option" in
+	"h")
+		usage "$0"
+		exit 0
+		;;
+	"4")
+		ipv6=false
+		;;
+	"?")
+		usage "$0"
+		exit 1
+		;;
+esac
+done
+
+setup_prp_interfaces()
+{
+	echo "INFO: Preparing interfaces for PRP"
+# Two PRP nodes, connected by two links (treated as LAN A and LAN B).
+#
+#       vethA ----- vethA
+#     prp1             prp2
+#       vethB ----- vethB
+#
+#     node1           node2
+
+	# Interfaces
+	# shellcheck disable=SC2154 # variables assigned by setup_ns
+	ip link add vethA netns "$node1" type veth peer name vethA netns "$node2"
+	ip link add vethB netns "$node1" type veth peer name vethB netns "$node2"
+
+	# MAC addresses will be copied from LAN A interface
+	ip -net "$node1" link set address 00:11:22:00:00:01 dev vethA
+	ip -net "$node2" link set address 00:11:22:00:00:02 dev vethA
+
+	# PRP
+	ip -net "$node1" link add name prp1 type hsr \
+		slave1 vethA slave2 vethB supervision 45 proto 1
+	ip -net "$node2" link add name prp2 type hsr \
+		slave1 vethA slave2 vethB supervision 45 proto 1
+
+	# IP addresses
+	ip -net "$node1" addr add 100.64.0.1/24 dev prp1
+	ip -net "$node1" addr add dead:beef:0::1/64 dev prp1 nodad
+	ip -net "$node2" addr add 100.64.0.2/24 dev prp2
+	ip -net "$node2" addr add dead:beef:0::2/64 dev prp2 nodad
+
+	# All links up
+	ip -net "$node1" link set vethA up
+	ip -net "$node1" link set vethB up
+	ip -net "$node1" link set prp1 up
+
+	ip -net "$node2" link set vethA up
+	ip -net "$node2" link set vethB up
+	ip -net "$node2" link set prp2 up
+}
+
+setup_vlan_interfaces()
+{
+	# Interfaces
+	ip -net "$node1" link add link prp1 name prp1.2 type vlan id 2
+	ip -net "$node2" link add link prp2 name prp2.2 type vlan id 2
+
+	# IP addresses
+	ip -net "$node1" addr add 100.64.2.1/24 dev prp1.2
+	ip -net "$node1" addr add dead:beef:2::1/64 dev prp1.2 nodad
+
+	ip -net "$node2" addr add 100.64.2.2/24 dev prp2.2
+	ip -net "$node2" addr add dead:beef:2::2/64 dev prp2.2 nodad
+
+	# All links up
+	ip -net "$node1" link set prp1.2 up
+	ip -net "$node2" link set prp2.2 up
+}
+
+do_ping_tests()
+{
+	local netid="$1"
+
+	echo "INFO: Initial validation ping"
+
+	do_ping "$node1" "100.64.$netid.2"
+	do_ping "$node2" "100.64.$netid.1"
+	stop_if_error "Initial validation failed on IPv4"
+
+	do_ping "$node1" "dead:beef:$netid::2"
+	do_ping "$node2" "dead:beef:$netid::1"
+	stop_if_error "Initial validation failed on IPv6"
+
+	echo "INFO: Longer ping test."
+
+	do_ping_long "$node1" "100.64.$netid.2"
+	do_ping_long "$node2" "100.64.$netid.1"
+	stop_if_error "Longer ping test failed on IPv4."
+
+	do_ping_long "$node1" "dead:beef:$netid::2"
+	do_ping_long "$node2" "dead:beef:$netid::1"
+	stop_if_error "Longer ping test failed on IPv6."
+}
+
+run_ping_tests()
+{
+	echo "INFO: Running ping tests"
+	do_ping_tests 0
+}
+
+run_vlan_ping_tests()
+{
+	vlan_challenged_prp1=$(ip net exec "$node1" ethtool -k prp1 | \
+		grep "vlan-challenged" | awk '{print $2}')
+	vlan_challenged_prp2=$(ip net exec "$node2" ethtool -k prp2 | \
+		grep "vlan-challenged" | awk '{print $2}')
+
+	if [[ "$vlan_challenged_prp1" = "off" || \
+	      "$vlan_challenged_prp2" = "off" ]]; then
+		echo "INFO: Running VLAN ping tests"
+		setup_vlan_interfaces
+		do_ping_tests 2
+	else
+		echo "INFO: Not Running VLAN tests as the device does not support VLAN"
+	fi
+}
+
+check_prerequisites
+trap cleanup_all_ns EXIT
+
+setup_ns node1 node2
+setup_prp_interfaces
+
+run_ping_tests
+run_vlan_ping_tests
+
+exit $ret
-- 
cgit v1.2.3


From 776b64ba12e7e2be393b3df07979c825fed47931 Mon Sep 17 00:00:00 2001
From: Felix Maurer <fmaurer@redhat.com>
Date: Thu, 5 Feb 2026 14:57:29 +0100
Subject: selftests: hsr: Check duplicates on HSR with VLAN

Previously the hsr_ping test only checked that all nodes in a VLAN are
reachable (using do_ping). Update the test to also check that there is no
packet loss and no duplicate packets by running the same tests for VLANs as
without VLANs (including using do_ping_long). This also adds tests for IPv6
over VLAN. To unify the test code, the topology without VLANs now uses IP
addresses from dead:beef:0::/64 to align with the 100.64.0.0/24 range for
IPv4. Error messages are updated across the board to make it easier to find
what actually failed.

Also update the VLAN test to only run in VLAN 2, as there is no need to
check if ping really works with VLAN IDs 2, 3, 4, and 5. This lowers the
number of long ping tests on VLANs to keep the overall test runtime in
bounds.

It's still necessary to bump the test timeout a bit, though: a ping long
tests takes 1sec, do_ping_tests performs 12 of them, do_link_problem_tests
6, and the VLAN tests again 12. With some buffer for setup and waiting and
for two protocol versions, 90sec timeout seems reasonable.

Signed-off-by: Felix Maurer <fmaurer@redhat.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://patch.msgid.link/e3ded0e2547b5f720524b62fabeb96debc579697.1770299429.git.fmaurer@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/hsr/hsr_ping.sh | 188 ++++++++++------------------
 tools/testing/selftests/net/hsr/settings    |   2 +-
 2 files changed, 70 insertions(+), 120 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/hsr/hsr_ping.sh b/tools/testing/selftests/net/hsr/hsr_ping.sh
index 5a65f4f836be..ebee4a18fc67 100755
--- a/tools/testing/selftests/net/hsr/hsr_ping.sh
+++ b/tools/testing/selftests/net/hsr/hsr_ping.sh
@@ -27,31 +27,34 @@ while getopts "$optstring" option;do
 esac
 done
 
-do_complete_ping_test()
+do_ping_tests()
 {
-	echo "INFO: Initial validation ping."
-	# Each node has to be able each one.
-	do_ping "$ns1" 100.64.0.2
-	do_ping "$ns2" 100.64.0.1
-	do_ping "$ns3" 100.64.0.1
-	stop_if_error "Initial validation failed."
-
-	do_ping "$ns1" 100.64.0.3
-	do_ping "$ns2" 100.64.0.3
-	do_ping "$ns3" 100.64.0.2
+	local netid="$1"
 
-	do_ping "$ns1" dead:beef:1::2
-	do_ping "$ns1" dead:beef:1::3
-	do_ping "$ns2" dead:beef:1::1
-	do_ping "$ns2" dead:beef:1::2
-	do_ping "$ns3" dead:beef:1::1
-	do_ping "$ns3" dead:beef:1::2
+	echo "INFO: Running ping tests."
 
-	stop_if_error "Initial validation failed."
+	echo "INFO: Initial validation ping."
+	# Each node has to be able to reach each one.
+	do_ping "$ns1" "100.64.$netid.2"
+	do_ping "$ns1" "100.64.$netid.3"
+	do_ping "$ns2" "100.64.$netid.1"
+	do_ping "$ns2" "100.64.$netid.3"
+	do_ping "$ns3" "100.64.$netid.1"
+	do_ping "$ns3" "100.64.$netid.2"
+	stop_if_error "Initial validation failed on IPv4."
+
+	do_ping "$ns1" "dead:beef:$netid::2"
+	do_ping "$ns1" "dead:beef:$netid::3"
+	do_ping "$ns2" "dead:beef:$netid::1"
+	do_ping "$ns2" "dead:beef:$netid::2"
+	do_ping "$ns3" "dead:beef:$netid::1"
+	do_ping "$ns3" "dead:beef:$netid::2"
+	stop_if_error "Initial validation failed on IPv6."
 
 # Wait until supervisor all supervision frames have been processed and the node
 # entries have been merged. Otherwise duplicate frames will be observed which is
 # valid at this stage.
+	echo "INFO: Wait for node table entries to be merged."
 	WAIT=5
 	while [ ${WAIT} -gt 0 ]
 	do
@@ -68,24 +71,28 @@ do_complete_ping_test()
 	sleep 1
 
 	echo "INFO: Longer ping test."
-	do_ping_long "$ns1" 100.64.0.2
-	do_ping_long "$ns1" dead:beef:1::2
-	do_ping_long "$ns1" 100.64.0.3
-	do_ping_long "$ns1" dead:beef:1::3
-
-	stop_if_error "Longer ping test failed."
-
-	do_ping_long "$ns2" 100.64.0.1
-	do_ping_long "$ns2" dead:beef:1::1
-	do_ping_long "$ns2" 100.64.0.3
-	do_ping_long "$ns2" dead:beef:1::2
-	stop_if_error "Longer ping test failed."
+	do_ping_long "$ns1" "100.64.$netid.2"
+	do_ping_long "$ns1" "dead:beef:$netid::2"
+	do_ping_long "$ns1" "100.64.$netid.3"
+	do_ping_long "$ns1" "dead:beef:$netid::3"
+	stop_if_error "Longer ping test failed (ns1)."
+
+	do_ping_long "$ns2" "100.64.$netid.1"
+	do_ping_long "$ns2" "dead:beef:$netid::1"
+	do_ping_long "$ns2" "100.64.$netid.3"
+	do_ping_long "$ns2" "dead:beef:$netid::3"
+	stop_if_error "Longer ping test failed (ns2)."
+
+	do_ping_long "$ns3" "100.64.$netid.1"
+	do_ping_long "$ns3" "dead:beef:$netid::1"
+	do_ping_long "$ns3" "100.64.$netid.2"
+	do_ping_long "$ns3" "dead:beef:$netid::2"
+	stop_if_error "Longer ping test failed (ns3)."
+}
 
-	do_ping_long "$ns3" 100.64.0.1
-	do_ping_long "$ns3" dead:beef:1::1
-	do_ping_long "$ns3" 100.64.0.2
-	do_ping_long "$ns3" dead:beef:1::2
-	stop_if_error "Longer ping test failed."
+do_link_problem_tests()
+{
+	echo "INFO: Running link problem tests."
 
 	echo "INFO: Cutting one link."
 	do_ping_long "$ns1" 100.64.0.3 &
@@ -104,26 +111,22 @@ do_complete_ping_test()
 
 	do_ping_long "$ns1" 100.64.0.2
 	do_ping_long "$ns1" 100.64.0.3
-
-	stop_if_error "Failed with delay and packetloss."
+	stop_if_error "Failed with delay and packetloss (ns1)."
 
 	do_ping_long "$ns2" 100.64.0.1
 	do_ping_long "$ns2" 100.64.0.3
-
-	stop_if_error "Failed with delay and packetloss."
+	stop_if_error "Failed with delay and packetloss (ns2)."
 
 	do_ping_long "$ns3" 100.64.0.1
 	do_ping_long "$ns3" 100.64.0.2
-	stop_if_error "Failed with delay and packetloss."
-
-	echo "INFO: All good."
+	stop_if_error "Failed with delay and packetloss (ns3)."
 }
 
 setup_hsr_interfaces()
 {
 	local HSRv="$1"
 
-	echo "INFO: preparing interfaces for HSRv${HSRv}."
+	echo "INFO: Preparing interfaces for HSRv${HSRv}."
 # Three HSR nodes. Each node has one link to each of its neighbour, two links in total.
 #
 #    ns1eth1 ----- ns2eth1
@@ -140,17 +143,20 @@ setup_hsr_interfaces()
 	ip link add ns3eth2 netns "$ns3" type veth peer name ns2eth2 netns "$ns2"
 
 	# HSRv0/1
-	ip -net "$ns1" link add name hsr1 type hsr slave1 ns1eth1 slave2 ns1eth2 supervision 45 version $HSRv proto 0
-	ip -net "$ns2" link add name hsr2 type hsr slave1 ns2eth1 slave2 ns2eth2 supervision 45 version $HSRv proto 0
-	ip -net "$ns3" link add name hsr3 type hsr slave1 ns3eth1 slave2 ns3eth2 supervision 45 version $HSRv proto 0
+	ip -net "$ns1" link add name hsr1 type hsr slave1 ns1eth1 \
+		slave2 ns1eth2 supervision 45 version "$HSRv" proto 0
+	ip -net "$ns2" link add name hsr2 type hsr slave1 ns2eth1 \
+		slave2 ns2eth2 supervision 45 version "$HSRv" proto 0
+	ip -net "$ns3" link add name hsr3 type hsr slave1 ns3eth1 \
+		slave2 ns3eth2 supervision 45 version "$HSRv" proto 0
 
 	# IP for HSR
 	ip -net "$ns1" addr add 100.64.0.1/24 dev hsr1
-	ip -net "$ns1" addr add dead:beef:1::1/64 dev hsr1 nodad
+	ip -net "$ns1" addr add dead:beef:0::1/64 dev hsr1 nodad
 	ip -net "$ns2" addr add 100.64.0.2/24 dev hsr2
-	ip -net "$ns2" addr add dead:beef:1::2/64 dev hsr2 nodad
+	ip -net "$ns2" addr add dead:beef:0::2/64 dev hsr2 nodad
 	ip -net "$ns3" addr add 100.64.0.3/24 dev hsr3
-	ip -net "$ns3" addr add dead:beef:1::3/64 dev hsr3 nodad
+	ip -net "$ns3" addr add dead:beef:0::3/64 dev hsr3 nodad
 
 	ip -net "$ns1" link set address 00:11:22:00:01:01 dev ns1eth1
 	ip -net "$ns1" link set address 00:11:22:00:01:02 dev ns1eth2
@@ -177,85 +183,33 @@ setup_hsr_interfaces()
 
 setup_vlan_interfaces() {
 	ip -net "$ns1" link add link hsr1 name hsr1.2 type vlan id 2
-	ip -net "$ns1" link add link hsr1 name hsr1.3 type vlan id 3
-	ip -net "$ns1" link add link hsr1 name hsr1.4 type vlan id 4
-	ip -net "$ns1" link add link hsr1 name hsr1.5 type vlan id 5
-
 	ip -net "$ns2" link add link hsr2 name hsr2.2 type vlan id 2
-	ip -net "$ns2" link add link hsr2 name hsr2.3 type vlan id 3
-	ip -net "$ns2" link add link hsr2 name hsr2.4 type vlan id 4
-	ip -net "$ns2" link add link hsr2 name hsr2.5 type vlan id 5
-
 	ip -net "$ns3" link add link hsr3 name hsr3.2 type vlan id 2
-	ip -net "$ns3" link add link hsr3 name hsr3.3 type vlan id 3
-	ip -net "$ns3" link add link hsr3 name hsr3.4 type vlan id 4
-	ip -net "$ns3" link add link hsr3 name hsr3.5 type vlan id 5
 
 	ip -net "$ns1" addr add 100.64.2.1/24 dev hsr1.2
-	ip -net "$ns1" addr add 100.64.3.1/24 dev hsr1.3
-	ip -net "$ns1" addr add 100.64.4.1/24 dev hsr1.4
-	ip -net "$ns1" addr add 100.64.5.1/24 dev hsr1.5
+	ip -net "$ns1" addr add dead:beef:2::1/64 dev hsr1.2 nodad
 
 	ip -net "$ns2" addr add 100.64.2.2/24 dev hsr2.2
-	ip -net "$ns2" addr add 100.64.3.2/24 dev hsr2.3
-	ip -net "$ns2" addr add 100.64.4.2/24 dev hsr2.4
-	ip -net "$ns2" addr add 100.64.5.2/24 dev hsr2.5
+	ip -net "$ns2" addr add dead:beef:2::2/64 dev hsr2.2 nodad
 
 	ip -net "$ns3" addr add 100.64.2.3/24 dev hsr3.2
-	ip -net "$ns3" addr add 100.64.3.3/24 dev hsr3.3
-	ip -net "$ns3" addr add 100.64.4.3/24 dev hsr3.4
-	ip -net "$ns3" addr add 100.64.5.3/24 dev hsr3.5
+	ip -net "$ns3" addr add dead:beef:2::3/64 dev hsr3.2 nodad
 
 	ip -net "$ns1" link set dev hsr1.2 up
-	ip -net "$ns1" link set dev hsr1.3 up
-	ip -net "$ns1" link set dev hsr1.4 up
-	ip -net "$ns1" link set dev hsr1.5 up
-
 	ip -net "$ns2" link set dev hsr2.2 up
-	ip -net "$ns2" link set dev hsr2.3 up
-	ip -net "$ns2" link set dev hsr2.4 up
-	ip -net "$ns2" link set dev hsr2.5 up
-
 	ip -net "$ns3" link set dev hsr3.2 up
-	ip -net "$ns3" link set dev hsr3.3 up
-	ip -net "$ns3" link set dev hsr3.4 up
-	ip -net "$ns3" link set dev hsr3.5 up
 
 }
 
-hsr_vlan_ping() {
-	do_ping "$ns1" 100.64.2.2
-	do_ping "$ns1" 100.64.3.2
-	do_ping "$ns1" 100.64.4.2
-	do_ping "$ns1" 100.64.5.2
-
-	do_ping "$ns1" 100.64.2.3
-	do_ping "$ns1" 100.64.3.3
-	do_ping "$ns1" 100.64.4.3
-	do_ping "$ns1" 100.64.5.3
-
-	do_ping "$ns2" 100.64.2.1
-	do_ping "$ns2" 100.64.3.1
-	do_ping "$ns2" 100.64.4.1
-	do_ping "$ns2" 100.64.5.1
-
-	do_ping "$ns2" 100.64.2.3
-	do_ping "$ns2" 100.64.3.3
-	do_ping "$ns2" 100.64.4.3
-	do_ping "$ns2" 100.64.5.3
-
-	do_ping "$ns3" 100.64.2.1
-	do_ping "$ns3" 100.64.3.1
-	do_ping "$ns3" 100.64.4.1
-	do_ping "$ns3" 100.64.5.1
-
-	do_ping "$ns3" 100.64.2.2
-	do_ping "$ns3" 100.64.3.2
-	do_ping "$ns3" 100.64.4.2
-	do_ping "$ns3" 100.64.5.2
+run_complete_ping_tests()
+{
+	echo "INFO: Running complete ping tests."
+	do_ping_tests 0
+	do_link_problem_tests
 }
 
-run_vlan_tests() {
+run_vlan_tests()
+{
 	vlan_challenged_hsr1=$(ip net exec "$ns1" ethtool -k hsr1 | grep "vlan-challenged" | awk '{print $2}')
 	vlan_challenged_hsr2=$(ip net exec "$ns2" ethtool -k hsr2 | grep "vlan-challenged" | awk '{print $2}')
 	vlan_challenged_hsr3=$(ip net exec "$ns3" ethtool -k hsr3 | grep "vlan-challenged" | awk '{print $2}')
@@ -263,27 +217,23 @@ run_vlan_tests() {
 	if [[ "$vlan_challenged_hsr1" = "off" || "$vlan_challenged_hsr2" = "off" || "$vlan_challenged_hsr3" = "off" ]]; then
 		echo "INFO: Running VLAN tests"
 		setup_vlan_interfaces
-		hsr_vlan_ping
+		do_ping_tests 2
 	else
 		echo "INFO: Not Running VLAN tests as the device does not support VLAN"
 	fi
 }
 
 check_prerequisites
-setup_ns ns1 ns2 ns3
-
 trap cleanup_all_ns EXIT
 
+setup_ns ns1 ns2 ns3
 setup_hsr_interfaces 0
-do_complete_ping_test
-
+run_complete_ping_tests
 run_vlan_tests
 
 setup_ns ns1 ns2 ns3
-
 setup_hsr_interfaces 1
-do_complete_ping_test
-
+run_complete_ping_tests
 run_vlan_tests
 
 exit $ret
diff --git a/tools/testing/selftests/net/hsr/settings b/tools/testing/selftests/net/hsr/settings
index 0fbc037f2aa8..ba4d85f74cd6 100644
--- a/tools/testing/selftests/net/hsr/settings
+++ b/tools/testing/selftests/net/hsr/settings
@@ -1 +1 @@
-timeout=50
+timeout=90
-- 
cgit v1.2.3


From ca4a09a950d27909a16cebe512544bb01b8ce2e5 Mon Sep 17 00:00:00 2001
From: Felix Maurer <fmaurer@redhat.com>
Date: Thu, 5 Feb 2026 14:57:30 +0100
Subject: selftests: hsr: Add tests for faulty links

Add a test case that can support different types of faulty links for all
protocol versions (HSRv0, HSRv1, PRPv1). It starts with a baseline with
fully functional links. The first faulty case is one link being cut during
the ping. This test uses a different function for ping that sends more
packets in shorter intervals to stress the duplicate detection algorithms a
bit more and allow for future tests with other link faults (packet loss,
reordering, etc.).

As the link fault tests now cover the cut link for HSR and PRP, it can be
removed from the hsr_ping test. Note that the removed cut link test did not
really test the fault because do_ping_long takes about 1sec while the link
is only cut after a 3sec sleep.

Signed-off-by: Felix Maurer <fmaurer@redhat.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://patch.msgid.link/dad52276e2c349ecb96168bef7e3001bf7becc81.1770299429.git.fmaurer@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/hsr/Makefile       |   1 +
 tools/testing/selftests/net/hsr/hsr_ping.sh    |  11 -
 tools/testing/selftests/net/hsr/link_faults.sh | 271 +++++++++++++++++++++++++
 3 files changed, 272 insertions(+), 11 deletions(-)
 create mode 100755 tools/testing/selftests/net/hsr/link_faults.sh

(limited to 'tools')

diff --git a/tools/testing/selftests/net/hsr/Makefile b/tools/testing/selftests/net/hsr/Makefile
index 1886f345897a..31fb9326cf53 100644
--- a/tools/testing/selftests/net/hsr/Makefile
+++ b/tools/testing/selftests/net/hsr/Makefile
@@ -5,6 +5,7 @@ top_srcdir = ../../../../..
 TEST_PROGS := \
 	hsr_ping.sh \
 	hsr_redbox.sh \
+	link_faults.sh \
 	prp_ping.sh \
 # end of TEST_PROGS
 
diff --git a/tools/testing/selftests/net/hsr/hsr_ping.sh b/tools/testing/selftests/net/hsr/hsr_ping.sh
index ebee4a18fc67..0ec71b20ab75 100755
--- a/tools/testing/selftests/net/hsr/hsr_ping.sh
+++ b/tools/testing/selftests/net/hsr/hsr_ping.sh
@@ -94,17 +94,6 @@ do_link_problem_tests()
 {
 	echo "INFO: Running link problem tests."
 
-	echo "INFO: Cutting one link."
-	do_ping_long "$ns1" 100.64.0.3 &
-
-	sleep 3
-	ip -net "$ns3" link set ns3eth1 down
-	wait
-
-	ip -net "$ns3" link set ns3eth1 up
-
-	stop_if_error "Failed with one link down."
-
 	echo "INFO: Delay the link and drop a few packages."
 	tc -net "$ns3" qdisc add dev ns3eth1 root netem delay 50ms
 	tc -net "$ns2" qdisc add dev ns2eth1 root netem delay 5ms loss 25%
diff --git a/tools/testing/selftests/net/hsr/link_faults.sh b/tools/testing/selftests/net/hsr/link_faults.sh
new file mode 100755
index 000000000000..7ff14dcd32e7
--- /dev/null
+++ b/tools/testing/selftests/net/hsr/link_faults.sh
@@ -0,0 +1,271 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# shellcheck disable=SC2329
+
+source ../lib.sh
+
+ALL_TESTS="
+	test_clean_hsrv0
+	test_cut_link_hsrv0
+	test_clean_hsrv1
+	test_cut_link_hsrv1
+	test_clean_prp
+	test_cut_link_prp
+"
+
+# The tests are running ping for 5sec with a relatively short interval with a
+# cut link, which should be recoverable by HSR/PRP.
+
+setup_hsr_topo()
+{
+	# Three HSR nodes in a ring, every node has a LAN A interface connected
+	# to the LAN B interface of the next node.
+	#
+	#    node1            node2
+	#
+	#     vethA -------- vethB
+	#   hsr1                 hsr2
+	#     vethB          vethA
+	#         \          /
+	#         vethA  vethB
+	#             hsr3
+	#
+	#            node3
+
+	local ver="$1"
+
+	setup_ns node1 node2 node3
+
+	# veth links
+	# shellcheck disable=SC2154 # variables assigned by setup_ns
+	ip link add vethA netns "$node1" type veth peer name vethB netns "$node2"
+	# shellcheck disable=SC2154 # variables assigned by setup_ns
+	ip link add vethA netns "$node2" type veth peer name vethB netns "$node3"
+	ip link add vethA netns "$node3" type veth peer name vethB netns "$node1"
+
+	# MAC addresses (not needed for HSR operation, but helps with debugging)
+	ip -net "$node1" link set address 00:11:22:00:01:01 dev vethA
+	ip -net "$node1" link set address 00:11:22:00:01:02 dev vethB
+
+	ip -net "$node2" link set address 00:11:22:00:02:01 dev vethA
+	ip -net "$node2" link set address 00:11:22:00:02:02 dev vethB
+
+	ip -net "$node3" link set address 00:11:22:00:03:01 dev vethA
+	ip -net "$node3" link set address 00:11:22:00:03:02 dev vethB
+
+	# HSR interfaces
+	ip -net "$node1" link add name hsr1 type hsr proto 0 version "$ver" \
+		slave1 vethA slave2 vethB supervision 45
+	ip -net "$node2" link add name hsr2 type hsr proto 0 version "$ver" \
+		slave1 vethA slave2 vethB supervision 45
+	ip -net "$node3" link add name hsr3 type hsr proto 0 version "$ver" \
+		slave1 vethA slave2 vethB supervision 45
+
+	# IP addresses
+	ip -net "$node1" addr add 100.64.0.1/24 dev hsr1
+	ip -net "$node2" addr add 100.64.0.2/24 dev hsr2
+	ip -net "$node3" addr add 100.64.0.3/24 dev hsr3
+
+	# Set all links up
+	ip -net "$node1" link set vethA up
+	ip -net "$node1" link set vethB up
+	ip -net "$node1" link set hsr1 up
+
+	ip -net "$node2" link set vethA up
+	ip -net "$node2" link set vethB up
+	ip -net "$node2" link set hsr2 up
+
+	ip -net "$node3" link set vethA up
+	ip -net "$node3" link set vethB up
+	ip -net "$node3" link set hsr3 up
+}
+
+setup_prp_topo()
+{
+	# Two PRP nodes, connected by two links (treated as LAN A and LAN B).
+	#
+	#       vethA ----- vethA
+	#     prp1             prp2
+	#       vethB ----- vethB
+	#
+	#     node1           node2
+
+	setup_ns node1 node2
+
+	# veth links
+	ip link add vethA netns "$node1" type veth peer name vethA netns "$node2"
+	ip link add vethB netns "$node1" type veth peer name vethB netns "$node2"
+
+	# MAC addresses will be copied from LAN A interface
+	ip -net "$node1" link set address 00:11:22:00:00:01 dev vethA
+	ip -net "$node2" link set address 00:11:22:00:00:02 dev vethA
+
+	# PRP interfaces
+	ip -net "$node1" link add name prp1 type hsr \
+		slave1 vethA slave2 vethB supervision 45 proto 1
+	ip -net "$node2" link add name prp2 type hsr \
+		slave1 vethA slave2 vethB supervision 45 proto 1
+
+	# IP addresses
+	ip -net "$node1" addr add 100.64.0.1/24 dev prp1
+	ip -net "$node2" addr add 100.64.0.2/24 dev prp2
+
+	# All links up
+	ip -net "$node1" link set vethA up
+	ip -net "$node1" link set vethB up
+	ip -net "$node1" link set prp1 up
+
+	ip -net "$node2" link set vethA up
+	ip -net "$node2" link set vethB up
+	ip -net "$node2" link set prp2 up
+}
+
+wait_for_hsr_node_table()
+{
+	log_info "Wait for node table entries to be merged."
+	WAIT=5
+	while [ "${WAIT}" -gt 0 ]; do
+		nts=$(cat /sys/kernel/debug/hsr/hsr*/node_table)
+
+		# We need entries in the node tables, and they need to be merged
+		if (echo "$nts" | grep -qE "^([0-9a-f]{2}:){5}") && \
+		    ! (echo "$nts" | grep -q "00:00:00:00:00:00"); then
+			return
+		fi
+
+		sleep 1
+		((WAIT--))
+	done
+	check_err 1 "Failed to wait for merged node table entries"
+}
+
+setup_topo()
+{
+	local proto="$1"
+
+	if [ "$proto" = "HSRv0" ]; then
+		setup_hsr_topo 0
+		wait_for_hsr_node_table
+	elif [ "$proto" = "HSRv1" ]; then
+		setup_hsr_topo 1
+		wait_for_hsr_node_table
+	elif [ "$proto" = "PRP" ]; then
+		setup_prp_topo
+	else
+		check_err 1 "Unknown protocol (${proto})"
+	fi
+}
+
+check_ping()
+{
+	local node="$1"
+	local dst="$2"
+	local ping_args="-q -i 0.01 -c 400"
+
+	log_info "Running ping $node -> $dst"
+	# shellcheck disable=SC2086
+	output=$(ip netns exec "$node" ping $ping_args "$dst" | \
+		grep "packets transmitted")
+	log_info "$output"
+
+	dups=0
+	loss=0
+
+	if [[ "$output" =~ \+([0-9]+)" duplicates" ]]; then
+		dups="${BASH_REMATCH[1]}"
+	fi
+	if [[ "$output" =~ ([0-9\.]+\%)" packet loss" ]]; then
+		loss="${BASH_REMATCH[1]}"
+	fi
+
+	check_err "$dups" "Unexpected duplicate packets (${dups})"
+	if [ "$loss" != "0%" ]; then
+		check_err 1 "Unexpected packet loss (${loss})"
+	fi
+}
+
+test_clean()
+{
+	local proto="$1"
+
+	RET=0
+	tname="${FUNCNAME[0]} - ${proto}"
+
+	setup_topo "$proto"
+	if ((RET != ksft_pass)); then
+		log_test "${tname} setup"
+		return
+	fi
+
+	check_ping "$node1" "100.64.0.2"
+
+	log_test "${tname}"
+}
+
+test_clean_hsrv0()
+{
+	test_clean "HSRv0"
+}
+
+test_clean_hsrv1()
+{
+	test_clean "HSRv1"
+}
+
+test_clean_prp()
+{
+	test_clean "PRP"
+}
+
+test_cut_link()
+{
+	local proto="$1"
+
+	RET=0
+	tname="${FUNCNAME[0]} - ${proto}"
+
+	setup_topo "$proto"
+	if ((RET != ksft_pass)); then
+		log_test "${tname} setup"
+		return
+	fi
+
+	# Cutting link from subshell, so check_ping can run in the normal shell
+	# with access to global variables from the test harness.
+	(
+		sleep 2
+		log_info "Cutting link"
+		ip -net "$node1" link set vethB down
+	) &
+	check_ping "$node1" "100.64.0.2"
+
+	wait
+	log_test "${tname}"
+}
+
+
+test_cut_link_hsrv0()
+{
+	test_cut_link "HSRv0"
+}
+
+test_cut_link_hsrv1()
+{
+	test_cut_link "HSRv1"
+}
+
+test_cut_link_prp()
+{
+	test_cut_link "PRP"
+}
+
+cleanup()
+{
+	cleanup_all_ns
+}
+
+trap cleanup EXIT
+
+tests_run
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3


From 8908c3c8cef437d8d2ad41f9b23f4305029d1782 Mon Sep 17 00:00:00 2001
From: Felix Maurer <fmaurer@redhat.com>
Date: Thu, 5 Feb 2026 14:57:32 +0100
Subject: selftests: hsr: Add tests for more link faults with PRP

Add tests where one link has different rates of packet loss or reorders
packets. PRP should still be able to recover from these link faults and
show no packet loss.  However, it is acceptable to receive some level of
duplicate packets. This matches the current specification (IEC
62439-3:2021) of the duplicate discard algorithm that requires it to be
"designed such that it never rejects a legitimate frame, while occasional
acceptance of a duplicate can be tolerated." The rate of acceptable
duplicates in this test is intentionally high (10%) to make the test
stable, the values I observed in the worst test cases (20% loss) are around
5% duplicates.

The duplicates occur because of the 10ms ping interval in the test. As
blocks expire after 400ms based on the timestamp of the first received
sequence number in the block, every approx. 40th will lead to a new, clean
block being used where the sequence number hasn't been seen before. As this
occurs on both nodes in the test (for requests and replies), we observe
around 20 duplicate frames.

Signed-off-by: Felix Maurer <fmaurer@redhat.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://patch.msgid.link/7b36506d3a80e53786fe56526cf6046c74dfeee1.1770299429.git.fmaurer@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/hsr/link_faults.sh | 79 ++++++++++++++++++++++++--
 1 file changed, 74 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/hsr/link_faults.sh b/tools/testing/selftests/net/hsr/link_faults.sh
index 7ff14dcd32e7..1959bea17147 100755
--- a/tools/testing/selftests/net/hsr/link_faults.sh
+++ b/tools/testing/selftests/net/hsr/link_faults.sh
@@ -11,10 +11,16 @@ ALL_TESTS="
 	test_cut_link_hsrv1
 	test_clean_prp
 	test_cut_link_prp
+	test_packet_loss_prp
+	test_high_packet_loss_prp
+	test_reordering_prp
 "
 
-# The tests are running ping for 5sec with a relatively short interval with a
-# cut link, which should be recoverable by HSR/PRP.
+# The tests are running ping for 5sec with a relatively short interval in
+# different scenarios with faulty links (cut links, packet loss, delay,
+# reordering) that should be recoverable by HSR/PRP. The ping interval (10ms)
+# is short enough that the base delay (50ms) leads to a queue in the netem
+# qdiscs which is needed for reordering.
 
 setup_hsr_topo()
 {
@@ -160,6 +166,7 @@ check_ping()
 {
 	local node="$1"
 	local dst="$2"
+	local accepted_dups="$3"
 	local ping_args="-q -i 0.01 -c 400"
 
 	log_info "Running ping $node -> $dst"
@@ -178,7 +185,9 @@ check_ping()
 		loss="${BASH_REMATCH[1]}"
 	fi
 
-	check_err "$dups" "Unexpected duplicate packets (${dups})"
+	if [ "$dups" -gt "$accepted_dups" ]; then
+		check_err 1 "Unexpected duplicate packets (${dups})"
+	fi
 	if [ "$loss" != "0%" ]; then
 		check_err 1 "Unexpected packet loss (${loss})"
 	fi
@@ -197,7 +206,7 @@ test_clean()
 		return
 	fi
 
-	check_ping "$node1" "100.64.0.2"
+	check_ping "$node1" "100.64.0.2" 0
 
 	log_test "${tname}"
 }
@@ -237,7 +246,7 @@ test_cut_link()
 		log_info "Cutting link"
 		ip -net "$node1" link set vethB down
 	) &
-	check_ping "$node1" "100.64.0.2"
+	check_ping "$node1" "100.64.0.2" 0
 
 	wait
 	log_test "${tname}"
@@ -259,6 +268,66 @@ test_cut_link_prp()
 	test_cut_link "PRP"
 }
 
+test_packet_loss()
+{
+	local proto="$1"
+	local loss="$2"
+
+	RET=0
+	tname="${FUNCNAME[0]} - ${proto}, ${loss}"
+
+	setup_topo "$proto"
+	if ((RET != ksft_pass)); then
+		log_test "${tname} setup"
+		return
+	fi
+
+	# Packet loss with lower delay makes sure the packets on the lossy link
+	# arrive first.
+	tc -net "$node1" qdisc add dev vethA root netem delay 50ms
+	tc -net "$node1" qdisc add dev vethB root netem delay 20ms loss "$loss"
+
+	check_ping "$node1" "100.64.0.2" 40
+
+	log_test "${tname}"
+}
+
+test_packet_loss_prp()
+{
+	test_packet_loss "PRP" "20%"
+}
+
+test_high_packet_loss_prp()
+{
+	test_packet_loss "PRP" "80%"
+}
+
+test_reordering()
+{
+	local proto="$1"
+
+	RET=0
+	tname="${FUNCNAME[0]} - ${proto}"
+
+	setup_topo "$proto"
+	if ((RET != ksft_pass)); then
+		log_test "${tname} setup"
+		return
+	fi
+
+	tc -net "$node1" qdisc add dev vethA root netem delay 50ms
+	tc -net "$node1" qdisc add dev vethB root netem delay 50ms reorder 20%
+
+	check_ping "$node1" "100.64.0.2" 40
+
+	log_test "${tname}"
+}
+
+test_reordering_prp()
+{
+	test_reordering "PRP"
+}
+
 cleanup()
 {
 	cleanup_all_ns
-- 
cgit v1.2.3


From bbbd531faa18b778a9129938d2c8db6c33c106ab Mon Sep 17 00:00:00 2001
From: Felix Maurer <fmaurer@redhat.com>
Date: Thu, 5 Feb 2026 14:57:34 +0100
Subject: selftests: hsr: Add more link fault tests for HSR

Run the packet loss and reordering tests also for both HSR versions. Now
they can be removed from the hsr_ping tests completely. The timeout needs
to be increased because there are 15 link fault test cases now, with each
of them taking 5-6sec for the test and at most 5sec for the HSR node tables
to get merged and we also want some room to make the test runs stable.

Signed-off-by: Felix Maurer <fmaurer@redhat.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://patch.msgid.link/eb6f667d3804ce63d86f0ee3fbc0e0ac9e1a209a.1770299429.git.fmaurer@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/hsr/hsr_ping.sh    | 32 ++++------------------
 tools/testing/selftests/net/hsr/link_faults.sh | 38 ++++++++++++++++++++++++++
 tools/testing/selftests/net/hsr/settings       |  2 +-
 3 files changed, 44 insertions(+), 28 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/hsr/hsr_ping.sh b/tools/testing/selftests/net/hsr/hsr_ping.sh
index 0ec71b20ab75..f4d685df4345 100755
--- a/tools/testing/selftests/net/hsr/hsr_ping.sh
+++ b/tools/testing/selftests/net/hsr/hsr_ping.sh
@@ -90,27 +90,6 @@ do_ping_tests()
 	stop_if_error "Longer ping test failed (ns3)."
 }
 
-do_link_problem_tests()
-{
-	echo "INFO: Running link problem tests."
-
-	echo "INFO: Delay the link and drop a few packages."
-	tc -net "$ns3" qdisc add dev ns3eth1 root netem delay 50ms
-	tc -net "$ns2" qdisc add dev ns2eth1 root netem delay 5ms loss 25%
-
-	do_ping_long "$ns1" 100.64.0.2
-	do_ping_long "$ns1" 100.64.0.3
-	stop_if_error "Failed with delay and packetloss (ns1)."
-
-	do_ping_long "$ns2" 100.64.0.1
-	do_ping_long "$ns2" 100.64.0.3
-	stop_if_error "Failed with delay and packetloss (ns2)."
-
-	do_ping_long "$ns3" 100.64.0.1
-	do_ping_long "$ns3" 100.64.0.2
-	stop_if_error "Failed with delay and packetloss (ns3)."
-}
-
 setup_hsr_interfaces()
 {
 	local HSRv="$1"
@@ -190,11 +169,10 @@ setup_vlan_interfaces() {
 
 }
 
-run_complete_ping_tests()
+run_ping_tests()
 {
-	echo "INFO: Running complete ping tests."
+	echo "INFO: Running ping tests."
 	do_ping_tests 0
-	do_link_problem_tests
 }
 
 run_vlan_tests()
@@ -204,7 +182,7 @@ run_vlan_tests()
 	vlan_challenged_hsr3=$(ip net exec "$ns3" ethtool -k hsr3 | grep "vlan-challenged" | awk '{print $2}')
 
 	if [[ "$vlan_challenged_hsr1" = "off" || "$vlan_challenged_hsr2" = "off" || "$vlan_challenged_hsr3" = "off" ]]; then
-		echo "INFO: Running VLAN tests"
+		echo "INFO: Running VLAN ping tests"
 		setup_vlan_interfaces
 		do_ping_tests 2
 	else
@@ -217,12 +195,12 @@ trap cleanup_all_ns EXIT
 
 setup_ns ns1 ns2 ns3
 setup_hsr_interfaces 0
-run_complete_ping_tests
+run_ping_tests
 run_vlan_tests
 
 setup_ns ns1 ns2 ns3
 setup_hsr_interfaces 1
-run_complete_ping_tests
+run_ping_tests
 run_vlan_tests
 
 exit $ret
diff --git a/tools/testing/selftests/net/hsr/link_faults.sh b/tools/testing/selftests/net/hsr/link_faults.sh
index 1959bea17147..be526281571c 100755
--- a/tools/testing/selftests/net/hsr/link_faults.sh
+++ b/tools/testing/selftests/net/hsr/link_faults.sh
@@ -7,8 +7,16 @@ source ../lib.sh
 ALL_TESTS="
 	test_clean_hsrv0
 	test_cut_link_hsrv0
+	test_packet_loss_hsrv0
+	test_high_packet_loss_hsrv0
+	test_reordering_hsrv0
+
 	test_clean_hsrv1
 	test_cut_link_hsrv1
+	test_packet_loss_hsrv1
+	test_high_packet_loss_hsrv1
+	test_reordering_hsrv1
+
 	test_clean_prp
 	test_cut_link_prp
 	test_packet_loss_prp
@@ -292,11 +300,31 @@ test_packet_loss()
 	log_test "${tname}"
 }
 
+test_packet_loss_hsrv0()
+{
+	test_packet_loss "HSRv0" "20%"
+}
+
+test_packet_loss_hsrv1()
+{
+	test_packet_loss "HSRv1" "20%"
+}
+
 test_packet_loss_prp()
 {
 	test_packet_loss "PRP" "20%"
 }
 
+test_high_packet_loss_hsrv0()
+{
+	test_packet_loss "HSRv0" "80%"
+}
+
+test_high_packet_loss_hsrv1()
+{
+	test_packet_loss "HSRv1" "80%"
+}
+
 test_high_packet_loss_prp()
 {
 	test_packet_loss "PRP" "80%"
@@ -323,6 +351,16 @@ test_reordering()
 	log_test "${tname}"
 }
 
+test_reordering_hsrv0()
+{
+	test_reordering "HSRv0"
+}
+
+test_reordering_hsrv1()
+{
+	test_reordering "HSRv1"
+}
+
 test_reordering_prp()
 {
 	test_reordering "PRP"
diff --git a/tools/testing/selftests/net/hsr/settings b/tools/testing/selftests/net/hsr/settings
index ba4d85f74cd6..a953c96aa16e 100644
--- a/tools/testing/selftests/net/hsr/settings
+++ b/tools/testing/selftests/net/hsr/settings
@@ -1 +1 @@
-timeout=90
+timeout=180
-- 
cgit v1.2.3


From abca6583a2aa00ed856907d86446ae527442a754 Mon Sep 17 00:00:00 2001
From: "Lain \"Fearyncess\" Yang" <fearyncess@aosc.io>
Date: Tue, 10 Feb 2026 19:31:12 +0800
Subject: LoongArch: Wire up memfd_secret system call

LoongArch supports ARCH_HAS_SET_DIRECT_MAP, therefore wire up the
memfd_secret system call, which just depends on it.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Lain "Fearyncess" Yang <fearyncess@aosc.io>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/include/asm/unistd.h     | 1 +
 arch/loongarch/kernel/Makefile.syscalls | 5 ++---
 tools/testing/selftests/mm/Makefile     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/arch/loongarch/include/asm/unistd.h b/arch/loongarch/include/asm/unistd.h
index e2c0f3d86c7b..e7649c158248 100644
--- a/arch/loongarch/include/asm/unistd.h
+++ b/arch/loongarch/include/asm/unistd.h
@@ -10,5 +10,6 @@
 
 #define __ARCH_WANT_NEW_STAT
 #define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_MEMFD_SECRET
 
 #define NR_syscalls (__NR_syscalls)
diff --git a/arch/loongarch/kernel/Makefile.syscalls b/arch/loongarch/kernel/Makefile.syscalls
index cd46c2b69c7f..06f160502537 100644
--- a/arch/loongarch/kernel/Makefile.syscalls
+++ b/arch/loongarch/kernel/Makefile.syscalls
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 
-# No special ABIs on loongarch so far
-syscall_abis_32 +=
-syscall_abis_64 +=
+syscall_abis_32 += memfd_secret
+syscall_abis_64 += memfd_secret
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index eaf9312097f7..79582438efc4 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -72,7 +72,7 @@ TEST_GEN_FILES += madv_populate
 TEST_GEN_FILES += map_fixed_noreplace
 TEST_GEN_FILES += map_hugetlb
 TEST_GEN_FILES += map_populate
-ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64))
+ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64 loongarch32 loongarch64))
 TEST_GEN_FILES += memfd_secret
 endif
 TEST_GEN_FILES += migration
-- 
cgit v1.2.3


From 84a654f786414a74a15a7b61929d1ced06687310 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Tue, 10 Feb 2026 09:01:18 -0300
Subject: tools build: Make test-rust.bin be removed by the 'clean' target

test-rust.bin is missing from the list of FILES, and thus is not removed by the
clean target. This could lead to a false feature detection, since the binary
stays there. Fix it.

Fixes: 6a32fa5ccd33da5d ("tools build: Add a feature test for rust compiler")
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/feature/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 9ae69d857166..cc53d4e5f8d8 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -73,6 +73,7 @@ FILES=                                          \
          test-clang-bpf-co-re.bin		\
          test-file-handle.bin			\
          test-libpfm4.bin			\
+         test-rust.bin				\
          test-libopenssl.bin
 
 FILES := $(addprefix $(OUTPUT),$(FILES))
-- 
cgit v1.2.3


From 1f12fb138de81e63c1140689257548966cfb88a3 Mon Sep 17 00:00:00 2001
From: Dmitry Dolgov <9erthalion6@gmail.com>
Date: Tue, 10 Feb 2026 11:02:56 +0100
Subject: tools build: Emit dependencies file for test-rust.bin

Test it first by having rust installed, then removing it and building again.

Fixes: 6a32fa5ccd33da5d ("tools build: Add a feature test for rust compiler")
Signed-off-by: Dmitry Dolgov <9erthalion6@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/feature/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index cc53d4e5f8d8..e959caa7f1c7 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -113,7 +113,7 @@ __BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(
 __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS)
   BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1
 
-__BUILDRS = $(RUSTC) $(RUSTC_FLAGS) -o $@ $(patsubst %.bin,%.rs,$(@F))
+__BUILDRS = $(RUSTC) $(RUSTC_FLAGS) --emit=dep-info=$(patsubst %.bin,%.d,$(@F)),link -o $@ $(patsubst %.bin,%.rs,$(@F))
   BUILDRS = $(__BUILDRS) > $(@:.bin=.make.output) 2>&1
 
 ###############################
-- 
cgit v1.2.3


From bc105a8918fd8458ec3a5b947018b0f8059da529 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 9 Feb 2026 22:03:54 -0800
Subject: Revert "perf tool_pmu: More accurately set the cpus for tool events"

This reverts commit d8d8a0b3603a9a8fa207cf9e4f292e81dc5d1008.

The setting of a user CPU map can cause an empty intersection when
combined with CPU 0 and the event removed. This later triggers a segv in
the stat-shadow logic. Let's put back a full online CPU map for now by
reverting this patch.

Closes: https://lore.kernel.org/linux-perf-users/cgja46br2smmznxs7kbeabs6zgv3b4olfqgh2fdp5mxk2yom4v@w6jjgov6hdi6/
Reported-by: Andres Freund <andres@anarazel.de>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-events.c |  9 ++-------
 tools/perf/util/tool_pmu.c     | 19 -------------------
 tools/perf/util/tool_pmu.h     |  1 -
 3 files changed, 2 insertions(+), 27 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index d4647ded340f..f631bf7a919f 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -30,7 +30,6 @@
 #include "util/event.h"
 #include "util/bpf-filter.h"
 #include "util/stat.h"
-#include "util/tool_pmu.h"
 #include "util/util.h"
 #include "tracepoint.h"
 #include <api/fs/tracing_path.h>
@@ -230,12 +229,8 @@ __add_event(struct list_head *list, int *idx,
 	if (pmu) {
 		is_pmu_core = pmu->is_core;
 		pmu_cpus = perf_cpu_map__get(pmu->cpus);
-		if (perf_cpu_map__is_empty(pmu_cpus)) {
-			if (perf_pmu__is_tool(pmu))
-				pmu_cpus = tool_pmu__cpus(attr);
-			else
-				pmu_cpus = cpu_map__online();
-		}
+		if (perf_cpu_map__is_empty(pmu_cpus))
+			pmu_cpus = cpu_map__online();
 	} else {
 		is_pmu_core = (attr->type == PERF_TYPE_HARDWARE ||
 			       attr->type == PERF_TYPE_HW_CACHE);
diff --git a/tools/perf/util/tool_pmu.c b/tools/perf/util/tool_pmu.c
index 37c4eae0bef1..6a9df3dc0e07 100644
--- a/tools/perf/util/tool_pmu.c
+++ b/tools/perf/util/tool_pmu.c
@@ -2,7 +2,6 @@
 #include "cgroup.h"
 #include "counts.h"
 #include "cputopo.h"
-#include "debug.h"
 #include "evsel.h"
 #include "pmu.h"
 #include "print-events.h"
@@ -14,7 +13,6 @@
 #include <api/fs/fs.h>
 #include <api/io.h>
 #include <internal/threadmap.h>
-#include <perf/cpumap.h>
 #include <perf/threadmap.h>
 #include <fcntl.h>
 #include <strings.h>
@@ -111,23 +109,6 @@ const char *evsel__tool_pmu_event_name(const struct evsel *evsel)
 	return tool_pmu__event_to_str(evsel->core.attr.config);
 }
 
-struct perf_cpu_map *tool_pmu__cpus(struct perf_event_attr *attr)
-{
-	static struct perf_cpu_map *cpu0_map;
-	enum tool_pmu_event event = (enum tool_pmu_event)attr->config;
-
-	if (event <= TOOL_PMU__EVENT_NONE || event >= TOOL_PMU__EVENT_MAX) {
-		pr_err("Invalid tool PMU event config %llx\n", attr->config);
-		return NULL;
-	}
-	if (event == TOOL_PMU__EVENT_USER_TIME || event == TOOL_PMU__EVENT_SYSTEM_TIME)
-		return cpu_map__online();
-
-	if (!cpu0_map)
-		cpu0_map = perf_cpu_map__new_int(0);
-	return perf_cpu_map__get(cpu0_map);
-}
-
 static bool read_until_char(struct io *io, char e)
 {
 	int c;
diff --git a/tools/perf/util/tool_pmu.h b/tools/perf/util/tool_pmu.h
index ea343d1983d3..f1714001bc1d 100644
--- a/tools/perf/util/tool_pmu.h
+++ b/tools/perf/util/tool_pmu.h
@@ -46,7 +46,6 @@ bool tool_pmu__read_event(enum tool_pmu_event ev,
 u64 tool_pmu__cpu_slots_per_cycle(void);
 
 bool perf_pmu__is_tool(const struct perf_pmu *pmu);
-struct perf_cpu_map *tool_pmu__cpus(struct perf_event_attr *attr);
 
 bool evsel__is_tool(const struct evsel *evsel);
 enum tool_pmu_event evsel__tool_event(const struct evsel *evsel);
-- 
cgit v1.2.3


From 63b320aaac08ba267268ec21a195ce3c82dcb8ab Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 9 Feb 2026 22:03:55 -0800
Subject: perf stat-shadow: In prepare_metric fix guard on reading NULL
 perf_stat_evsel

The aggr value is setup to always be non-null creating a redundant
guard for reading from it. Switch to using the perf_stat_evsel (ps)
and narrow the scope of aggr so that it is known valid when used.

Fixes: 3d65f6445fd93e3e ("perf stat-shadow: Read tool events directly")
Reported-by: Andres Freund <andres@anarazel.de>
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/stat-shadow.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 5d8d09e0e6ae..59d2cd4f2188 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -57,7 +57,6 @@ static int prepare_metric(struct perf_stat_config *config,
 		bool is_tool_time =
 			tool_pmu__is_time_event(config, metric_events[i], &tool_aggr_idx);
 		struct perf_stat_evsel *ps = metric_events[i]->stats;
-		struct perf_stat_aggr *aggr;
 		char *n;
 		double val;
 
@@ -82,8 +81,7 @@ static int prepare_metric(struct perf_stat_config *config,
 			}
 		}
 		/* Time events are always on CPU0, the first aggregation index. */
-		aggr = &ps->aggr[is_tool_time ? tool_aggr_idx : aggr_idx];
-		if (!aggr || !metric_events[i]->supported || aggr->counts.run == 0) {
+		if (!ps || !metric_events[i]->supported) {
 			/*
 			 * Not supported events will have a count of 0, which
 			 * can be confusing in a metric. Explicitly set the
@@ -93,11 +91,21 @@ static int prepare_metric(struct perf_stat_config *config,
 			val = NAN;
 			source_count = 0;
 		} else {
-			val = aggr->counts.val;
-			if (is_tool_time)
-				val *= 1e-9; /* Convert time event nanoseconds to seconds. */
-			if (!source_count)
-				source_count = evsel__source_count(metric_events[i]);
+			struct perf_stat_aggr *aggr =
+				&ps->aggr[is_tool_time ? tool_aggr_idx : aggr_idx];
+
+			if (aggr->counts.run == 0) {
+				val = NAN;
+				source_count = 0;
+			} else {
+				val = aggr->counts.val;
+				if (is_tool_time) {
+					/* Convert time event nanoseconds to seconds. */
+					val *= 1e-9;
+				}
+				if (!source_count)
+					source_count = evsel__source_count(metric_events[i]);
+			}
 		}
 		n = strdup(evsel__metric_id(metric_events[i]));
 		if (!n)
-- 
cgit v1.2.3


From ff8548172f63288b5c2b0fda5a4a4be294f610f1 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 9 Feb 2026 22:03:56 -0800
Subject: perf evlist: Special map propagation for tool events that read on 1
 CPU

Tool events like duration_time don't need a perf_cpu_map that contains
all online CPUs.

Having such a perf_cpu_map causes overheads when iterating between
events for CPU affinity.

During parsing mark events that just read on a single CPU map index as
such, then during map propagation set up the evsel's CPUs and thereby
the evlists accordingly.

The setting cannot be done early in parsing as user CPUs are only fully
known when evlist__create_maps is called.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andres Freund <andres@anarazel.de>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/lib/perf/evlist.c                 | 36 +++++++++++++++++++++++++++++----
 tools/lib/perf/include/internal/evsel.h |  2 ++
 tools/perf/util/parse-events.c          |  1 +
 tools/perf/util/pmu.c                   | 11 ++++++++++
 tools/perf/util/pmu.h                   |  2 ++
 5 files changed, 48 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/perf/evlist.c b/tools/lib/perf/evlist.c
index 3ed023f4b190..1f210dadd666 100644
--- a/tools/lib/perf/evlist.c
+++ b/tools/lib/perf/evlist.c
@@ -101,6 +101,28 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist,
 		evsel->cpus = perf_cpu_map__get(evlist->user_requested_cpus);
 	}
 
+	/*
+	 * Tool events may only read on the first CPU index to avoid double
+	 * counting things like duration_time. Make the evsel->cpus contain just
+	 * that single entry otherwise we may spend time changing affinity to
+	 * CPUs that just have tool events, etc.
+	 */
+	if (evsel->reads_only_on_cpu_idx0 && perf_cpu_map__nr(evsel->cpus) > 0) {
+		struct perf_cpu_map *srcs[3] = {
+			evlist->all_cpus,
+			evlist->user_requested_cpus,
+			evsel->pmu_cpus,
+		};
+		for (size_t i = 0; i < ARRAY_SIZE(srcs); i++) {
+			if (!srcs[i])
+				continue;
+
+			perf_cpu_map__put(evsel->cpus);
+			evsel->cpus = perf_cpu_map__new_int(perf_cpu_map__cpu(srcs[i], 0).cpu);
+			break;
+		}
+	}
+
 	/* Sanity check assert before the evsel is potentially removed. */
 	assert(!evsel->requires_cpu || !perf_cpu_map__has_any_cpu(evsel->cpus));
 
@@ -133,16 +155,22 @@ static void __perf_evlist__propagate_maps(struct perf_evlist *evlist,
 
 static void perf_evlist__propagate_maps(struct perf_evlist *evlist)
 {
-	struct perf_evsel *evsel, *n;
-
 	evlist->needs_map_propagation = true;
 
 	/* Clear the all_cpus set which will be merged into during propagation. */
 	perf_cpu_map__put(evlist->all_cpus);
 	evlist->all_cpus = NULL;
 
-	list_for_each_entry_safe(evsel, n, &evlist->entries, node)
-		__perf_evlist__propagate_maps(evlist, evsel);
+	/* 2 rounds so that reads_only_on_cpu_idx0 benefit from knowing the other CPU maps. */
+	for (int round = 0; round < 2; round++) {
+		struct perf_evsel *evsel, *n;
+
+		list_for_each_entry_safe(evsel, n, &evlist->entries, node) {
+			if ((!evsel->reads_only_on_cpu_idx0 && round == 0) ||
+			    (evsel->reads_only_on_cpu_idx0 && round == 1))
+				__perf_evlist__propagate_maps(evlist, evsel);
+		}
+	}
 }
 
 void perf_evlist__add(struct perf_evlist *evlist,
diff --git a/tools/lib/perf/include/internal/evsel.h b/tools/lib/perf/include/internal/evsel.h
index fefe64ba5e26..b988034f1371 100644
--- a/tools/lib/perf/include/internal/evsel.h
+++ b/tools/lib/perf/include/internal/evsel.h
@@ -128,6 +128,8 @@ struct perf_evsel {
 	bool			 requires_cpu;
 	/** Is the PMU for the event a core one? Effects the handling of own_cpus. */
 	bool			 is_pmu_core;
+	/** Does the evsel on read on the first CPU index such as tool time events? */
+	bool			 reads_only_on_cpu_idx0;
 	int			 idx;
 };
 
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index f631bf7a919f..b9efb296bba5 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -269,6 +269,7 @@ __add_event(struct list_head *list, int *idx,
 	evsel->core.pmu_cpus = pmu_cpus;
 	evsel->core.requires_cpu = pmu ? pmu->is_uncore : false;
 	evsel->core.is_pmu_core = is_pmu_core;
+	evsel->core.reads_only_on_cpu_idx0 = perf_pmu__reads_only_on_cpu_idx0(attr);
 	evsel->pmu = pmu;
 	evsel->alternate_hw_config = alternate_hw_config;
 	evsel->first_wildcard_match = first_wildcard_match;
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index bb399a47d2b4..81ab74681c9b 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -2718,3 +2718,14 @@ const char *perf_pmu__name_from_config(struct perf_pmu *pmu, u64 config)
 	}
 	return NULL;
 }
+
+bool perf_pmu__reads_only_on_cpu_idx0(const struct perf_event_attr *attr)
+{
+	enum tool_pmu_event event;
+
+	if (attr->type != PERF_PMU_TYPE_TOOL)
+		return false;
+
+	event = (enum tool_pmu_event)attr->config;
+	return event != TOOL_PMU__EVENT_USER_TIME && event != TOOL_PMU__EVENT_SYSTEM_TIME;
+}
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 7ef90b54a149..41c21389f393 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -350,6 +350,8 @@ void perf_pmu__delete(struct perf_pmu *pmu);
 const char *perf_pmu__name_from_config(struct perf_pmu *pmu, u64 config);
 bool perf_pmu__is_fake(const struct perf_pmu *pmu);
 
+bool perf_pmu__reads_only_on_cpu_idx0(const struct perf_event_attr *attr);
+
 static inline enum pmu_kind perf_pmu__kind(const struct perf_pmu *pmu)
 {
 	__u32 type;
-- 
cgit v1.2.3


From 47172912c9933103bc2c68627b1dafd9058d035e Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 9 Feb 2026 22:03:57 -0800
Subject: perf evlist: Missing TPEBS close in evlist__close()

The libperf evsel close won't close TPEBS events properly.

Add a test to do this. The libperf close routine is used in
evlist__close() for affinity reasons.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andres Freund <andres@anarazel.de>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 3b0d837e3046..3abc2215e790 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1356,6 +1356,8 @@ void evlist__close(struct evlist *evlist)
 		return;
 
 	evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) {
+		if (evlist_cpu_itr.cpu_map_idx == 0 && evsel__is_retire_lat(evlist_cpu_itr.evsel))
+			evsel__tpebs_close(evlist_cpu_itr.evsel);
 		perf_evsel__close_cpu(&evlist_cpu_itr.evsel->core,
 				      evlist_cpu_itr.cpu_map_idx);
 	}
-- 
cgit v1.2.3


From d484361550ebdc4da77ea16f6cb08badde33e799 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 9 Feb 2026 22:03:58 -0800
Subject: perf evlist: Reduce affinity use and move into iterator, fix no
 affinity

The evlist__for_each_cpu iterator will call sched_setaffitinity when
moving between CPUs to avoid IPIs.

If only 1 IPI is saved then this may be unprofitable as the delay to get
scheduled may be considerable.

This may be particularly true if reading an event group in `perf stat`
in interval mode.

Move the affinity handling completely into the iterator so that a single
evlist__use_affinity can determine whether CPU affinities will be used.

For `perf record` the change is minimal as the dummy event and the real
event will always make the use of affinities the thing to do.

In `perf stat`, tool events are ignored and affinities only used if >1
event on the same CPU occur.

Determining if affinities are useful is done by evlist__use_affinity
which tests per-event whether the event's PMU benefits from affinity use
- it is assumed only perf event using PMUs do.

Fix a bug where when there are no affinities that the CPU map iterator
may reference a CPU not present in the initial evsel. Fix by making the
iterator and non-iterator code common.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andres Freund <andres@anarazel.de>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 108 +++++++++++++------------------
 tools/perf/util/evlist.c  | 158 ++++++++++++++++++++++++++++------------------
 tools/perf/util/evlist.h  |  26 ++++++--
 tools/perf/util/pmu.c     |  12 ++++
 tools/perf/util/pmu.h     |   1 +
 5 files changed, 174 insertions(+), 131 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2895b809607f..c1bb40b99176 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -369,19 +369,11 @@ static int read_counter_cpu(struct evsel *counter, int cpu_map_idx)
 static int read_counters_with_affinity(void)
 {
 	struct evlist_cpu_iterator evlist_cpu_itr;
-	struct affinity saved_affinity, *affinity;
 
 	if (all_counters_use_bpf)
 		return 0;
 
-	if (!target__has_cpu(&target) || target__has_per_thread(&target))
-		affinity = NULL;
-	else if (affinity__setup(&saved_affinity) < 0)
-		return -1;
-	else
-		affinity = &saved_affinity;
-
-	evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
+	evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
 		struct evsel *counter = evlist_cpu_itr.evsel;
 
 		if (evsel__is_bpf(counter))
@@ -393,8 +385,6 @@ static int read_counters_with_affinity(void)
 		if (!counter->err)
 			counter->err = read_counter_cpu(counter, evlist_cpu_itr.cpu_map_idx);
 	}
-	if (affinity)
-		affinity__cleanup(&saved_affinity);
 
 	return 0;
 }
@@ -793,7 +783,6 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	const bool forks = (argc > 0);
 	bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false;
 	struct evlist_cpu_iterator evlist_cpu_itr;
-	struct affinity saved_affinity, *affinity = NULL;
 	int err, open_err = 0;
 	bool second_pass = false, has_supported_counters;
 
@@ -805,14 +794,6 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 		child_pid = evsel_list->workload.pid;
 	}
 
-	if (!cpu_map__is_dummy(evsel_list->core.user_requested_cpus)) {
-		if (affinity__setup(&saved_affinity) < 0) {
-			err = -1;
-			goto err_out;
-		}
-		affinity = &saved_affinity;
-	}
-
 	evlist__for_each_entry(evsel_list, counter) {
 		counter->reset_group = false;
 		if (bpf_counter__load(counter, &target)) {
@@ -825,49 +806,48 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 
 	evlist__reset_aggr_stats(evsel_list);
 
-	evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
-		counter = evlist_cpu_itr.evsel;
+	/*
+	 * bperf calls evsel__open_per_cpu() in bperf__load(), so
+	 * no need to call it again here.
+	 */
+	if (!target.use_bpf) {
+		evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
+			counter = evlist_cpu_itr.evsel;
 
-		/*
-		 * bperf calls evsel__open_per_cpu() in bperf__load(), so
-		 * no need to call it again here.
-		 */
-		if (target.use_bpf)
-			break;
+			if (counter->reset_group || !counter->supported)
+				continue;
+			if (evsel__is_bperf(counter))
+				continue;
 
-		if (counter->reset_group || !counter->supported)
-			continue;
-		if (evsel__is_bperf(counter))
-			continue;
+			while (true) {
+				if (create_perf_stat_counter(counter, &stat_config,
+							      evlist_cpu_itr.cpu_map_idx) == 0)
+					break;
 
-		while (true) {
-			if (create_perf_stat_counter(counter, &stat_config,
-						     evlist_cpu_itr.cpu_map_idx) == 0)
-				break;
+				open_err = errno;
+				/*
+				 * Weak group failed. We cannot just undo this
+				 * here because earlier CPUs might be in group
+				 * mode, and the kernel doesn't support mixing
+				 * group and non group reads. Defer it to later.
+				 * Don't close here because we're in the wrong
+				 * affinity.
+				 */
+				if ((open_err == EINVAL || open_err == EBADF) &&
+					evsel__leader(counter) != counter &&
+					counter->weak_group) {
+					evlist__reset_weak_group(evsel_list, counter, false);
+					assert(counter->reset_group);
+					counter->supported = true;
+					second_pass = true;
+					break;
+				}
 
-			open_err = errno;
-			/*
-			 * Weak group failed. We cannot just undo this here
-			 * because earlier CPUs might be in group mode, and the kernel
-			 * doesn't support mixing group and non group reads. Defer
-			 * it to later.
-			 * Don't close here because we're in the wrong affinity.
-			 */
-			if ((open_err == EINVAL || open_err == EBADF) &&
-				evsel__leader(counter) != counter &&
-				counter->weak_group) {
-				evlist__reset_weak_group(evsel_list, counter, false);
-				assert(counter->reset_group);
-				counter->supported = true;
-				second_pass = true;
-				break;
+				if (stat_handle_error(counter, open_err) != COUNTER_RETRY)
+					break;
 			}
-
-			if (stat_handle_error(counter, open_err) != COUNTER_RETRY)
-				break;
 		}
 	}
-
 	if (second_pass) {
 		/*
 		 * Now redo all the weak group after closing them,
@@ -875,7 +855,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 		 */
 
 		/* First close errored or weak retry */
-		evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
+		evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
 			counter = evlist_cpu_itr.evsel;
 
 			if (!counter->reset_group && counter->supported)
@@ -884,7 +864,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 			perf_evsel__close_cpu(&counter->core, evlist_cpu_itr.cpu_map_idx);
 		}
 		/* Now reopen weak */
-		evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
+		evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
 			counter = evlist_cpu_itr.evsel;
 
 			if (!counter->reset_group)
@@ -893,17 +873,18 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 			while (true) {
 				pr_debug2("reopening weak %s\n", evsel__name(counter));
 				if (create_perf_stat_counter(counter, &stat_config,
-							     evlist_cpu_itr.cpu_map_idx) == 0)
+							     evlist_cpu_itr.cpu_map_idx) == 0) {
+					evlist_cpu_iterator__exit(&evlist_cpu_itr);
 					break;
-
+				}
 				open_err = errno;
-				if (stat_handle_error(counter, open_err) != COUNTER_RETRY)
+				if (stat_handle_error(counter, open_err) != COUNTER_RETRY) {
+					evlist_cpu_iterator__exit(&evlist_cpu_itr);
 					break;
+				}
 			}
 		}
 	}
-	affinity__cleanup(affinity);
-	affinity = NULL;
 
 	has_supported_counters = false;
 	evlist__for_each_entry(evsel_list, counter) {
@@ -1065,7 +1046,6 @@ err_out:
 	if (forks)
 		evlist__cancel_workload(evsel_list);
 
-	affinity__cleanup(affinity);
 	return err;
 }
 
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 3abc2215e790..45833244daf3 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -359,36 +359,111 @@ int evlist__add_newtp(struct evlist *evlist, const char *sys, const char *name,
 }
 #endif
 
-struct evlist_cpu_iterator evlist__cpu_begin(struct evlist *evlist, struct affinity *affinity)
+/*
+ * Should sched_setaffinity be used with evlist__for_each_cpu? Determine if
+ * migrating the thread will avoid possibly numerous IPIs.
+ */
+static bool evlist__use_affinity(struct evlist *evlist)
+{
+	struct evsel *pos;
+	struct perf_cpu_map *used_cpus = NULL;
+	bool ret = false;
+
+	/*
+	 * With perf record core.user_requested_cpus is usually NULL.
+	 * Use the old method to handle this for now.
+	 */
+	if (!evlist->core.user_requested_cpus ||
+	    cpu_map__is_dummy(evlist->core.user_requested_cpus))
+		return false;
+
+	evlist__for_each_entry(evlist, pos) {
+		struct perf_cpu_map *intersect;
+
+		if (!perf_pmu__benefits_from_affinity(pos->pmu))
+			continue;
+
+		if (evsel__is_dummy_event(pos)) {
+			/*
+			 * The dummy event is opened on all CPUs so assume >1
+			 * event with shared CPUs.
+			 */
+			ret = true;
+			break;
+		}
+		if (evsel__is_retire_lat(pos)) {
+			/*
+			 * Retirement latency events are similar to tool ones in
+			 * their implementation, and so don't require affinity.
+			 */
+			continue;
+		}
+		if (perf_cpu_map__is_empty(used_cpus)) {
+			/* First benefitting event, we want >1 on a common CPU. */
+			used_cpus = perf_cpu_map__get(pos->core.cpus);
+			continue;
+		}
+		if ((pos->core.attr.read_format & PERF_FORMAT_GROUP) &&
+		    evsel__leader(pos) != pos) {
+			/* Skip members of the same sample group. */
+			continue;
+		}
+		intersect = perf_cpu_map__intersect(used_cpus, pos->core.cpus);
+		if (!perf_cpu_map__is_empty(intersect)) {
+			/* >1 event with shared CPUs. */
+			perf_cpu_map__put(intersect);
+			ret = true;
+			break;
+		}
+		perf_cpu_map__put(intersect);
+		perf_cpu_map__merge(&used_cpus, pos->core.cpus);
+	}
+	perf_cpu_map__put(used_cpus);
+	return ret;
+}
+
+void evlist_cpu_iterator__init(struct evlist_cpu_iterator *itr, struct evlist *evlist)
 {
-	struct evlist_cpu_iterator itr = {
+	*itr = (struct evlist_cpu_iterator){
 		.container = evlist,
 		.evsel = NULL,
 		.cpu_map_idx = 0,
 		.evlist_cpu_map_idx = 0,
 		.evlist_cpu_map_nr = perf_cpu_map__nr(evlist->core.all_cpus),
 		.cpu = (struct perf_cpu){ .cpu = -1},
-		.affinity = affinity,
+		.affinity = NULL,
 	};
 
 	if (evlist__empty(evlist)) {
 		/* Ensure the empty list doesn't iterate. */
-		itr.evlist_cpu_map_idx = itr.evlist_cpu_map_nr;
-	} else {
-		itr.evsel = evlist__first(evlist);
-		if (itr.affinity) {
-			itr.cpu = perf_cpu_map__cpu(evlist->core.all_cpus, 0);
-			affinity__set(itr.affinity, itr.cpu.cpu);
-			itr.cpu_map_idx = perf_cpu_map__idx(itr.evsel->core.cpus, itr.cpu);
-			/*
-			 * If this CPU isn't in the evsel's cpu map then advance
-			 * through the list.
-			 */
-			if (itr.cpu_map_idx == -1)
-				evlist_cpu_iterator__next(&itr);
-		}
+		itr->evlist_cpu_map_idx = itr->evlist_cpu_map_nr;
+		return;
 	}
-	return itr;
+
+	if (evlist__use_affinity(evlist)) {
+		if (affinity__setup(&itr->saved_affinity) == 0)
+			itr->affinity = &itr->saved_affinity;
+	}
+	itr->evsel = evlist__first(evlist);
+	itr->cpu = perf_cpu_map__cpu(evlist->core.all_cpus, 0);
+	if (itr->affinity)
+		affinity__set(itr->affinity, itr->cpu.cpu);
+	itr->cpu_map_idx = perf_cpu_map__idx(itr->evsel->core.cpus, itr->cpu);
+	/*
+	 * If this CPU isn't in the evsel's cpu map then advance
+	 * through the list.
+	 */
+	if (itr->cpu_map_idx == -1)
+		evlist_cpu_iterator__next(itr);
+}
+
+void evlist_cpu_iterator__exit(struct evlist_cpu_iterator *itr)
+{
+	if (!itr->affinity)
+		return;
+
+	affinity__cleanup(itr->affinity);
+	itr->affinity = NULL;
 }
 
 void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr)
@@ -418,14 +493,11 @@ void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr)
 		 */
 		if (evlist_cpu_itr->cpu_map_idx == -1)
 			evlist_cpu_iterator__next(evlist_cpu_itr);
+	} else {
+		evlist_cpu_iterator__exit(evlist_cpu_itr);
 	}
 }
 
-bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr)
-{
-	return evlist_cpu_itr->evlist_cpu_map_idx >= evlist_cpu_itr->evlist_cpu_map_nr;
-}
-
 static int evsel__strcmp(struct evsel *pos, char *evsel_name)
 {
 	if (!evsel_name)
@@ -453,19 +525,11 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name, bool excl
 {
 	struct evsel *pos;
 	struct evlist_cpu_iterator evlist_cpu_itr;
-	struct affinity saved_affinity, *affinity = NULL;
 	bool has_imm = false;
 
-	// See explanation in evlist__close()
-	if (!cpu_map__is_dummy(evlist->core.user_requested_cpus)) {
-		if (affinity__setup(&saved_affinity) < 0)
-			return;
-		affinity = &saved_affinity;
-	}
-
 	/* Disable 'immediate' events last */
 	for (int imm = 0; imm <= 1; imm++) {
-		evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) {
+		evlist__for_each_cpu(evlist_cpu_itr, evlist) {
 			pos = evlist_cpu_itr.evsel;
 			if (evsel__strcmp(pos, evsel_name))
 				continue;
@@ -483,7 +547,6 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name, bool excl
 			break;
 	}
 
-	affinity__cleanup(affinity);
 	evlist__for_each_entry(evlist, pos) {
 		if (evsel__strcmp(pos, evsel_name))
 			continue;
@@ -523,16 +586,8 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name, bool excl_
 {
 	struct evsel *pos;
 	struct evlist_cpu_iterator evlist_cpu_itr;
-	struct affinity saved_affinity, *affinity = NULL;
 
-	// See explanation in evlist__close()
-	if (!cpu_map__is_dummy(evlist->core.user_requested_cpus)) {
-		if (affinity__setup(&saved_affinity) < 0)
-			return;
-		affinity = &saved_affinity;
-	}
-
-	evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) {
+	evlist__for_each_cpu(evlist_cpu_itr, evlist) {
 		pos = evlist_cpu_itr.evsel;
 		if (evsel__strcmp(pos, evsel_name))
 			continue;
@@ -542,7 +597,6 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name, bool excl_
 			continue;
 		evsel__enable_cpu(pos, evlist_cpu_itr.cpu_map_idx);
 	}
-	affinity__cleanup(affinity);
 	evlist__for_each_entry(evlist, pos) {
 		if (evsel__strcmp(pos, evsel_name))
 			continue;
@@ -1339,30 +1393,14 @@ void evlist__close(struct evlist *evlist)
 {
 	struct evsel *evsel;
 	struct evlist_cpu_iterator evlist_cpu_itr;
-	struct affinity affinity;
-
-	/*
-	 * With perf record core.user_requested_cpus is usually NULL.
-	 * Use the old method to handle this for now.
-	 */
-	if (!evlist->core.user_requested_cpus ||
-	    cpu_map__is_dummy(evlist->core.user_requested_cpus)) {
-		evlist__for_each_entry_reverse(evlist, evsel)
-			evsel__close(evsel);
-		return;
-	}
-
-	if (affinity__setup(&affinity) < 0)
-		return;
 
-	evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) {
+	evlist__for_each_cpu(evlist_cpu_itr, evlist) {
 		if (evlist_cpu_itr.cpu_map_idx == 0 && evsel__is_retire_lat(evlist_cpu_itr.evsel))
 			evsel__tpebs_close(evlist_cpu_itr.evsel);
 		perf_evsel__close_cpu(&evlist_cpu_itr.evsel->core,
 				      evlist_cpu_itr.cpu_map_idx);
 	}
 
-	affinity__cleanup(&affinity);
 	evlist__for_each_entry_reverse(evlist, evsel) {
 		perf_evsel__free_fd(&evsel->core);
 		perf_evsel__free_id(&evsel->core);
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 911834ae7c2a..30dff7484d3c 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -10,6 +10,7 @@
 #include <internal/evlist.h>
 #include <internal/evsel.h>
 #include <perf/evlist.h>
+#include "affinity.h"
 #include "events_stats.h"
 #include "evsel.h"
 #include "rblist.h"
@@ -363,6 +364,8 @@ struct evlist_cpu_iterator {
 	struct perf_cpu cpu;
 	/** If present, used to set the affinity when switching between CPUs. */
 	struct affinity *affinity;
+	/** Maybe be used to hold affinity state prior to iterating. */
+	struct affinity saved_affinity;
 };
 
 /**
@@ -370,22 +373,31 @@ struct evlist_cpu_iterator {
  *                        affinity, iterate over all CPUs and then the evlist
  *                        for each evsel on that CPU. When switching between
  *                        CPUs the affinity is set to the CPU to avoid IPIs
- *                        during syscalls.
+ *                        during syscalls. The affinity is set up and removed
+ *                        automatically, if the loop is broken a call to
+ *                        evlist_cpu_iterator__exit is necessary.
  * @evlist_cpu_itr: the iterator instance.
  * @evlist: evlist instance to iterate.
- * @affinity: NULL or used to set the affinity to the current CPU.
  */
-#define evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity)		\
-	for ((evlist_cpu_itr) = evlist__cpu_begin(evlist, affinity);	\
+#define evlist__for_each_cpu(evlist_cpu_itr, evlist)			\
+	for (evlist_cpu_iterator__init(&(evlist_cpu_itr), evlist);	\
 	     !evlist_cpu_iterator__end(&evlist_cpu_itr);		\
 	     evlist_cpu_iterator__next(&evlist_cpu_itr))
 
-/** Returns an iterator set to the first CPU/evsel of evlist. */
-struct evlist_cpu_iterator evlist__cpu_begin(struct evlist *evlist, struct affinity *affinity);
+/** Setup an iterator set to the first CPU/evsel of evlist. */
+void evlist_cpu_iterator__init(struct evlist_cpu_iterator *itr, struct evlist *evlist);
+/**
+ * Cleans up the iterator, automatically done by evlist_cpu_iterator__next when
+ * the end of the list is reached. Multiple calls are safe.
+ */
+void evlist_cpu_iterator__exit(struct evlist_cpu_iterator *itr);
 /** Move to next element in iterator, updating CPU, evsel and the affinity. */
 void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr);
 /** Returns true when iterator is at the end of the CPUs and evlist. */
-bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr);
+static inline bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr)
+{
+	return evlist_cpu_itr->evlist_cpu_map_idx >= evlist_cpu_itr->evlist_cpu_map_nr;
+}
 
 struct evsel *evlist__get_tracking_event(struct evlist *evlist);
 void evlist__set_tracking_event(struct evlist *evlist, struct evsel *tracking_evsel);
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 81ab74681c9b..5cdd350e8885 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -2375,6 +2375,18 @@ bool perf_pmu__is_software(const struct perf_pmu *pmu)
 	return false;
 }
 
+bool perf_pmu__benefits_from_affinity(struct perf_pmu *pmu)
+{
+	if (!pmu)
+		return true; /* Assume is core. */
+
+	/*
+	 * All perf event PMUs should benefit from accessing the perf event
+	 * contexts on the local CPU.
+	 */
+	return pmu->type <= PERF_PMU_TYPE_PE_END;
+}
+
 FILE *perf_pmu__open_file(const struct perf_pmu *pmu, const char *name)
 {
 	char path[PATH_MAX];
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 41c21389f393..0d9f3c57e8e8 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -303,6 +303,7 @@ bool perf_pmu__name_no_suffix_match(const struct perf_pmu *pmu, const char *to_m
  *                        perf_sw_context in the kernel?
  */
 bool perf_pmu__is_software(const struct perf_pmu *pmu);
+bool perf_pmu__benefits_from_affinity(struct perf_pmu *pmu);
 
 FILE *perf_pmu__open_file(const struct perf_pmu *pmu, const char *name);
 FILE *perf_pmu__open_file_at(const struct perf_pmu *pmu, int dirfd, const char *name);
-- 
cgit v1.2.3


From 5d1ab659fb93eed85d6d8b2937013360157032f4 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 9 Feb 2026 22:03:59 -0800
Subject: perf stat: Add no-affinity flag

Add flag that disables affinity behavior.

Using sched_setaffinity() to place a perf thread on a CPU can avoid
certain interprocessor interrupts but may introduce a delay due to the
scheduling, particularly on loaded machines.

Add a command line option to disable the behavior.

This behavior is less present in other tools like `perf record`, as it
uses a ring buffer and doesn't make repeated system calls.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andres Freund <andres@anarazel.de>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-stat.txt | 5 +++++
 tools/perf/builtin-stat.c              | 6 ++++++
 tools/perf/util/evlist.c               | 6 +-----
 tools/perf/util/evlist.h               | 1 +
 4 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 1a766d4a2233..7cccc3a847d1 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -382,6 +382,11 @@ color the metric's computed value.
 Don't print output, warnings or messages. This is useful with perf stat
 record below to only write data to the perf.data file.
 
+--no-affinity::
+Don't change scheduler CPU affinities when iterating over
+CPUs. Disables an optimization aimed at minimizing interprocessor
+interrupts.
+
 STAT RECORD
 -----------
 Stores stat data into perf data file.
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c1bb40b99176..73c2ba7e3076 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -2426,6 +2426,7 @@ static int parse_tpebs_mode(const struct option *opt, const char *str,
 int cmd_stat(int argc, const char **argv)
 {
 	struct opt_aggr_mode opt_mode = {};
+	bool affinity = true, affinity_set = false;
 	struct option stat_options[] = {
 		OPT_BOOLEAN('T', "transaction", &transaction_run,
 			"hardware transaction statistics"),
@@ -2554,6 +2555,8 @@ int cmd_stat(int argc, const char **argv)
 			"don't print 'summary' for CSV summary output"),
 		OPT_BOOLEAN(0, "quiet", &quiet,
 			"don't print any output, messages or warnings (useful with record)"),
+		OPT_BOOLEAN_SET(0, "affinity", &affinity, &affinity_set,
+			"enable (default) or disable affinity optimizations to reduce IPIs"),
 		OPT_CALLBACK(0, "cputype", &evsel_list, "hybrid cpu type",
 			"Only enable events on applying cpu with this type "
 			"for hybrid platform (e.g. core or atom)",
@@ -2611,6 +2614,9 @@ int cmd_stat(int argc, const char **argv)
 	} else
 		stat_config.csv_sep = DEFAULT_SEPARATOR;
 
+	if (affinity_set)
+		evsel_list->no_affinity = !affinity;
+
 	if (argc && strlen(argv[0]) > 2 && strstarts("record", argv[0])) {
 		argc = __cmd_record(stat_options, &opt_mode, argc, argv);
 		if (argc < 0)
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 45833244daf3..591bdf0b3e2a 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -369,11 +369,7 @@ static bool evlist__use_affinity(struct evlist *evlist)
 	struct perf_cpu_map *used_cpus = NULL;
 	bool ret = false;
 
-	/*
-	 * With perf record core.user_requested_cpus is usually NULL.
-	 * Use the old method to handle this for now.
-	 */
-	if (!evlist->core.user_requested_cpus ||
+	if (evlist->no_affinity || !evlist->core.user_requested_cpus ||
 	    cpu_map__is_dummy(evlist->core.user_requested_cpus))
 		return false;
 
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 30dff7484d3c..d17c3b57a409 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -59,6 +59,7 @@ struct event_enable_timer;
 struct evlist {
 	struct perf_evlist core;
 	bool		 enabled;
+	bool		 no_affinity;
 	int		 id_pos;
 	int		 is_pos;
 	int		 nr_br_cntr;
-- 
cgit v1.2.3


From 9adbe8935152c511c1e43a47d69f44f0e969afc8 Mon Sep 17 00:00:00 2001
From: Günther Noack <gnoack3000@gmail.com>
Date: Fri, 6 Feb 2026 16:11:53 +0100
Subject: selftests/landlock: Add filesystem access benchmark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fs_bench benchmarks the performance of Landlock's path walk
by exercising it in a scenario that amplifies Landlock's overhead:

* Create a large number of nested directories
* Enforce a Landlock policy in which a rule is associated with each of
  these subdirectories
* Benchmark openat() applied to the deepest directory,
  forcing Landlock to walk the entire path.

Signed-off-by: Günther Noack <gnoack3000@gmail.com>
Link: https://lore.kernel.org/r/20260206151154.97915-3-gnoack3000@gmail.com
[mic: Fix missing mode with O_CREAT, improve text consistency, sort
includes]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/.gitignore |   1 +
 tools/testing/selftests/landlock/Makefile   |   1 +
 tools/testing/selftests/landlock/fs_bench.c | 214 ++++++++++++++++++++++++++++
 3 files changed, 216 insertions(+)
 create mode 100644 tools/testing/selftests/landlock/fs_bench.c

(limited to 'tools')

diff --git a/tools/testing/selftests/landlock/.gitignore b/tools/testing/selftests/landlock/.gitignore
index a820329cae0d..1974e17a2611 100644
--- a/tools/testing/selftests/landlock/.gitignore
+++ b/tools/testing/selftests/landlock/.gitignore
@@ -1,4 +1,5 @@
 /*_test
+/fs_bench
 /sandbox-and-launch
 /true
 /wait-pipe
diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile
index 044b83bde16e..fc43225d319a 100644
--- a/tools/testing/selftests/landlock/Makefile
+++ b/tools/testing/selftests/landlock/Makefile
@@ -9,6 +9,7 @@ LOCAL_HDRS += $(wildcard *.h)
 src_test := $(wildcard *_test.c)
 
 TEST_GEN_PROGS := $(src_test:.c=)
+TEST_GEN_PROGS += fs_bench
 
 TEST_GEN_PROGS_EXTENDED := \
 	true \
diff --git a/tools/testing/selftests/landlock/fs_bench.c b/tools/testing/selftests/landlock/fs_bench.c
new file mode 100644
index 000000000000..d13a88dcd1ed
--- /dev/null
+++ b/tools/testing/selftests/landlock/fs_bench.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Landlock filesystem benchmark
+ *
+ * This program benchmarks the time required for file access checks.  We use a
+ * large number (-d flag) of nested directories where each directory inode has
+ * an associated Landlock rule, and we repeatedly (-n flag) exercise a file
+ * access for which Landlock has to walk the path all the way up to the root.
+ *
+ * With an increasing number of nested subdirectories, Landlock's portion of the
+ * overall system call time increases, which makes the effects of Landlock
+ * refactorings more measurable.
+ *
+ * This benchmark does *not* measure the building of the Landlock ruleset.  The
+ * time required to add all these rules is not large enough to be easily
+ * measurable.  A separate benchmark tool would be better to test that, and that
+ * tool could then also use a simpler file system layout.
+ *
+ * Copyright © 2026 Google LLC
+ */
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/landlock.h>
+#include <linux/prctl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/times.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "wrappers.h"
+
+static void usage(const char *const argv0)
+{
+	printf("Usage:\n");
+	printf("  %s [OPTIONS]\n", argv0);
+	printf("\n");
+	printf("  Benchmark expensive Landlock checks for D nested dirs\n");
+	printf("\n");
+	printf("Options:\n");
+	printf("  -h	help\n");
+	printf("  -L	disable Landlock (as a baseline)\n");
+	printf("  -d D	set directory depth to D\n");
+	printf("  -n N	set number of benchmark iterations to N\n");
+}
+
+/*
+ * Build a deep directory, enforce Landlock and return the FD to the
+ * deepest dir.  On any failure, exit the process with an error.
+ */
+static int build_directory(size_t depth, const bool use_landlock)
+{
+	const char *path = "d"; /* directory name */
+	int abi, ruleset_fd, curr, prev;
+
+	if (use_landlock) {
+		abi = landlock_create_ruleset(NULL, 0,
+					      LANDLOCK_CREATE_RULESET_VERSION);
+		if (abi < 7)
+			err(1, "Landlock ABI too low: got %d, wanted 7+", abi);
+	}
+
+	ruleset_fd = -1;
+	if (use_landlock) {
+		struct landlock_ruleset_attr attr = {
+			.handled_access_fs = LANDLOCK_ACCESS_FS_IOCTL_DEV |
+					     LANDLOCK_ACCESS_FS_WRITE_FILE |
+					     LANDLOCK_ACCESS_FS_MAKE_REG,
+		};
+		ruleset_fd = landlock_create_ruleset(&attr, sizeof(attr), 0U);
+		if (ruleset_fd < 0)
+			err(1, "landlock_create_ruleset");
+	}
+
+	curr = open(".", O_PATH);
+	if (curr < 0)
+		err(1, "open(.)");
+
+	while (depth--) {
+		if (use_landlock) {
+			struct landlock_path_beneath_attr attr = {
+				.allowed_access = LANDLOCK_ACCESS_FS_IOCTL_DEV,
+				.parent_fd = curr,
+			};
+			if (landlock_add_rule(ruleset_fd,
+					      LANDLOCK_RULE_PATH_BENEATH, &attr,
+					      0) < 0)
+				err(1, "landlock_add_rule");
+		}
+
+		if (mkdirat(curr, path, 0700) < 0)
+			err(1, "mkdirat(%s)", path);
+
+		prev = curr;
+		curr = openat(curr, path, O_PATH);
+		if (curr < 0)
+			err(1, "openat(%s)", path);
+
+		close(prev);
+	}
+
+	if (use_landlock) {
+		if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
+			err(1, "prctl");
+
+		if (landlock_restrict_self(ruleset_fd, 0) < 0)
+			err(1, "landlock_restrict_self");
+	}
+
+	close(ruleset_fd);
+	return curr;
+}
+
+static void remove_recursively(const size_t depth)
+{
+	const char *path = "d"; /* directory name */
+
+	int fd = openat(AT_FDCWD, ".", O_PATH);
+
+	if (fd < 0)
+		err(1, "openat(.)");
+
+	for (size_t i = 0; i < depth - 1; i++) {
+		int oldfd = fd;
+
+		fd = openat(fd, path, O_PATH);
+		if (fd < 0)
+			err(1, "openat(%s)", path);
+		close(oldfd);
+	}
+
+	for (size_t i = 0; i < depth; i++) {
+		if (unlinkat(fd, path, AT_REMOVEDIR) < 0)
+			err(1, "unlinkat(%s)", path);
+		int newfd = openat(fd, "..", O_PATH);
+
+		close(fd);
+		fd = newfd;
+	}
+	close(fd);
+}
+
+int main(int argc, char *argv[])
+{
+	bool use_landlock = true;
+	size_t num_iterations = 100000;
+	size_t num_subdirs = 10000;
+	int c, curr, fd;
+	struct tms start_time, end_time;
+
+	setbuf(stdout, NULL);
+	while ((c = getopt(argc, argv, "hLd:n:")) != -1) {
+		switch (c) {
+		case 'h':
+			usage(argv[0]);
+			return EXIT_SUCCESS;
+		case 'L':
+			use_landlock = false;
+			break;
+		case 'd':
+			num_subdirs = atoi(optarg);
+			break;
+		case 'n':
+			num_iterations = atoi(optarg);
+			break;
+		default:
+			usage(argv[0]);
+			return EXIT_FAILURE;
+		}
+	}
+
+	printf("*** Benchmark ***\n");
+	printf("%zu dirs, %zu iterations, %s Landlock\n", num_subdirs,
+	       num_iterations, use_landlock ? "with" : "without");
+
+	if (times(&start_time) == -1)
+		err(1, "times");
+
+	curr = build_directory(num_subdirs, use_landlock);
+
+	for (int i = 0; i < num_iterations; i++) {
+		fd = openat(curr, "file.txt", O_CREAT | O_TRUNC | O_WRONLY,
+			    0600);
+		if (use_landlock) {
+			if (fd == 0)
+				errx(1, "openat succeeded, expected EACCES");
+			if (errno != EACCES)
+				err(1, "openat expected EACCES, but got");
+		}
+		if (fd != -1)
+			close(fd);
+	}
+
+	if (times(&end_time) == -1)
+		err(1, "times");
+
+	printf("*** Benchmark concluded ***\n");
+	printf("System: %ld clocks\n",
+	       end_time.tms_stime - start_time.tms_stime);
+	printf("User  : %ld clocks\n",
+	       end_time.tms_utime - start_time.tms_utime);
+	printf("Clocks per second: %ld\n", CLOCKS_PER_SEC);
+
+	close(curr);
+
+	remove_recursively(num_subdirs);
+}
-- 
cgit v1.2.3


From ccb3272666989effd24a3354696e4cc5dea80661 Mon Sep 17 00:00:00 2001
From: Dimitri Daskalakis <dimitri.daskalakis1@gmail.com>
Date: Fri, 6 Feb 2026 17:30:18 -0800
Subject: selftests: drivers: net: hw: Modify toeplitz.c to poll for packets

Prior to this the receiver would sleep for the configured timeout,
then attempt to receive as many packets as possible. This would result
in a large burst of packets, and we don't necessarily need that many samples.

The tests now run faster.

Before

 ok 12 toeplitz.test.rps_udp_ipv6
 # Totals: pass:12 fail:0 xfail:0 xpass:0 skip:0 error:0

 real	0m54.792s
 user	0m12.486s
 sys	0m10.887s

After

 ok 12 toeplitz.test.rps_udp_ipv6
 # Totals: pass:12 fail:0 xfail:0 xpass:0 skip:0 error:0

 real	0m36.892s
 user	0m4.203s
 sys	0m8.314s

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Dimitri Daskalakis <dimitri.daskalakis1@gmail.com>
Link: https://patch.msgid.link/20260207013018.551347-1-dimitri.daskalakis1@gmail.com
[pabeni@redhat.com: whitespaces fixes]
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/drivers/net/hw/toeplitz.c | 26 +++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.c b/tools/testing/selftests/drivers/net/hw/toeplitz.c
index cd4bf58a44ee..035bf908d8d9 100644
--- a/tools/testing/selftests/drivers/net/hw/toeplitz.c
+++ b/tools/testing/selftests/drivers/net/hw/toeplitz.c
@@ -72,6 +72,8 @@
 
 #define RPS_MAX_CPUS 16UL	/* must be a power of 2 */
 
+#define MIN_PKT_SAMPLES 40	/* minimum number of packets to receive */
+
 /* configuration options (cmdline arguments) */
 static uint16_t cfg_dport =	8000;
 static int cfg_family =		AF_INET6;
@@ -251,15 +253,31 @@ static bool recv_block(struct ring_state *ring)
 	return true;
 }
 
-/* simple test: sleep once unconditionally and then process all rings */
+/* simple test: process all rings until MIN_PKT_SAMPLES packets are received,
+ * or the test times out.
+ */
 static void process_rings(void)
 {
+	struct timeval start, now;
+	bool pkts_found = true;
+	long elapsed_usec;
 	int i;
 
-	usleep(1000 * cfg_timeout_msec);
+	gettimeofday(&start, NULL);
 
-	for (i = 0; i < num_cpus; i++)
-		do {} while (recv_block(&rings[i]));
+	do {
+		if (!pkts_found)
+			usleep(100);
+
+		pkts_found = false;
+		for (i = 0; i < num_cpus; i++)
+			pkts_found |= recv_block(&rings[i]);
+
+		gettimeofday(&now, NULL);
+		elapsed_usec = (now.tv_sec - start.tv_sec) * 1000000 +
+			       (now.tv_usec - start.tv_usec);
+	} while (frames_received - frames_nohash < MIN_PKT_SAMPLES &&
+		 elapsed_usec < cfg_timeout_msec * 1000);
 
 	fprintf(stderr, "count: pass=%u nohash=%u fail=%u\n",
 		frames_received - frames_nohash - frames_error,
-- 
cgit v1.2.3


From 68b4fe32d73789dea23e356f468de67c8367ef8f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Tue, 10 Feb 2026 13:45:22 -0800
Subject: kbuild: Add objtool to top-level clean target

Objtool is an integral part of the build, make sure it gets cleaned by
"make clean" and "make mrproper".

Fixes: 442f04c34a1a ("objtool: Add tool to perform compile-time stack metadata validation")
Reported-by: Jens Remus <jremus@linux.ibm.com>
Closes: https://lore.kernel.org/15f2af3b-be33-46fc-b972-6b8e7e0aa52e@linux.ibm.com
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Tested-by: Jens Remus <jremus@linux.ibm.com>
Link: https://patch.msgid.link/968faf2ed30fa8b3519f79f01a1ecfe7929553e5.1770759919.git.jpoimboe@kernel.org
[nathan: use Closes: instead of Link: per checkpatch.pl]
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
---
 Makefile               | 11 ++++++++++-
 tools/objtool/Makefile |  2 ++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/Makefile b/Makefile
index ff6926d3f97b..777dc5d70273 100644
--- a/Makefile
+++ b/Makefile
@@ -1481,6 +1481,15 @@ ifneq ($(wildcard $(resolve_btfids_O)),)
 	$(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean
 endif
 
+PHONY += objtool_clean
+
+objtool_O = $(abspath $(objtree))/tools/objtool
+
+objtool_clean:
+ifneq ($(wildcard $(objtool_O)),)
+	$(Q)$(MAKE) -sC $(abs_srctree)/tools/objtool O=$(objtool_O) srctree=$(abs_srctree) clean
+endif
+
 tools/: FORCE
 	$(Q)mkdir -p $(objtree)/tools
 	$(Q)$(MAKE) O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/
@@ -1644,7 +1653,7 @@ vmlinuxclean:
 	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/link-vmlinux.sh clean
 	$(Q)$(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) clean)
 
-clean: archclean vmlinuxclean resolve_btfids_clean
+clean: archclean vmlinuxclean resolve_btfids_clean objtool_clean
 
 # mrproper - Delete all generated files, including .config
 #
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index a40f30232929..6964175abdfd 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -29,6 +29,8 @@ srctree := $(patsubst %/,%,$(dir $(CURDIR)))
 srctree := $(patsubst %/,%,$(dir $(srctree)))
 endif
 
+RM ?= rm -f
+
 LIBSUBCMD_DIR = $(srctree)/tools/lib/subcmd/
 ifneq ($(OUTPUT),)
   LIBSUBCMD_OUTPUT = $(abspath $(OUTPUT))/libsubcmd
-- 
cgit v1.2.3


From 11fece49e956ef97318177f5af15a84317594244 Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Wed, 11 Feb 2026 21:52:13 +0000
Subject: tools/sched_ext: scx_flatcg: zero-initialize stats counter array

The local cnts array in read_stats() is not initialized before being
accumulated into per-CPU stats, which may lead to reading garbage
values. Zero it out with memset alongside the existing stats array
initialization.

Signed-off-by: David Carlier <devnexen@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_flatcg.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index cd85eb401179..bea76d060201 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -106,6 +106,7 @@ static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats)
 	__u32 idx;
 
 	memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS);
+	memset(cnts, 0, sizeof(cnts));
 
 	for (idx = 0; idx < FCG_NR_STATS; idx++) {
 		int ret, cpu;
-- 
cgit v1.2.3


From 988369d236e46e6bc68d2616fbc008aa6b06a454 Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Wed, 11 Feb 2026 21:30:27 +0000
Subject: tools/sched_ext: scx_central: fix sched_setaffinity() call with the
 set size

The cpu set is dynamically allocated for nr_cpu_ids using CPU_ALLOC(),
so the size passed to sched_setaffinity() should be CPU_ALLOC_SIZE()
rather than sizeof(cpu_set_t). Valgrind flagged this as accessing
unaddressable bytes past the allocation.

Signed-off-by: David Carlier <devnexen@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_central.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 55931a4cd71c..a6dfd45de70c 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -50,6 +50,7 @@ int main(int argc, char **argv)
 	__u64 seq = 0, ecode;
 	__s32 opt;
 	cpu_set_t *cpuset;
+	size_t cpuset_size;
 
 	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
@@ -106,9 +107,10 @@ restart:
 	 */
 	cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids);
 	SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
-	CPU_ZERO_S(CPU_ALLOC_SIZE(skel->rodata->nr_cpu_ids), cpuset);
+	cpuset_size = CPU_ALLOC_SIZE(skel->rodata->nr_cpu_ids);
+	CPU_ZERO_S(cpuset_size, cpuset);
 	CPU_SET(skel->rodata->central_cpu, cpuset);
-	SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset),
+	SCX_BUG_ON(sched_setaffinity(0, cpuset_size, cpuset),
 		   "Failed to affinitize to central CPU %d (max %d)",
 		   skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
 	CPU_FREE(cpuset);
-- 
cgit v1.2.3


From 1a6c45969a85d1be43dbbf0705aef4bc8eb515a8 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 10 Feb 2026 21:38:27 -0800
Subject: perf libunwind: Fix calls to thread__e_machine()

Add the missing 'e_flags' option to fix the build.

Fixes: 4e66527f8859a661 ("perf thread: Add optional e_flags output argument to thread__e_machine")
Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/unwind-libunwind-local.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
index ecf0c01fe51f..5b39ce21e333 100644
--- a/tools/perf/util/unwind-libunwind-local.c
+++ b/tools/perf/util/unwind-libunwind-local.c
@@ -584,7 +584,9 @@ static int access_mem(unw_addr_space_t __maybe_unused as,
 	}
 
 	ret = perf_reg_value(&start, perf_sample__user_regs(ui->sample),
-			     perf_arch_reg_sp(thread__e_machine(ui->thread, ui->machine)));
+			     perf_arch_reg_sp(thread__e_machine(ui->thread,
+								ui->machine,
+								/*e_flags=*/NULL)));
 	if (ret)
 		return ret;
 
@@ -733,7 +735,7 @@ static void _unwind__finish_access(struct maps *maps)
 static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
 		       void *arg, int max_stack)
 {
-	uint16_t e_machine = thread__e_machine(ui->thread, ui->machine);
+	uint16_t e_machine = thread__e_machine(ui->thread, ui->machine, /*e_flags=*/NULL);
 	u64 val;
 	unw_word_t ips[max_stack];
 	unw_addr_space_t addr_space;
-- 
cgit v1.2.3


From 804490c3eb26098b60c5e858fa20c0e6f2c2c1d8 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Wed, 11 Feb 2026 10:58:01 +0100
Subject: tools build: Fix feature test for rust compiler

Currently a dummy rust code is compiled to detect if the rust feature
could be enabled. It turns out that in this case rust emits a dependency
file without any external references:

    /perf/feature/test-rust.d: test-rust.rs

    /perf/feature/test-rust.bin: test-rust.rs

    test-rust.rs:

This can lead to a situation, when rustc was removed after a successful build,
but the build process still thinks it's there and the feature is enabled on
subsequent runs.

Instead simply check the compiler presence to detect the feature, as
suggested by Arnaldo.

This way no actual test-rust.bin will be created, meaning the feature
check will not be cached and always performed. That's exactly what we
want, and the overhead of doing this every time is minimal.

Tested with multiple rounds of install/remove of the rust package.

Reported-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Suggested-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/build/feature/Makefile     | 10 ++++++----
 tools/build/feature/test-rust.rs |  4 ----
 2 files changed, 6 insertions(+), 8 deletions(-)
 delete mode 100644 tools/build/feature/test-rust.rs

(limited to 'tools')

diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index e959caa7f1c7..1fbcb3ce74d2 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -113,9 +113,6 @@ __BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(
 __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS)
   BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1
 
-__BUILDRS = $(RUSTC) $(RUSTC_FLAGS) --emit=dep-info=$(patsubst %.bin,%.d,$(@F)),link -o $@ $(patsubst %.bin,%.rs,$(@F))
-  BUILDRS = $(__BUILDRS) > $(@:.bin=.make.output) 2>&1
-
 ###############################
 
 $(OUTPUT)test-all.bin:
@@ -393,8 +390,13 @@ $(OUTPUT)test-bpftool-skeletons.bin:
 	$(SYSTEM_BPFTOOL) version | grep '^features:.*skeletons' \
 		> $(@:.bin=.make.output) 2>&1
 
+# Testing Rust is special: we don't compile anything, it's enough to check the
+# compiler presence. Compiling a test code for this purposes is problematic,
+# because Rust will emit a dependency file without any external references,
+# meaning that if rustc will be removed the build process will still think it's
+# there.
 $(OUTPUT)test-rust.bin:
-	$(BUILDRS) > $(@:.bin=.make.output) 2>&1
+	$(RUSTC) --version > /dev/null 2>&1
 
 ###############################
 
diff --git a/tools/build/feature/test-rust.rs b/tools/build/feature/test-rust.rs
deleted file mode 100644
index f2fc91cc4f69..000000000000
--- a/tools/build/feature/test-rust.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-fn main() {
-    println!("hi")
-}
-- 
cgit v1.2.3


From adc1284bae3cfd25df785d55b900a8778ad79366 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 11 Feb 2026 09:46:22 -0300
Subject: perf test data_type_profiling.sh: Skip just the Rust tests if
 code_with_type workload is missing

Namhyung suggested skipping only the rust tests when the code_with_type
'perf test' workload is not built into perf, do it so that we can
continue to test the C based workloads:

With rust:

  root@number:/# perf test -vv "data type"
   83: perf data type profiling tests:
  --- start ---
  test child forked, pid 2645245
  Basic Rust perf annotate test
  Basic annotate test [Success]
  Pipe Rust perf annotate test
  Pipe annotate test [Success]
  Basic C perf annotate test
  Basic annotate test [Success]
  Pipe C perf annotate test
  Pipe annotate test [Success]
  ---- end(0) ----
   83: perf data type profiling tests                                  : Ok
  root@number:/#

Without:

  root@number:/# perf test "data type"
   83: perf data type profiling tests                                  : Ok
  root@number:/# perf test -vv "data type"
   83: perf data type profiling tests:
  --- start ---
  test child forked, pid 2634759
  Basic Rust perf annotate test
  Skip: code_with_type workload not built in 'perf test'
  Pipe Rust perf annotate test
  Skip: code_with_type workload not built in 'perf test'
  Basic C perf annotate test
  Basic annotate test [Success]
  Pipe C perf annotate test
  Pipe annotate test [Success]
  ---- end(0) ----
   83: perf data type profiling tests                                  : Ok
  root@number:/#

Suggested-by: Namhyung Kim <namhyung@kernel.org>
Cc: Dmitrii Dolgov <9erthalion6@gmail.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/data_type_profiling.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/data_type_profiling.sh b/tools/perf/tests/shell/data_type_profiling.sh
index 3ef72a10850d..2a7f8f7c42d0 100755
--- a/tools/perf/tests/shell/data_type_profiling.sh
+++ b/tools/perf/tests/shell/data_type_profiling.sh
@@ -4,11 +4,6 @@
 
 set -e
 
-if ! perf test --list-workloads | grep -qw code_with_type ; then
-	echo "Skip: code_with_type workload not built in 'perf test'"
-	exit 2
-fi
-
 # The logic below follows the same line as the annotate test, but looks for a
 # data type profiling manifestation
 
@@ -42,6 +37,11 @@ test_basic_annotate() {
 
   case "x${runtime}" in
     "xRust")
+    if ! perf check feature -q rust
+    then
+      echo "Skip: code_with_type workload not built in 'perf test'"
+      return
+    fi
     index=0 ;;
 
     "xC")
-- 
cgit v1.2.3


From 9eb1760f846a38ea4ef1e5e177a2a0415e34c267 Mon Sep 17 00:00:00 2001
From: hupu <hupu.gm@gmail.com>
Date: Tue, 23 Dec 2025 16:43:34 +0800
Subject: perf build: Allow passing extra Clang flags via EXTRA_BPF_FLAGS

Add support for EXTRA_BPF_FLAGS in the eBPF skeleton build, allowing
users to pass additional clang options such as --sysroot or custom
include paths when cross-compiling perf.

This is primarily intended for cross-build scenarios where the default
host include paths do not match the target kernel version.

Example usage:

    make perf ARCH="arm64" EXTRA_BPF_FLAGS="--sysroot=..."

Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: hupu <hupu.gm@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile.perf | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index a6d8ca3e9233..11b63bafdb23 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -35,6 +35,9 @@ include ../scripts/utilities.mak
 #
 # Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds.
 #
+# Define EXTRA_BPF_FLAGS="--sysroot=<path>" or other custom include paths for
+# cross-compiling BPF skeletons
+#
 # Define EXCLUDE_EXTLIBS=-lmylib to exclude libmylib from the auto-generated
 # EXTLIBS.
 #
@@ -1244,7 +1247,7 @@ endif
 $(SKEL_TMP_OUT)/%.bpf.o: $(OUTPUT)PERF-VERSION-FILE util/bpf_skel/perf_version.h | $(SKEL_TMP_OUT)
 $(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) $(SKEL_OUT)/vmlinux.h
 	$(QUIET_CLANG)$(CLANG) -g -O2 -fno-stack-protector --target=bpf \
-	  $(CLANG_OPTIONS) $(BPF_INCLUDE) $(TOOLS_UAPI_INCLUDE) \
+	  $(CLANG_OPTIONS) $(EXTRA_BPF_FLAGS) $(BPF_INCLUDE) $(TOOLS_UAPI_INCLUDE) \
 	  -include $(OUTPUT)PERF-VERSION-FILE -include util/bpf_skel/perf_version.h \
 	  -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@
 
-- 
cgit v1.2.3


From 8772598b7801a2bb235bd35a858017e0fb939c38 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 10 Feb 2026 17:52:38 -0800
Subject: perf check: Add libbabeltrace to the listed features

This enables scripts to more easily determine if `perf data --to-ctf`
is supported.

Committer testing:

  $ perf check feature libbabeltrace
         libbabeltrace: [ on  ]  # HAVE_LIBBABELTRACE_SUPPORT
  $ perf check feature -q libbabeltrace && echo have libbabeltrace support
  have libbabeltrace support
  $

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Derek Foreman <derek.foreman@collabora.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-check.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/perf/builtin-check.c b/tools/perf/builtin-check.c
index 27a41beeaddf..3641d263b345 100644
--- a/tools/perf/builtin-check.c
+++ b/tools/perf/builtin-check.c
@@ -43,6 +43,7 @@ struct feature_status supported_features[] = {
 	FEATURE_STATUS("dwarf_getlocations", HAVE_LIBDW_SUPPORT),
 	FEATURE_STATUS("dwarf-unwind", HAVE_DWARF_UNWIND_SUPPORT),
 	FEATURE_STATUS_TIP("libbfd", HAVE_LIBBFD_SUPPORT, "Deprecated, license incompatibility, use BUILD_NONDISTRO=1 and install binutils-dev[el]"),
+	FEATURE_STATUS("libbabeltrace", HAVE_LIBBABELTRACE_SUPPORT),
 	FEATURE_STATUS("libbpf-strings", HAVE_LIBBPF_STRINGS_SUPPORT),
 	FEATURE_STATUS("libcapstone", HAVE_LIBCAPSTONE_SUPPORT),
 	FEATURE_STATUS("libdw-dwarf-unwind", HAVE_LIBDW_SUPPORT),
-- 
cgit v1.2.3


From 6db2f7c67b2804bc13fa385ff4e462fc3b366f8f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 10 Feb 2026 17:52:39 -0800
Subject: perf json: Pipe mode --to-json support

In pipe mode the environment may not be fully initialized so be robust
to fields being NULL. Add default handling of feature and attr events.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Derek Foreman <derek.foreman@collabora.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/data-convert-json.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'tools')

diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c
index eefa3a94c813..6a626322476a 100644
--- a/tools/perf/util/data-convert-json.c
+++ b/tools/perf/util/data-convert-json.c
@@ -48,6 +48,9 @@ struct convert_json {
 static void output_json_string(FILE *out, const char *s)
 {
 	fputc('"', out);
+	if (!s)
+		goto out;
+
 	while (*s) {
 		switch (*s) {
 
@@ -71,6 +74,7 @@ static void output_json_string(FILE *out, const char *s)
 
 		++s;
 	}
+out:
 	fputc('"', out);
 }
 
@@ -322,6 +326,16 @@ static void output_headers(struct perf_session *session, struct convert_json *c)
 	output_json_format(out, false, 2, "]");
 }
 
+static int process_feature_event(const struct perf_tool *tool __maybe_unused,
+				 struct perf_session *session,
+				 union perf_event *event)
+{
+	if (event->feat.feat_id < HEADER_LAST_FEATURE)
+		return perf_event__process_feature(session, event);
+
+	return 0;
+}
+
 int bt_convert__perf2json(const char *input_name, const char *output_name,
 		struct perf_data_convert_opts *opts __maybe_unused)
 {
@@ -360,6 +374,8 @@ int bt_convert__perf2json(const char *input_name, const char *output_name,
 	c.tool.auxtrace_info  = perf_event__process_auxtrace_info;
 	c.tool.auxtrace       = perf_event__process_auxtrace;
 	c.tool.event_update   = perf_event__process_event_update;
+	c.tool.attr           = perf_event__process_attr;
+	c.tool.feature        = process_feature_event;
 	c.tool.ordering_requires_timestamps = true;
 
 	if (opts->all) {
-- 
cgit v1.2.3


From 5b92fc082c835bebfa94f790e428b8c039afc457 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 10 Feb 2026 17:52:40 -0800
Subject: perf json: Pipe mode --to-ctf support

In pipe mode the environment may not be fully initialized so be robust
to fields being NULL.

Add default handling of attr events, use the feature events to populate
the ctf writer environment.

Signed-off-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Derek Foreman <derek.foreman@collabora.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/data-convert-bt.c | 54 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index a22e9049ff30..ba1c8e48d495 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -35,6 +35,7 @@
 #include "clockid.h"
 #include "util/sample.h"
 #include "util/time-utils.h"
+#include "header.h"
 
 #ifdef HAVE_LIBTRACEEVENT
 #include <event-parse.h>
@@ -1338,7 +1339,8 @@ static void cleanup_events(struct perf_session *session)
 		struct evsel_priv *priv;
 
 		priv = evsel->priv;
-		bt_ctf_event_class_put(priv->event_class);
+		if (priv)
+			bt_ctf_event_class_put(priv->event_class);
 		zfree(&evsel->priv);
 	}
 
@@ -1387,7 +1389,7 @@ static int ctf_writer__setup_env(struct ctf_writer *cw,
 
 #define ADD(__n, __v)							\
 do {									\
-	if (bt_ctf_writer_add_environment_field(writer, __n, __v))	\
+	if (__v && bt_ctf_writer_add_environment_field(writer, __n, __v))	\
 		return -1;						\
 } while (0)
 
@@ -1403,6 +1405,52 @@ do {									\
 	return 0;
 }
 
+static int process_feature_event(const struct perf_tool *tool,
+				 struct perf_session *session,
+				 union perf_event *event)
+{
+	struct convert *c = container_of(tool, struct convert, tool);
+	struct ctf_writer *cw = &c->writer;
+	struct perf_record_header_feature *fe = &event->feat;
+
+	if (event->feat.feat_id < HEADER_LAST_FEATURE) {
+		int ret = perf_event__process_feature(session, event);
+
+		if (ret)
+			return ret;
+	}
+
+	switch (fe->feat_id) {
+	case HEADER_HOSTNAME:
+		if (session->header.env.hostname) {
+			return bt_ctf_writer_add_environment_field(cw->writer, "host",
+								   session->header.env.hostname);
+		}
+		break;
+	case HEADER_OSRELEASE:
+		if (session->header.env.os_release) {
+			return bt_ctf_writer_add_environment_field(cw->writer, "release",
+								   session->header.env.os_release);
+		}
+		break;
+	case HEADER_VERSION:
+		if (session->header.env.version) {
+			return bt_ctf_writer_add_environment_field(cw->writer, "version",
+								   session->header.env.version);
+		}
+		break;
+	case HEADER_ARCH:
+		if (session->header.env.arch) {
+			return bt_ctf_writer_add_environment_field(cw->writer, "machine",
+								   session->header.env.arch);
+		}
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
 static int ctf_writer__setup_clock(struct ctf_writer *cw,
 				   struct perf_session *session,
 				   bool tod)
@@ -1635,6 +1683,8 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 	c.tool.tracing_data    = perf_event__process_tracing_data;
 	c.tool.build_id        = perf_event__process_build_id;
 	c.tool.namespaces      = perf_event__process_namespaces;
+	c.tool.attr            = perf_event__process_attr;
+	c.tool.feature         = process_feature_event;
 	c.tool.ordering_requires_timestamps = true;
 
 	if (opts->all) {
-- 
cgit v1.2.3


From fc4577b52a891da3828af52c1e1c7167b9dcd4dc Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 10 Feb 2026 17:52:41 -0800
Subject: perf test: Test pipe mode with data conversion --to-json

Add pipe mode test for json data conversion. Tidy up exit and cleanup
code.

Committer testing:

  $ perf test 'perf data convert --to-json'
  124: 'perf data convert --to-json' command test                      : Ok
  $ perf test -vv 'perf data convert --to-json'
  124: 'perf data convert --to-json' command test:
  --- start ---
  test child forked, pid 548738
  Testing Perf Data Conversion Command to JSON
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.020 MB /tmp/__perf_test.perf.data.krxvl (104 samples) ]
  [ perf data convert: Converted '/tmp/__perf_test.perf.data.krxvl' into JSON data '/tmp/__perf_test.output.json.0z60p' ]
  [ perf data convert: Converted and wrote 0.075 MB (104 samples) ]
  Perf Data Converter Command to JSON [SUCCESS]
  Validating Perf Data Converted JSON file
  The file contains valid JSON format [SUCCESS]
  Testing Perf Data Conversion Command to JSON (Pipe mode)
  [ perf record: Woken up 2 times to write data ]
  [ perf record: Captured and wrote 0.046 MB - ]
  [ perf data convert: Converted '-' into JSON data '/tmp/__perf_test.output.json.0z60p' ]
  [ perf data convert: Converted and wrote 0.081 MB (110 samples) ]
  Perf Data Converter Command to JSON (Pipe mode) [SUCCESS]
  Validating Perf Data Converted JSON file
  The file contains valid JSON format [SUCCESS]
  ---- end(0) ----
  124: 'perf data convert --to-json' command test                      : Ok
  $

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Derek Foreman <derek.foreman@collabora.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../tests/shell/test_perf_data_converter_json.sh   | 33 ++++++++++++++++------
 1 file changed, 24 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/tests/shell/test_perf_data_converter_json.sh b/tools/perf/tests/shell/test_perf_data_converter_json.sh
index c4f1b59d116f..35d81e39a26c 100755
--- a/tools/perf/tests/shell/test_perf_data_converter_json.sh
+++ b/tools/perf/tests/shell/test_perf_data_converter_json.sh
@@ -15,29 +15,42 @@ result=$(mktemp /tmp/__perf_test.output.json.XXXXX)
 
 cleanup()
 {
-	rm -f "${perfdata}"
+	rm -f "${perfdata}*"
 	rm -f "${result}"
 	trap - exit term int
 }
 
 trap_cleanup()
 {
+	echo "Unexpected signal in ${FUNCNAME[1]}"
 	cleanup
-	exit ${err}
+	exit 1
 }
 trap trap_cleanup exit term int
 
 test_json_converter_command()
 {
-	echo "Testing Perf Data Convertion Command to JSON"
-	perf record -o "$perfdata" -F 99 -g -- perf test -w noploop > /dev/null 2>&1
-	perf data convert --to-json "$result" --force -i "$perfdata" >/dev/null 2>&1
+	echo "Testing Perf Data Conversion Command to JSON"
+	perf record -o "$perfdata" -F 99 -g -- perf test -w noploop
+	perf data convert --to-json "$result" --force -i "$perfdata"
 	if [ "$(cat ${result} | wc -l)" -gt "0" ] ; then
 		echo "Perf Data Converter Command to JSON [SUCCESS]"
 	else
 		echo "Perf Data Converter Command to JSON [FAILED]"
 		err=1
-		exit
+	fi
+}
+
+test_json_converter_pipe()
+{
+	echo "Testing Perf Data Conversion Command to JSON (Pipe mode)"
+	perf record -o - -F 99 -g -- perf test -w noploop > "$perfdata"
+	cat "$perfdata" | perf data convert --to-json "$result" --force -i -
+	if [ "$(cat ${result} | wc -l)" -gt "0" ] ; then
+		echo "Perf Data Converter Command to JSON (Pipe mode) [SUCCESS]"
+	else
+		echo "Perf Data Converter Command to JSON (Pipe mode) [FAILED]"
+		err=1
 	fi
 }
 
@@ -50,16 +63,18 @@ validate_json_format()
 		else
 			echo "The file does not contain valid JSON format [FAILED]"
 			err=1
-			exit
 		fi
 	else
 		echo "File not found [FAILED]"
-		err=2
-		exit
+		err=1
 	fi
 }
 
 test_json_converter_command
 validate_json_format
 
+test_json_converter_pipe
+validate_json_format
+
+cleanup
 exit ${err}
-- 
cgit v1.2.3


From 9083ce531a1f7fb5186be934f42d884de34698da Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 10 Feb 2026 17:52:42 -0800
Subject: perf test: perf data --to-ctf testing

If babeltrace is detected check that --to-ctf functions with a data
file and in pipe mode.

Committer testing:

  $ perf test 'perf data convert --to-ctf'
  124: 'perf data convert --to-ctf' command test                       : Ok
  $ perf test -vv 'perf data convert --to-ctf'
  124: 'perf data convert --to-ctf' command test:
  --- start ---
  test child forked, pid 556008
           libbabeltrace: [ on  ]  # HAVE_LIBBABELTRACE_SUPPORT
  Testing Perf Data Conversion Command to CTF (File input)
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.021 MB /tmp/__perf_test.perf.data.9TxzZ (115 samples) ]
  [ perf data convert: Converted '/tmp/__perf_test.perf.data.9TxzZ' into CTF data '/tmp/__perf_test.ctf.f5EkS' ]
  [ perf data convert: Converted and wrote 0.012 MB (115 samples) ]
  Perf Data Converter Command to CTF (File input) [SUCCESS]
  Testing Perf Data Conversion Command to CTF (Pipe mode)
  [ perf record: Woken up 2 times to write data ]
  [ perf record: Captured and wrote 0.047 MB - ]
  Failed to setup all events.
  [ perf data convert: Converted '/tmp/__perf_test.perf.data.9TxzZ' into CTF data '/tmp/__perf_test.ctf.f5EkS' ]
  [ perf data convert: Converted and wrote 0.000 MB (0 samples) ]
  Perf Data Converter Command to CTF (Pipe mode) [SUCCESS]
  Unexpected signal in main
  ---- end(0) ----
  124: 'perf data convert --to-ctf' command test                       : Ok
  $

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Derek Foreman <derek.foreman@collabora.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../tests/shell/test_perf_data_converter_ctf.sh    | 104 +++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100755 tools/perf/tests/shell/test_perf_data_converter_ctf.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/test_perf_data_converter_ctf.sh b/tools/perf/tests/shell/test_perf_data_converter_ctf.sh
new file mode 100755
index 000000000000..334eebc9945e
--- /dev/null
+++ b/tools/perf/tests/shell/test_perf_data_converter_ctf.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+# 'perf data convert --to-ctf' command test
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+err=0
+
+perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
+ctf_dir=$(mktemp -d /tmp/__perf_test.ctf.XXXXX)
+
+cleanup()
+{
+	rm -f "${perfdata}"
+	rm -rf "${ctf_dir}"
+	trap - exit term int
+}
+
+trap_cleanup()
+{
+	echo "Unexpected signal in ${FUNCNAME[1]}"
+	cleanup
+	exit ${err}
+}
+trap trap_cleanup exit term int
+
+check_babeltrace_support()
+{
+	if ! perf check feature libbabeltrace
+	then
+		echo "perf not linked with libbabeltrace, skipping test"
+		exit 2
+	fi
+}
+
+test_ctf_converter_file()
+{
+	echo "Testing Perf Data Conversion Command to CTF (File input)"
+	# Record some data
+	if ! perf record -o "$perfdata" -F 99 -g -- perf test -w noploop
+	then
+		echo "Failed to record perf data"
+		err=1
+		return
+	fi
+
+	# Cleanup previous ctf dir
+	rm -rf "${ctf_dir}"
+
+	# Convert
+	if ! perf data convert --to-ctf "$ctf_dir" --force -i "$perfdata"
+	then
+		echo "Perf Data Converter Command to CTF (File input) [FAILED]"
+		err=1
+		return
+	fi
+
+	if [ -d "${ctf_dir}" ] && [ "$(ls -A "${ctf_dir}")" ]
+	then
+		echo "Perf Data Converter Command to CTF (File input) [SUCCESS]"
+	else
+		echo "Perf Data Converter Command to CTF (File input) [FAILED]"
+		echo "    Output directory empty or missing"
+		err=1
+	fi
+}
+
+test_ctf_converter_pipe()
+{
+	echo "Testing Perf Data Conversion Command to CTF (Pipe mode)"
+
+	# Cleanup previous ctf dir
+	rm -rf "${ctf_dir}"
+
+	# Record to stdout and pipe to $perfdata file
+	if ! perf record -o - -F 99 -g -- perf test -w noploop > "$perfdata"
+	then
+		echo "Failed to record perf data"
+		err=1
+		return
+	fi
+
+	if ! perf data convert --to-ctf "$ctf_dir" --force -i "$perfdata"
+	then
+		echo "Perf Data Converter Command to CTF (Pipe mode) [FAILED]"
+		err=1
+		return
+	fi
+
+	if [ -d "${ctf_dir}" ] && [ "$(ls -A "${ctf_dir}")" ]
+	then
+		echo "Perf Data Converter Command to CTF (Pipe mode) [SUCCESS]"
+	else
+		echo "Perf Data Converter Command to CTF (Pipe mode) [FAILED]"
+		echo "    Output directory empty or missing"
+		err=1
+	fi
+}
+
+check_babeltrace_support
+test_ctf_converter_file
+test_ctf_converter_pipe
+
+exit ${err}
-- 
cgit v1.2.3


From 22ca2f7f32cc783b57bc1223b84d6f5ba3e5d1e2 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 9 Feb 2026 12:22:06 -0800
Subject: perf script: Allow the generated script to be a path

Allow the script generated by "perf script -g <language>" to be a file
path and the language determined by the file extension.

This is useful in testing so that the generated script file can be
written to a test directory.

Committer testing:

  $ perf record ls a.a
  ls: cannot access 'a.a': No such file or directory
  [ perf record: Woken up 2 times to write data ]
  [ perf record: Captured and wrote 0.003 MB perf.data (7 samples) ]
  $ perf script -g python
  generated Python script: perf-script.py
  $ perf script -g myscript.py
  generated Python script: myscript.py
  $ diff -u perf-script.py myscript.py
  $ tail myscript.py
  def trace_unhandled(event_name, context, event_fields_dict, perf_sample_dict):
  		print(get_dict_as_string(event_fields_dict))
  		print('Sample: {'+get_dict_as_string(perf_sample_dict['sample'], ', ')+'}')

  def print_header(event_name, cpu, secs, nsecs, pid, comm):
  	print("%-20s %5u %05u.%09u %8u %-20s " % \
  	(event_name, cpu, secs, nsecs, pid, comm), end="")

  def get_dict_as_string(a_dict, delimiter=' '):
  	return delimiter.join(['%s=%s'%(k,str(v))for k,v in sorted(a_dict.items())])
  $

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-script.txt |  6 ++++--
 tools/perf/builtin-script.c              | 24 ++++++++++++++++++++----
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 03d112960632..ddf92f9c7821 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -98,8 +98,10 @@ OPTIONS
 
 -g::
 --gen-script=::
-        Generate perf-script.[ext] starter script for given language,
-        using current perf.data.
+	Generate a starter script. If a language is given then the
+        script is named perf-script.[ext] according to the
+        language. If a file path is given then python is used for
+        files ending '.py' and perl used for files ending '.pl'.
 
 --dlfilter=<file>::
 	Filter sample events using the given shared object file.
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 14c6f6c3c4f2..7c743a303507 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -4489,6 +4489,7 @@ script_found:
 	if (generate_script_lang) {
 		struct stat perf_stat;
 		int input;
+		char *filename = strdup("perf-script");
 
 		if (output_set_by_user()) {
 			fprintf(stderr,
@@ -4516,17 +4517,32 @@ script_found:
 		}
 
 		scripting_ops = script_spec__lookup(generate_script_lang);
+		if (!scripting_ops && ends_with(generate_script_lang, ".py")) {
+			scripting_ops = script_spec__lookup("python");
+			free(filename);
+			filename = strdup(generate_script_lang);
+			filename[strlen(filename) - 3] = '\0';
+		} else if (!scripting_ops && ends_with(generate_script_lang, ".pl")) {
+			scripting_ops = script_spec__lookup("perl");
+			free(filename);
+			filename = strdup(generate_script_lang);
+			filename[strlen(filename) - 3] = '\0';
+		}
 		if (!scripting_ops) {
-			fprintf(stderr, "invalid language specifier");
+			fprintf(stderr, "invalid language specifier '%s'\n", generate_script_lang);
 			err = -ENOENT;
 			goto out_delete;
 		}
+		if (!filename) {
+			err = -ENOMEM;
+			goto out_delete;
+		}
 #ifdef HAVE_LIBTRACEEVENT
-		err = scripting_ops->generate_script(session->tevent.pevent,
-						     "perf-script");
+		err = scripting_ops->generate_script(session->tevent.pevent, filename);
 #else
-		err = scripting_ops->generate_script(NULL, "perf-script");
+		err = scripting_ops->generate_script(NULL, filename);
 #endif
+		free(filename);
 		goto out_delete;
 	}
 
-- 
cgit v1.2.3


From 2273697781d27c6ac033cdca7b5f5f5ad12e28f9 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 9 Feb 2026 12:22:07 -0800
Subject: perf test script: Add perl script testing support

Basic coverage of perl script support from `perf script`. This is
disabled by default and so the test will most normally skip.

Committer testing:

  $ perf test 'perf script perl'
  106: perf script perl tests                                          : Skip
  $ perf test -vv 'perf script perl'
  106: perf script perl tests:
  --- start ---
  test child forked, pid 578323
  perf script perl test [Skipped: no libperl support]
  ---- end(-2) ----
  106: perf script perl tests                                          : Skip
  $ perf check feature libperl
                 libperl: [ OFF ]  # HAVE_LIBPERL_SUPPORT ( tip: Deprecated, use LIBPERL=1 and install perl-ExtUtils-Embed/libperl-dev to build with it )
  $

Install perl-ExtUtils-Embed, build with LIBPERL=1, rebuild:

  $ perf check feature libperl
                 libperl: [ on  ]  # HAVE_LIBPERL_SUPPORT
  $ perf test 'perf script perl'
  106: perf script perl tests                                          : Ok
  $ perf test -vv 'perf script perl'
  106: perf script perl tests:
  --- start ---
  test child forked, pid 588206
  Testing event: sched:sched_switch
  perf script perl test [Skipped: failed to record sched:sched_switch]
  Testing event: task-clock
  Generating perl script...
  generated Perl script: /tmp/__perf_test_script.RpMn5.pl
  Executing perl script...
  perf script perl test [Success: task-clock triggered $VAR1]
  ---- end(0) ----
  106: perf script perl tests                                          : Ok
  $

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/script_perl.sh | 102 ++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100755 tools/perf/tests/shell/script_perl.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/script_perl.sh b/tools/perf/tests/shell/script_perl.sh
new file mode 100755
index 000000000000..b6d65b6fbda1
--- /dev/null
+++ b/tools/perf/tests/shell/script_perl.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# perf script perl tests
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+# set PERF_EXEC_PATH to find scripts in the source directory
+perfdir=$(dirname "$0")/../..
+if [ -e "$perfdir/scripts/perl/Perf-Trace-Util" ]; then
+  export PERF_EXEC_PATH=$perfdir
+fi
+
+
+perfdata=$(mktemp /tmp/__perf_test_script_perl.perf.data.XXXXX)
+generated_script=$(mktemp /tmp/__perf_test_script.XXXXX.pl)
+
+cleanup() {
+  rm -f "${perfdata}"
+  rm -f "${generated_script}"
+  trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+  echo "Unexpected signal in ${FUNCNAME[1]}"
+  cleanup
+  exit 1
+}
+trap trap_cleanup TERM INT
+trap cleanup EXIT
+
+check_perl_support() {
+	if perf check feature -q libperl; then
+		return 0
+	fi
+	echo "perf script perl test [Skipped: no libperl support]"
+	return 2
+}
+
+test_script() {
+	local event_name=$1
+	local expected_output=$2
+	local record_opts=$3
+
+	echo "Testing event: $event_name"
+
+	# Try to record. If this fails, it might be permissions or lack of support.
+	# We return 2 to indicate "skip this event" rather than "fail test".
+	if ! perf record -o "${perfdata}" -e "$event_name" $record_opts -- perf test -w thloop > /dev/null 2>&1; then
+		echo "perf script perl test [Skipped: failed to record $event_name]"
+		return 2
+	fi
+
+	echo "Generating perl script..."
+	if ! perf script -i "${perfdata}" -g "${generated_script}"; then
+		echo "perf script perl test [Failed: script generation for $event_name]"
+		return 1
+	fi
+
+	if [ ! -f "${generated_script}" ]; then
+		echo "perf script perl test [Failed: script not generated for $event_name]"
+		return 1
+	fi
+
+	echo "Executing perl script..."
+	output=$(perf script -i "${perfdata}" -s "${generated_script}" 2>&1)
+
+	if echo "$output" | grep -q "$expected_output"; then
+		echo "perf script perl test [Success: $event_name triggered $expected_output]"
+		return 0
+	else
+		echo "perf script perl test [Failed: $event_name did not trigger $expected_output]"
+		echo "Output was:"
+		echo "$output" | head -n 20
+		return 1
+	fi
+}
+
+check_perl_support || exit 2
+
+# Try tracepoint first
+test_script "sched:sched_switch" "sched::sched_switch" "-c 1" && res=0 || res=$?
+
+if [ $res -eq 0 ]; then
+	exit 0
+elif [ $res -eq 1 ]; then
+	exit 1
+fi
+
+# If tracepoint skipped (res=2), try task-clock
+# For generic events like task-clock, the generated script uses process_event()
+# which dumps data using Data::Dumper. We check for "$VAR1" which is standard Dumper output.
+test_script "task-clock" "\$VAR1" "-c 100" && res=0 || res=$?
+
+if [ $res -eq 0 ]; then
+	exit 0
+elif [ $res -eq 1 ]; then
+	exit 1
+fi
+
+# If both skipped
+echo "perf script perl test [Skipped: Could not record tracepoint or task-clock]"
+exit 2
-- 
cgit v1.2.3


From dbf0108347bdb5d4ccef8910555b16c1f1a505f8 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 9 Feb 2026 12:22:08 -0800
Subject: perf test script: Add python script testing support

Basic coverage of python script support from `perf script`.

Committer testing:

  $ perf test 'perf script python'
  107: perf script python tests                                        : Ok
  $ perf test -vv 'perf script python'
  107: perf script python tests:
  --- start ---
  test child forked, pid 595537
  Testing event: sched:sched_switch
  perf script python test [Skipped: failed to record sched:sched_switch]
  Testing event: task-clock
  Generating python script...
  generated Python script: /tmp/__perf_test_script.J4rWj.py
  Executing python script...
  perf script python test [Success: task-clock triggered param_dict]
  ---- end(0) ----
  107: perf script python tests                                        : Ok
  $

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sandipan Das <sandipan.das@amd.com>
Cc: Yujie Liu <yujie.liu@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/script_python.sh | 113 ++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100755 tools/perf/tests/shell/script_python.sh

(limited to 'tools')

diff --git a/tools/perf/tests/shell/script_python.sh b/tools/perf/tests/shell/script_python.sh
new file mode 100755
index 000000000000..6bc66074a31f
--- /dev/null
+++ b/tools/perf/tests/shell/script_python.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# perf script python tests
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+# set PERF_EXEC_PATH to find scripts in the source directory
+perfdir=$(dirname "$0")/../..
+if [ -e "$perfdir/scripts/python/Perf-Trace-Util" ]; then
+  export PERF_EXEC_PATH=$perfdir
+fi
+
+
+perfdata=$(mktemp /tmp/__perf_test_script_python.perf.data.XXXXX)
+generated_script=$(mktemp /tmp/__perf_test_script.XXXXX.py)
+
+cleanup() {
+  rm -f "${perfdata}"
+  rm -f "${generated_script}"
+  trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+  echo "Unexpected signal in ${FUNCNAME[1]}"
+  cleanup
+  exit 1
+}
+trap trap_cleanup TERM INT
+trap cleanup EXIT
+
+check_python_support() {
+	if perf check feature -q libpython; then
+		return 0
+	fi
+	echo "perf script python test [Skipped: no libpython support]"
+	return 2
+}
+
+test_script() {
+	local event_name=$1
+	local expected_output=$2
+	local record_opts=$3
+
+	echo "Testing event: $event_name"
+
+	# Try to record. If this fails, it might be permissions or lack of
+	# support. Return 2 to indicate "skip this event" rather than "fail
+	# test".
+	if ! perf record -o "${perfdata}" -e "$event_name" $record_opts -- perf test -w thloop > /dev/null 2>&1; then
+		echo "perf script python test [Skipped: failed to record $event_name]"
+		return 2
+	fi
+
+	echo "Generating python script..."
+	if ! perf script -i "${perfdata}" -g "${generated_script}"; then
+		echo "perf script python test [Failed: script generation for $event_name]"
+		return 1
+	fi
+
+	if [ ! -f "${generated_script}" ]; then
+		echo "perf script python test [Failed: script not generated for $event_name]"
+		return 1
+	fi
+
+	# Perf script -g python doesn't generate process_event for generic
+	# events so append it manually to test that the callback works.
+	if ! grep -q "def process_event" "${generated_script}"; then
+		cat <<EOF >> "${generated_script}"
+
+def process_event(param_dict):
+	print("param_dict: %s" % param_dict)
+EOF
+	fi
+
+	echo "Executing python script..."
+	output=$(perf script -i "${perfdata}" -s "${generated_script}" 2>&1)
+
+	if echo "$output" | grep -q "$expected_output"; then
+		echo "perf script python test [Success: $event_name triggered $expected_output]"
+		return 0
+	else
+		echo "perf script python test [Failed: $event_name did not trigger $expected_output]"
+		echo "Output was:"
+		echo "$output" | head -n 20
+		return 1
+	fi
+}
+
+check_python_support || exit 2
+
+# Try tracepoint first
+test_script "sched:sched_switch" "sched__sched_switch" "-c 1" && res=0 || res=$?
+
+if [ $res -eq 0 ]; then
+	exit 0
+elif [ $res -eq 1 ]; then
+	exit 1
+fi
+
+# If tracepoint skipped (res=2), try task-clock
+# For generic events like task-clock, the generated script uses process_event()
+# which prints the param_dict.
+test_script "task-clock" "param_dict" "-c 100" && res=0 || res=$?
+
+if [ $res -eq 0 ]; then
+	exit 0
+elif [ $res -eq 1 ]; then
+	exit 1
+fi
+
+# If both skipped
+echo "perf script python test [Skipped: Could not record tracepoint or task-clock]"
+exit 2
-- 
cgit v1.2.3


From 048714d9df73a724d3f84b587f1110963e32f9b3 Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Thu, 12 Feb 2026 20:35:19 +0000
Subject: tools/sched_ext: scx_userland: fix restart and stats thread lifecycle
 bugs

Fix three issues in scx_userland's restart path:

- exit_req is not reset on restart, causing sched_main_loop() to exit
  immediately without doing any scheduling work.

- stats_printer thread handle is local to spawn_stats_thread(), making
  it impossible to join from main(). Promote it to file scope.

- The stats thread continues reading skel->bss after the skeleton is
  destroyed on restart, causing a use-after-free. Join the stats thread
  before destroying the skeleton to ensure it has exited.

Signed-off-by: David Carlier <devnexen@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_userland.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
index 10b31020f44f..63f89b35d999 100644
--- a/tools/sched_ext/scx_userland.c
+++ b/tools/sched_ext/scx_userland.c
@@ -54,6 +54,7 @@ static bool verbose;
 static volatile int exit_req;
 static int enqueued_fd, dispatched_fd;
 
+static pthread_t stats_printer;
 static struct scx_userland *skel;
 static struct bpf_link *ops_link;
 
@@ -319,8 +320,6 @@ static void *run_stats_printer(void *arg)
 
 static int spawn_stats_thread(void)
 {
-	pthread_t stats_printer;
-
 	return pthread_create(&stats_printer, NULL, run_stats_printer, NULL);
 }
 
@@ -375,6 +374,7 @@ static void pre_bootstrap(int argc, char **argv)
 
 static void bootstrap(char *comm)
 {
+	exit_req = 0;
 	skel = SCX_OPS_OPEN(userland_ops, scx_userland);
 
 	skel->rodata->num_possible_cpus = libbpf_num_possible_cpus();
@@ -428,6 +428,7 @@ restart:
 
 	exit_req = 1;
 	bpf_link__destroy(ops_link);
+	pthread_join(stats_printer, NULL);
 	ecode = UEI_REPORT(skel, uei);
 	scx_userland__destroy(skel);
 
-- 
cgit v1.2.3


From eda8c5e776220ce9c869f5c714ccef90c6e1966b Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Wed, 21 Jan 2026 11:49:40 -0500
Subject: mm/memory: add tree limit to free_pgtables()

The ceiling and tree search limit need to be different arguments for the
future change in the failed fork attempt.  The ceiling and floor variables
are not very descriptive, so change them to pg_start/pg_end.

Adding a new variable for the vma_end to the function as it will differ
from the pg_end in the later patches in the series.

Add a kernel doc about the free_pgtables() function.

Test code also updated.

No functional changes intended.

Link: https://lkml.kernel.org/r/20260121164946.2093480-6-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/internal.h                    |  6 ++++--
 mm/memory.c                      | 42 +++++++++++++++++++++++++++++++---------
 mm/mmap.c                        |  2 +-
 mm/vma.c                         |  3 ++-
 tools/testing/vma/vma_internal.h |  3 ++-
 5 files changed, 42 insertions(+), 14 deletions(-)

(limited to 'tools')

diff --git a/mm/internal.h b/mm/internal.h
index 5fe6bb96c23c..2a0e42e36b48 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -510,8 +510,10 @@ void deactivate_file_folio(struct folio *folio);
 void folio_activate(struct folio *folio);
 
 void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
-		   struct vm_area_struct *start_vma, unsigned long floor,
-		   unsigned long ceiling, bool mm_wr_locked);
+		   struct vm_area_struct *vma, unsigned long pg_start,
+		   unsigned long pg_end, unsigned long vma_end,
+		   bool mm_wr_locked);
+
 void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
 
 struct zap_details;
diff --git a/mm/memory.c b/mm/memory.c
index cf32e2666e71..98c407622dea 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -370,23 +370,47 @@ void free_pgd_range(struct mmu_gather *tlb,
 	} while (pgd++, addr = next, addr != end);
 }
 
+/**
+ * free_pgtables() - Free a range of page tables
+ * @tlb: The mmu gather
+ * @mas: The maple state
+ * @vma: The first vma
+ * @pg_start: The lowest page table address (floor)
+ * @pg_end: The highest page table address (ceiling)
+ * @vma_end: The highest vma tree search address
+ * @mm_wr_locked: boolean indicating if the mm is write locked
+ *
+ * Note: pg_start and pg_end are provided to indicate the absolute range of the
+ * page tables that should be removed.  This can differ from the vma mappings on
+ * some archs that may have mappings that need to be removed outside the vmas.
+ * Note that the prev->vm_end and next->vm_start are often used.
+ *
+ * The vma_end differs from the pg_end when a dup_mmap() failed and the tree has
+ * unrelated data to the mm_struct being torn down.
+ */
 void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
-		   struct vm_area_struct *vma, unsigned long floor,
-		   unsigned long ceiling, bool mm_wr_locked)
+		   struct vm_area_struct *vma, unsigned long pg_start,
+		   unsigned long pg_end, unsigned long vma_end,
+		   bool mm_wr_locked)
 {
 	struct unlink_vma_file_batch vb;
 
+	/*
+	 * Note: USER_PGTABLES_CEILING may be passed as the value of pg_end and
+	 * may be 0.  Underflow is expected in this case.  Otherwise the
+	 * pagetable end is exclusive.
+	 * vma_end is exclusive.
+	 * The last vma address should never be larger than the pagetable end.
+	 */
+	WARN_ON_ONCE(vma_end - 1 > pg_end - 1);
+
 	tlb_free_vmas(tlb);
 
 	do {
 		unsigned long addr = vma->vm_start;
 		struct vm_area_struct *next;
 
-		/*
-		 * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
-		 * be 0.  This will underflow and is okay.
-		 */
-		next = mas_find(mas, ceiling - 1);
+		next = mas_find(mas, vma_end - 1);
 		if (unlikely(xa_is_zero(next)))
 			next = NULL;
 
@@ -406,7 +430,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		 */
 		while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
 			vma = next;
-			next = mas_find(mas, ceiling - 1);
+			next = mas_find(mas, vma_end - 1);
 			if (unlikely(xa_is_zero(next)))
 				next = NULL;
 			if (mm_wr_locked)
@@ -417,7 +441,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		unlink_file_vma_batch_final(&vb);
 
 		free_pgd_range(tlb, addr, vma->vm_end,
-			floor, next ? next->vm_start : ceiling);
+			pg_start, next ? next->vm_start : pg_end);
 		vma = next;
 	} while (vma);
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 9c8adc505d3d..827a64cdcc68 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1308,7 +1308,7 @@ void exit_mmap(struct mm_struct *mm)
 	mt_clear_in_rcu(&mm->mm_mt);
 	vma_iter_set(&vmi, vma->vm_end);
 	free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
-		      USER_PGTABLES_CEILING, true);
+		      USER_PGTABLES_CEILING, USER_PGTABLES_CEILING, true);
 	tlb_finish_mmu(&tlb);
 
 	/*
diff --git a/mm/vma.c b/mm/vma.c
index 0c35cdc0d3b7..b2b9e7b3284f 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -484,6 +484,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
 	unmap_vmas(&tlb, mas, vma, vma_start, vma_end, vma_end);
 	mas_set(mas, vma->vm_end);
 	free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+		      next ? next->vm_start : USER_PGTABLES_CEILING,
 		      next ? next->vm_start : USER_PGTABLES_CEILING,
 		      /* mm_wr_locked = */ true);
 	tlb_finish_mmu(&tlb);
@@ -1275,7 +1276,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
 	mas_set(mas_detach, 1);
 	/* start and end may be different if there is no prev or next vma. */
 	free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
-		      vms->unmap_end, mm_wr_locked);
+		      vms->unmap_end, vms->unmap_end, mm_wr_locked);
 	tlb_finish_mmu(&tlb);
 	vms->clear_ptes = false;
 }
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 7fa56dcc53a6..f50b8ddee612 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -1139,7 +1139,8 @@ static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
 
 static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		   struct vm_area_struct *vma, unsigned long floor,
-		   unsigned long ceiling, bool mm_wr_locked)
+		   unsigned long ceiling, unsigned long tree_max,
+		   bool mm_wr_locked)
 {
 }
 
-- 
cgit v1.2.3


From 0df5a8d3948da979b8ab811a692b34635e1b146d Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Wed, 21 Jan 2026 11:49:44 -0500
Subject: mm/vma: use unmap_desc in exit_mmap() and vms_clear_ptes()

Convert vms_clear_ptes() to use unmap_desc to call unmap_vmas() instead of
the large argument list.  The UNMAP_STATE() cannot be used because the vma
iterator in the vms does not point to the correct maple state
(mas_detach), and the tree_end will be set incorrectly.  Setting up the
arguments manually avoids setting the struct up incorrectly and doing
extra work to get the correct pagetable range.

exit_mmap() also calls unmap_vmas() with many arguments.  Using the
unmap_all_init() function to set the unmap descriptor for all vmas makes
this a bit easier to read.

Update to the vma test code is necessary to ensure testing continues to
function.

No functional changes intended.

Link: https://lkml.kernel.org/r/20260121164946.2093480-10-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h               |  4 ----
 mm/internal.h                    |  3 +++
 mm/memory.c                      | 20 ++++++++------------
 mm/mmap.c                        |  4 +++-
 mm/vma.c                         | 27 ++++++++++++++++++++++-----
 mm/vma.h                         | 14 ++++++++++++++
 tools/testing/vma/vma_internal.h |  6 +++---
 7 files changed, 53 insertions(+), 25 deletions(-)

(limited to 'tools')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2c6c6d00ed73..945902d23d47 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2625,10 +2625,6 @@ static inline void zap_vma_pages(struct vm_area_struct *vma)
 	zap_page_range_single(vma, vma->vm_start,
 			      vma->vm_end - vma->vm_start, NULL);
 }
-void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
-		struct vm_area_struct *start_vma, unsigned long start,
-		unsigned long end, unsigned long tree_end);
-
 struct mmu_notifier_range;
 
 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
diff --git a/mm/internal.h b/mm/internal.h
index 2a0e42e36b48..0f3ad8665d95 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -197,6 +197,9 @@ static inline void vma_close(struct vm_area_struct *vma)
 	}
 }
 
+/* unmap_vmas is in mm/memory.c */
+void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
+
 #ifdef CONFIG_MMU
 
 static inline void get_anon_vma(struct anon_vma *anon_vma)
diff --git a/mm/memory.c b/mm/memory.c
index 6033cf6c93de..d68f8f082b1c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2144,11 +2144,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 /**
  * unmap_vmas - unmap a range of memory covered by a list of vma's
  * @tlb: address of the caller's struct mmu_gather
- * @mas: the maple state
- * @vma: the starting vma
- * @start_addr: virtual address at which to start unmapping
- * @end_addr: virtual address at which to end unmapping
- * @tree_end: The maximum index to check
+ * @unmap: The unmap_desc
  *
  * Unmap all pages in the vma list.
  *
@@ -2161,10 +2157,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
  * drops the lock and schedules.
  */
-void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
-		struct vm_area_struct *vma, unsigned long start_addr,
-		unsigned long end_addr, unsigned long tree_end)
+void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap)
 {
+	struct vm_area_struct *vma;
 	struct mmu_notifier_range range;
 	struct zap_details details = {
 		.zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
@@ -2172,16 +2167,17 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
 		.even_cows = true,
 	};
 
+	vma = unmap->first;
 	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
-				start_addr, end_addr);
+				unmap->vma_start, unmap->vma_end);
 	mmu_notifier_invalidate_range_start(&range);
 	do {
-		unsigned long start = start_addr;
-		unsigned long end = end_addr;
+		unsigned long start = unmap->vma_start;
+		unsigned long end = unmap->vma_end;
 		hugetlb_zap_begin(vma, &start, &end);
 		unmap_single_vma(tlb, vma, start, end, &details);
 		hugetlb_zap_end(vma, &details);
-		vma = mas_find(mas, tree_end - 1);
+		vma = mas_find(unmap->mas, unmap->tree_end - 1);
 	} while (vma);
 	mmu_notifier_invalidate_range_end(&range);
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 4500e61a0d5e..042b6b4b6ab8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1277,6 +1277,7 @@ void exit_mmap(struct mm_struct *mm)
 	struct vm_area_struct *vma;
 	unsigned long nr_accounted = 0;
 	VMA_ITERATOR(vmi, mm, 0);
+	struct unmap_desc unmap;
 
 	/* mm's last user has gone, and its about to be pulled down */
 	mmu_notifier_release(mm);
@@ -1292,11 +1293,12 @@ void exit_mmap(struct mm_struct *mm)
 		goto destroy;
 	}
 
+	unmap_all_init(&unmap, &vmi, vma);
 	flush_cache_mm(mm);
 	tlb_gather_mmu_fullmm(&tlb, mm);
 	/* update_hiwater_rss(mm) here? but nobody should be looking */
 	/* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
-	unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX);
+	unmap_vmas(&tlb, &unmap);
 	mmap_read_unlock(mm);
 
 	/*
diff --git a/mm/vma.c b/mm/vma.c
index 75c68c74c062..b46c869d4bb0 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -480,8 +480,7 @@ void unmap_region(struct unmap_desc *unmap)
 
 	tlb_gather_mmu(&tlb, mm);
 	update_hiwater_rss(mm);
-	unmap_vmas(&tlb, mas, unmap->first, unmap->vma_start, unmap->vma_end,
-		   unmap->vma_end);
+	unmap_vmas(&tlb, unmap);
 	mas_set(mas, unmap->tree_reset);
 	free_pgtables(&tlb, mas, unmap->first, unmap->pg_start, unmap->pg_end,
 		      unmap->tree_end, unmap->mm_wr_locked);
@@ -1257,6 +1256,26 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
 		    struct ma_state *mas_detach, bool mm_wr_locked)
 {
 	struct mmu_gather tlb;
+	struct unmap_desc unmap = {
+		.mas = mas_detach,
+		.first = vms->vma,
+		/* start and end may be different if there is no prev or next vma. */
+		.pg_start = vms->unmap_start,
+		.pg_end = vms->unmap_end,
+		.vma_start = vms->start,
+		.vma_end = vms->end,
+		/*
+		 * The tree limits and reset differ from the normal case since it's a
+		 * side-tree
+		 */
+		.tree_reset = 1,
+		.tree_end = vms->vma_count,
+		/*
+		 * We can free page tables without write-locking mmap_lock because VMAs
+		 * were isolated before we downgraded mmap_lock.
+		 */
+		.mm_wr_locked = mm_wr_locked,
+	};
 
 	if (!vms->clear_ptes) /* Nothing to do */
 		return;
@@ -1268,9 +1287,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
 	mas_set(mas_detach, 1);
 	tlb_gather_mmu(&tlb, vms->vma->vm_mm);
 	update_hiwater_rss(vms->vma->vm_mm);
-	unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
-		   vms->vma_count);
-
+	unmap_vmas(&tlb, &unmap);
 	mas_set(mas_detach, 1);
 	/* start and end may be different if there is no prev or next vma. */
 	free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
diff --git a/mm/vma.h b/mm/vma.h
index cca7553c7d64..bb7fa5d2bde2 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -167,6 +167,20 @@ struct unmap_desc {
 	bool mm_wr_locked;            /* If the mmap write lock is held */
 };
 
+static inline void unmap_all_init(struct unmap_desc *unmap,
+		struct vma_iterator *vmi, struct vm_area_struct *vma)
+{
+	unmap->mas = &vmi->mas;
+	unmap->first = vma;
+	unmap->pg_start = FIRST_USER_ADDRESS;
+	unmap->pg_end = USER_PGTABLES_CEILING;
+	unmap->vma_start = 0;
+	unmap->vma_end = ULONG_MAX;
+	unmap->tree_end = ULONG_MAX;
+	unmap->tree_reset = vma->vm_end;
+	unmap->mm_wr_locked = false;
+}
+
 #define UNMAP_STATE(name, _vmi, _vma, _vma_start, _vma_end, _prev, _next)      \
 	struct unmap_desc name = {                                             \
 		.mas = &(_vmi)->mas,                                           \
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index f50b8ddee612..0b4918aac8d6 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -1131,9 +1131,9 @@ static inline void update_hiwater_vm(struct mm_struct *mm)
 {
 }
 
-static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
-		      struct vm_area_struct *vma, unsigned long start_addr,
-		      unsigned long end_addr, unsigned long tree_end)
+struct unmap_desc;
+
+static inline void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap)
 {
 }
 
-- 
cgit v1.2.3


From a8700d42b0af3a1751f70d53ee90c97fb4dc50f2 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Wed, 21 Jan 2026 11:49:46 -0500
Subject: mm: use unmap_desc struct for freeing page tables

Pass through the unmap_desc to free_pgtables() because it almost has
everything necessary and is already on the stack.

Updates testing code as necessary.

No functional changes intended.

[Liam.Howlett@oracle.com: fix up unmap desc use on exit_mmap()]
  Link: https://lkml.kernel.org/r/20260210214214.364856-1-Liam.Howlett@oracle.com
Link: https://lkml.kernel.org/r/20260121164946.2093480-12-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: SeongJae Park <sj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/internal.h                    |  5 +----
 mm/memory.c                      | 33 +++++++++++++--------------------
 mm/mmap.c                        |  6 +++---
 mm/vma.c                         |  6 ++----
 mm/vma.h                         | 23 +++++++++++++++++++++++
 tools/testing/vma/vma_internal.h |  7 +++----
 6 files changed, 45 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/mm/internal.h b/mm/internal.h
index 0f3ad8665d95..ef71a1d9991f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -512,10 +512,7 @@ bool __folio_end_writeback(struct folio *folio);
 void deactivate_file_folio(struct folio *folio);
 void folio_activate(struct folio *folio);
 
-void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
-		   struct vm_area_struct *vma, unsigned long pg_start,
-		   unsigned long pg_end, unsigned long vma_end,
-		   bool mm_wr_locked);
+void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc);
 
 void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
 
diff --git a/mm/memory.c b/mm/memory.c
index d68f8f082b1c..136b80ca357b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -373,12 +373,7 @@ void free_pgd_range(struct mmu_gather *tlb,
 /**
  * free_pgtables() - Free a range of page tables
  * @tlb: The mmu gather
- * @mas: The maple state
- * @vma: The first vma
- * @pg_start: The lowest page table address (floor)
- * @pg_end: The highest page table address (ceiling)
- * @vma_end: The highest vma tree search address
- * @mm_wr_locked: boolean indicating if the mm is write locked
+ * @unmap: The unmap_desc
  *
  * Note: pg_start and pg_end are provided to indicate the absolute range of the
  * page tables that should be removed.  This can differ from the vma mappings on
@@ -388,21 +383,19 @@ void free_pgd_range(struct mmu_gather *tlb,
  * The vma_end differs from the pg_end when a dup_mmap() failed and the tree has
  * unrelated data to the mm_struct being torn down.
  */
-void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
-		   struct vm_area_struct *vma, unsigned long pg_start,
-		   unsigned long pg_end, unsigned long vma_end,
-		   bool mm_wr_locked)
+void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *unmap)
 {
 	struct unlink_vma_file_batch vb;
+	struct ma_state *mas = unmap->mas;
+	struct vm_area_struct *vma = unmap->first;
 
 	/*
 	 * Note: USER_PGTABLES_CEILING may be passed as the value of pg_end and
 	 * may be 0.  Underflow is expected in this case.  Otherwise the
-	 * pagetable end is exclusive.
-	 * vma_end is exclusive.
-	 * The last vma address should never be larger than the pagetable end.
+	 * pagetable end is exclusive.  vma_end is exclusive.  The last vma
+	 * address should never be larger than the pagetable end.
 	 */
-	WARN_ON_ONCE(vma_end - 1 > pg_end - 1);
+	WARN_ON_ONCE(unmap->vma_end - 1 > unmap->pg_end - 1);
 
 	tlb_free_vmas(tlb);
 
@@ -410,13 +403,13 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		unsigned long addr = vma->vm_start;
 		struct vm_area_struct *next;
 
-		next = mas_find(mas, vma_end - 1);
+		next = mas_find(mas, unmap->tree_end - 1);
 
 		/*
 		 * Hide vma from rmap and truncate_pagecache before freeing
 		 * pgtables
 		 */
-		if (mm_wr_locked)
+		if (unmap->mm_wr_locked)
 			vma_start_write(vma);
 		unlink_anon_vmas(vma);
 
@@ -428,16 +421,16 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		 */
 		while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
 			vma = next;
-			next = mas_find(mas, vma_end - 1);
-			if (mm_wr_locked)
+			next = mas_find(mas, unmap->tree_end - 1);
+			if (unmap->mm_wr_locked)
 				vma_start_write(vma);
 			unlink_anon_vmas(vma);
 			unlink_file_vma_batch_add(&vb, vma);
 		}
 		unlink_file_vma_batch_final(&vb);
 
-		free_pgd_range(tlb, addr, vma->vm_end,
-			pg_start, next ? next->vm_start : pg_end);
+		free_pgd_range(tlb, addr, vma->vm_end, unmap->pg_start,
+			       next ? next->vm_start : unmap->pg_end);
 		vma = next;
 	} while (vma);
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 042b6b4b6ab8..a03b7681e13c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1307,10 +1307,10 @@ void exit_mmap(struct mm_struct *mm)
 	 */
 	mm_flags_set(MMF_OOM_SKIP, mm);
 	mmap_write_lock(mm);
+	unmap.mm_wr_locked = true;
 	mt_clear_in_rcu(&mm->mm_mt);
-	vma_iter_set(&vmi, vma->vm_end);
-	free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
-		      USER_PGTABLES_CEILING, USER_PGTABLES_CEILING, true);
+	unmap_pgtable_init(&unmap, &vmi);
+	free_pgtables(&tlb, &unmap);
 	tlb_finish_mmu(&tlb);
 
 	/*
diff --git a/mm/vma.c b/mm/vma.c
index 876d2db5329d..f352d5c72212 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -475,15 +475,13 @@ void remove_vma(struct vm_area_struct *vma)
 void unmap_region(struct unmap_desc *unmap)
 {
 	struct mm_struct *mm = unmap->first->vm_mm;
-	struct ma_state *mas = unmap->mas;
 	struct mmu_gather tlb;
 
 	tlb_gather_mmu(&tlb, mm);
 	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, unmap);
-	mas_set(mas, unmap->tree_reset);
-	free_pgtables(&tlb, mas, unmap->first, unmap->pg_start, unmap->pg_end,
-		      unmap->tree_end, unmap->mm_wr_locked);
+	mas_set(unmap->mas, unmap->tree_reset);
+	free_pgtables(&tlb, unmap);
 	tlb_finish_mmu(&tlb);
 }
 
diff --git a/mm/vma.h b/mm/vma.h
index bb7fa5d2bde2..de30c69bceaf 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -167,6 +167,10 @@ struct unmap_desc {
 	bool mm_wr_locked;            /* If the mmap write lock is held */
 };
 
+/*
+ * unmap_all_init() - Initialize unmap_desc to remove all vmas, point the
+ * pg_start and pg_end to a safe location.
+ */
 static inline void unmap_all_init(struct unmap_desc *unmap,
 		struct vma_iterator *vmi, struct vm_area_struct *vma)
 {
@@ -181,6 +185,25 @@ static inline void unmap_all_init(struct unmap_desc *unmap,
 	unmap->mm_wr_locked = false;
 }
 
+/*
+ * unmap_pgtable_init() - Initialize unmap_desc to remove all page tables within
+ * the user range.
+ *
+ * ARM can have mappings outside of vmas.
+ * See: e2cdef8c847b4 ("[PATCH] freepgt: free_pgtables from FIRST_USER_ADDRESS")
+ *
+ * ARM LPAE uses page table mappings beyond the USER_PGTABLES_CEILING
+ * See: CONFIG_ARM_LPAE in arch/arm/include/asm/pgtable.h
+ */
+static inline void unmap_pgtable_init(struct unmap_desc *unmap,
+				      struct vma_iterator *vmi)
+{
+	vma_iter_set(vmi, unmap->tree_reset);
+	unmap->vma_start = FIRST_USER_ADDRESS;
+	unmap->vma_end = USER_PGTABLES_CEILING;
+	unmap->tree_end = USER_PGTABLES_CEILING;
+}
+
 #define UNMAP_STATE(name, _vmi, _vma, _vma_start, _vma_end, _prev, _next)      \
 	struct unmap_desc name = {                                             \
 		.mas = &(_vmi)->mas,                                           \
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 0b4918aac8d6..ca4eb563b29b 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -1137,11 +1137,10 @@ static inline void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap)
 {
 }
 
-static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
-		   struct vm_area_struct *vma, unsigned long floor,
-		   unsigned long ceiling, unsigned long tree_max,
-		   bool mm_wr_locked)
+static inline void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc)
 {
+	(void)tlb;
+	(void)desc;
 }
 
 static inline void mapping_unmap_writable(struct address_space *mapping)
-- 
cgit v1.2.3


From 21c8a5bae7bd594f5b89db551b618d60b994b8cf Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 22 Jan 2026 16:06:13 +0000
Subject: tools: bitmap: add missing bitmap_[subset(), andnot()]

The bitmap_subset() and bitmap_andnot() functions are not present in the
tools version of include/linux/bitmap.h, so add them as subsequent patches
implement test code that requires them.

We also add the missing __bitmap_subset() to tools/lib/bitmap.c.

Link: https://lkml.kernel.org/r/0fd0d4ec868297f522003cb4b5898b53b498805b.1769097829.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Yury Norov <ynorov@nvidia.com>
Cc: Chris Mason <clm@fb.com>
Cc: Pedro Falcato <pfalcato@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/include/linux/bitmap.h | 22 ++++++++++++++++++++++
 tools/lib/bitmap.c           | 29 +++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'tools')

diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index 0d992245c600..250883090a5d 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -24,6 +24,10 @@ void __bitmap_set(unsigned long *map, unsigned int start, int len);
 void __bitmap_clear(unsigned long *map, unsigned int start, int len);
 bool __bitmap_intersects(const unsigned long *bitmap1,
 			 const unsigned long *bitmap2, unsigned int bits);
+bool __bitmap_subset(const unsigned long *bitmap1,
+		     const unsigned long *bitmap2, unsigned int nbits);
+bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+		    const unsigned long *bitmap2, unsigned int nbits);
 
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
 #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
@@ -81,6 +85,15 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
 		__bitmap_or(dst, src1, src2, nbits);
 }
 
+static __always_inline
+bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
+		   const unsigned long *src2, unsigned int nbits)
+{
+	if (small_const_nbits(nbits))
+		return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
+	return __bitmap_andnot(dst, src1, src2, nbits);
+}
+
 static inline unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags __maybe_unused)
 {
 	return malloc(bitmap_size(nbits));
@@ -157,6 +170,15 @@ static inline bool bitmap_intersects(const unsigned long *src1,
 		return __bitmap_intersects(src1, src2, nbits);
 }
 
+static __always_inline
+bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
+{
+	if (small_const_nbits(nbits))
+		return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
+	else
+		return __bitmap_subset(src1, src2, nbits);
+}
+
 static inline void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits)
 {
 	if (__builtin_constant_p(nbits) && nbits == 1)
diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c
index 51255c69754d..aa83d22c45e3 100644
--- a/tools/lib/bitmap.c
+++ b/tools/lib/bitmap.c
@@ -140,3 +140,32 @@ void __bitmap_clear(unsigned long *map, unsigned int start, int len)
 		*p &= ~mask_to_clear;
 	}
 }
+
+bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+				const unsigned long *bitmap2, unsigned int bits)
+{
+	unsigned int k;
+	unsigned int lim = bits/BITS_PER_LONG;
+	unsigned long result = 0;
+
+	for (k = 0; k < lim; k++)
+		result |= (dst[k] = bitmap1[k] & ~bitmap2[k]);
+	if (bits % BITS_PER_LONG)
+		result |= (dst[k] = bitmap1[k] & ~bitmap2[k] &
+			   BITMAP_LAST_WORD_MASK(bits));
+	return result != 0;
+}
+
+bool __bitmap_subset(const unsigned long *bitmap1,
+		     const unsigned long *bitmap2, unsigned int bits)
+{
+	unsigned int k, lim = bits/BITS_PER_LONG;
+	for (k = 0; k < lim; ++k)
+		if (bitmap1[k] & ~bitmap2[k])
+			return false;
+
+	if (bits % BITS_PER_LONG)
+		if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
+			return false;
+	return true;
+}
-- 
cgit v1.2.3


From bae0ba7c7c0a022287d8b093da63ebcb794d77ea Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 22 Jan 2026 16:06:14 +0000
Subject: mm: add basic VMA flag operation helper functions

Now we have the mk_vma_flags() macro helper which permits easy
specification of any number of VMA flags, add helper functions which
operate with vma_flags_t parameters.

This patch provides vma_flags_test[_mask](), vma_flags_set[_mask]() and
vma_flags_clear[_mask]() respectively testing, setting and clearing flags
with the _mask variants accepting vma_flag_t parameters, and the non-mask
variants implemented as macros which accept a list of flags.

This allows us to trivially test/set/clear aggregate VMA flag values as
necessary, for instance:

	if (vma_flags_test(&flags, VMA_READ_BIT, VMA_WRITE_BIT))
		goto readwrite;

	vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT);

	vma_flags_clear(&flags, VMA_READ_BIT, VMA_WRITE_BIT);

We also add a function for testing that ALL flags are set for convenience,
e.g.:

	if (vma_flags_test_all(&flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) {
		/* Both READ and MAYREAD flags set */
		...
	}

The compiler generates optimal assembly for each such that they behave as
if the caller were setting the bitmap flags manually.

This is important for e.g.  drivers which manipulate flag values rather
than a VMA's specific flag values.

We also add helpers for testing, setting and clearing flags for VMA's and
VMA descriptors to reduce boilerplate.

Also add the EMPTY_VMA_FLAGS define to aid initialisation of empty flags.

Finally, update the userland VMA tests to add the helpers there so they
can be utilised as part of userland testing.

Link: https://lkml.kernel.org/r/885d4897d67a6a57c0b07fa182a7055ad752df11.1769097829.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Yury Norov <ynorov@nvidia.com>
Cc: Chris Mason <clm@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h               | 165 +++++++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h         |   4 +-
 tools/testing/vma/vma_internal.h | 147 +++++++++++++++++++++++++++++-----
 3 files changed, 295 insertions(+), 21 deletions(-)

(limited to 'tools')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d3d10c769d6f..aa99b28e7a8a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1059,6 +1059,171 @@ static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits)
 #define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \
 					 (const vma_flag_t []){__VA_ARGS__})
 
+/*  Test each of to_test flags in flags, non-atomically. */
+static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags,
+		vma_flags_t to_test)
+{
+	const unsigned long *bitmap = flags->__vma_flags;
+	const unsigned long *bitmap_to_test = to_test.__vma_flags;
+
+	return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Test whether any specified VMA flag is set, e.g.:
+ *
+ * if (vma_flags_test(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... }
+ */
+#define vma_flags_test(flags, ...) \
+	vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+/* Test that ALL of the to_test flags are set, non-atomically. */
+static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags,
+		vma_flags_t to_test)
+{
+	const unsigned long *bitmap = flags->__vma_flags;
+	const unsigned long *bitmap_to_test = to_test.__vma_flags;
+
+	return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Test whether ALL specified VMA flags are set, e.g.:
+ *
+ * if (vma_flags_test_all(flags, VMA_READ_BIT, VMA_MAYREAD_BIT)) { ... }
+ */
+#define vma_flags_test_all(flags, ...) \
+	vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+/* Set each of the to_set flags in flags, non-atomically. */
+static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set)
+{
+	unsigned long *bitmap = flags->__vma_flags;
+	const unsigned long *bitmap_to_set = to_set.__vma_flags;
+
+	bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Set all specified VMA flags, e.g.:
+ *
+ * vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
+ */
+#define vma_flags_set(flags, ...) \
+	vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+/* Clear all of the to-clear flags in flags, non-atomically. */
+static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear)
+{
+	unsigned long *bitmap = flags->__vma_flags;
+	const unsigned long *bitmap_to_clear = to_clear.__vma_flags;
+
+	bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS);
+}
+
+/*
+ * Clear all specified individual flags, e.g.:
+ *
+ * vma_flags_clear(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
+ */
+#define vma_flags_clear(flags, ...) \
+	vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+/*
+ * Helper to test that ALL specified flags are set in a VMA.
+ *
+ * Note: appropriate locks must be held, this function does not acquire them for
+ * you.
+ */
+static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma,
+					   vma_flags_t flags)
+{
+	return vma_flags_test_all_mask(&vma->flags, flags);
+}
+
+/*
+ * Helper macro for checking that ALL specified flags are set in a VMA, e.g.:
+ *
+ * if (vma_test_all_flags(vma, VMA_READ_BIT, VMA_MAYREAD_BIT) { ... }
+ */
+#define vma_test_all_flags(vma, ...) \
+	vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
+
+/*
+ * Helper to set all VMA flags in a VMA.
+ *
+ * Note: appropriate locks must be held, this function does not acquire them for
+ * you.
+ */
+static inline void vma_set_flags_mask(struct vm_area_struct *vma,
+				      vma_flags_t flags)
+{
+	vma_flags_set_mask(&vma->flags, flags);
+}
+
+/*
+ * Helper macro for specifying VMA flags in a VMA, e.g.:
+ *
+ * vma_set_flags(vma, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
+ * 		VMA_DONTDUMP_BIT);
+ *
+ * Note: appropriate locks must be held, this function does not acquire them for
+ * you.
+ */
+#define vma_set_flags(vma, ...) \
+	vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
+
+/* Helper to test all VMA flags in a VMA descriptor. */
+static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc,
+					    vma_flags_t flags)
+{
+	return vma_flags_test_mask(&desc->vma_flags, flags);
+}
+
+/*
+ * Helper macro for testing VMA flags for an input pointer to a struct
+ * vm_area_desc object describing a proposed VMA, e.g.:
+ *
+ * if (vma_desc_test_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT,
+ *		VMA_DONTEXPAND_BIT, VMA_DONTDUMP_BIT)) { ... }
+ */
+#define vma_desc_test_flags(desc, ...) \
+	vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+/* Helper to set all VMA flags in a VMA descriptor. */
+static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc,
+					   vma_flags_t flags)
+{
+	vma_flags_set_mask(&desc->vma_flags, flags);
+}
+
+/*
+ * Helper macro for specifying VMA flags for an input pointer to a struct
+ * vm_area_desc object describing a proposed VMA, e.g.:
+ *
+ * vma_desc_set_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
+ * 		VMA_DONTDUMP_BIT);
+ */
+#define vma_desc_set_flags(desc, ...) \
+	vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+/* Helper to clear all VMA flags in a VMA descriptor. */
+static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc,
+					     vma_flags_t flags)
+{
+	vma_flags_clear_mask(&desc->vma_flags, flags);
+}
+
+/*
+ * Helper macro for clearing VMA flags for an input pointer to a struct
+ * vm_area_desc object describing a proposed VMA, e.g.:
+ *
+ * vma_desc_clear_flags(desc, VMA_IO_BIT, VMA_PFNMAP_BIT, VMA_DONTEXPAND_BIT,
+ * 		VMA_DONTDUMP_BIT);
+ */
+#define vma_desc_clear_flags(desc, ...) \
+	vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
 static inline void vma_set_anonymous(struct vm_area_struct *vma)
 {
 	vma->vm_ops = NULL;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ed0e128361f7..9b4311cfd5e8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -844,7 +844,7 @@ struct mmap_action {
 
 	/*
 	 * If specified, this hook is invoked when an error occurred when
-	 * attempting the selection action.
+	 * attempting the selected action.
 	 *
 	 * The hook can return an error code in order to filter the error, but
 	 * it is not valid to clear the error here.
@@ -868,6 +868,8 @@ typedef struct {
 	DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS);
 } vma_flags_t;
 
+#define EMPTY_VMA_FLAGS ((vma_flags_t){ })
+
 /*
  * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
  * manipulate mutable fields which will cause those fields to be updated in the
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index ca4eb563b29b..2b01794cbd61 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -21,7 +21,13 @@
 
 #include <stdlib.h>
 
+#ifdef __CONCAT
+#undef __CONCAT
+#endif
+
+#include <linux/args.h>
 #include <linux/atomic.h>
+#include <linux/bitmap.h>
 #include <linux/list.h>
 #include <linux/maple_tree.h>
 #include <linux/mm.h>
@@ -38,6 +44,8 @@ extern unsigned long dac_mmap_min_addr;
 #define dac_mmap_min_addr	0UL
 #endif
 
+#define ACCESS_PRIVATE(p, member) ((p)->member)
+
 #define VM_WARN_ON(_expr) (WARN_ON(_expr))
 #define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr))
 #define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr))
@@ -533,6 +541,8 @@ typedef struct {
 	DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS);
 } __private vma_flags_t;
 
+#define EMPTY_VMA_FLAGS ((vma_flags_t){ })
+
 struct mm_struct {
 	struct maple_tree mm_mt;
 	int map_count;			/* number of VMAs */
@@ -882,6 +892,123 @@ static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
 	return __pgprot(vm_flags);
 }
 
+static inline void vma_flags_clear_all(vma_flags_t *flags)
+{
+	bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS);
+}
+
+static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit)
+{
+	unsigned long *bitmap = flags->__vma_flags;
+
+	__set_bit((__force int)bit, bitmap);
+}
+
+static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits)
+{
+	vma_flags_t flags;
+	int i;
+
+	vma_flags_clear_all(&flags);
+	for (i = 0; i < count; i++)
+		vma_flag_set(&flags, bits[i]);
+	return flags;
+}
+
+#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \
+					 (const vma_flag_t []){__VA_ARGS__})
+
+static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags,
+		vma_flags_t to_test)
+{
+	const unsigned long *bitmap = flags->__vma_flags;
+	const unsigned long *bitmap_to_test = to_test.__vma_flags;
+
+	return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
+}
+
+#define vma_flags_test(flags, ...) \
+	vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags,
+		vma_flags_t to_test)
+{
+	const unsigned long *bitmap = flags->__vma_flags;
+	const unsigned long *bitmap_to_test = to_test.__vma_flags;
+
+	return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
+}
+
+#define vma_flags_test_all(flags, ...) \
+	vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set)
+{
+	unsigned long *bitmap = flags->__vma_flags;
+	const unsigned long *bitmap_to_set = to_set.__vma_flags;
+
+	bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS);
+}
+
+#define vma_flags_set(flags, ...) \
+	vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear)
+{
+	unsigned long *bitmap = flags->__vma_flags;
+	const unsigned long *bitmap_to_clear = to_clear.__vma_flags;
+
+	bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS);
+}
+
+#define vma_flags_clear(flags, ...) \
+	vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma,
+					   vma_flags_t flags)
+{
+	return vma_flags_test_all_mask(&vma->flags, flags);
+}
+
+#define vma_test_all_flags(vma, ...) \
+	vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
+
+static inline void vma_set_flags_mask(struct vm_area_struct *vma,
+				      vma_flags_t flags)
+{
+	vma_flags_set_mask(&vma->flags, flags);
+}
+
+#define vma_set_flags(vma, ...) \
+	vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
+
+static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc,
+					    vma_flags_t flags)
+{
+	return vma_flags_test_mask(&desc->vma_flags, flags);
+}
+
+#define vma_desc_test_flags(desc, ...) \
+	vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc,
+					   vma_flags_t flags)
+{
+	vma_flags_set_mask(&desc->vma_flags, flags);
+}
+
+#define vma_desc_set_flags(desc, ...) \
+	vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc,
+					     vma_flags_t flags)
+{
+	vma_flags_clear_mask(&desc->vma_flags, flags);
+}
+
+#define vma_desc_clear_flags(desc, ...) \
+	vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
 static inline bool is_shared_maywrite(vm_flags_t vm_flags)
 {
 	return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
@@ -1540,31 +1667,11 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
 {
 }
 
-#define ACCESS_PRIVATE(p, member) ((p)->member)
-
-#define bitmap_size(nbits)	(ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE)
-
-static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
-{
-	unsigned int len = bitmap_size(nbits);
-
-	if (small_const_nbits(nbits))
-		*dst = 0;
-	else
-		memset(dst, 0, len);
-}
-
 static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
 {
 	return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
 }
 
-/* Clears all bits in the VMA flags bitmap, non-atomically. */
-static inline void vma_flags_clear_all(vma_flags_t *flags)
-{
-	bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS);
-}
-
 /*
  * Copy value to the first system word of VMA flags, non-atomically.
  *
-- 
cgit v1.2.3


From 53f1d936445131cb5da2212c2b60884a25cb0330 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 22 Jan 2026 16:06:19 +0000
Subject: mm: make vm_area_desc utilise vma_flags_t only

Now we have eliminated all uses of vm_area_desc->vm_flags, eliminate this
field, and have mmap_prepare users utilise the vma_flags_t
vm_area_desc->vma_flags field only.

As part of this change we alter is_shared_maywrite() to accept a
vma_flags_t parameter, and introduce is_shared_maywrite_vm_flags() for use
with legacy vm_flags_t flags.

We also update struct mmap_state to add a union between vma_flags and
vm_flags temporarily until the mmap logic is also converted to using
vma_flags_t.

Also update the VMA userland tests to reflect this change.

Link: https://lkml.kernel.org/r/fd2a2938b246b4505321954062b1caba7acfc77a.1769097829.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Yury Norov <ynorov@nvidia.com>
Cc: Chris Mason <clm@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h               |  9 +++++++--
 include/linux/mm_types.h         |  5 +----
 mm/filemap.c                     |  2 +-
 mm/util.c                        |  2 +-
 mm/vma.c                         | 11 +++++++----
 mm/vma.h                         |  3 +--
 tools/testing/vma/vma_internal.h |  9 +++++++--
 7 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'tools')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 05d950805701..f8a8fd47399c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1290,15 +1290,20 @@ static inline bool vma_is_accessible(const struct vm_area_struct *vma)
 	return vma->vm_flags & VM_ACCESS_FLAGS;
 }
 
-static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags)
 {
 	return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
 		(VM_SHARED | VM_MAYWRITE);
 }
 
+static inline bool is_shared_maywrite(const vma_flags_t *flags)
+{
+	return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT);
+}
+
 static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma)
 {
-	return is_shared_maywrite(vma->vm_flags);
+	return is_shared_maywrite(&vma->flags);
 }
 
 static inline
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9b4311cfd5e8..3cc8ae722886 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -887,10 +887,7 @@ struct vm_area_desc {
 	/* Mutable fields. Populated with initial state. */
 	pgoff_t pgoff;
 	struct file *vm_file;
-	union {
-		vm_flags_t vm_flags;
-		vma_flags_t vma_flags;
-	};
+	vma_flags_t vma_flags;
 	pgprot_t page_prot;
 
 	/* Write-only fields. */
diff --git a/mm/filemap.c b/mm/filemap.c
index ebd75684cb0a..6cd7974d4ada 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -4012,7 +4012,7 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
 
 int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
 {
-	if (is_shared_maywrite(desc->vm_flags))
+	if (is_shared_maywrite(&desc->vma_flags))
 		return -EINVAL;
 	return generic_file_mmap_prepare(desc);
 }
diff --git a/mm/util.c b/mm/util.c
index 97cae40c0209..b05ab6f97e11 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1154,7 +1154,7 @@ int __compat_vma_mmap(const struct file_operations *f_op,
 
 		.pgoff = vma->vm_pgoff,
 		.vm_file = vma->vm_file,
-		.vm_flags = vma->vm_flags,
+		.vma_flags = vma->flags,
 		.page_prot = vma->vm_page_prot,
 
 		.action.type = MMAP_NOTHING, /* Default */
diff --git a/mm/vma.c b/mm/vma.c
index 39dcd9ddd4ba..be64f781a3aa 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -15,7 +15,10 @@ struct mmap_state {
 	unsigned long end;
 	pgoff_t pgoff;
 	unsigned long pglen;
-	vm_flags_t vm_flags;
+	union {
+		vm_flags_t vm_flags;
+		vma_flags_t vma_flags;
+	};
 	struct file *file;
 	pgprot_t page_prot;
 
@@ -2369,7 +2372,7 @@ static void set_desc_from_map(struct vm_area_desc *desc,
 
 	desc->pgoff = map->pgoff;
 	desc->vm_file = map->file;
-	desc->vm_flags = map->vm_flags;
+	desc->vma_flags = map->vma_flags;
 	desc->page_prot = map->page_prot;
 }
 
@@ -2650,7 +2653,7 @@ static int call_mmap_prepare(struct mmap_state *map,
 		map->file_doesnt_need_get = true;
 		map->file = desc->vm_file;
 	}
-	map->vm_flags = desc->vm_flags;
+	map->vma_flags = desc->vma_flags;
 	map->page_prot = desc->page_prot;
 	/* User-defined fields. */
 	map->vm_ops = desc->vm_ops;
@@ -2823,7 +2826,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 		return -EINVAL;
 
 	/* Map writable and ensure this isn't a sealed memfd. */
-	if (file && is_shared_maywrite(vm_flags)) {
+	if (file && is_shared_maywrite_vm_flags(vm_flags)) {
 		int error = mapping_map_writable(file->f_mapping);
 
 		if (error)
diff --git a/mm/vma.h b/mm/vma.h
index de30c69bceaf..eba388c61ef4 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -309,8 +309,7 @@ static inline void set_vma_from_desc(struct vm_area_struct *vma,
 	vma->vm_pgoff = desc->pgoff;
 	if (desc->vm_file != vma->vm_file)
 		vma_set_file(vma, desc->vm_file);
-	if (desc->vm_flags != vma->vm_flags)
-		vm_flags_set(vma, desc->vm_flags);
+	vma->flags = desc->vma_flags;
 	vma->vm_page_prot = desc->page_prot;
 
 	/* User-defined fields. */
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 2b01794cbd61..2743f12ecf32 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -1009,15 +1009,20 @@ static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc,
 #define vma_desc_clear_flags(desc, ...) \
 	vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
 
-static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags)
 {
 	return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
 		(VM_SHARED | VM_MAYWRITE);
 }
 
+static inline bool is_shared_maywrite(const vma_flags_t *flags)
+{
+	return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT);
+}
+
 static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
 {
-	return is_shared_maywrite(vma->vm_flags);
+	return is_shared_maywrite(&vma->flags);
 }
 
 static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
-- 
cgit v1.2.3


From 6aacab308a5dfd222b2d23662bbae60c11007cfb Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 22 Jan 2026 16:06:20 +0000
Subject: tools/testing/vma: separate VMA userland tests into separate files

So far the userland VMA tests have been established as a rough expression
of what's been possible.

Adapt it into a more usable form by separating out tests and shared
helper functions.

Since we test functions that are declared statically in mm/vma.c, we make
use of the trick of #include'ing kernel C files directly.

In order for the tests to continue to function, we must therefore also
this way into the tests/ directory.

We try to keep as much shared logic actually modularised into a separate
compilation unit in shared.c, however the merge_existing() and
attach_vma() helpers rely on statically declared mm/vma.c functions so
these must be declared in main.c.

Link: https://lkml.kernel.org/r/a0455ccfe4fdcd1c962c64f76304f612e5662a4e.1769097829.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Yury Norov <ynorov@nvidia.com>
Cc: Chris Mason <clm@fb.com>
Cc: Pedro Falcato <pfalcato@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/Makefile       |    4 +-
 tools/testing/vma/main.c         |   55 ++
 tools/testing/vma/shared.c       |  131 +++
 tools/testing/vma/shared.h       |  114 +++
 tools/testing/vma/tests/merge.c  | 1469 +++++++++++++++++++++++++++++++
 tools/testing/vma/tests/mmap.c   |   57 ++
 tools/testing/vma/tests/vma.c    |   39 +
 tools/testing/vma/vma.c          | 1785 --------------------------------------
 tools/testing/vma/vma_internal.h |    9 -
 9 files changed, 1867 insertions(+), 1796 deletions(-)
 create mode 100644 tools/testing/vma/main.c
 create mode 100644 tools/testing/vma/shared.c
 create mode 100644 tools/testing/vma/shared.h
 create mode 100644 tools/testing/vma/tests/merge.c
 create mode 100644 tools/testing/vma/tests/mmap.c
 create mode 100644 tools/testing/vma/tests/vma.c
 delete mode 100644 tools/testing/vma/vma.c

(limited to 'tools')

diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
index 66f3831a668f..94133d9d3955 100644
--- a/tools/testing/vma/Makefile
+++ b/tools/testing/vma/Makefile
@@ -6,10 +6,10 @@ default: vma
 
 include ../shared/shared.mk
 
-OFILES = $(SHARED_OFILES) vma.o maple-shim.o
+OFILES = $(SHARED_OFILES) main.o shared.o maple-shim.o
 TARGETS = vma
 
-vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h
+main.o: main.c shared.c shared.h vma_internal.h tests/merge.c tests/mmap.c tests/vma.c ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h
 
 vma:	$(OFILES)
 	$(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
diff --git a/tools/testing/vma/main.c b/tools/testing/vma/main.c
new file mode 100644
index 000000000000..49b09e97a51f
--- /dev/null
+++ b/tools/testing/vma/main.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "shared.h"
+/*
+ * Directly import the VMA implementation here. Our vma_internal.h wrapper
+ * provides userland-equivalent functionality for everything vma.c uses.
+ */
+#include "../../../mm/vma_init.c"
+#include "../../../mm/vma_exec.c"
+#include "../../../mm/vma.c"
+
+/* Tests are included directly so they can test static functions in mm/vma.c. */
+#include "tests/merge.c"
+#include "tests/mmap.c"
+#include "tests/vma.c"
+
+/* Helper functions which utilise static kernel functions. */
+
+struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg)
+{
+	struct vm_area_struct *vma;
+
+	vma = vma_merge_existing_range(vmg);
+	if (vma)
+		vma_assert_attached(vma);
+	return vma;
+}
+
+int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+	int res;
+
+	res = vma_link(mm, vma);
+	if (!res)
+		vma_assert_attached(vma);
+	return res;
+}
+
+/* Main test running which invokes tests/ *.c runners. */
+int main(void)
+{
+	int num_tests = 0, num_fail = 0;
+
+	maple_tree_init();
+	vma_state_init();
+
+	run_merge_tests(&num_tests, &num_fail);
+	run_mmap_tests(&num_tests, &num_fail);
+	run_vma_tests(&num_tests, &num_fail);
+
+	printf("%d tests run, %d passed, %d failed.\n",
+	       num_tests, num_tests - num_fail, num_fail);
+
+	return num_fail == 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tools/testing/vma/shared.c b/tools/testing/vma/shared.c
new file mode 100644
index 000000000000..bda578cc3304
--- /dev/null
+++ b/tools/testing/vma/shared.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "shared.h"
+
+
+bool fail_prealloc;
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
+
+const struct vm_operations_struct vma_dummy_vm_ops;
+struct anon_vma dummy_anon_vma;
+struct task_struct __current;
+
+struct vm_area_struct *alloc_vma(struct mm_struct *mm,
+		unsigned long start, unsigned long end,
+		pgoff_t pgoff, vm_flags_t vm_flags)
+{
+	struct vm_area_struct *vma = vm_area_alloc(mm);
+
+	if (vma == NULL)
+		return NULL;
+
+	vma->vm_start = start;
+	vma->vm_end = end;
+	vma->vm_pgoff = pgoff;
+	vm_flags_reset(vma, vm_flags);
+	vma_assert_detached(vma);
+
+	return vma;
+}
+
+void detach_free_vma(struct vm_area_struct *vma)
+{
+	vma_mark_detached(vma);
+	vm_area_free(vma);
+}
+
+struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
+		unsigned long start, unsigned long end,
+		pgoff_t pgoff, vm_flags_t vm_flags)
+{
+	struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, vm_flags);
+
+	if (vma == NULL)
+		return NULL;
+
+	if (attach_vma(mm, vma)) {
+		detach_free_vma(vma);
+		return NULL;
+	}
+
+	/*
+	 * Reset this counter which we use to track whether writes have
+	 * begun. Linking to the tree will have caused this to be incremented,
+	 * which means we will get a false positive otherwise.
+	 */
+	vma->vm_lock_seq = UINT_MAX;
+
+	return vma;
+}
+
+void reset_dummy_anon_vma(void)
+{
+	dummy_anon_vma.was_cloned = false;
+	dummy_anon_vma.was_unlinked = false;
+}
+
+int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi)
+{
+	struct vm_area_struct *vma;
+	int count = 0;
+
+	fail_prealloc = false;
+	reset_dummy_anon_vma();
+
+	vma_iter_set(vmi, 0);
+	for_each_vma(*vmi, vma) {
+		detach_free_vma(vma);
+		count++;
+	}
+
+	mtree_destroy(&mm->mm_mt);
+	mm->map_count = 0;
+	return count;
+}
+
+bool vma_write_started(struct vm_area_struct *vma)
+{
+	int seq = vma->vm_lock_seq;
+
+	/* We reset after each check. */
+	vma->vm_lock_seq = UINT_MAX;
+
+	/* The vma_start_write() stub simply increments this value. */
+	return seq > -1;
+}
+
+void __vma_set_dummy_anon_vma(struct vm_area_struct *vma,
+		struct anon_vma_chain *avc, struct anon_vma *anon_vma)
+{
+	vma->anon_vma = anon_vma;
+	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	list_add(&avc->same_vma, &vma->anon_vma_chain);
+	avc->anon_vma = vma->anon_vma;
+}
+
+void vma_set_dummy_anon_vma(struct vm_area_struct *vma,
+		struct anon_vma_chain *avc)
+{
+	__vma_set_dummy_anon_vma(vma, avc, &dummy_anon_vma);
+}
+
+struct task_struct *get_current(void)
+{
+	return &__current;
+}
+
+unsigned long rlimit(unsigned int limit)
+{
+	return (unsigned long)-1;
+}
+
+void vma_set_range(struct vm_area_struct *vma,
+		   unsigned long start, unsigned long end,
+		   pgoff_t pgoff)
+{
+	vma->vm_start = start;
+	vma->vm_end = end;
+	vma->vm_pgoff = pgoff;
+}
diff --git a/tools/testing/vma/shared.h b/tools/testing/vma/shared.h
new file mode 100644
index 000000000000..6c64211cfa22
--- /dev/null
+++ b/tools/testing/vma/shared.h
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "generated/bit-length.h"
+#include "maple-shared.h"
+#include "vma_internal.h"
+#include "../../../mm/vma.h"
+
+/* Simple test runner. Assumes local num_[fail, tests] counters. */
+#define TEST(name)							\
+	do {								\
+		(*num_tests)++;						\
+		if (!test_##name()) {					\
+			(*num_fail)++;					\
+			fprintf(stderr, "Test " #name " FAILED\n");	\
+		}							\
+	} while (0)
+
+#define ASSERT_TRUE(_expr)						\
+	do {								\
+		if (!(_expr)) {						\
+			fprintf(stderr,					\
+				"Assert FAILED at %s:%d:%s(): %s is FALSE.\n", \
+				__FILE__, __LINE__, __FUNCTION__, #_expr); \
+			return false;					\
+		}							\
+	} while (0)
+
+#define ASSERT_FALSE(_expr) ASSERT_TRUE(!(_expr))
+#define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2))
+#define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2))
+
+#define IS_SET(_val, _flags) ((_val & _flags) == _flags)
+
+extern bool fail_prealloc;
+
+/* Override vma_iter_prealloc() so we can choose to fail it. */
+#define vma_iter_prealloc(vmi, vma)					\
+	(fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL))
+
+#define CONFIG_DEFAULT_MMAP_MIN_ADDR 65536
+
+extern unsigned long mmap_min_addr;
+extern unsigned long dac_mmap_min_addr;
+extern unsigned long stack_guard_gap;
+
+extern const struct vm_operations_struct vma_dummy_vm_ops;
+extern struct anon_vma dummy_anon_vma;
+extern struct task_struct __current;
+
+/*
+ * Helper function which provides a wrapper around a merge existing VMA
+ * operation.
+ *
+ * Declared in main.c as uses static VMA function.
+ */
+struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg);
+
+/*
+ * Helper function to allocate a VMA and link it to the tree.
+ *
+ * Declared in main.c as uses static VMA function.
+ */
+int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma);
+
+/* Helper function providing a dummy vm_ops->close() method.*/
+static inline void dummy_close(struct vm_area_struct *)
+{
+}
+
+/* Helper function to simply allocate a VMA. */
+struct vm_area_struct *alloc_vma(struct mm_struct *mm,
+		unsigned long start, unsigned long end,
+		pgoff_t pgoff, vm_flags_t vm_flags);
+
+/* Helper function to detach and free a VMA. */
+void detach_free_vma(struct vm_area_struct *vma);
+
+/* Helper function to allocate a VMA and link it to the tree. */
+struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
+		unsigned long start, unsigned long end,
+		pgoff_t pgoff, vm_flags_t vm_flags);
+
+/*
+ * Helper function to reset the dummy anon_vma to indicate it has not been
+ * duplicated.
+ */
+void reset_dummy_anon_vma(void);
+
+/*
+ * Helper function to remove all VMAs and destroy the maple tree associated with
+ * a virtual address space. Returns a count of VMAs in the tree.
+ */
+int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi);
+
+/* Helper function to determine if VMA has had vma_start_write() performed. */
+bool vma_write_started(struct vm_area_struct *vma);
+
+void __vma_set_dummy_anon_vma(struct vm_area_struct *vma,
+		struct anon_vma_chain *avc, struct anon_vma *anon_vma);
+
+/* Provide a simple dummy VMA/anon_vma dummy setup for testing. */
+void vma_set_dummy_anon_vma(struct vm_area_struct *vma,
+			    struct anon_vma_chain *avc);
+
+/* Helper function to specify a VMA's range. */
+void vma_set_range(struct vm_area_struct *vma,
+		   unsigned long start, unsigned long end,
+		   pgoff_t pgoff);
diff --git a/tools/testing/vma/tests/merge.c b/tools/testing/vma/tests/merge.c
new file mode 100644
index 000000000000..3708dc6945b0
--- /dev/null
+++ b/tools/testing/vma/tests/merge.c
@@ -0,0 +1,1469 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* Helper function which provides a wrapper around a merge new VMA operation. */
+static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg)
+{
+	struct vm_area_struct *vma;
+	/*
+	 * For convenience, get prev and next VMAs. Which the new VMA operation
+	 * requires.
+	 */
+	vmg->next = vma_next(vmg->vmi);
+	vmg->prev = vma_prev(vmg->vmi);
+	vma_iter_next_range(vmg->vmi);
+
+	vma = vma_merge_new_range(vmg);
+	if (vma)
+		vma_assert_attached(vma);
+
+	return vma;
+}
+
+/*
+ * Helper function which provides a wrapper around the expansion of an existing
+ * VMA.
+ */
+static int expand_existing(struct vma_merge_struct *vmg)
+{
+	return vma_expand(vmg);
+}
+
+/*
+ * Helper function to reset merge state the associated VMA iterator to a
+ * specified new range.
+ */
+void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start,
+		   unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags)
+{
+	vma_iter_set(vmg->vmi, start);
+
+	vmg->prev = NULL;
+	vmg->middle = NULL;
+	vmg->next = NULL;
+	vmg->target = NULL;
+
+	vmg->start = start;
+	vmg->end = end;
+	vmg->pgoff = pgoff;
+	vmg->vm_flags = vm_flags;
+
+	vmg->just_expand = false;
+	vmg->__remove_middle = false;
+	vmg->__remove_next = false;
+	vmg->__adjust_middle_start = false;
+	vmg->__adjust_next_start = false;
+}
+
+/* Helper function to set both the VMG range and its anon_vma. */
+static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long start,
+		unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags,
+		struct anon_vma *anon_vma)
+{
+	vmg_set_range(vmg, start, end, pgoff, vm_flags);
+	vmg->anon_vma = anon_vma;
+}
+
+/*
+ * Helper function to try to merge a new VMA.
+ *
+ * Update vmg and the iterator for it and try to merge, otherwise allocate a new
+ * VMA, link it to the maple tree and return it.
+ */
+static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm,
+		struct vma_merge_struct *vmg, unsigned long start,
+		unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags,
+		bool *was_merged)
+{
+	struct vm_area_struct *merged;
+
+	vmg_set_range(vmg, start, end, pgoff, vm_flags);
+
+	merged = merge_new(vmg);
+	if (merged) {
+		*was_merged = true;
+		ASSERT_EQ(vmg->state, VMA_MERGE_SUCCESS);
+		return merged;
+	}
+
+	*was_merged = false;
+
+	ASSERT_EQ(vmg->state, VMA_MERGE_NOMERGE);
+
+	return alloc_and_link_vma(mm, start, end, pgoff, vm_flags);
+}
+
+static bool test_simple_merge(void)
+{
+	struct vm_area_struct *vma;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, vm_flags);
+	struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, vm_flags);
+	VMA_ITERATOR(vmi, &mm, 0x1000);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+		.start = 0x1000,
+		.end = 0x2000,
+		.vm_flags = vm_flags,
+		.pgoff = 1,
+	};
+
+	ASSERT_FALSE(attach_vma(&mm, vma_left));
+	ASSERT_FALSE(attach_vma(&mm, vma_right));
+
+	vma = merge_new(&vmg);
+	ASSERT_NE(vma, NULL);
+
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x3000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_EQ(vma->vm_flags, vm_flags);
+
+	detach_free_vma(vma);
+	mtree_destroy(&mm.mm_mt);
+
+	return true;
+}
+
+static bool test_simple_modify(void)
+{
+	struct vm_area_struct *vma;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags);
+	VMA_ITERATOR(vmi, &mm, 0x1000);
+	vm_flags_t flags = VM_READ | VM_MAYREAD;
+
+	ASSERT_FALSE(attach_vma(&mm, init_vma));
+
+	/*
+	 * The flags will not be changed, the vma_modify_flags() function
+	 * performs the merge/split only.
+	 */
+	vma = vma_modify_flags(&vmi, init_vma, init_vma,
+			       0x1000, 0x2000, &flags);
+	ASSERT_NE(vma, NULL);
+	/* We modify the provided VMA, and on split allocate new VMAs. */
+	ASSERT_EQ(vma, init_vma);
+
+	ASSERT_EQ(vma->vm_start, 0x1000);
+	ASSERT_EQ(vma->vm_end, 0x2000);
+	ASSERT_EQ(vma->vm_pgoff, 1);
+
+	/*
+	 * Now walk through the three split VMAs and make sure they are as
+	 * expected.
+	 */
+
+	vma_iter_set(&vmi, 0);
+	vma = vma_iter_load(&vmi);
+
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x1000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+
+	detach_free_vma(vma);
+	vma_iter_clear(&vmi);
+
+	vma = vma_next(&vmi);
+
+	ASSERT_EQ(vma->vm_start, 0x1000);
+	ASSERT_EQ(vma->vm_end, 0x2000);
+	ASSERT_EQ(vma->vm_pgoff, 1);
+
+	detach_free_vma(vma);
+	vma_iter_clear(&vmi);
+
+	vma = vma_next(&vmi);
+
+	ASSERT_EQ(vma->vm_start, 0x2000);
+	ASSERT_EQ(vma->vm_end, 0x3000);
+	ASSERT_EQ(vma->vm_pgoff, 2);
+
+	detach_free_vma(vma);
+	mtree_destroy(&mm.mm_mt);
+
+	return true;
+}
+
+static bool test_simple_expand(void)
+{
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, vm_flags);
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.vmi = &vmi,
+		.target = vma,
+		.start = 0,
+		.end = 0x3000,
+		.pgoff = 0,
+	};
+
+	ASSERT_FALSE(attach_vma(&mm, vma));
+
+	ASSERT_FALSE(expand_existing(&vmg));
+
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x3000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+
+	detach_free_vma(vma);
+	mtree_destroy(&mm.mm_mt);
+
+	return true;
+}
+
+static bool test_simple_shrink(void)
+{
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags);
+	VMA_ITERATOR(vmi, &mm, 0);
+
+	ASSERT_FALSE(attach_vma(&mm, vma));
+
+	ASSERT_FALSE(vma_shrink(&vmi, vma, 0, 0x1000, 0));
+
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x1000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+
+	detach_free_vma(vma);
+	mtree_destroy(&mm.mm_mt);
+
+	return true;
+}
+
+static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, bool c_is_sticky)
+{
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	struct anon_vma_chain dummy_anon_vma_chain_a = {
+		.anon_vma = &dummy_anon_vma,
+	};
+	struct anon_vma_chain dummy_anon_vma_chain_b = {
+		.anon_vma = &dummy_anon_vma,
+	};
+	struct anon_vma_chain dummy_anon_vma_chain_c = {
+		.anon_vma = &dummy_anon_vma,
+	};
+	struct anon_vma_chain dummy_anon_vma_chain_d = {
+		.anon_vma = &dummy_anon_vma,
+	};
+	const struct vm_operations_struct vm_ops = {
+		.close = dummy_close,
+	};
+	int count;
+	struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d;
+	bool merged;
+
+	if (is_sticky)
+		vm_flags |= VM_STICKY;
+
+	/*
+	 * 0123456789abc
+	 * AA B       CC
+	 */
+	vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
+	ASSERT_NE(vma_a, NULL);
+	if (a_is_sticky)
+		vm_flags_set(vma_a, VM_STICKY);
+	/* We give each VMA a single avc so we can test anon_vma duplication. */
+	INIT_LIST_HEAD(&vma_a->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain);
+
+	vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags);
+	ASSERT_NE(vma_b, NULL);
+	if (b_is_sticky)
+		vm_flags_set(vma_b, VM_STICKY);
+	INIT_LIST_HEAD(&vma_b->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain);
+
+	vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, vm_flags);
+	ASSERT_NE(vma_c, NULL);
+	if (c_is_sticky)
+		vm_flags_set(vma_c, VM_STICKY);
+	INIT_LIST_HEAD(&vma_c->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain);
+
+	/*
+	 * NO merge.
+	 *
+	 * 0123456789abc
+	 * AA B   **  CC
+	 */
+	vma_d = try_merge_new_vma(&mm, &vmg, 0x7000, 0x9000, 7, vm_flags, &merged);
+	ASSERT_NE(vma_d, NULL);
+	INIT_LIST_HEAD(&vma_d->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain_d.same_vma, &vma_d->anon_vma_chain);
+	ASSERT_FALSE(merged);
+	ASSERT_EQ(mm.map_count, 4);
+
+	/*
+	 * Merge BOTH sides.
+	 *
+	 * 0123456789abc
+	 * AA*B   DD  CC
+	 */
+	vma_a->vm_ops = &vm_ops; /* This should have no impact. */
+	vma_b->anon_vma = &dummy_anon_vma;
+	vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, vm_flags, &merged);
+	ASSERT_EQ(vma, vma_a);
+	/* Merge with A, delete B. */
+	ASSERT_TRUE(merged);
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x4000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 3);
+	if (is_sticky || a_is_sticky || b_is_sticky)
+		ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY));
+
+	/*
+	 * Merge to PREVIOUS VMA.
+	 *
+	 * 0123456789abc
+	 * AAAA*  DD  CC
+	 */
+	vma = try_merge_new_vma(&mm, &vmg, 0x4000, 0x5000, 4, vm_flags, &merged);
+	ASSERT_EQ(vma, vma_a);
+	/* Extend A. */
+	ASSERT_TRUE(merged);
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x5000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 3);
+	if (is_sticky || a_is_sticky)
+		ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY));
+
+	/*
+	 * Merge to NEXT VMA.
+	 *
+	 * 0123456789abc
+	 * AAAAA *DD  CC
+	 */
+	vma_d->anon_vma = &dummy_anon_vma;
+	vma_d->vm_ops = &vm_ops; /* This should have no impact. */
+	vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, vm_flags, &merged);
+	ASSERT_EQ(vma, vma_d);
+	/* Prepend. */
+	ASSERT_TRUE(merged);
+	ASSERT_EQ(vma->vm_start, 0x6000);
+	ASSERT_EQ(vma->vm_end, 0x9000);
+	ASSERT_EQ(vma->vm_pgoff, 6);
+	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 3);
+	if (is_sticky) /* D uses is_sticky. */
+		ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY));
+
+	/*
+	 * Merge BOTH sides.
+	 *
+	 * 0123456789abc
+	 * AAAAA*DDD  CC
+	 */
+	vma_d->vm_ops = NULL; /* This would otherwise degrade the merge. */
+	vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, vm_flags, &merged);
+	ASSERT_EQ(vma, vma_a);
+	/* Merge with A, delete D. */
+	ASSERT_TRUE(merged);
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x9000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 2);
+	if (is_sticky || a_is_sticky)
+		ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY));
+
+	/*
+	 * Merge to NEXT VMA.
+	 *
+	 * 0123456789abc
+	 * AAAAAAAAA *CC
+	 */
+	vma_c->anon_vma = &dummy_anon_vma;
+	vma = try_merge_new_vma(&mm, &vmg, 0xa000, 0xb000, 0xa, vm_flags, &merged);
+	ASSERT_EQ(vma, vma_c);
+	/* Prepend C. */
+	ASSERT_TRUE(merged);
+	ASSERT_EQ(vma->vm_start, 0xa000);
+	ASSERT_EQ(vma->vm_end, 0xc000);
+	ASSERT_EQ(vma->vm_pgoff, 0xa);
+	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 2);
+	if (is_sticky || c_is_sticky)
+		ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY));
+
+	/*
+	 * Merge BOTH sides.
+	 *
+	 * 0123456789abc
+	 * AAAAAAAAA*CCC
+	 */
+	vma = try_merge_new_vma(&mm, &vmg, 0x9000, 0xa000, 0x9, vm_flags, &merged);
+	ASSERT_EQ(vma, vma_a);
+	/* Extend A and delete C. */
+	ASSERT_TRUE(merged);
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0xc000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 1);
+	if (is_sticky || a_is_sticky || c_is_sticky)
+		ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY));
+
+	/*
+	 * Final state.
+	 *
+	 * 0123456789abc
+	 * AAAAAAAAAAAAA
+	 */
+
+	count = 0;
+	vma_iter_set(&vmi, 0);
+	for_each_vma(vmi, vma) {
+		ASSERT_NE(vma, NULL);
+		ASSERT_EQ(vma->vm_start, 0);
+		ASSERT_EQ(vma->vm_end, 0xc000);
+		ASSERT_EQ(vma->vm_pgoff, 0);
+		ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+
+		detach_free_vma(vma);
+		count++;
+	}
+
+	/* Should only have one VMA left (though freed) after all is done.*/
+	ASSERT_EQ(count, 1);
+
+	mtree_destroy(&mm.mm_mt);
+	return true;
+}
+
+static bool test_merge_new(void)
+{
+	int i, j, k, l;
+
+	/* Generate every possible permutation of sticky flags. */
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < 2; j++)
+			for (k = 0; k < 2; k++)
+				for (l = 0; l < 2; l++)
+					ASSERT_TRUE(__test_merge_new(i, j, k, l));
+
+	return true;
+}
+
+static bool test_vma_merge_special_flags(void)
+{
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	vm_flags_t special_flags[] = { VM_IO, VM_DONTEXPAND, VM_PFNMAP, VM_MIXEDMAP };
+	vm_flags_t all_special_flags = 0;
+	int i;
+	struct vm_area_struct *vma_left, *vma;
+
+	/* Make sure there aren't new VM_SPECIAL flags. */
+	for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
+		all_special_flags |= special_flags[i];
+	}
+	ASSERT_EQ(all_special_flags, VM_SPECIAL);
+
+	/*
+	 * 01234
+	 * AAA
+	 */
+	vma_left = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	ASSERT_NE(vma_left, NULL);
+
+	/* 1. Set up new VMA with special flag that would otherwise merge. */
+
+	/*
+	 * 01234
+	 * AAA*
+	 *
+	 * This should merge if not for the VM_SPECIAL flag.
+	 */
+	vmg_set_range(&vmg, 0x3000, 0x4000, 3, vm_flags);
+	for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
+		vm_flags_t special_flag = special_flags[i];
+
+		vm_flags_reset(vma_left, vm_flags | special_flag);
+		vmg.vm_flags = vm_flags | special_flag;
+		vma = merge_new(&vmg);
+		ASSERT_EQ(vma, NULL);
+		ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+	}
+
+	/* 2. Modify VMA with special flag that would otherwise merge. */
+
+	/*
+	 * 01234
+	 * AAAB
+	 *
+	 * Create a VMA to modify.
+	 */
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags);
+	ASSERT_NE(vma, NULL);
+	vmg.middle = vma;
+
+	for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
+		vm_flags_t special_flag = special_flags[i];
+
+		vm_flags_reset(vma_left, vm_flags | special_flag);
+		vmg.vm_flags = vm_flags | special_flag;
+		vma = merge_existing(&vmg);
+		ASSERT_EQ(vma, NULL);
+		ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+	}
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static bool test_vma_merge_with_close(void)
+{
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	const struct vm_operations_struct vm_ops = {
+		.close = dummy_close,
+	};
+	struct vm_area_struct *vma_prev, *vma_next, *vma;
+
+	/*
+	 * When merging VMAs we are not permitted to remove any VMA that has a
+	 * vm_ops->close() hook.
+	 *
+	 * Considering the two possible adjacent VMAs to which a VMA can be
+	 * merged:
+	 *
+	 * [ prev ][ vma ][ next ]
+	 *
+	 * In no case will we need to delete prev. If the operation is
+	 * mergeable, then prev will be extended with one or both of vma and
+	 * next deleted.
+	 *
+	 * As a result, during initial mergeability checks, only
+	 * can_vma_merge_before() (which implies the VMA being merged with is
+	 * 'next' as shown above) bothers to check to see whether the next VMA
+	 * has a vm_ops->close() callback that will need to be called when
+	 * removed.
+	 *
+	 * If it does, then we cannot merge as the resources that the close()
+	 * operation potentially clears down are tied only to the existing VMA
+	 * range and we have no way of extending those to the nearly merged one.
+	 *
+	 * We must consider two scenarios:
+	 *
+	 * A.
+	 *
+	 * vm_ops->close:     -       -    !NULL
+	 *                 [ prev ][ vma ][ next ]
+	 *
+	 * Where prev may or may not be present/mergeable.
+	 *
+	 * This is picked up by a specific check in can_vma_merge_before().
+	 *
+	 * B.
+	 *
+	 * vm_ops->close:     -     !NULL
+	 *                 [ prev ][ vma ]
+	 *
+	 * Where prev and vma are present and mergeable.
+	 *
+	 * This is picked up by a specific check in the modified VMA merge.
+	 *
+	 * IMPORTANT NOTE: We make the assumption that the following case:
+	 *
+	 *    -     !NULL   NULL
+	 * [ prev ][ vma ][ next ]
+	 *
+	 * Cannot occur, because vma->vm_ops being the same implies the same
+	 * vma->vm_file, and therefore this would mean that next->vm_ops->close
+	 * would be set too, and thus scenario A would pick this up.
+	 */
+
+	/*
+	 * The only case of a new VMA merge that results in a VMA being deleted
+	 * is one where both the previous and next VMAs are merged - in this
+	 * instance the next VMA is deleted, and the previous VMA is extended.
+	 *
+	 * If we are unable to do so, we reduce the operation to simply
+	 * extending the prev VMA and not merging next.
+	 *
+	 * 0123456789
+	 * PPP**NNNN
+	 *             ->
+	 * 0123456789
+	 * PPPPPPNNN
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
+	vma_next->vm_ops = &vm_ops;
+
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
+	ASSERT_EQ(merge_new(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x5000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
+
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	/*
+	 * When modifying an existing VMA there are further cases where we
+	 * delete VMAs.
+	 *
+	 *    <>
+	 * 0123456789
+	 * PPPVV
+	 *
+	 * In this instance, if vma has a close hook, the merge simply cannot
+	 * proceed.
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma->vm_ops = &vm_ops;
+
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
+	vmg.prev = vma_prev;
+	vmg.middle = vma;
+
+	/*
+	 * The VMA being modified in a way that would otherwise merge should
+	 * also fail.
+	 */
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	/*
+	 * This case is mirrored if merging with next.
+	 *
+	 *    <>
+	 * 0123456789
+	 *    VVNNNN
+	 *
+	 * In this instance, if vma has a close hook, the merge simply cannot
+	 * proceed.
+	 */
+
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
+	vma->vm_ops = &vm_ops;
+
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
+	vmg.middle = vma;
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+	/*
+	 * Initially this is misapprehended as an out of memory report, as the
+	 * close() check is handled in the same way as anon_vma duplication
+	 * failures, however a subsequent patch resolves this.
+	 */
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	/*
+	 * Finally, we consider two variants of the case where we modify a VMA
+	 * to merge with both the previous and next VMAs.
+	 *
+	 * The first variant is where vma has a close hook. In this instance, no
+	 * merge can proceed.
+	 *
+	 *    <>
+	 * 0123456789
+	 * PPPVVNNNN
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
+	vma->vm_ops = &vm_ops;
+
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
+	vmg.prev = vma_prev;
+	vmg.middle = vma;
+
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 3);
+
+	/*
+	 * The second variant is where next has a close hook. In this instance,
+	 * we reduce the operation to a merge between prev and vma.
+	 *
+	 *    <>
+	 * 0123456789
+	 * PPPVVNNNN
+	 *            ->
+	 * 0123456789
+	 * PPPPPNNNN
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
+	vma_next->vm_ops = &vm_ops;
+
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
+	vmg.prev = vma_prev;
+	vmg.middle = vma;
+
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x5000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
+
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	return true;
+}
+
+static bool test_vma_merge_new_with_close(void)
+{
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	struct vm_area_struct *vma_prev = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
+	struct vm_area_struct *vma_next = alloc_and_link_vma(&mm, 0x5000, 0x7000, 5, vm_flags);
+	const struct vm_operations_struct vm_ops = {
+		.close = dummy_close,
+	};
+	struct vm_area_struct *vma;
+
+	/*
+	 * We should allow the partial merge of a proposed new VMA if the
+	 * surrounding VMAs have vm_ops->close() hooks (but are otherwise
+	 * compatible), e.g.:
+	 *
+	 *        New VMA
+	 *    A  v-------v  B
+	 * |-----|       |-----|
+	 *  close         close
+	 *
+	 * Since the rule is to not DELETE a VMA with a close operation, this
+	 * should be permitted, only rather than expanding A and deleting B, we
+	 * should simply expand A and leave B intact, e.g.:
+	 *
+	 *        New VMA
+	 *       A          B
+	 * |------------||-----|
+	 *  close         close
+	 */
+
+	/* Have prev and next have a vm_ops->close() hook. */
+	vma_prev->vm_ops = &vm_ops;
+	vma_next->vm_ops = &vm_ops;
+
+	vmg_set_range(&vmg, 0x2000, 0x5000, 2, vm_flags);
+	vma = merge_new(&vmg);
+	ASSERT_NE(vma, NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x5000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_EQ(vma->vm_ops, &vm_ops);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 2);
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bool next_is_sticky)
+{
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t prev_flags = vm_flags;
+	vm_flags_t next_flags = vm_flags;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vm_area_struct *vma, *vma_prev, *vma_next;
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	const struct vm_operations_struct vm_ops = {
+		.close = dummy_close,
+	};
+	struct anon_vma_chain avc = {};
+
+	if (prev_is_sticky)
+		prev_flags |= VM_STICKY;
+	if (middle_is_sticky)
+		vm_flags |= VM_STICKY;
+	if (next_is_sticky)
+		next_flags |= VM_STICKY;
+
+	/*
+	 * Merge right case - partial span.
+	 *
+	 *    <->
+	 * 0123456789
+	 *   VVVVNNN
+	 *            ->
+	 * 0123456789
+	 *   VNNNNNN
+	 */
+	vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags);
+	vma->vm_ops = &vm_ops; /* This should have no impact. */
+	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags);
+	vma_next->vm_ops = &vm_ops; /* This should have no impact. */
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma);
+	vmg.middle = vma;
+	vmg.prev = vma;
+	vma_set_dummy_anon_vma(vma, &avc);
+	ASSERT_EQ(merge_existing(&vmg), vma_next);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma_next->vm_start, 0x3000);
+	ASSERT_EQ(vma_next->vm_end, 0x9000);
+	ASSERT_EQ(vma_next->vm_pgoff, 3);
+	ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
+	ASSERT_EQ(vma->vm_start, 0x2000);
+	ASSERT_EQ(vma->vm_end, 0x3000);
+	ASSERT_EQ(vma->vm_pgoff, 2);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_TRUE(vma_write_started(vma_next));
+	ASSERT_EQ(mm.map_count, 2);
+	if (middle_is_sticky || next_is_sticky)
+		ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY));
+
+	/* Clear down and reset. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	/*
+	 * Merge right case - full span.
+	 *
+	 *   <-->
+	 * 0123456789
+	 *   VVVVNNN
+	 *            ->
+	 * 0123456789
+	 *   NNNNNNN
+	 */
+	vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags);
+	vma_next->vm_ops = &vm_ops; /* This should have no impact. */
+	vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, vm_flags, &dummy_anon_vma);
+	vmg.middle = vma;
+	vma_set_dummy_anon_vma(vma, &avc);
+	ASSERT_EQ(merge_existing(&vmg), vma_next);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma_next->vm_start, 0x2000);
+	ASSERT_EQ(vma_next->vm_end, 0x9000);
+	ASSERT_EQ(vma_next->vm_pgoff, 2);
+	ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma_next));
+	ASSERT_EQ(mm.map_count, 1);
+	if (middle_is_sticky || next_is_sticky)
+		ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY));
+
+	/* Clear down and reset. We should have deleted vma. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
+
+	/*
+	 * Merge left case - partial span.
+	 *
+	 *    <->
+	 * 0123456789
+	 * PPPVVVV
+	 *            ->
+	 * 0123456789
+	 * PPPPPPV
+	 */
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags);
+	vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
+	vma->vm_ops = &vm_ops; /* This should have no impact. */
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma);
+	vmg.prev = vma_prev;
+	vmg.middle = vma;
+	vma_set_dummy_anon_vma(vma, &avc);
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x6000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_EQ(vma->vm_start, 0x6000);
+	ASSERT_EQ(vma->vm_end, 0x7000);
+	ASSERT_EQ(vma->vm_pgoff, 6);
+	ASSERT_TRUE(vma_write_started(vma_prev));
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 2);
+	if (prev_is_sticky || middle_is_sticky)
+		ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY));
+
+	/* Clear down and reset. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	/*
+	 * Merge left case - full span.
+	 *
+	 *    <-->
+	 * 0123456789
+	 * PPPVVVV
+	 *            ->
+	 * 0123456789
+	 * PPPPPPP
+	 */
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags);
+	vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma);
+	vmg.prev = vma_prev;
+	vmg.middle = vma;
+	vma_set_dummy_anon_vma(vma, &avc);
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x7000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma_prev));
+	ASSERT_EQ(mm.map_count, 1);
+	if (prev_is_sticky || middle_is_sticky)
+		ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY));
+
+	/* Clear down and reset. We should have deleted vma. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
+
+	/*
+	 * Merge both case.
+	 *
+	 *    <-->
+	 * 0123456789
+	 * PPPVVVVNNN
+	 *             ->
+	 * 0123456789
+	 * PPPPPPPPPP
+	 */
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags);
+	vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, next_flags);
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma);
+	vmg.prev = vma_prev;
+	vmg.middle = vma;
+	vma_set_dummy_anon_vma(vma, &avc);
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x9000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_write_started(vma_prev));
+	ASSERT_EQ(mm.map_count, 1);
+	if (prev_is_sticky || middle_is_sticky || next_is_sticky)
+		ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY));
+
+	/* Clear down and reset. We should have deleted prev and next. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
+
+	/*
+	 * Non-merge ranges. the modified VMA merge operation assumes that the
+	 * caller always specifies ranges within the input VMA so we need only
+	 * examine these cases.
+	 *
+	 *     -
+	 *      -
+	 *       -
+	 *     <->
+	 *     <>
+	 *      <>
+	 * 0123456789a
+	 * PPPVVVVVNNN
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, next_flags);
+
+	vmg_set_range(&vmg, 0x4000, 0x5000, 4, vm_flags);
+	vmg.prev = vma;
+	vmg.middle = vma;
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+
+	vmg_set_range(&vmg, 0x5000, 0x6000, 5, vm_flags);
+	vmg.prev = vma;
+	vmg.middle = vma;
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+
+	vmg_set_range(&vmg, 0x6000, 0x7000, 6, vm_flags);
+	vmg.prev = vma;
+	vmg.middle = vma;
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+
+	vmg_set_range(&vmg, 0x4000, 0x7000, 4, vm_flags);
+	vmg.prev = vma;
+	vmg.middle = vma;
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+
+	vmg_set_range(&vmg, 0x4000, 0x6000, 4, vm_flags);
+	vmg.prev = vma;
+	vmg.middle = vma;
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+
+	vmg_set_range(&vmg, 0x5000, 0x6000, 5, vm_flags);
+	vmg.prev = vma;
+	vmg.middle = vma;
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
+
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 3);
+
+	return true;
+}
+
+static bool test_merge_existing(void)
+{
+	int i, j, k;
+
+	/* Generate every possible permutation of sticky flags. */
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < 2; j++)
+			for (k = 0; k < 2; k++)
+				ASSERT_TRUE(__test_merge_existing(i, j, k));
+
+	return true;
+}
+
+static bool test_anon_vma_non_mergeable(void)
+{
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vm_area_struct *vma, *vma_prev, *vma_next;
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	struct anon_vma_chain dummy_anon_vma_chain_1 = {};
+	struct anon_vma_chain dummy_anon_vma_chain_2 = {};
+	struct anon_vma dummy_anon_vma_2;
+
+	/*
+	 * In the case of modified VMA merge, merging both left and right VMAs
+	 * but where prev and next have incompatible anon_vma objects, we revert
+	 * to a merge of prev and VMA:
+	 *
+	 *    <-->
+	 * 0123456789
+	 * PPPVVVVNNN
+	 *            ->
+	 * 0123456789
+	 * PPPPPPPNNN
+	 */
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags);
+
+	/*
+	 * Give both prev and next single anon_vma_chain fields, so they will
+	 * merge with the NULL vmg->anon_vma.
+	 *
+	 * However, when prev is compared to next, the merge should fail.
+	 */
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, NULL);
+	vmg.prev = vma_prev;
+	vmg.middle = vma;
+	vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1);
+	__vma_set_dummy_anon_vma(vma_next, &dummy_anon_vma_chain_2, &dummy_anon_vma_2);
+
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x7000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
+	ASSERT_TRUE(vma_write_started(vma_prev));
+	ASSERT_FALSE(vma_write_started(vma_next));
+
+	/* Clear down and reset. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	/*
+	 * Now consider the new VMA case. This is equivalent, only adding a new
+	 * VMA in a gap between prev and next.
+	 *
+	 *    <-->
+	 * 0123456789
+	 * PPP****NNN
+	 *            ->
+	 * 0123456789
+	 * PPPPPPPNNN
+	 */
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags);
+
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, NULL);
+	vmg.prev = vma_prev;
+	vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1);
+	__vma_set_dummy_anon_vma(vma_next, &dummy_anon_vma_chain_2, &dummy_anon_vma_2);
+
+	vmg.anon_vma = NULL;
+	ASSERT_EQ(merge_new(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x7000);
+	ASSERT_EQ(vma_prev->vm_pgoff, 0);
+	ASSERT_TRUE(vma_write_started(vma_prev));
+	ASSERT_FALSE(vma_write_started(vma_next));
+
+	/* Final cleanup. */
+	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+	return true;
+}
+
+static bool test_dup_anon_vma(void)
+{
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	struct anon_vma_chain dummy_anon_vma_chain = {
+		.anon_vma = &dummy_anon_vma,
+	};
+	struct vm_area_struct *vma_prev, *vma_next, *vma;
+
+	reset_dummy_anon_vma();
+
+	/*
+	 * Expanding a VMA delete the next one duplicates next's anon_vma and
+	 * assigns it to the expanded VMA.
+	 *
+	 * This covers new VMA merging, as these operations amount to a VMA
+	 * expand.
+	 */
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma_next->anon_vma = &dummy_anon_vma;
+
+	vmg_set_range(&vmg, 0, 0x5000, 0, vm_flags);
+	vmg.target = vma_prev;
+	vmg.next = vma_next;
+
+	ASSERT_EQ(expand_existing(&vmg), 0);
+
+	/* Will have been cloned. */
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
+
+	/* Cleanup ready for next run. */
+	cleanup_mm(&mm, &vmi);
+
+	/*
+	 * next has anon_vma, we assign to prev.
+	 *
+	 *         |<----->|
+	 * |-------*********-------|
+	 *   prev     vma     next
+	 *  extend   delete  delete
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags);
+
+	/* Initialise avc so mergeability check passes. */
+	INIT_LIST_HEAD(&vma_next->anon_vma_chain);
+	list_add(&dummy_anon_vma_chain.same_vma, &vma_next->anon_vma_chain);
+
+	vma_next->anon_vma = &dummy_anon_vma;
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
+	vmg.prev = vma_prev;
+	vmg.middle = vma;
+
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x8000);
+
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
+
+	cleanup_mm(&mm, &vmi);
+
+	/*
+	 * vma has anon_vma, we assign to prev.
+	 *
+	 *         |<----->|
+	 * |-------*********-------|
+	 *   prev     vma     next
+	 *  extend   delete  delete
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags);
+	vmg.anon_vma = &dummy_anon_vma;
+	vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain);
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
+	vmg.prev = vma_prev;
+	vmg.middle = vma;
+
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x8000);
+
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
+
+	cleanup_mm(&mm, &vmi);
+
+	/*
+	 * vma has anon_vma, we assign to prev.
+	 *
+	 *         |<----->|
+	 * |-------*************
+	 *   prev       vma
+	 *  extend shrink/delete
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags);
+
+	vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain);
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
+	vmg.prev = vma_prev;
+	vmg.middle = vma;
+
+	ASSERT_EQ(merge_existing(&vmg), vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+
+	ASSERT_EQ(vma_prev->vm_start, 0);
+	ASSERT_EQ(vma_prev->vm_end, 0x5000);
+
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
+
+	cleanup_mm(&mm, &vmi);
+
+	/*
+	 * vma has anon_vma, we assign to next.
+	 *
+	 *     |<----->|
+	 * *************-------|
+	 *      vma       next
+	 * shrink/delete extend
+	 */
+
+	vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags);
+
+	vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain);
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
+	vmg.prev = vma;
+	vmg.middle = vma;
+
+	ASSERT_EQ(merge_existing(&vmg), vma_next);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+
+	ASSERT_EQ(vma_next->vm_start, 0x3000);
+	ASSERT_EQ(vma_next->vm_end, 0x8000);
+
+	ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(vma_next->anon_vma->was_cloned);
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static bool test_vmi_prealloc_fail(void)
+{
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vma_merge_struct vmg = {
+		.mm = &mm,
+		.vmi = &vmi,
+	};
+	struct anon_vma_chain avc = {};
+	struct vm_area_struct *vma_prev, *vma;
+
+	/*
+	 * We are merging vma into prev, with vma possessing an anon_vma, which
+	 * will be duplicated. We cause the vmi preallocation to fail and assert
+	 * the duplicated anon_vma is unlinked.
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma->anon_vma = &dummy_anon_vma;
+
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x5000, 3, vm_flags, &dummy_anon_vma);
+	vmg.prev = vma_prev;
+	vmg.middle = vma;
+	vma_set_dummy_anon_vma(vma, &avc);
+
+	fail_prealloc = true;
+
+	/* This will cause the merge to fail. */
+	ASSERT_EQ(merge_existing(&vmg), NULL);
+	ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM);
+	/* We will already have assigned the anon_vma. */
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	/* And it was both cloned and unlinked. */
+	ASSERT_TRUE(dummy_anon_vma.was_cloned);
+	ASSERT_TRUE(dummy_anon_vma.was_unlinked);
+
+	cleanup_mm(&mm, &vmi); /* Resets fail_prealloc too. */
+
+	/*
+	 * We repeat the same operation for expanding a VMA, which is what new
+	 * VMA merging ultimately uses too. This asserts that unlinking is
+	 * performed in this case too.
+	 */
+
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma->anon_vma = &dummy_anon_vma;
+
+	vmg_set_range(&vmg, 0, 0x5000, 3, vm_flags);
+	vmg.target = vma_prev;
+	vmg.next = vma;
+
+	fail_prealloc = true;
+	ASSERT_EQ(expand_existing(&vmg), -ENOMEM);
+	ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM);
+
+	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+	ASSERT_TRUE(dummy_anon_vma.was_cloned);
+	ASSERT_TRUE(dummy_anon_vma.was_unlinked);
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static bool test_merge_extend(void)
+{
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0x1000);
+	struct vm_area_struct *vma;
+
+	vma = alloc_and_link_vma(&mm, 0, 0x1000, 0, vm_flags);
+	alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags);
+
+	/*
+	 * Extend a VMA into the gap between itself and the following VMA.
+	 * This should result in a merge.
+	 *
+	 * <->
+	 * *  *
+	 *
+	 */
+
+	ASSERT_EQ(vma_merge_extend(&vmi, vma, 0x2000), vma);
+	ASSERT_EQ(vma->vm_start, 0);
+	ASSERT_EQ(vma->vm_end, 0x4000);
+	ASSERT_EQ(vma->vm_pgoff, 0);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(mm.map_count, 1);
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static bool test_expand_only_mode(void)
+{
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vm_area_struct *vma_prev, *vma;
+	VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, vm_flags, 5);
+
+	/*
+	 * Place a VMA prior to the one we're expanding so we assert that we do
+	 * not erroneously try to traverse to the previous VMA even though we
+	 * have, through the use of the just_expand flag, indicated we do not
+	 * need to do so.
+	 */
+	alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
+
+	/*
+	 * We will be positioned at the prev VMA, but looking to expand to
+	 * 0x9000.
+	 */
+	vma_iter_set(&vmi, 0x3000);
+	vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vmg.prev = vma_prev;
+	vmg.just_expand = true;
+
+	vma = vma_merge_new_range(&vmg);
+	ASSERT_NE(vma, NULL);
+	ASSERT_EQ(vma, vma_prev);
+	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
+	ASSERT_EQ(vma->vm_start, 0x3000);
+	ASSERT_EQ(vma->vm_end, 0x9000);
+	ASSERT_EQ(vma->vm_pgoff, 3);
+	ASSERT_TRUE(vma_write_started(vma));
+	ASSERT_EQ(vma_iter_addr(&vmi), 0x3000);
+	vma_assert_attached(vma);
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static void run_merge_tests(int *num_tests, int *num_fail)
+{
+	/* Very simple tests to kick the tyres. */
+	TEST(simple_merge);
+	TEST(simple_modify);
+	TEST(simple_expand);
+	TEST(simple_shrink);
+
+	TEST(merge_new);
+	TEST(vma_merge_special_flags);
+	TEST(vma_merge_with_close);
+	TEST(vma_merge_new_with_close);
+	TEST(merge_existing);
+	TEST(anon_vma_non_mergeable);
+	TEST(dup_anon_vma);
+	TEST(vmi_prealloc_fail);
+	TEST(merge_extend);
+	TEST(expand_only_mode);
+}
diff --git a/tools/testing/vma/tests/mmap.c b/tools/testing/vma/tests/mmap.c
new file mode 100644
index 000000000000..bded4ecbe5db
--- /dev/null
+++ b/tools/testing/vma/tests/mmap.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+static bool test_mmap_region_basic(void)
+{
+	struct mm_struct mm = {};
+	unsigned long addr;
+	struct vm_area_struct *vma;
+	VMA_ITERATOR(vmi, &mm, 0);
+
+	current->mm = &mm;
+
+	/* Map at 0x300000, length 0x3000. */
+	addr = __mmap_region(NULL, 0x300000, 0x3000,
+			     VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
+			     0x300, NULL);
+	ASSERT_EQ(addr, 0x300000);
+
+	/* Map at 0x250000, length 0x3000. */
+	addr = __mmap_region(NULL, 0x250000, 0x3000,
+			     VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
+			     0x250, NULL);
+	ASSERT_EQ(addr, 0x250000);
+
+	/* Map at 0x303000, merging to 0x300000 of length 0x6000. */
+	addr = __mmap_region(NULL, 0x303000, 0x3000,
+			     VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
+			     0x303, NULL);
+	ASSERT_EQ(addr, 0x303000);
+
+	/* Map at 0x24d000, merging to 0x250000 of length 0x6000. */
+	addr = __mmap_region(NULL, 0x24d000, 0x3000,
+			     VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
+			     0x24d, NULL);
+	ASSERT_EQ(addr, 0x24d000);
+
+	ASSERT_EQ(mm.map_count, 2);
+
+	for_each_vma(vmi, vma) {
+		if (vma->vm_start == 0x300000) {
+			ASSERT_EQ(vma->vm_end, 0x306000);
+			ASSERT_EQ(vma->vm_pgoff, 0x300);
+		} else if (vma->vm_start == 0x24d000) {
+			ASSERT_EQ(vma->vm_end, 0x253000);
+			ASSERT_EQ(vma->vm_pgoff, 0x24d);
+		} else {
+			ASSERT_FALSE(true);
+		}
+	}
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static void run_mmap_tests(int *num_tests, int *num_fail)
+{
+	TEST(mmap_region_basic);
+}
diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c
new file mode 100644
index 000000000000..6d9775aee243
--- /dev/null
+++ b/tools/testing/vma/tests/vma.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+static bool test_copy_vma(void)
+{
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	struct mm_struct mm = {};
+	bool need_locks = false;
+	VMA_ITERATOR(vmi, &mm, 0);
+	struct vm_area_struct *vma, *vma_new, *vma_next;
+
+	/* Move backwards and do not merge. */
+
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks);
+	ASSERT_NE(vma_new, vma);
+	ASSERT_EQ(vma_new->vm_start, 0);
+	ASSERT_EQ(vma_new->vm_end, 0x2000);
+	ASSERT_EQ(vma_new->vm_pgoff, 0);
+	vma_assert_attached(vma_new);
+
+	cleanup_mm(&mm, &vmi);
+
+	/* Move a VMA into position next to another and merge the two. */
+
+	vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, vm_flags);
+	vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks);
+	vma_assert_attached(vma_new);
+
+	ASSERT_EQ(vma_new, vma_next);
+
+	cleanup_mm(&mm, &vmi);
+	return true;
+}
+
+static void run_vma_tests(int *num_tests, int *num_fail)
+{
+	TEST(copy_vma);
+}
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
deleted file mode 100644
index 93d21bc7e112..000000000000
--- a/tools/testing/vma/vma.c
+++ /dev/null
@@ -1,1785 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <stdbool.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "generated/bit-length.h"
-
-#include "maple-shared.h"
-#include "vma_internal.h"
-
-/* Include so header guard set. */
-#include "../../../mm/vma.h"
-
-static bool fail_prealloc;
-
-/* Then override vma_iter_prealloc() so we can choose to fail it. */
-#define vma_iter_prealloc(vmi, vma)					\
-	(fail_prealloc ? -ENOMEM : mas_preallocate(&(vmi)->mas, (vma), GFP_KERNEL))
-
-#define CONFIG_DEFAULT_MMAP_MIN_ADDR 65536
-
-unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
-unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
-unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
-
-/*
- * Directly import the VMA implementation here. Our vma_internal.h wrapper
- * provides userland-equivalent functionality for everything vma.c uses.
- */
-#include "../../../mm/vma_init.c"
-#include "../../../mm/vma_exec.c"
-#include "../../../mm/vma.c"
-
-const struct vm_operations_struct vma_dummy_vm_ops;
-static struct anon_vma dummy_anon_vma;
-
-#define ASSERT_TRUE(_expr)						\
-	do {								\
-		if (!(_expr)) {						\
-			fprintf(stderr,					\
-				"Assert FAILED at %s:%d:%s(): %s is FALSE.\n", \
-				__FILE__, __LINE__, __FUNCTION__, #_expr); \
-			return false;					\
-		}							\
-	} while (0)
-#define ASSERT_FALSE(_expr) ASSERT_TRUE(!(_expr))
-#define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2))
-#define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2))
-
-#define IS_SET(_val, _flags) ((_val & _flags) == _flags)
-
-static struct task_struct __current;
-
-struct task_struct *get_current(void)
-{
-	return &__current;
-}
-
-unsigned long rlimit(unsigned int limit)
-{
-	return (unsigned long)-1;
-}
-
-/* Helper function to simply allocate a VMA. */
-static struct vm_area_struct *alloc_vma(struct mm_struct *mm,
-					unsigned long start,
-					unsigned long end,
-					pgoff_t pgoff,
-					vm_flags_t vm_flags)
-{
-	struct vm_area_struct *vma = vm_area_alloc(mm);
-
-	if (vma == NULL)
-		return NULL;
-
-	vma->vm_start = start;
-	vma->vm_end = end;
-	vma->vm_pgoff = pgoff;
-	vm_flags_reset(vma, vm_flags);
-	vma_assert_detached(vma);
-
-	return vma;
-}
-
-/* Helper function to allocate a VMA and link it to the tree. */
-static int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma)
-{
-	int res;
-
-	res = vma_link(mm, vma);
-	if (!res)
-		vma_assert_attached(vma);
-	return res;
-}
-
-static void detach_free_vma(struct vm_area_struct *vma)
-{
-	vma_mark_detached(vma);
-	vm_area_free(vma);
-}
-
-/* Helper function to allocate a VMA and link it to the tree. */
-static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
-						 unsigned long start,
-						 unsigned long end,
-						 pgoff_t pgoff,
-						 vm_flags_t vm_flags)
-{
-	struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, vm_flags);
-
-	if (vma == NULL)
-		return NULL;
-
-	if (attach_vma(mm, vma)) {
-		detach_free_vma(vma);
-		return NULL;
-	}
-
-	/*
-	 * Reset this counter which we use to track whether writes have
-	 * begun. Linking to the tree will have caused this to be incremented,
-	 * which means we will get a false positive otherwise.
-	 */
-	vma->vm_lock_seq = UINT_MAX;
-
-	return vma;
-}
-
-/* Helper function which provides a wrapper around a merge new VMA operation. */
-static struct vm_area_struct *merge_new(struct vma_merge_struct *vmg)
-{
-	struct vm_area_struct *vma;
-	/*
-	 * For convenience, get prev and next VMAs. Which the new VMA operation
-	 * requires.
-	 */
-	vmg->next = vma_next(vmg->vmi);
-	vmg->prev = vma_prev(vmg->vmi);
-	vma_iter_next_range(vmg->vmi);
-
-	vma = vma_merge_new_range(vmg);
-	if (vma)
-		vma_assert_attached(vma);
-
-	return vma;
-}
-
-/*
- * Helper function which provides a wrapper around a merge existing VMA
- * operation.
- */
-static struct vm_area_struct *merge_existing(struct vma_merge_struct *vmg)
-{
-	struct vm_area_struct *vma;
-
-	vma = vma_merge_existing_range(vmg);
-	if (vma)
-		vma_assert_attached(vma);
-	return vma;
-}
-
-/*
- * Helper function which provides a wrapper around the expansion of an existing
- * VMA.
- */
-static int expand_existing(struct vma_merge_struct *vmg)
-{
-	return vma_expand(vmg);
-}
-
-/*
- * Helper function to reset merge state the associated VMA iterator to a
- * specified new range.
- */
-static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start,
-			  unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags)
-{
-	vma_iter_set(vmg->vmi, start);
-
-	vmg->prev = NULL;
-	vmg->middle = NULL;
-	vmg->next = NULL;
-	vmg->target = NULL;
-
-	vmg->start = start;
-	vmg->end = end;
-	vmg->pgoff = pgoff;
-	vmg->vm_flags = vm_flags;
-
-	vmg->just_expand = false;
-	vmg->__remove_middle = false;
-	vmg->__remove_next = false;
-	vmg->__adjust_middle_start = false;
-	vmg->__adjust_next_start = false;
-}
-
-/* Helper function to set both the VMG range and its anon_vma. */
-static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long start,
-				   unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags,
-				   struct anon_vma *anon_vma)
-{
-	vmg_set_range(vmg, start, end, pgoff, vm_flags);
-	vmg->anon_vma = anon_vma;
-}
-
-/*
- * Helper function to try to merge a new VMA.
- *
- * Update vmg and the iterator for it and try to merge, otherwise allocate a new
- * VMA, link it to the maple tree and return it.
- */
-static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm,
-						struct vma_merge_struct *vmg,
-						unsigned long start, unsigned long end,
-						pgoff_t pgoff, vm_flags_t vm_flags,
-						bool *was_merged)
-{
-	struct vm_area_struct *merged;
-
-	vmg_set_range(vmg, start, end, pgoff, vm_flags);
-
-	merged = merge_new(vmg);
-	if (merged) {
-		*was_merged = true;
-		ASSERT_EQ(vmg->state, VMA_MERGE_SUCCESS);
-		return merged;
-	}
-
-	*was_merged = false;
-
-	ASSERT_EQ(vmg->state, VMA_MERGE_NOMERGE);
-
-	return alloc_and_link_vma(mm, start, end, pgoff, vm_flags);
-}
-
-/*
- * Helper function to reset the dummy anon_vma to indicate it has not been
- * duplicated.
- */
-static void reset_dummy_anon_vma(void)
-{
-	dummy_anon_vma.was_cloned = false;
-	dummy_anon_vma.was_unlinked = false;
-}
-
-/*
- * Helper function to remove all VMAs and destroy the maple tree associated with
- * a virtual address space. Returns a count of VMAs in the tree.
- */
-static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi)
-{
-	struct vm_area_struct *vma;
-	int count = 0;
-
-	fail_prealloc = false;
-	reset_dummy_anon_vma();
-
-	vma_iter_set(vmi, 0);
-	for_each_vma(*vmi, vma) {
-		detach_free_vma(vma);
-		count++;
-	}
-
-	mtree_destroy(&mm->mm_mt);
-	mm->map_count = 0;
-	return count;
-}
-
-/* Helper function to determine if VMA has had vma_start_write() performed. */
-static bool vma_write_started(struct vm_area_struct *vma)
-{
-	int seq = vma->vm_lock_seq;
-
-	/* We reset after each check. */
-	vma->vm_lock_seq = UINT_MAX;
-
-	/* The vma_start_write() stub simply increments this value. */
-	return seq > -1;
-}
-
-/* Helper function providing a dummy vm_ops->close() method.*/
-static void dummy_close(struct vm_area_struct *)
-{
-}
-
-static void __vma_set_dummy_anon_vma(struct vm_area_struct *vma,
-				     struct anon_vma_chain *avc,
-				     struct anon_vma *anon_vma)
-{
-	vma->anon_vma = anon_vma;
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
-	list_add(&avc->same_vma, &vma->anon_vma_chain);
-	avc->anon_vma = vma->anon_vma;
-}
-
-static void vma_set_dummy_anon_vma(struct vm_area_struct *vma,
-				   struct anon_vma_chain *avc)
-{
-	__vma_set_dummy_anon_vma(vma, avc, &dummy_anon_vma);
-}
-
-static bool test_simple_merge(void)
-{
-	struct vm_area_struct *vma;
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, vm_flags);
-	struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, vm_flags);
-	VMA_ITERATOR(vmi, &mm, 0x1000);
-	struct vma_merge_struct vmg = {
-		.mm = &mm,
-		.vmi = &vmi,
-		.start = 0x1000,
-		.end = 0x2000,
-		.vm_flags = vm_flags,
-		.pgoff = 1,
-	};
-
-	ASSERT_FALSE(attach_vma(&mm, vma_left));
-	ASSERT_FALSE(attach_vma(&mm, vma_right));
-
-	vma = merge_new(&vmg);
-	ASSERT_NE(vma, NULL);
-
-	ASSERT_EQ(vma->vm_start, 0);
-	ASSERT_EQ(vma->vm_end, 0x3000);
-	ASSERT_EQ(vma->vm_pgoff, 0);
-	ASSERT_EQ(vma->vm_flags, vm_flags);
-
-	detach_free_vma(vma);
-	mtree_destroy(&mm.mm_mt);
-
-	return true;
-}
-
-static bool test_simple_modify(void)
-{
-	struct vm_area_struct *vma;
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags);
-	VMA_ITERATOR(vmi, &mm, 0x1000);
-	vm_flags_t flags = VM_READ | VM_MAYREAD;
-
-	ASSERT_FALSE(attach_vma(&mm, init_vma));
-
-	/*
-	 * The flags will not be changed, the vma_modify_flags() function
-	 * performs the merge/split only.
-	 */
-	vma = vma_modify_flags(&vmi, init_vma, init_vma,
-			       0x1000, 0x2000, &flags);
-	ASSERT_NE(vma, NULL);
-	/* We modify the provided VMA, and on split allocate new VMAs. */
-	ASSERT_EQ(vma, init_vma);
-
-	ASSERT_EQ(vma->vm_start, 0x1000);
-	ASSERT_EQ(vma->vm_end, 0x2000);
-	ASSERT_EQ(vma->vm_pgoff, 1);
-
-	/*
-	 * Now walk through the three split VMAs and make sure they are as
-	 * expected.
-	 */
-
-	vma_iter_set(&vmi, 0);
-	vma = vma_iter_load(&vmi);
-
-	ASSERT_EQ(vma->vm_start, 0);
-	ASSERT_EQ(vma->vm_end, 0x1000);
-	ASSERT_EQ(vma->vm_pgoff, 0);
-
-	detach_free_vma(vma);
-	vma_iter_clear(&vmi);
-
-	vma = vma_next(&vmi);
-
-	ASSERT_EQ(vma->vm_start, 0x1000);
-	ASSERT_EQ(vma->vm_end, 0x2000);
-	ASSERT_EQ(vma->vm_pgoff, 1);
-
-	detach_free_vma(vma);
-	vma_iter_clear(&vmi);
-
-	vma = vma_next(&vmi);
-
-	ASSERT_EQ(vma->vm_start, 0x2000);
-	ASSERT_EQ(vma->vm_end, 0x3000);
-	ASSERT_EQ(vma->vm_pgoff, 2);
-
-	detach_free_vma(vma);
-	mtree_destroy(&mm.mm_mt);
-
-	return true;
-}
-
-static bool test_simple_expand(void)
-{
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, vm_flags);
-	VMA_ITERATOR(vmi, &mm, 0);
-	struct vma_merge_struct vmg = {
-		.vmi = &vmi,
-		.target = vma,
-		.start = 0,
-		.end = 0x3000,
-		.pgoff = 0,
-	};
-
-	ASSERT_FALSE(attach_vma(&mm, vma));
-
-	ASSERT_FALSE(expand_existing(&vmg));
-
-	ASSERT_EQ(vma->vm_start, 0);
-	ASSERT_EQ(vma->vm_end, 0x3000);
-	ASSERT_EQ(vma->vm_pgoff, 0);
-
-	detach_free_vma(vma);
-	mtree_destroy(&mm.mm_mt);
-
-	return true;
-}
-
-static bool test_simple_shrink(void)
-{
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags);
-	VMA_ITERATOR(vmi, &mm, 0);
-
-	ASSERT_FALSE(attach_vma(&mm, vma));
-
-	ASSERT_FALSE(vma_shrink(&vmi, vma, 0, 0x1000, 0));
-
-	ASSERT_EQ(vma->vm_start, 0);
-	ASSERT_EQ(vma->vm_end, 0x1000);
-	ASSERT_EQ(vma->vm_pgoff, 0);
-
-	detach_free_vma(vma);
-	mtree_destroy(&mm.mm_mt);
-
-	return true;
-}
-
-static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, bool c_is_sticky)
-{
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	VMA_ITERATOR(vmi, &mm, 0);
-	struct vma_merge_struct vmg = {
-		.mm = &mm,
-		.vmi = &vmi,
-	};
-	struct anon_vma_chain dummy_anon_vma_chain_a = {
-		.anon_vma = &dummy_anon_vma,
-	};
-	struct anon_vma_chain dummy_anon_vma_chain_b = {
-		.anon_vma = &dummy_anon_vma,
-	};
-	struct anon_vma_chain dummy_anon_vma_chain_c = {
-		.anon_vma = &dummy_anon_vma,
-	};
-	struct anon_vma_chain dummy_anon_vma_chain_d = {
-		.anon_vma = &dummy_anon_vma,
-	};
-	const struct vm_operations_struct vm_ops = {
-		.close = dummy_close,
-	};
-	int count;
-	struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d;
-	bool merged;
-
-	if (is_sticky)
-		vm_flags |= VM_STICKY;
-
-	/*
-	 * 0123456789abc
-	 * AA B       CC
-	 */
-	vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
-	ASSERT_NE(vma_a, NULL);
-	if (a_is_sticky)
-		vm_flags_set(vma_a, VM_STICKY);
-	/* We give each VMA a single avc so we can test anon_vma duplication. */
-	INIT_LIST_HEAD(&vma_a->anon_vma_chain);
-	list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain);
-
-	vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags);
-	ASSERT_NE(vma_b, NULL);
-	if (b_is_sticky)
-		vm_flags_set(vma_b, VM_STICKY);
-	INIT_LIST_HEAD(&vma_b->anon_vma_chain);
-	list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain);
-
-	vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, vm_flags);
-	ASSERT_NE(vma_c, NULL);
-	if (c_is_sticky)
-		vm_flags_set(vma_c, VM_STICKY);
-	INIT_LIST_HEAD(&vma_c->anon_vma_chain);
-	list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain);
-
-	/*
-	 * NO merge.
-	 *
-	 * 0123456789abc
-	 * AA B   **  CC
-	 */
-	vma_d = try_merge_new_vma(&mm, &vmg, 0x7000, 0x9000, 7, vm_flags, &merged);
-	ASSERT_NE(vma_d, NULL);
-	INIT_LIST_HEAD(&vma_d->anon_vma_chain);
-	list_add(&dummy_anon_vma_chain_d.same_vma, &vma_d->anon_vma_chain);
-	ASSERT_FALSE(merged);
-	ASSERT_EQ(mm.map_count, 4);
-
-	/*
-	 * Merge BOTH sides.
-	 *
-	 * 0123456789abc
-	 * AA*B   DD  CC
-	 */
-	vma_a->vm_ops = &vm_ops; /* This should have no impact. */
-	vma_b->anon_vma = &dummy_anon_vma;
-	vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, vm_flags, &merged);
-	ASSERT_EQ(vma, vma_a);
-	/* Merge with A, delete B. */
-	ASSERT_TRUE(merged);
-	ASSERT_EQ(vma->vm_start, 0);
-	ASSERT_EQ(vma->vm_end, 0x4000);
-	ASSERT_EQ(vma->vm_pgoff, 0);
-	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_write_started(vma));
-	ASSERT_EQ(mm.map_count, 3);
-	if (is_sticky || a_is_sticky || b_is_sticky)
-		ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY));
-
-	/*
-	 * Merge to PREVIOUS VMA.
-	 *
-	 * 0123456789abc
-	 * AAAA*  DD  CC
-	 */
-	vma = try_merge_new_vma(&mm, &vmg, 0x4000, 0x5000, 4, vm_flags, &merged);
-	ASSERT_EQ(vma, vma_a);
-	/* Extend A. */
-	ASSERT_TRUE(merged);
-	ASSERT_EQ(vma->vm_start, 0);
-	ASSERT_EQ(vma->vm_end, 0x5000);
-	ASSERT_EQ(vma->vm_pgoff, 0);
-	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_write_started(vma));
-	ASSERT_EQ(mm.map_count, 3);
-	if (is_sticky || a_is_sticky)
-		ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY));
-
-	/*
-	 * Merge to NEXT VMA.
-	 *
-	 * 0123456789abc
-	 * AAAAA *DD  CC
-	 */
-	vma_d->anon_vma = &dummy_anon_vma;
-	vma_d->vm_ops = &vm_ops; /* This should have no impact. */
-	vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, vm_flags, &merged);
-	ASSERT_EQ(vma, vma_d);
-	/* Prepend. */
-	ASSERT_TRUE(merged);
-	ASSERT_EQ(vma->vm_start, 0x6000);
-	ASSERT_EQ(vma->vm_end, 0x9000);
-	ASSERT_EQ(vma->vm_pgoff, 6);
-	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_write_started(vma));
-	ASSERT_EQ(mm.map_count, 3);
-	if (is_sticky) /* D uses is_sticky. */
-		ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY));
-
-	/*
-	 * Merge BOTH sides.
-	 *
-	 * 0123456789abc
-	 * AAAAA*DDD  CC
-	 */
-	vma_d->vm_ops = NULL; /* This would otherwise degrade the merge. */
-	vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, vm_flags, &merged);
-	ASSERT_EQ(vma, vma_a);
-	/* Merge with A, delete D. */
-	ASSERT_TRUE(merged);
-	ASSERT_EQ(vma->vm_start, 0);
-	ASSERT_EQ(vma->vm_end, 0x9000);
-	ASSERT_EQ(vma->vm_pgoff, 0);
-	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_write_started(vma));
-	ASSERT_EQ(mm.map_count, 2);
-	if (is_sticky || a_is_sticky)
-		ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY));
-
-	/*
-	 * Merge to NEXT VMA.
-	 *
-	 * 0123456789abc
-	 * AAAAAAAAA *CC
-	 */
-	vma_c->anon_vma = &dummy_anon_vma;
-	vma = try_merge_new_vma(&mm, &vmg, 0xa000, 0xb000, 0xa, vm_flags, &merged);
-	ASSERT_EQ(vma, vma_c);
-	/* Prepend C. */
-	ASSERT_TRUE(merged);
-	ASSERT_EQ(vma->vm_start, 0xa000);
-	ASSERT_EQ(vma->vm_end, 0xc000);
-	ASSERT_EQ(vma->vm_pgoff, 0xa);
-	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_write_started(vma));
-	ASSERT_EQ(mm.map_count, 2);
-	if (is_sticky || c_is_sticky)
-		ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY));
-
-	/*
-	 * Merge BOTH sides.
-	 *
-	 * 0123456789abc
-	 * AAAAAAAAA*CCC
-	 */
-	vma = try_merge_new_vma(&mm, &vmg, 0x9000, 0xa000, 0x9, vm_flags, &merged);
-	ASSERT_EQ(vma, vma_a);
-	/* Extend A and delete C. */
-	ASSERT_TRUE(merged);
-	ASSERT_EQ(vma->vm_start, 0);
-	ASSERT_EQ(vma->vm_end, 0xc000);
-	ASSERT_EQ(vma->vm_pgoff, 0);
-	ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_write_started(vma));
-	ASSERT_EQ(mm.map_count, 1);
-	if (is_sticky || a_is_sticky || c_is_sticky)
-		ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY));
-
-	/*
-	 * Final state.
-	 *
-	 * 0123456789abc
-	 * AAAAAAAAAAAAA
-	 */
-
-	count = 0;
-	vma_iter_set(&vmi, 0);
-	for_each_vma(vmi, vma) {
-		ASSERT_NE(vma, NULL);
-		ASSERT_EQ(vma->vm_start, 0);
-		ASSERT_EQ(vma->vm_end, 0xc000);
-		ASSERT_EQ(vma->vm_pgoff, 0);
-		ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
-
-		detach_free_vma(vma);
-		count++;
-	}
-
-	/* Should only have one VMA left (though freed) after all is done.*/
-	ASSERT_EQ(count, 1);
-
-	mtree_destroy(&mm.mm_mt);
-	return true;
-}
-
-static bool test_merge_new(void)
-{
-	int i, j, k, l;
-
-	/* Generate every possible permutation of sticky flags. */
-	for (i = 0; i < 2; i++)
-		for (j = 0; j < 2; j++)
-			for (k = 0; k < 2; k++)
-				for (l = 0; l < 2; l++)
-					ASSERT_TRUE(__test_merge_new(i, j, k, l));
-
-	return true;
-}
-
-static bool test_vma_merge_special_flags(void)
-{
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	VMA_ITERATOR(vmi, &mm, 0);
-	struct vma_merge_struct vmg = {
-		.mm = &mm,
-		.vmi = &vmi,
-	};
-	vm_flags_t special_flags[] = { VM_IO, VM_DONTEXPAND, VM_PFNMAP, VM_MIXEDMAP };
-	vm_flags_t all_special_flags = 0;
-	int i;
-	struct vm_area_struct *vma_left, *vma;
-
-	/* Make sure there aren't new VM_SPECIAL flags. */
-	for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
-		all_special_flags |= special_flags[i];
-	}
-	ASSERT_EQ(all_special_flags, VM_SPECIAL);
-
-	/*
-	 * 01234
-	 * AAA
-	 */
-	vma_left = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
-	ASSERT_NE(vma_left, NULL);
-
-	/* 1. Set up new VMA with special flag that would otherwise merge. */
-
-	/*
-	 * 01234
-	 * AAA*
-	 *
-	 * This should merge if not for the VM_SPECIAL flag.
-	 */
-	vmg_set_range(&vmg, 0x3000, 0x4000, 3, vm_flags);
-	for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
-		vm_flags_t special_flag = special_flags[i];
-
-		vm_flags_reset(vma_left, vm_flags | special_flag);
-		vmg.vm_flags = vm_flags | special_flag;
-		vma = merge_new(&vmg);
-		ASSERT_EQ(vma, NULL);
-		ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
-	}
-
-	/* 2. Modify VMA with special flag that would otherwise merge. */
-
-	/*
-	 * 01234
-	 * AAAB
-	 *
-	 * Create a VMA to modify.
-	 */
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags);
-	ASSERT_NE(vma, NULL);
-	vmg.middle = vma;
-
-	for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
-		vm_flags_t special_flag = special_flags[i];
-
-		vm_flags_reset(vma_left, vm_flags | special_flag);
-		vmg.vm_flags = vm_flags | special_flag;
-		vma = merge_existing(&vmg);
-		ASSERT_EQ(vma, NULL);
-		ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
-	}
-
-	cleanup_mm(&mm, &vmi);
-	return true;
-}
-
-static bool test_vma_merge_with_close(void)
-{
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	VMA_ITERATOR(vmi, &mm, 0);
-	struct vma_merge_struct vmg = {
-		.mm = &mm,
-		.vmi = &vmi,
-	};
-	const struct vm_operations_struct vm_ops = {
-		.close = dummy_close,
-	};
-	struct vm_area_struct *vma_prev, *vma_next, *vma;
-
-	/*
-	 * When merging VMAs we are not permitted to remove any VMA that has a
-	 * vm_ops->close() hook.
-	 *
-	 * Considering the two possible adjacent VMAs to which a VMA can be
-	 * merged:
-	 *
-	 * [ prev ][ vma ][ next ]
-	 *
-	 * In no case will we need to delete prev. If the operation is
-	 * mergeable, then prev will be extended with one or both of vma and
-	 * next deleted.
-	 *
-	 * As a result, during initial mergeability checks, only
-	 * can_vma_merge_before() (which implies the VMA being merged with is
-	 * 'next' as shown above) bothers to check to see whether the next VMA
-	 * has a vm_ops->close() callback that will need to be called when
-	 * removed.
-	 *
-	 * If it does, then we cannot merge as the resources that the close()
-	 * operation potentially clears down are tied only to the existing VMA
-	 * range and we have no way of extending those to the nearly merged one.
-	 *
-	 * We must consider two scenarios:
-	 *
-	 * A.
-	 *
-	 * vm_ops->close:     -       -    !NULL
-	 *                 [ prev ][ vma ][ next ]
-	 *
-	 * Where prev may or may not be present/mergeable.
-	 *
-	 * This is picked up by a specific check in can_vma_merge_before().
-	 *
-	 * B.
-	 *
-	 * vm_ops->close:     -     !NULL
-	 *                 [ prev ][ vma ]
-	 *
-	 * Where prev and vma are present and mergeable.
-	 *
-	 * This is picked up by a specific check in the modified VMA merge.
-	 *
-	 * IMPORTANT NOTE: We make the assumption that the following case:
-	 *
-	 *    -     !NULL   NULL
-	 * [ prev ][ vma ][ next ]
-	 *
-	 * Cannot occur, because vma->vm_ops being the same implies the same
-	 * vma->vm_file, and therefore this would mean that next->vm_ops->close
-	 * would be set too, and thus scenario A would pick this up.
-	 */
-
-	/*
-	 * The only case of a new VMA merge that results in a VMA being deleted
-	 * is one where both the previous and next VMAs are merged - in this
-	 * instance the next VMA is deleted, and the previous VMA is extended.
-	 *
-	 * If we are unable to do so, we reduce the operation to simply
-	 * extending the prev VMA and not merging next.
-	 *
-	 * 0123456789
-	 * PPP**NNNN
-	 *             ->
-	 * 0123456789
-	 * PPPPPPNNN
-	 */
-
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
-	vma_next->vm_ops = &vm_ops;
-
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
-	ASSERT_EQ(merge_new(&vmg), vma_prev);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-	ASSERT_EQ(vma_prev->vm_start, 0);
-	ASSERT_EQ(vma_prev->vm_end, 0x5000);
-	ASSERT_EQ(vma_prev->vm_pgoff, 0);
-
-	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
-
-	/*
-	 * When modifying an existing VMA there are further cases where we
-	 * delete VMAs.
-	 *
-	 *    <>
-	 * 0123456789
-	 * PPPVV
-	 *
-	 * In this instance, if vma has a close hook, the merge simply cannot
-	 * proceed.
-	 */
-
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
-	vma->vm_ops = &vm_ops;
-
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
-	vmg.prev = vma_prev;
-	vmg.middle = vma;
-
-	/*
-	 * The VMA being modified in a way that would otherwise merge should
-	 * also fail.
-	 */
-	ASSERT_EQ(merge_existing(&vmg), NULL);
-	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
-
-	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
-
-	/*
-	 * This case is mirrored if merging with next.
-	 *
-	 *    <>
-	 * 0123456789
-	 *    VVNNNN
-	 *
-	 * In this instance, if vma has a close hook, the merge simply cannot
-	 * proceed.
-	 */
-
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
-	vma->vm_ops = &vm_ops;
-
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
-	vmg.middle = vma;
-	ASSERT_EQ(merge_existing(&vmg), NULL);
-	/*
-	 * Initially this is misapprehended as an out of memory report, as the
-	 * close() check is handled in the same way as anon_vma duplication
-	 * failures, however a subsequent patch resolves this.
-	 */
-	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
-
-	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
-
-	/*
-	 * Finally, we consider two variants of the case where we modify a VMA
-	 * to merge with both the previous and next VMAs.
-	 *
-	 * The first variant is where vma has a close hook. In this instance, no
-	 * merge can proceed.
-	 *
-	 *    <>
-	 * 0123456789
-	 * PPPVVNNNN
-	 */
-
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
-	vma->vm_ops = &vm_ops;
-
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
-	vmg.prev = vma_prev;
-	vmg.middle = vma;
-
-	ASSERT_EQ(merge_existing(&vmg), NULL);
-	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
-
-	ASSERT_EQ(cleanup_mm(&mm, &vmi), 3);
-
-	/*
-	 * The second variant is where next has a close hook. In this instance,
-	 * we reduce the operation to a merge between prev and vma.
-	 *
-	 *    <>
-	 * 0123456789
-	 * PPPVVNNNN
-	 *            ->
-	 * 0123456789
-	 * PPPPPNNNN
-	 */
-
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
-	vma_next->vm_ops = &vm_ops;
-
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
-	vmg.prev = vma_prev;
-	vmg.middle = vma;
-
-	ASSERT_EQ(merge_existing(&vmg), vma_prev);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-	ASSERT_EQ(vma_prev->vm_start, 0);
-	ASSERT_EQ(vma_prev->vm_end, 0x5000);
-	ASSERT_EQ(vma_prev->vm_pgoff, 0);
-
-	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
-
-	return true;
-}
-
-static bool test_vma_merge_new_with_close(void)
-{
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	VMA_ITERATOR(vmi, &mm, 0);
-	struct vma_merge_struct vmg = {
-		.mm = &mm,
-		.vmi = &vmi,
-	};
-	struct vm_area_struct *vma_prev = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
-	struct vm_area_struct *vma_next = alloc_and_link_vma(&mm, 0x5000, 0x7000, 5, vm_flags);
-	const struct vm_operations_struct vm_ops = {
-		.close = dummy_close,
-	};
-	struct vm_area_struct *vma;
-
-	/*
-	 * We should allow the partial merge of a proposed new VMA if the
-	 * surrounding VMAs have vm_ops->close() hooks (but are otherwise
-	 * compatible), e.g.:
-	 *
-	 *        New VMA
-	 *    A  v-------v  B
-	 * |-----|       |-----|
-	 *  close         close
-	 *
-	 * Since the rule is to not DELETE a VMA with a close operation, this
-	 * should be permitted, only rather than expanding A and deleting B, we
-	 * should simply expand A and leave B intact, e.g.:
-	 *
-	 *        New VMA
-	 *       A          B
-	 * |------------||-----|
-	 *  close         close
-	 */
-
-	/* Have prev and next have a vm_ops->close() hook. */
-	vma_prev->vm_ops = &vm_ops;
-	vma_next->vm_ops = &vm_ops;
-
-	vmg_set_range(&vmg, 0x2000, 0x5000, 2, vm_flags);
-	vma = merge_new(&vmg);
-	ASSERT_NE(vma, NULL);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-	ASSERT_EQ(vma->vm_start, 0);
-	ASSERT_EQ(vma->vm_end, 0x5000);
-	ASSERT_EQ(vma->vm_pgoff, 0);
-	ASSERT_EQ(vma->vm_ops, &vm_ops);
-	ASSERT_TRUE(vma_write_started(vma));
-	ASSERT_EQ(mm.map_count, 2);
-
-	cleanup_mm(&mm, &vmi);
-	return true;
-}
-
-static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bool next_is_sticky)
-{
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	vm_flags_t prev_flags = vm_flags;
-	vm_flags_t next_flags = vm_flags;
-	struct mm_struct mm = {};
-	VMA_ITERATOR(vmi, &mm, 0);
-	struct vm_area_struct *vma, *vma_prev, *vma_next;
-	struct vma_merge_struct vmg = {
-		.mm = &mm,
-		.vmi = &vmi,
-	};
-	const struct vm_operations_struct vm_ops = {
-		.close = dummy_close,
-	};
-	struct anon_vma_chain avc = {};
-
-	if (prev_is_sticky)
-		prev_flags |= VM_STICKY;
-	if (middle_is_sticky)
-		vm_flags |= VM_STICKY;
-	if (next_is_sticky)
-		next_flags |= VM_STICKY;
-
-	/*
-	 * Merge right case - partial span.
-	 *
-	 *    <->
-	 * 0123456789
-	 *   VVVVNNN
-	 *            ->
-	 * 0123456789
-	 *   VNNNNNN
-	 */
-	vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags);
-	vma->vm_ops = &vm_ops; /* This should have no impact. */
-	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags);
-	vma_next->vm_ops = &vm_ops; /* This should have no impact. */
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma);
-	vmg.middle = vma;
-	vmg.prev = vma;
-	vma_set_dummy_anon_vma(vma, &avc);
-	ASSERT_EQ(merge_existing(&vmg), vma_next);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-	ASSERT_EQ(vma_next->vm_start, 0x3000);
-	ASSERT_EQ(vma_next->vm_end, 0x9000);
-	ASSERT_EQ(vma_next->vm_pgoff, 3);
-	ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
-	ASSERT_EQ(vma->vm_start, 0x2000);
-	ASSERT_EQ(vma->vm_end, 0x3000);
-	ASSERT_EQ(vma->vm_pgoff, 2);
-	ASSERT_TRUE(vma_write_started(vma));
-	ASSERT_TRUE(vma_write_started(vma_next));
-	ASSERT_EQ(mm.map_count, 2);
-	if (middle_is_sticky || next_is_sticky)
-		ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY));
-
-	/* Clear down and reset. */
-	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
-
-	/*
-	 * Merge right case - full span.
-	 *
-	 *   <-->
-	 * 0123456789
-	 *   VVVVNNN
-	 *            ->
-	 * 0123456789
-	 *   NNNNNNN
-	 */
-	vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags);
-	vma_next->vm_ops = &vm_ops; /* This should have no impact. */
-	vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, vm_flags, &dummy_anon_vma);
-	vmg.middle = vma;
-	vma_set_dummy_anon_vma(vma, &avc);
-	ASSERT_EQ(merge_existing(&vmg), vma_next);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-	ASSERT_EQ(vma_next->vm_start, 0x2000);
-	ASSERT_EQ(vma_next->vm_end, 0x9000);
-	ASSERT_EQ(vma_next->vm_pgoff, 2);
-	ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_write_started(vma_next));
-	ASSERT_EQ(mm.map_count, 1);
-	if (middle_is_sticky || next_is_sticky)
-		ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY));
-
-	/* Clear down and reset. We should have deleted vma. */
-	ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
-
-	/*
-	 * Merge left case - partial span.
-	 *
-	 *    <->
-	 * 0123456789
-	 * PPPVVVV
-	 *            ->
-	 * 0123456789
-	 * PPPPPPV
-	 */
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags);
-	vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
-	vma->vm_ops = &vm_ops; /* This should have no impact. */
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma);
-	vmg.prev = vma_prev;
-	vmg.middle = vma;
-	vma_set_dummy_anon_vma(vma, &avc);
-	ASSERT_EQ(merge_existing(&vmg), vma_prev);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-	ASSERT_EQ(vma_prev->vm_start, 0);
-	ASSERT_EQ(vma_prev->vm_end, 0x6000);
-	ASSERT_EQ(vma_prev->vm_pgoff, 0);
-	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
-	ASSERT_EQ(vma->vm_start, 0x6000);
-	ASSERT_EQ(vma->vm_end, 0x7000);
-	ASSERT_EQ(vma->vm_pgoff, 6);
-	ASSERT_TRUE(vma_write_started(vma_prev));
-	ASSERT_TRUE(vma_write_started(vma));
-	ASSERT_EQ(mm.map_count, 2);
-	if (prev_is_sticky || middle_is_sticky)
-		ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY));
-
-	/* Clear down and reset. */
-	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
-
-	/*
-	 * Merge left case - full span.
-	 *
-	 *    <-->
-	 * 0123456789
-	 * PPPVVVV
-	 *            ->
-	 * 0123456789
-	 * PPPPPPP
-	 */
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags);
-	vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma);
-	vmg.prev = vma_prev;
-	vmg.middle = vma;
-	vma_set_dummy_anon_vma(vma, &avc);
-	ASSERT_EQ(merge_existing(&vmg), vma_prev);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-	ASSERT_EQ(vma_prev->vm_start, 0);
-	ASSERT_EQ(vma_prev->vm_end, 0x7000);
-	ASSERT_EQ(vma_prev->vm_pgoff, 0);
-	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_write_started(vma_prev));
-	ASSERT_EQ(mm.map_count, 1);
-	if (prev_is_sticky || middle_is_sticky)
-		ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY));
-
-	/* Clear down and reset. We should have deleted vma. */
-	ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
-
-	/*
-	 * Merge both case.
-	 *
-	 *    <-->
-	 * 0123456789
-	 * PPPVVVVNNN
-	 *             ->
-	 * 0123456789
-	 * PPPPPPPPPP
-	 */
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags);
-	vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, next_flags);
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma);
-	vmg.prev = vma_prev;
-	vmg.middle = vma;
-	vma_set_dummy_anon_vma(vma, &avc);
-	ASSERT_EQ(merge_existing(&vmg), vma_prev);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-	ASSERT_EQ(vma_prev->vm_start, 0);
-	ASSERT_EQ(vma_prev->vm_end, 0x9000);
-	ASSERT_EQ(vma_prev->vm_pgoff, 0);
-	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_write_started(vma_prev));
-	ASSERT_EQ(mm.map_count, 1);
-	if (prev_is_sticky || middle_is_sticky || next_is_sticky)
-		ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY));
-
-	/* Clear down and reset. We should have deleted prev and next. */
-	ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
-
-	/*
-	 * Non-merge ranges. the modified VMA merge operation assumes that the
-	 * caller always specifies ranges within the input VMA so we need only
-	 * examine these cases.
-	 *
-	 *     -
-	 *      -
-	 *       -
-	 *     <->
-	 *     <>
-	 *      <>
-	 * 0123456789a
-	 * PPPVVVVVNNN
-	 */
-
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, next_flags);
-
-	vmg_set_range(&vmg, 0x4000, 0x5000, 4, vm_flags);
-	vmg.prev = vma;
-	vmg.middle = vma;
-	ASSERT_EQ(merge_existing(&vmg), NULL);
-	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
-
-	vmg_set_range(&vmg, 0x5000, 0x6000, 5, vm_flags);
-	vmg.prev = vma;
-	vmg.middle = vma;
-	ASSERT_EQ(merge_existing(&vmg), NULL);
-	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
-
-	vmg_set_range(&vmg, 0x6000, 0x7000, 6, vm_flags);
-	vmg.prev = vma;
-	vmg.middle = vma;
-	ASSERT_EQ(merge_existing(&vmg), NULL);
-	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
-
-	vmg_set_range(&vmg, 0x4000, 0x7000, 4, vm_flags);
-	vmg.prev = vma;
-	vmg.middle = vma;
-	ASSERT_EQ(merge_existing(&vmg), NULL);
-	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
-
-	vmg_set_range(&vmg, 0x4000, 0x6000, 4, vm_flags);
-	vmg.prev = vma;
-	vmg.middle = vma;
-	ASSERT_EQ(merge_existing(&vmg), NULL);
-	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
-
-	vmg_set_range(&vmg, 0x5000, 0x6000, 5, vm_flags);
-	vmg.prev = vma;
-	vmg.middle = vma;
-	ASSERT_EQ(merge_existing(&vmg), NULL);
-	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
-
-	ASSERT_EQ(cleanup_mm(&mm, &vmi), 3);
-
-	return true;
-}
-
-static bool test_merge_existing(void)
-{
-	int i, j, k;
-
-	/* Generate every possible permutation of sticky flags. */
-	for (i = 0; i < 2; i++)
-		for (j = 0; j < 2; j++)
-			for (k = 0; k < 2; k++)
-				ASSERT_TRUE(__test_merge_existing(i, j, k));
-
-	return true;
-}
-
-static bool test_anon_vma_non_mergeable(void)
-{
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	VMA_ITERATOR(vmi, &mm, 0);
-	struct vm_area_struct *vma, *vma_prev, *vma_next;
-	struct vma_merge_struct vmg = {
-		.mm = &mm,
-		.vmi = &vmi,
-	};
-	struct anon_vma_chain dummy_anon_vma_chain_1 = {};
-	struct anon_vma_chain dummy_anon_vma_chain_2 = {};
-	struct anon_vma dummy_anon_vma_2;
-
-	/*
-	 * In the case of modified VMA merge, merging both left and right VMAs
-	 * but where prev and next have incompatible anon_vma objects, we revert
-	 * to a merge of prev and VMA:
-	 *
-	 *    <-->
-	 * 0123456789
-	 * PPPVVVVNNN
-	 *            ->
-	 * 0123456789
-	 * PPPPPPPNNN
-	 */
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags);
-
-	/*
-	 * Give both prev and next single anon_vma_chain fields, so they will
-	 * merge with the NULL vmg->anon_vma.
-	 *
-	 * However, when prev is compared to next, the merge should fail.
-	 */
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, NULL);
-	vmg.prev = vma_prev;
-	vmg.middle = vma;
-	vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1);
-	__vma_set_dummy_anon_vma(vma_next, &dummy_anon_vma_chain_2, &dummy_anon_vma_2);
-
-	ASSERT_EQ(merge_existing(&vmg), vma_prev);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-	ASSERT_EQ(vma_prev->vm_start, 0);
-	ASSERT_EQ(vma_prev->vm_end, 0x7000);
-	ASSERT_EQ(vma_prev->vm_pgoff, 0);
-	ASSERT_TRUE(vma_write_started(vma_prev));
-	ASSERT_FALSE(vma_write_started(vma_next));
-
-	/* Clear down and reset. */
-	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
-
-	/*
-	 * Now consider the new VMA case. This is equivalent, only adding a new
-	 * VMA in a gap between prev and next.
-	 *
-	 *    <-->
-	 * 0123456789
-	 * PPP****NNN
-	 *            ->
-	 * 0123456789
-	 * PPPPPPPNNN
-	 */
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags);
-
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, NULL);
-	vmg.prev = vma_prev;
-	vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1);
-	__vma_set_dummy_anon_vma(vma_next, &dummy_anon_vma_chain_2, &dummy_anon_vma_2);
-
-	vmg.anon_vma = NULL;
-	ASSERT_EQ(merge_new(&vmg), vma_prev);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-	ASSERT_EQ(vma_prev->vm_start, 0);
-	ASSERT_EQ(vma_prev->vm_end, 0x7000);
-	ASSERT_EQ(vma_prev->vm_pgoff, 0);
-	ASSERT_TRUE(vma_write_started(vma_prev));
-	ASSERT_FALSE(vma_write_started(vma_next));
-
-	/* Final cleanup. */
-	ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
-
-	return true;
-}
-
-static bool test_dup_anon_vma(void)
-{
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	VMA_ITERATOR(vmi, &mm, 0);
-	struct vma_merge_struct vmg = {
-		.mm = &mm,
-		.vmi = &vmi,
-	};
-	struct anon_vma_chain dummy_anon_vma_chain = {
-		.anon_vma = &dummy_anon_vma,
-	};
-	struct vm_area_struct *vma_prev, *vma_next, *vma;
-
-	reset_dummy_anon_vma();
-
-	/*
-	 * Expanding a VMA delete the next one duplicates next's anon_vma and
-	 * assigns it to the expanded VMA.
-	 *
-	 * This covers new VMA merging, as these operations amount to a VMA
-	 * expand.
-	 */
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
-	vma_next->anon_vma = &dummy_anon_vma;
-
-	vmg_set_range(&vmg, 0, 0x5000, 0, vm_flags);
-	vmg.target = vma_prev;
-	vmg.next = vma_next;
-
-	ASSERT_EQ(expand_existing(&vmg), 0);
-
-	/* Will have been cloned. */
-	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
-
-	/* Cleanup ready for next run. */
-	cleanup_mm(&mm, &vmi);
-
-	/*
-	 * next has anon_vma, we assign to prev.
-	 *
-	 *         |<----->|
-	 * |-------*********-------|
-	 *   prev     vma     next
-	 *  extend   delete  delete
-	 */
-
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags);
-
-	/* Initialise avc so mergeability check passes. */
-	INIT_LIST_HEAD(&vma_next->anon_vma_chain);
-	list_add(&dummy_anon_vma_chain.same_vma, &vma_next->anon_vma_chain);
-
-	vma_next->anon_vma = &dummy_anon_vma;
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
-	vmg.prev = vma_prev;
-	vmg.middle = vma;
-
-	ASSERT_EQ(merge_existing(&vmg), vma_prev);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-
-	ASSERT_EQ(vma_prev->vm_start, 0);
-	ASSERT_EQ(vma_prev->vm_end, 0x8000);
-
-	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
-
-	cleanup_mm(&mm, &vmi);
-
-	/*
-	 * vma has anon_vma, we assign to prev.
-	 *
-	 *         |<----->|
-	 * |-------*********-------|
-	 *   prev     vma     next
-	 *  extend   delete  delete
-	 */
-
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags);
-	vmg.anon_vma = &dummy_anon_vma;
-	vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain);
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
-	vmg.prev = vma_prev;
-	vmg.middle = vma;
-
-	ASSERT_EQ(merge_existing(&vmg), vma_prev);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-
-	ASSERT_EQ(vma_prev->vm_start, 0);
-	ASSERT_EQ(vma_prev->vm_end, 0x8000);
-
-	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
-
-	cleanup_mm(&mm, &vmi);
-
-	/*
-	 * vma has anon_vma, we assign to prev.
-	 *
-	 *         |<----->|
-	 * |-------*************
-	 *   prev       vma
-	 *  extend shrink/delete
-	 */
-
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags);
-
-	vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain);
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
-	vmg.prev = vma_prev;
-	vmg.middle = vma;
-
-	ASSERT_EQ(merge_existing(&vmg), vma_prev);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-
-	ASSERT_EQ(vma_prev->vm_start, 0);
-	ASSERT_EQ(vma_prev->vm_end, 0x5000);
-
-	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_prev->anon_vma->was_cloned);
-
-	cleanup_mm(&mm, &vmi);
-
-	/*
-	 * vma has anon_vma, we assign to next.
-	 *
-	 *     |<----->|
-	 * *************-------|
-	 *      vma       next
-	 * shrink/delete extend
-	 */
-
-	vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags);
-
-	vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain);
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
-	vmg.prev = vma;
-	vmg.middle = vma;
-
-	ASSERT_EQ(merge_existing(&vmg), vma_next);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-
-	ASSERT_EQ(vma_next->vm_start, 0x3000);
-	ASSERT_EQ(vma_next->vm_end, 0x8000);
-
-	ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(vma_next->anon_vma->was_cloned);
-
-	cleanup_mm(&mm, &vmi);
-	return true;
-}
-
-static bool test_vmi_prealloc_fail(void)
-{
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	VMA_ITERATOR(vmi, &mm, 0);
-	struct vma_merge_struct vmg = {
-		.mm = &mm,
-		.vmi = &vmi,
-	};
-	struct anon_vma_chain avc = {};
-	struct vm_area_struct *vma_prev, *vma;
-
-	/*
-	 * We are merging vma into prev, with vma possessing an anon_vma, which
-	 * will be duplicated. We cause the vmi preallocation to fail and assert
-	 * the duplicated anon_vma is unlinked.
-	 */
-
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
-	vma->anon_vma = &dummy_anon_vma;
-
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x5000, 3, vm_flags, &dummy_anon_vma);
-	vmg.prev = vma_prev;
-	vmg.middle = vma;
-	vma_set_dummy_anon_vma(vma, &avc);
-
-	fail_prealloc = true;
-
-	/* This will cause the merge to fail. */
-	ASSERT_EQ(merge_existing(&vmg), NULL);
-	ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM);
-	/* We will already have assigned the anon_vma. */
-	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
-	/* And it was both cloned and unlinked. */
-	ASSERT_TRUE(dummy_anon_vma.was_cloned);
-	ASSERT_TRUE(dummy_anon_vma.was_unlinked);
-
-	cleanup_mm(&mm, &vmi); /* Resets fail_prealloc too. */
-
-	/*
-	 * We repeat the same operation for expanding a VMA, which is what new
-	 * VMA merging ultimately uses too. This asserts that unlinking is
-	 * performed in this case too.
-	 */
-
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
-	vma->anon_vma = &dummy_anon_vma;
-
-	vmg_set_range(&vmg, 0, 0x5000, 3, vm_flags);
-	vmg.target = vma_prev;
-	vmg.next = vma;
-
-	fail_prealloc = true;
-	ASSERT_EQ(expand_existing(&vmg), -ENOMEM);
-	ASSERT_EQ(vmg.state, VMA_MERGE_ERROR_NOMEM);
-
-	ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
-	ASSERT_TRUE(dummy_anon_vma.was_cloned);
-	ASSERT_TRUE(dummy_anon_vma.was_unlinked);
-
-	cleanup_mm(&mm, &vmi);
-	return true;
-}
-
-static bool test_merge_extend(void)
-{
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	VMA_ITERATOR(vmi, &mm, 0x1000);
-	struct vm_area_struct *vma;
-
-	vma = alloc_and_link_vma(&mm, 0, 0x1000, 0, vm_flags);
-	alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags);
-
-	/*
-	 * Extend a VMA into the gap between itself and the following VMA.
-	 * This should result in a merge.
-	 *
-	 * <->
-	 * *  *
-	 *
-	 */
-
-	ASSERT_EQ(vma_merge_extend(&vmi, vma, 0x2000), vma);
-	ASSERT_EQ(vma->vm_start, 0);
-	ASSERT_EQ(vma->vm_end, 0x4000);
-	ASSERT_EQ(vma->vm_pgoff, 0);
-	ASSERT_TRUE(vma_write_started(vma));
-	ASSERT_EQ(mm.map_count, 1);
-
-	cleanup_mm(&mm, &vmi);
-	return true;
-}
-
-static bool test_copy_vma(void)
-{
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	bool need_locks = false;
-	VMA_ITERATOR(vmi, &mm, 0);
-	struct vm_area_struct *vma, *vma_new, *vma_next;
-
-	/* Move backwards and do not merge. */
-
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
-	vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks);
-	ASSERT_NE(vma_new, vma);
-	ASSERT_EQ(vma_new->vm_start, 0);
-	ASSERT_EQ(vma_new->vm_end, 0x2000);
-	ASSERT_EQ(vma_new->vm_pgoff, 0);
-	vma_assert_attached(vma_new);
-
-	cleanup_mm(&mm, &vmi);
-
-	/* Move a VMA into position next to another and merge the two. */
-
-	vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
-	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, vm_flags);
-	vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks);
-	vma_assert_attached(vma_new);
-
-	ASSERT_EQ(vma_new, vma_next);
-
-	cleanup_mm(&mm, &vmi);
-	return true;
-}
-
-static bool test_expand_only_mode(void)
-{
-	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
-	struct mm_struct mm = {};
-	VMA_ITERATOR(vmi, &mm, 0);
-	struct vm_area_struct *vma_prev, *vma;
-	VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, vm_flags, 5);
-
-	/*
-	 * Place a VMA prior to the one we're expanding so we assert that we do
-	 * not erroneously try to traverse to the previous VMA even though we
-	 * have, through the use of the just_expand flag, indicated we do not
-	 * need to do so.
-	 */
-	alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
-
-	/*
-	 * We will be positioned at the prev VMA, but looking to expand to
-	 * 0x9000.
-	 */
-	vma_iter_set(&vmi, 0x3000);
-	vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
-	vmg.prev = vma_prev;
-	vmg.just_expand = true;
-
-	vma = vma_merge_new_range(&vmg);
-	ASSERT_NE(vma, NULL);
-	ASSERT_EQ(vma, vma_prev);
-	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
-	ASSERT_EQ(vma->vm_start, 0x3000);
-	ASSERT_EQ(vma->vm_end, 0x9000);
-	ASSERT_EQ(vma->vm_pgoff, 3);
-	ASSERT_TRUE(vma_write_started(vma));
-	ASSERT_EQ(vma_iter_addr(&vmi), 0x3000);
-	vma_assert_attached(vma);
-
-	cleanup_mm(&mm, &vmi);
-	return true;
-}
-
-static bool test_mmap_region_basic(void)
-{
-	struct mm_struct mm = {};
-	unsigned long addr;
-	struct vm_area_struct *vma;
-	VMA_ITERATOR(vmi, &mm, 0);
-
-	current->mm = &mm;
-
-	/* Map at 0x300000, length 0x3000. */
-	addr = __mmap_region(NULL, 0x300000, 0x3000,
-			     VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
-			     0x300, NULL);
-	ASSERT_EQ(addr, 0x300000);
-
-	/* Map at 0x250000, length 0x3000. */
-	addr = __mmap_region(NULL, 0x250000, 0x3000,
-			     VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
-			     0x250, NULL);
-	ASSERT_EQ(addr, 0x250000);
-
-	/* Map at 0x303000, merging to 0x300000 of length 0x6000. */
-	addr = __mmap_region(NULL, 0x303000, 0x3000,
-			     VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
-			     0x303, NULL);
-	ASSERT_EQ(addr, 0x303000);
-
-	/* Map at 0x24d000, merging to 0x250000 of length 0x6000. */
-	addr = __mmap_region(NULL, 0x24d000, 0x3000,
-			     VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE,
-			     0x24d, NULL);
-	ASSERT_EQ(addr, 0x24d000);
-
-	ASSERT_EQ(mm.map_count, 2);
-
-	for_each_vma(vmi, vma) {
-		if (vma->vm_start == 0x300000) {
-			ASSERT_EQ(vma->vm_end, 0x306000);
-			ASSERT_EQ(vma->vm_pgoff, 0x300);
-		} else if (vma->vm_start == 0x24d000) {
-			ASSERT_EQ(vma->vm_end, 0x253000);
-			ASSERT_EQ(vma->vm_pgoff, 0x24d);
-		} else {
-			ASSERT_FALSE(true);
-		}
-	}
-
-	cleanup_mm(&mm, &vmi);
-	return true;
-}
-
-int main(void)
-{
-	int num_tests = 0, num_fail = 0;
-
-	maple_tree_init();
-	vma_state_init();
-
-#define TEST(name)							\
-	do {								\
-		num_tests++;						\
-		if (!test_##name()) {					\
-			num_fail++;					\
-			fprintf(stderr, "Test " #name " FAILED\n");	\
-		}							\
-	} while (0)
-
-	/* Very simple tests to kick the tyres. */
-	TEST(simple_merge);
-	TEST(simple_modify);
-	TEST(simple_expand);
-	TEST(simple_shrink);
-
-	TEST(merge_new);
-	TEST(vma_merge_special_flags);
-	TEST(vma_merge_with_close);
-	TEST(vma_merge_new_with_close);
-	TEST(merge_existing);
-	TEST(anon_vma_non_mergeable);
-	TEST(dup_anon_vma);
-	TEST(vmi_prealloc_fail);
-	TEST(merge_extend);
-	TEST(copy_vma);
-	TEST(expand_only_mode);
-
-	TEST(mmap_region_basic);
-
-#undef TEST
-
-	printf("%d tests run, %d passed, %d failed.\n",
-	       num_tests, num_tests - num_fail, num_fail);
-
-	return num_fail == 0 ? EXIT_SUCCESS : EXIT_FAILURE;
-}
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 2743f12ecf32..b48ebae3927d 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -1127,15 +1127,6 @@ static inline void mapping_allow_writable(struct address_space *mapping)
 	atomic_inc(&mapping->i_mmap_writable);
 }
 
-static inline void vma_set_range(struct vm_area_struct *vma,
-				 unsigned long start, unsigned long end,
-				 pgoff_t pgoff)
-{
-	vma->vm_start = start;
-	vma->vm_end = end;
-	vma->vm_pgoff = pgoff;
-}
-
 static inline
 struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
 {
-- 
cgit v1.2.3


From a1f0dacaaba14c7f949f5c6ab876944034620904 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 22 Jan 2026 16:06:21 +0000
Subject: tools/testing/vma: separate out vma_internal.h into logical headers

The vma_internal.h file is becoming entirely unmanageable.  It combines
duplicated kernel implementation logic that needs to be kept in-sync with
the kernel, stubbed out declarations that we simply ignore for testing
purposes and custom logic added to aid testing.

If we separate each of the three things into separate headers it makes
things far more manageable, so do so:

* include/stubs.h  contains the stubbed declarations,
* include/dup.h    contains the duplicated kernel declarations, and
* include/custom.h contains declarations customised for testing.

[lorenzo.stoakes@oracle.com: avoid a duplicate struct define]
  Link: https://lkml.kernel.org/r/1e032732-61c3-485c-9aa7-6a09016fefc1@lucifer.local
Link: https://lkml.kernel.org/r/dd57baf5b5986cb96a167150ac712cbe804b63ee.1769097829.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Yury Norov <ynorov@nvidia.com>
Cc: Chris Mason <clm@fb.com>
Cc: Pedro Falcato <pfalcato@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/Makefile         |    2 +-
 tools/testing/vma/include/custom.h |  103 ++
 tools/testing/vma/include/dup.h    | 1329 +++++++++++++++++++++++++
 tools/testing/vma/include/stubs.h  |  428 ++++++++
 tools/testing/vma/vma_internal.h   | 1936 +-----------------------------------
 5 files changed, 1873 insertions(+), 1925 deletions(-)
 create mode 100644 tools/testing/vma/include/custom.h
 create mode 100644 tools/testing/vma/include/dup.h
 create mode 100644 tools/testing/vma/include/stubs.h

(limited to 'tools')

diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
index 94133d9d3955..50aa4301b3a6 100644
--- a/tools/testing/vma/Makefile
+++ b/tools/testing/vma/Makefile
@@ -9,7 +9,7 @@ include ../shared/shared.mk
 OFILES = $(SHARED_OFILES) main.o shared.o maple-shim.o
 TARGETS = vma
 
-main.o: main.c shared.c shared.h vma_internal.h tests/merge.c tests/mmap.c tests/vma.c ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h
+main.o: main.c shared.c shared.h vma_internal.h tests/merge.c tests/mmap.c tests/vma.c ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h include/custom.h include/dup.h include/stubs.h
 
 vma:	$(OFILES)
 	$(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h
new file mode 100644
index 000000000000..f567127efba9
--- /dev/null
+++ b/tools/testing/vma/include/custom.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#pragma once
+
+/*
+ * Contains declarations that exist in the kernel which have been CUSTOMISED for
+ * testing purposes to faciliate userland VMA testing.
+ */
+
+#ifdef CONFIG_MMU
+extern unsigned long mmap_min_addr;
+extern unsigned long dac_mmap_min_addr;
+#else
+#define mmap_min_addr		0UL
+#define dac_mmap_min_addr	0UL
+#endif
+
+#define VM_WARN_ON(_expr) (WARN_ON(_expr))
+#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr))
+#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr))
+#define VM_BUG_ON(_expr) (BUG_ON(_expr))
+#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr))
+
+/* We hardcode this for now. */
+#define sysctl_max_map_count 0x1000000UL
+
+#define TASK_SIZE ((1ul << 47)-PAGE_SIZE)
+
+/*
+ * The shared stubs do not implement this, it amounts to an fprintf(STDERR,...)
+ * either way :)
+ */
+#define pr_warn_once pr_err
+
+#define pgtable_supports_soft_dirty() 1
+
+struct anon_vma {
+	struct anon_vma *root;
+	struct rb_root_cached rb_root;
+
+	/* Test fields. */
+	bool was_cloned;
+	bool was_unlinked;
+};
+
+static inline void unlink_anon_vmas(struct vm_area_struct *vma)
+{
+	/* For testing purposes, indicate that the anon_vma was unlinked. */
+	vma->anon_vma->was_unlinked = true;
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+	/* Used to indicate to tests that a write operation has begun. */
+	vma->vm_lock_seq++;
+}
+
+static inline __must_check
+int vma_start_write_killable(struct vm_area_struct *vma)
+{
+	/* Used to indicate to tests that a write operation has begun. */
+	vma->vm_lock_seq++;
+	return 0;
+}
+
+static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+				 enum vma_operation operation)
+{
+	/* For testing purposes. We indicate that an anon_vma has been cloned. */
+	if (src->anon_vma != NULL) {
+		dst->anon_vma = src->anon_vma;
+		dst->anon_vma->was_cloned = true;
+	}
+
+	return 0;
+}
+
+static inline int __anon_vma_prepare(struct vm_area_struct *vma)
+{
+	struct anon_vma *anon_vma = calloc(1, sizeof(struct anon_vma));
+
+	if (!anon_vma)
+		return -ENOMEM;
+
+	anon_vma->root = anon_vma;
+	vma->anon_vma = anon_vma;
+
+	return 0;
+}
+
+static inline int anon_vma_prepare(struct vm_area_struct *vma)
+{
+	if (likely(vma->anon_vma))
+		return 0;
+
+	return __anon_vma_prepare(vma);
+}
+
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
+{
+	if (reset_refcnt)
+		refcount_set(&vma->vm_refcnt, 0);
+}
diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
new file mode 100644
index 000000000000..0accfc296615
--- /dev/null
+++ b/tools/testing/vma/include/dup.h
@@ -0,0 +1,1329 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#pragma once
+
+/* Forward declarations to avoid header cycle. */
+struct vm_area_struct;
+static inline void vma_start_write(struct vm_area_struct *vma);
+
+extern const struct vm_operations_struct vma_dummy_vm_ops;
+extern unsigned long stack_guard_gap;
+extern const struct vm_operations_struct vma_dummy_vm_ops;
+extern unsigned long rlimit(unsigned int limit);
+struct task_struct *get_current(void);
+
+#define MMF_HAS_MDWE	28
+#define current get_current()
+
+/*
+ * Define the task command name length as enum, then it can be visible to
+ * BPF programs.
+ */
+enum {
+	TASK_COMM_LEN = 16,
+};
+
+/* PARTIALLY implemented types. */
+struct mm_struct {
+	struct maple_tree mm_mt;
+	int map_count;			/* number of VMAs */
+	unsigned long total_vm;	   /* Total pages mapped */
+	unsigned long locked_vm;   /* Pages that have PG_mlocked set */
+	unsigned long data_vm;	   /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
+	unsigned long exec_vm;	   /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
+	unsigned long stack_vm;	   /* VM_STACK */
+
+	unsigned long def_flags;
+
+	mm_flags_t flags; /* Must use mm_flags_* helpers to access */
+};
+struct address_space {
+	struct rb_root_cached	i_mmap;
+	unsigned long		flags;
+	atomic_t		i_mmap_writable;
+};
+struct file_operations {
+	int (*mmap)(struct file *, struct vm_area_struct *);
+	int (*mmap_prepare)(struct vm_area_desc *);
+};
+struct file {
+	struct address_space	*f_mapping;
+	const struct file_operations	*f_op;
+};
+struct anon_vma_chain {
+	struct anon_vma *anon_vma;
+	struct list_head same_vma;
+};
+struct task_struct {
+	char comm[TASK_COMM_LEN];
+	pid_t pid;
+	struct mm_struct *mm;
+
+	/* Used for emulating ABI behavior of previous Linux versions: */
+	unsigned int			personality;
+};
+
+struct kref {
+	refcount_t refcount;
+};
+
+struct anon_vma_name {
+	struct kref kref;
+	/* The name needs to be at the end because it is dynamically sized. */
+	char name[];
+};
+
+/*
+ * Contains declarations that are DUPLICATED from kernel source in order to
+ * faciliate userland VMA testing.
+ *
+ * These must be kept in sync with kernel source.
+ */
+
+#define VMA_LOCK_OFFSET	0x40000000
+
+typedef struct { unsigned long v; } freeptr_t;
+
+#define VM_NONE		0x00000000
+
+typedef int __bitwise vma_flag_t;
+
+#define ACCESS_PRIVATE(p, member) ((p)->member)
+
+#define DECLARE_VMA_BIT(name, bitnum) \
+	VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum)
+#define DECLARE_VMA_BIT_ALIAS(name, aliased) \
+	VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT
+enum {
+	DECLARE_VMA_BIT(READ, 0),
+	DECLARE_VMA_BIT(WRITE, 1),
+	DECLARE_VMA_BIT(EXEC, 2),
+	DECLARE_VMA_BIT(SHARED, 3),
+	/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
+	DECLARE_VMA_BIT(MAYREAD, 4),	/* limits for mprotect() etc. */
+	DECLARE_VMA_BIT(MAYWRITE, 5),
+	DECLARE_VMA_BIT(MAYEXEC, 6),
+	DECLARE_VMA_BIT(MAYSHARE, 7),
+	DECLARE_VMA_BIT(GROWSDOWN, 8),	/* general info on the segment */
+#ifdef CONFIG_MMU
+	DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */
+#else
+	/* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
+	DECLARE_VMA_BIT(MAYOVERLAY, 9),
+#endif /* CONFIG_MMU */
+	/* Page-ranges managed without "struct page", just pure PFN */
+	DECLARE_VMA_BIT(PFNMAP, 10),
+	DECLARE_VMA_BIT(MAYBE_GUARD, 11),
+	DECLARE_VMA_BIT(UFFD_WP, 12),	/* wrprotect pages tracking */
+	DECLARE_VMA_BIT(LOCKED, 13),
+	DECLARE_VMA_BIT(IO, 14),	/* Memory mapped I/O or similar */
+	DECLARE_VMA_BIT(SEQ_READ, 15),	/* App will access data sequentially */
+	DECLARE_VMA_BIT(RAND_READ, 16),	/* App will not benefit from clustered reads */
+	DECLARE_VMA_BIT(DONTCOPY, 17),	/* Do not copy this vma on fork */
+	DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */
+	DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */
+	DECLARE_VMA_BIT(ACCOUNT, 20),	/* Is a VM accounted object */
+	DECLARE_VMA_BIT(NORESERVE, 21),	/* should the VM suppress accounting */
+	DECLARE_VMA_BIT(HUGETLB, 22),	/* Huge TLB Page VM */
+	DECLARE_VMA_BIT(SYNC, 23),	/* Synchronous page faults */
+	DECLARE_VMA_BIT(ARCH_1, 24),	/* Architecture-specific flag */
+	DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */
+	DECLARE_VMA_BIT(DONTDUMP, 26),	/* Do not include in the core dump */
+	DECLARE_VMA_BIT(SOFTDIRTY, 27),	/* NOT soft dirty clean area */
+	DECLARE_VMA_BIT(MIXEDMAP, 28),	/* Can contain struct page and pure PFN pages */
+	DECLARE_VMA_BIT(HUGEPAGE, 29),	/* MADV_HUGEPAGE marked this vma */
+	DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */
+	DECLARE_VMA_BIT(MERGEABLE, 31),	/* KSM may merge identical pages */
+	/* These bits are reused, we define specific uses below. */
+	DECLARE_VMA_BIT(HIGH_ARCH_0, 32),
+	DECLARE_VMA_BIT(HIGH_ARCH_1, 33),
+	DECLARE_VMA_BIT(HIGH_ARCH_2, 34),
+	DECLARE_VMA_BIT(HIGH_ARCH_3, 35),
+	DECLARE_VMA_BIT(HIGH_ARCH_4, 36),
+	DECLARE_VMA_BIT(HIGH_ARCH_5, 37),
+	DECLARE_VMA_BIT(HIGH_ARCH_6, 38),
+	/*
+	 * This flag is used to connect VFIO to arch specific KVM code. It
+	 * indicates that the memory under this VMA is safe for use with any
+	 * non-cachable memory type inside KVM. Some VFIO devices, on some
+	 * platforms, are thought to be unsafe and can cause machine crashes
+	 * if KVM does not lock down the memory type.
+	 */
+	DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39),
+#ifdef CONFIG_PPC32
+	DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1),
+#else
+	DECLARE_VMA_BIT(DROPPABLE, 40),
+#endif
+	DECLARE_VMA_BIT(UFFD_MINOR, 41),
+	DECLARE_VMA_BIT(SEALED, 42),
+	/* Flags that reuse flags above. */
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0),
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1),
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2),
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3),
+	DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4),
+#if defined(CONFIG_X86_USER_SHADOW_STACK)
+	/*
+	 * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
+	 * support core mm.
+	 *
+	 * These VMAs will get a single end guard page. This helps userspace
+	 * protect itself from attacks. A single page is enough for current
+	 * shadow stack archs (x86). See the comments near alloc_shstk() in
+	 * arch/x86/kernel/shstk.c for more details on the guard size.
+	 */
+	DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5),
+#elif defined(CONFIG_ARM64_GCS)
+	/*
+	 * arm64's Guarded Control Stack implements similar functionality and
+	 * has similar constraints to shadow stacks.
+	 */
+	DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6),
+#endif
+	DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1),		/* Strong Access Ordering (powerpc) */
+	DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1),		/* parisc */
+	DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1),	/* sparc64 */
+	DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1),	/* arm64 */
+	DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1),	/* sparc64, arm64 */
+	DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1),	/* !CONFIG_MMU */
+	DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4),	/* arm64 */
+	DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */
+#ifdef CONFIG_STACK_GROWSUP
+	DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP),
+	DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN),
+#else
+	DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN),
+#endif
+};
+
+#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT)
+#define VM_READ		INIT_VM_FLAG(READ)
+#define VM_WRITE	INIT_VM_FLAG(WRITE)
+#define VM_EXEC		INIT_VM_FLAG(EXEC)
+#define VM_SHARED	INIT_VM_FLAG(SHARED)
+#define VM_MAYREAD	INIT_VM_FLAG(MAYREAD)
+#define VM_MAYWRITE	INIT_VM_FLAG(MAYWRITE)
+#define VM_MAYEXEC	INIT_VM_FLAG(MAYEXEC)
+#define VM_MAYSHARE	INIT_VM_FLAG(MAYSHARE)
+#define VM_GROWSDOWN	INIT_VM_FLAG(GROWSDOWN)
+#ifdef CONFIG_MMU
+#define VM_UFFD_MISSING	INIT_VM_FLAG(UFFD_MISSING)
+#else
+#define VM_UFFD_MISSING	VM_NONE
+#define VM_MAYOVERLAY	INIT_VM_FLAG(MAYOVERLAY)
+#endif
+#define VM_PFNMAP	INIT_VM_FLAG(PFNMAP)
+#define VM_MAYBE_GUARD	INIT_VM_FLAG(MAYBE_GUARD)
+#define VM_UFFD_WP	INIT_VM_FLAG(UFFD_WP)
+#define VM_LOCKED	INIT_VM_FLAG(LOCKED)
+#define VM_IO		INIT_VM_FLAG(IO)
+#define VM_SEQ_READ	INIT_VM_FLAG(SEQ_READ)
+#define VM_RAND_READ	INIT_VM_FLAG(RAND_READ)
+#define VM_DONTCOPY	INIT_VM_FLAG(DONTCOPY)
+#define VM_DONTEXPAND	INIT_VM_FLAG(DONTEXPAND)
+#define VM_LOCKONFAULT	INIT_VM_FLAG(LOCKONFAULT)
+#define VM_ACCOUNT	INIT_VM_FLAG(ACCOUNT)
+#define VM_NORESERVE	INIT_VM_FLAG(NORESERVE)
+#define VM_HUGETLB	INIT_VM_FLAG(HUGETLB)
+#define VM_SYNC		INIT_VM_FLAG(SYNC)
+#define VM_ARCH_1	INIT_VM_FLAG(ARCH_1)
+#define VM_WIPEONFORK	INIT_VM_FLAG(WIPEONFORK)
+#define VM_DONTDUMP	INIT_VM_FLAG(DONTDUMP)
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define VM_SOFTDIRTY	INIT_VM_FLAG(SOFTDIRTY)
+#else
+#define VM_SOFTDIRTY	VM_NONE
+#endif
+#define VM_MIXEDMAP	INIT_VM_FLAG(MIXEDMAP)
+#define VM_HUGEPAGE	INIT_VM_FLAG(HUGEPAGE)
+#define VM_NOHUGEPAGE	INIT_VM_FLAG(NOHUGEPAGE)
+#define VM_MERGEABLE	INIT_VM_FLAG(MERGEABLE)
+#define VM_STACK	INIT_VM_FLAG(STACK)
+#ifdef CONFIG_STACK_GROWS_UP
+#define VM_STACK_EARLY	INIT_VM_FLAG(STACK_EARLY)
+#else
+#define VM_STACK_EARLY	VM_NONE
+#endif
+#ifdef CONFIG_ARCH_HAS_PKEYS
+#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT)
+/* Despite the naming, these are FLAGS not bits. */
+#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0)
+#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1)
+#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2)
+#if CONFIG_ARCH_PKEY_BITS > 3
+#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3)
+#else
+#define VM_PKEY_BIT3  VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 3 */
+#if CONFIG_ARCH_PKEY_BITS > 4
+#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4)
+#else
+#define VM_PKEY_BIT4  VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 4 */
+#endif /* CONFIG_ARCH_HAS_PKEYS */
+#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS)
+#define VM_SHADOW_STACK	INIT_VM_FLAG(SHADOW_STACK)
+#else
+#define VM_SHADOW_STACK	VM_NONE
+#endif
+#if defined(CONFIG_PPC64)
+#define VM_SAO		INIT_VM_FLAG(SAO)
+#elif defined(CONFIG_PARISC)
+#define VM_GROWSUP	INIT_VM_FLAG(GROWSUP)
+#elif defined(CONFIG_SPARC64)
+#define VM_SPARC_ADI	INIT_VM_FLAG(SPARC_ADI)
+#define VM_ARCH_CLEAR	INIT_VM_FLAG(ARCH_CLEAR)
+#elif defined(CONFIG_ARM64)
+#define VM_ARM64_BTI	INIT_VM_FLAG(ARM64_BTI)
+#define VM_ARCH_CLEAR	INIT_VM_FLAG(ARCH_CLEAR)
+#elif !defined(CONFIG_MMU)
+#define VM_MAPPED_COPY	INIT_VM_FLAG(MAPPED_COPY)
+#endif
+#ifndef VM_GROWSUP
+#define VM_GROWSUP	VM_NONE
+#endif
+#ifdef CONFIG_ARM64_MTE
+#define VM_MTE		INIT_VM_FLAG(MTE)
+#define VM_MTE_ALLOWED	INIT_VM_FLAG(MTE_ALLOWED)
+#else
+#define VM_MTE		VM_NONE
+#define VM_MTE_ALLOWED	VM_NONE
+#endif
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
+#define VM_UFFD_MINOR	INIT_VM_FLAG(UFFD_MINOR)
+#else
+#define VM_UFFD_MINOR	VM_NONE
+#endif
+#ifdef CONFIG_64BIT
+#define VM_ALLOW_ANY_UNCACHED	INIT_VM_FLAG(ALLOW_ANY_UNCACHED)
+#define VM_SEALED		INIT_VM_FLAG(SEALED)
+#else
+#define VM_ALLOW_ANY_UNCACHED	VM_NONE
+#define VM_SEALED		VM_NONE
+#endif
+#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
+#define VM_DROPPABLE		INIT_VM_FLAG(DROPPABLE)
+#else
+#define VM_DROPPABLE		VM_NONE
+#endif
+
+/* Bits set in the VMA until the stack is in its final location */
+#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
+
+#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
+
+/* Common data flag combinations */
+#define VM_DATA_FLAGS_TSK_EXEC	(VM_READ | VM_WRITE | TASK_EXEC | \
+				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_FLAGS_NON_EXEC	(VM_READ | VM_WRITE | VM_MAYREAD | \
+				 VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_FLAGS_EXEC	(VM_READ | VM_WRITE | VM_EXEC | \
+				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#ifndef VM_DATA_DEFAULT_FLAGS		/* arch can override this */
+#define VM_DATA_DEFAULT_FLAGS  VM_DATA_FLAGS_EXEC
+#endif
+
+#ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
+#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
+#endif
+
+#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
+
+#define VM_STACK_FLAGS	(VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
+
+/* VMA basic access permission flags */
+#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
+
+/*
+ * Special vmas that are non-mergable, non-mlock()able.
+ */
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
+
+#define DEFAULT_MAP_WINDOW	((1UL << 47) - PAGE_SIZE)
+#define TASK_SIZE_LOW		DEFAULT_MAP_WINDOW
+#define TASK_SIZE_MAX		DEFAULT_MAP_WINDOW
+#define STACK_TOP		TASK_SIZE_LOW
+#define STACK_TOP_MAX		TASK_SIZE_MAX
+
+/* This mask represents all the VMA flag bits used by mlock */
+#define VM_LOCKED_MASK	(VM_LOCKED | VM_LOCKONFAULT)
+
+#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
+
+#define VM_DATA_FLAGS_TSK_EXEC	(VM_READ | VM_WRITE | TASK_EXEC | \
+				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#define RLIMIT_STACK		3	/* max stack size */
+#define RLIMIT_MEMLOCK		8	/* max locked-in-memory address space */
+
+#define CAP_IPC_LOCK         14
+
+#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD)
+
+#define VM_IGNORE_MERGE VM_STICKY
+
+#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD)
+
+#define pgprot_val(x)		((x).pgprot)
+#define __pgprot(x)		((pgprot_t) { (x) } )
+
+#define for_each_vma(__vmi, __vma)					\
+	while (((__vma) = vma_next(&(__vmi))) != NULL)
+
+/* The MM code likes to work with exclusive end addresses */
+#define for_each_vma_range(__vmi, __vma, __end)				\
+	while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)
+
+#define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
+
+#define PHYS_PFN(x)	((unsigned long)((x) >> PAGE_SHIFT))
+
+#define test_and_set_bit(nr, addr) __test_and_set_bit(nr, addr)
+#define test_and_clear_bit(nr, addr) __test_and_clear_bit(nr, addr)
+
+#define AS_MM_ALL_LOCKS 2
+
+#define swap(a, b) \
+	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+
+/*
+ * Flags for bug emulation.
+ *
+ * These occupy the top three bytes.
+ */
+enum {
+	READ_IMPLIES_EXEC =	0x0400000,
+};
+
+struct vma_iterator {
+	struct ma_state mas;
+};
+
+#define VMA_ITERATOR(name, __mm, __addr)				\
+	struct vma_iterator name = {					\
+		.mas = {						\
+			.tree = &(__mm)->mm_mt,				\
+			.index = __addr,				\
+			.node = NULL,					\
+			.status = ma_start,				\
+		},							\
+	}
+
+#define DEFINE_MUTEX(mutexname) \
+	struct mutex mutexname = {}
+
+#define DECLARE_BITMAP(name, bits) \
+	unsigned long name[BITS_TO_LONGS(bits)]
+
+#define EMPTY_VMA_FLAGS ((vma_flags_t){ })
+
+/* What action should be taken after an .mmap_prepare call is complete? */
+enum mmap_action_type {
+	MMAP_NOTHING,		/* Mapping is complete, no further action. */
+	MMAP_REMAP_PFN,		/* Remap PFN range. */
+	MMAP_IO_REMAP_PFN,	/* I/O remap PFN range. */
+};
+
+/*
+ * Describes an action an mmap_prepare hook can instruct to be taken to complete
+ * the mapping of a VMA. Specified in vm_area_desc.
+ */
+struct mmap_action {
+	union {
+		/* Remap range. */
+		struct {
+			unsigned long start;
+			unsigned long start_pfn;
+			unsigned long size;
+			pgprot_t pgprot;
+		} remap;
+	};
+	enum mmap_action_type type;
+
+	/*
+	 * If specified, this hook is invoked after the selected action has been
+	 * successfully completed. Note that the VMA write lock still held.
+	 *
+	 * The absolute minimum ought to be done here.
+	 *
+	 * Returns 0 on success, or an error code.
+	 */
+	int (*success_hook)(const struct vm_area_struct *vma);
+
+	/*
+	 * If specified, this hook is invoked when an error occurred when
+	 * attempting the selection action.
+	 *
+	 * The hook can return an error code in order to filter the error, but
+	 * it is not valid to clear the error here.
+	 */
+	int (*error_hook)(int err);
+
+	/*
+	 * This should be set in rare instances where the operation required
+	 * that the rmap should not be able to access the VMA until
+	 * completely set up.
+	 */
+	bool hide_from_rmap_until_complete :1;
+};
+
+/* Operations which modify VMAs. */
+enum vma_operation {
+	VMA_OP_SPLIT,
+	VMA_OP_MERGE_UNFAULTED,
+	VMA_OP_REMAP,
+	VMA_OP_FORK,
+};
+
+/*
+ * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
+ * manipulate mutable fields which will cause those fields to be updated in the
+ * resultant VMA.
+ *
+ * Helper functions are not required for manipulating any field.
+ */
+struct vm_area_desc {
+	/* Immutable state. */
+	const struct mm_struct *const mm;
+	struct file *const file; /* May vary from vm_file in stacked callers. */
+	unsigned long start;
+	unsigned long end;
+
+	/* Mutable fields. Populated with initial state. */
+	pgoff_t pgoff;
+	struct file *vm_file;
+	union {
+		vm_flags_t vm_flags;
+		vma_flags_t vma_flags;
+	};
+	pgprot_t page_prot;
+
+	/* Write-only fields. */
+	const struct vm_operations_struct *vm_ops;
+	void *private_data;
+
+	/* Take further action? */
+	struct mmap_action action;
+};
+
+struct vm_area_struct {
+	/* The first cache line has the info for VMA tree walking. */
+
+	union {
+		struct {
+			/* VMA covers [vm_start; vm_end) addresses within mm */
+			unsigned long vm_start;
+			unsigned long vm_end;
+		};
+		freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
+	};
+
+	struct mm_struct *vm_mm;	/* The address space we belong to. */
+	pgprot_t vm_page_prot;          /* Access permissions of this VMA. */
+
+	/*
+	 * Flags, see mm.h.
+	 * To modify use vm_flags_{init|reset|set|clear|mod} functions.
+	 */
+	union {
+		const vm_flags_t vm_flags;
+		vma_flags_t flags;
+	};
+
+#ifdef CONFIG_PER_VMA_LOCK
+	/*
+	 * Can only be written (using WRITE_ONCE()) while holding both:
+	 *  - mmap_lock (in write mode)
+	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set
+	 * Can be read reliably while holding one of:
+	 *  - mmap_lock (in read or write mode)
+	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
+	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
+	 * while holding nothing (except RCU to keep the VMA struct allocated).
+	 *
+	 * This sequence counter is explicitly allowed to overflow; sequence
+	 * counter reuse can only lead to occasional unnecessary use of the
+	 * slowpath.
+	 */
+	unsigned int vm_lock_seq;
+#endif
+
+	/*
+	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
+	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
+	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
+	 * or brk vma (with NULL file) can only be in an anon_vma list.
+	 */
+	struct list_head anon_vma_chain; /* Serialized by mmap_lock &
+					  * page_table_lock */
+	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */
+
+	/* Function pointers to deal with this struct. */
+	const struct vm_operations_struct *vm_ops;
+
+	/* Information about our backing store: */
+	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
+					   units */
+	struct file * vm_file;		/* File we map to (can be NULL). */
+	void * vm_private_data;		/* was vm_pte (shared mem) */
+
+#ifdef CONFIG_SWAP
+	atomic_long_t swap_readahead_info;
+#endif
+#ifndef CONFIG_MMU
+	struct vm_region *vm_region;	/* NOMMU mapping region */
+#endif
+#ifdef CONFIG_NUMA
+	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
+#endif
+#ifdef CONFIG_PER_VMA_LOCK
+	/* Unstable RCU readers are allowed to read this. */
+	refcount_t vm_refcnt;
+#endif
+	/*
+	 * For areas with an address space and backing store,
+	 * linkage into the address_space->i_mmap interval tree.
+	 *
+	 */
+	struct {
+		struct rb_node rb;
+		unsigned long rb_subtree_last;
+	} shared;
+#ifdef CONFIG_ANON_VMA_NAME
+	/*
+	 * For private and shared anonymous mappings, a pointer to a null
+	 * terminated string containing the name given to the vma, or NULL if
+	 * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
+	 */
+	struct anon_vma_name *anon_name;
+#endif
+	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+} __randomize_layout;
+
+struct vm_operations_struct {
+	void (*open)(struct vm_area_struct * area);
+	/**
+	 * @close: Called when the VMA is being removed from the MM.
+	 * Context: User context.  May sleep.  Caller holds mmap_lock.
+	 */
+	void (*close)(struct vm_area_struct * area);
+	/* Called any time before splitting to check if it's allowed */
+	int (*may_split)(struct vm_area_struct *area, unsigned long addr);
+	int (*mremap)(struct vm_area_struct *area);
+	/*
+	 * Called by mprotect() to make driver-specific permission
+	 * checks before mprotect() is finalised.   The VMA must not
+	 * be modified.  Returns 0 if mprotect() can proceed.
+	 */
+	int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
+			unsigned long end, unsigned long newflags);
+	vm_fault_t (*fault)(struct vm_fault *vmf);
+	vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
+	vm_fault_t (*map_pages)(struct vm_fault *vmf,
+			pgoff_t start_pgoff, pgoff_t end_pgoff);
+	unsigned long (*pagesize)(struct vm_area_struct * area);
+
+	/* notification that a previously read-only page is about to become
+	 * writable, if an error is returned it will cause a SIGBUS */
+	vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
+
+	/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
+	vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
+
+	/* called by access_process_vm when get_user_pages() fails, typically
+	 * for use by special VMAs. See also generic_access_phys() for a generic
+	 * implementation useful for any iomem mapping.
+	 */
+	int (*access)(struct vm_area_struct *vma, unsigned long addr,
+		      void *buf, int len, int write);
+
+	/* Called by the /proc/PID/maps code to ask the vma whether it
+	 * has a special name.  Returning non-NULL will also cause this
+	 * vma to be dumped unconditionally. */
+	const char *(*name)(struct vm_area_struct *vma);
+
+#ifdef CONFIG_NUMA
+	/*
+	 * set_policy() op must add a reference to any non-NULL @new mempolicy
+	 * to hold the policy upon return.  Caller should pass NULL @new to
+	 * remove a policy and fall back to surrounding context--i.e. do not
+	 * install a MPOL_DEFAULT policy, nor the task or system default
+	 * mempolicy.
+	 */
+	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
+
+	/*
+	 * get_policy() op must add reference [mpol_get()] to any policy at
+	 * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
+	 * in mm/mempolicy.c will do this automatically.
+	 * get_policy() must NOT add a ref if the policy at (vma,addr) is not
+	 * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
+	 * If no [shared/vma] mempolicy exists at the addr, get_policy() op
+	 * must return NULL--i.e., do not "fallback" to task or system default
+	 * policy.
+	 */
+	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
+					unsigned long addr, pgoff_t *ilx);
+#endif
+#ifdef CONFIG_FIND_NORMAL_PAGE
+	/*
+	 * Called by vm_normal_page() for special PTEs in @vma at @addr. This
+	 * allows for returning a "normal" page from vm_normal_page() even
+	 * though the PTE indicates that the "struct page" either does not exist
+	 * or should not be touched: "special".
+	 *
+	 * Do not add new users: this really only works when a "normal" page
+	 * was mapped, but then the PTE got changed to something weird (+
+	 * marked special) that would not make pte_pfn() identify the originally
+	 * inserted page.
+	 */
+	struct page *(*find_normal_page)(struct vm_area_struct *vma,
+					 unsigned long addr);
+#endif /* CONFIG_FIND_NORMAL_PAGE */
+};
+
+struct vm_unmapped_area_info {
+#define VM_UNMAPPED_AREA_TOPDOWN 1
+	unsigned long flags;
+	unsigned long length;
+	unsigned long low_limit;
+	unsigned long high_limit;
+	unsigned long align_mask;
+	unsigned long align_offset;
+	unsigned long start_gap;
+};
+
+struct pagetable_move_control {
+	struct vm_area_struct *old; /* Source VMA. */
+	struct vm_area_struct *new; /* Destination VMA. */
+	unsigned long old_addr; /* Address from which the move begins. */
+	unsigned long old_end; /* Exclusive address at which old range ends. */
+	unsigned long new_addr; /* Address to move page tables to. */
+	unsigned long len_in; /* Bytes to remap specified by user. */
+
+	bool need_rmap_locks; /* Do rmap locks need to be taken? */
+	bool for_stack; /* Is this an early temp stack being moved? */
+};
+
+#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_)	\
+	struct pagetable_move_control name = {				\
+		.old = old_,						\
+		.new = new_,						\
+		.old_addr = old_addr_,					\
+		.old_end = (old_addr_) + (len_),			\
+		.new_addr = new_addr_,					\
+		.len_in = len_,						\
+	}
+
+static inline void vma_iter_invalidate(struct vma_iterator *vmi)
+{
+	mas_pause(&vmi->mas);
+}
+
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+	return __pgprot(pgprot_val(oldprot) | pgprot_val(newprot));
+}
+
+static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
+{
+	return __pgprot(vm_flags);
+}
+
+static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
+{
+	return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
+}
+
+/*
+ * Copy value to the first system word of VMA flags, non-atomically.
+ *
+ * IMPORTANT: This does not overwrite bytes past the first system word. The
+ * caller must account for this.
+ */
+static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value)
+{
+	*ACCESS_PRIVATE(flags, __vma_flags) = value;
+}
+
+/*
+ * Copy value to the first system word of VMA flags ONCE, non-atomically.
+ *
+ * IMPORTANT: This does not overwrite bytes past the first system word. The
+ * caller must account for this.
+ */
+static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value)
+{
+	unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+	WRITE_ONCE(*bitmap, value);
+}
+
+/* Update the first system word of VMA flags setting bits, non-atomically. */
+static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value)
+{
+	unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+	*bitmap |= value;
+}
+
+/* Update the first system word of VMA flags clearing bits, non-atomically. */
+static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value)
+{
+	unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+	*bitmap &= ~value;
+}
+
+static inline void vma_flags_clear_all(vma_flags_t *flags)
+{
+	bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS);
+}
+
+static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit)
+{
+	unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
+
+	__set_bit((__force int)bit, bitmap);
+}
+
+/* Use when VMA is not part of the VMA tree and needs no locking */
+static inline void vm_flags_init(struct vm_area_struct *vma,
+				 vm_flags_t flags)
+{
+	vma_flags_clear_all(&vma->flags);
+	vma_flags_overwrite_word(&vma->flags, flags);
+}
+
+/*
+ * Use when VMA is part of the VMA tree and modifications need coordination
+ * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
+ * it should be locked explicitly beforehand.
+ */
+static inline void vm_flags_reset(struct vm_area_struct *vma,
+				  vm_flags_t flags)
+{
+	vma_assert_write_locked(vma);
+	vm_flags_init(vma, flags);
+}
+
+static inline void vm_flags_reset_once(struct vm_area_struct *vma,
+				       vm_flags_t flags)
+{
+	vma_assert_write_locked(vma);
+	/*
+	 * The user should only be interested in avoiding reordering of
+	 * assignment to the first word.
+	 */
+	vma_flags_clear_all(&vma->flags);
+	vma_flags_overwrite_word_once(&vma->flags, flags);
+}
+
+static inline void vm_flags_set(struct vm_area_struct *vma,
+				vm_flags_t flags)
+{
+	vma_start_write(vma);
+	vma_flags_set_word(&vma->flags, flags);
+}
+
+static inline void vm_flags_clear(struct vm_area_struct *vma,
+				  vm_flags_t flags)
+{
+	vma_start_write(vma);
+	vma_flags_clear_word(&vma->flags, flags);
+}
+
+static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits)
+{
+	vma_flags_t flags;
+	int i;
+
+	vma_flags_clear_all(&flags);
+	for (i = 0; i < count; i++)
+		vma_flag_set(&flags, bits[i]);
+	return flags;
+}
+
+#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \
+					 (const vma_flag_t []){__VA_ARGS__})
+
+static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags,
+		vma_flags_t to_test)
+{
+	const unsigned long *bitmap = flags->__vma_flags;
+	const unsigned long *bitmap_to_test = to_test.__vma_flags;
+
+	return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
+}
+
+#define vma_flags_test(flags, ...) \
+	vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags,
+		vma_flags_t to_test)
+{
+	const unsigned long *bitmap = flags->__vma_flags;
+	const unsigned long *bitmap_to_test = to_test.__vma_flags;
+
+	return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
+}
+
+#define vma_flags_test_all(flags, ...) \
+	vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set)
+{
+	unsigned long *bitmap = flags->__vma_flags;
+	const unsigned long *bitmap_to_set = to_set.__vma_flags;
+
+	bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS);
+}
+
+#define vma_flags_set(flags, ...) \
+	vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear)
+{
+	unsigned long *bitmap = flags->__vma_flags;
+	const unsigned long *bitmap_to_clear = to_clear.__vma_flags;
+
+	bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS);
+}
+
+#define vma_flags_clear(flags, ...) \
+	vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__))
+
+static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma,
+					   vma_flags_t flags)
+{
+	return vma_flags_test_all_mask(&vma->flags, flags);
+}
+
+#define vma_test_all_flags(vma, ...) \
+	vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
+
+static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags)
+{
+	return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
+		(VM_SHARED | VM_MAYWRITE);
+}
+
+static inline void vma_set_flags_mask(struct vm_area_struct *vma,
+				      vma_flags_t flags)
+{
+	vma_flags_set_mask(&vma->flags, flags);
+}
+
+#define vma_set_flags(vma, ...) \
+	vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
+
+static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc,
+					    vma_flags_t flags)
+{
+	return vma_flags_test_mask(&desc->vma_flags, flags);
+}
+
+#define vma_desc_test_flags(desc, ...) \
+	vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc,
+					   vma_flags_t flags)
+{
+	vma_flags_set_mask(&desc->vma_flags, flags);
+}
+
+#define vma_desc_set_flags(desc, ...) \
+	vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc,
+					     vma_flags_t flags)
+{
+	vma_flags_clear_mask(&desc->vma_flags, flags);
+}
+
+#define vma_desc_clear_flags(desc, ...) \
+	vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
+
+static inline bool is_shared_maywrite(const vma_flags_t *flags)
+{
+	return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT);
+}
+
+static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+{
+	return is_shared_maywrite(&vma->flags);
+}
+
+static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
+{
+	/*
+	 * Uses mas_find() to get the first VMA when the iterator starts.
+	 * Calling mas_next() could skip the first entry.
+	 */
+	return mas_find(&vmi->mas, ULONG_MAX);
+}
+
+/*
+ * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
+ * assertions should be made either under mmap_write_lock or when the object
+ * has been isolated under mmap_write_lock, ensuring no competing writers.
+ */
+static inline void vma_assert_attached(struct vm_area_struct *vma)
+{
+	WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
+}
+
+static inline void vma_assert_detached(struct vm_area_struct *vma)
+{
+	WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *);
+static inline void vma_mark_attached(struct vm_area_struct *vma)
+{
+	vma_assert_write_locked(vma);
+	vma_assert_detached(vma);
+	refcount_set_release(&vma->vm_refcnt, 1);
+}
+
+static inline void vma_mark_detached(struct vm_area_struct *vma)
+{
+	vma_assert_write_locked(vma);
+	vma_assert_attached(vma);
+	/* We are the only writer, so no need to use vma_refcount_put(). */
+	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
+		/*
+		 * Reader must have temporarily raised vm_refcnt but it will
+		 * drop it without using the vma since vma is write-locked.
+		 */
+	}
+}
+
+static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
+{
+	memset(vma, 0, sizeof(*vma));
+	vma->vm_mm = mm;
+	vma->vm_ops = &vma_dummy_vm_ops;
+	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	vma->vm_lock_seq = UINT_MAX;
+}
+
+/*
+ * These are defined in vma.h, but sadly vm_stat_account() is referenced by
+ * kernel/fork.c, so we have to these broadly available there, and temporarily
+ * define them here to resolve the dependency cycle.
+ */
+#define is_exec_mapping(flags) \
+	((flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC)
+
+#define is_stack_mapping(flags) \
+	(((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK))
+
+#define is_data_mapping(flags) \
+	((flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE)
+
+static inline void vm_stat_account(struct mm_struct *mm, vm_flags_t flags,
+				   long npages)
+{
+	WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
+
+	if (is_exec_mapping(flags))
+		mm->exec_vm += npages;
+	else if (is_stack_mapping(flags))
+		mm->stack_vm += npages;
+	else if (is_data_mapping(flags))
+		mm->data_vm += npages;
+}
+
+#undef is_exec_mapping
+#undef is_stack_mapping
+#undef is_data_mapping
+
+static inline void vm_unacct_memory(long pages)
+{
+	vm_acct_memory(-pages);
+}
+
+static inline void mapping_allow_writable(struct address_space *mapping)
+{
+	atomic_inc(&mapping->i_mmap_writable);
+}
+
+static inline
+struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
+{
+	return mas_find(&vmi->mas, max - 1);
+}
+
+static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
+			unsigned long start, unsigned long end, gfp_t gfp)
+{
+	__mas_set_range(&vmi->mas, start, end - 1);
+	mas_store_gfp(&vmi->mas, NULL, gfp);
+	if (unlikely(mas_is_err(&vmi->mas)))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline void vma_set_anonymous(struct vm_area_struct *vma)
+{
+	vma->vm_ops = NULL;
+}
+
+/* Declared in vma.h. */
+static inline void set_vma_from_desc(struct vm_area_struct *vma,
+		struct vm_area_desc *desc);
+
+static inline int __compat_vma_mmap(const struct file_operations *f_op,
+		struct file *file, struct vm_area_struct *vma)
+{
+	struct vm_area_desc desc = {
+		.mm = vma->vm_mm,
+		.file = file,
+		.start = vma->vm_start,
+		.end = vma->vm_end,
+
+		.pgoff = vma->vm_pgoff,
+		.vm_file = vma->vm_file,
+		.vm_flags = vma->vm_flags,
+		.page_prot = vma->vm_page_prot,
+
+		.action.type = MMAP_NOTHING, /* Default */
+	};
+	int err;
+
+	err = f_op->mmap_prepare(&desc);
+	if (err)
+		return err;
+
+	mmap_action_prepare(&desc.action, &desc);
+	set_vma_from_desc(vma, &desc);
+	return mmap_action_complete(&desc.action, vma);
+}
+
+static inline int compat_vma_mmap(struct file *file,
+		struct vm_area_struct *vma)
+{
+	return __compat_vma_mmap(file->f_op, file, vma);
+}
+
+
+static inline void vma_iter_init(struct vma_iterator *vmi,
+		struct mm_struct *mm, unsigned long addr)
+{
+	mas_init(&vmi->mas, &mm->mm_mt, addr);
+}
+
+static inline unsigned long vma_pages(struct vm_area_struct *vma)
+{
+	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+}
+
+static inline void mmap_assert_locked(struct mm_struct *);
+static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+						unsigned long start_addr,
+						unsigned long end_addr)
+{
+	unsigned long index = start_addr;
+
+	mmap_assert_locked(mm);
+	return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+
+static inline
+struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
+{
+	return mtree_load(&mm->mm_mt, addr);
+}
+
+static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
+{
+	return mas_prev(&vmi->mas, 0);
+}
+
+static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
+{
+	mas_set(&vmi->mas, addr);
+}
+
+static inline bool vma_is_anonymous(struct vm_area_struct *vma)
+{
+	return !vma->vm_ops;
+}
+
+/* Defined in vma.h, so temporarily define here to avoid circular dependency. */
+#define vma_iter_load(vmi) \
+	mas_walk(&(vmi)->mas)
+
+static inline struct vm_area_struct *
+find_vma_prev(struct mm_struct *mm, unsigned long addr,
+			struct vm_area_struct **pprev)
+{
+	struct vm_area_struct *vma;
+	VMA_ITERATOR(vmi, mm, addr);
+
+	vma = vma_iter_load(&vmi);
+	*pprev = vma_prev(&vmi);
+	if (!vma)
+		vma = vma_next(&vmi);
+	return vma;
+}
+
+#undef vma_iter_load
+
+static inline void vma_iter_free(struct vma_iterator *vmi)
+{
+	mas_destroy(&vmi->mas);
+}
+
+static inline
+struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
+{
+	return mas_next_range(&vmi->mas, ULONG_MAX);
+}
+
+bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
+
+/* Update vma->vm_page_prot to reflect vma->vm_flags. */
+static inline void vma_set_page_prot(struct vm_area_struct *vma)
+{
+	vm_flags_t vm_flags = vma->vm_flags;
+	pgprot_t vm_page_prot;
+
+	/* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
+	vm_page_prot = pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vm_flags));
+
+	if (vma_wants_writenotify(vma, vm_page_prot)) {
+		vm_flags &= ~VM_SHARED;
+		/* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
+		vm_page_prot = pgprot_modify(vm_page_prot, vm_get_page_prot(vm_flags));
+	}
+	/* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
+	WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
+}
+
+static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_GROWSDOWN)
+		return stack_guard_gap;
+
+	/* See reasoning around the VM_SHADOW_STACK definition */
+	if (vma->vm_flags & VM_SHADOW_STACK)
+		return PAGE_SIZE;
+
+	return 0;
+}
+
+static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
+{
+	unsigned long gap = stack_guard_start_gap(vma);
+	unsigned long vm_start = vma->vm_start;
+
+	vm_start -= gap;
+	if (vm_start > vma->vm_start)
+		vm_start = 0;
+	return vm_start;
+}
+
+static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
+{
+	unsigned long vm_end = vma->vm_end;
+
+	if (vma->vm_flags & VM_GROWSUP) {
+		vm_end += stack_guard_gap;
+		if (vm_end < vma->vm_end)
+			vm_end = -PAGE_SIZE;
+	}
+	return vm_end;
+}
+
+static inline bool vma_is_accessible(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & VM_ACCESS_FLAGS;
+}
+
+static inline bool mlock_future_ok(const struct mm_struct *mm,
+		vm_flags_t vm_flags, unsigned long bytes)
+{
+	unsigned long locked_pages, limit_pages;
+
+	if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
+		return true;
+
+	locked_pages = bytes >> PAGE_SHIFT;
+	locked_pages += mm->locked_vm;
+
+	limit_pages = rlimit(RLIMIT_MEMLOCK);
+	limit_pages >>= PAGE_SHIFT;
+
+	return locked_pages <= limit_pages;
+}
+
+static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
+{
+	/* If MDWE is disabled, we have nothing to deny. */
+	if (mm_flags_test(MMF_HAS_MDWE, current->mm))
+		return false;
+
+	/* If the new VMA is not executable, we have nothing to deny. */
+	if (!(new & VM_EXEC))
+		return false;
+
+	/* Under MDWE we do not accept newly writably executable VMAs... */
+	if (new & VM_WRITE)
+		return true;
+
+	/* ...nor previously non-executable VMAs becoming executable. */
+	if (!(old & VM_EXEC))
+		return true;
+
+	return false;
+}
+
+static inline int mapping_map_writable(struct address_space *mapping)
+{
+	return atomic_inc_unless_negative(&mapping->i_mmap_writable) ?
+		0 : -EPERM;
+}
+
+/* Did the driver provide valid mmap hook configuration? */
+static inline bool can_mmap_file(struct file *file)
+{
+	bool has_mmap = file->f_op->mmap;
+	bool has_mmap_prepare = file->f_op->mmap_prepare;
+
+	/* Hooks are mutually exclusive. */
+	if (WARN_ON_ONCE(has_mmap && has_mmap_prepare))
+		return false;
+	if (!has_mmap && !has_mmap_prepare)
+		return false;
+
+	return true;
+}
+
+static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if (file->f_op->mmap_prepare)
+		return compat_vma_mmap(file, vma);
+
+	return file->f_op->mmap(file, vma);
+}
+
+static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
+{
+	return file->f_op->mmap_prepare(desc);
+}
+
+static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
+{
+	/* Changing an anonymous vma with this is illegal */
+	get_file(file);
+	swap(vma->vm_file, file);
+	fput(file);
+}
diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h
new file mode 100644
index 000000000000..947a3a0c2566
--- /dev/null
+++ b/tools/testing/vma/include/stubs.h
@@ -0,0 +1,428 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#pragma once
+
+/*
+ * Contains declarations that are STUBBED, that is that are rendered no-ops, in
+ * order to faciliate userland VMA testing.
+ */
+
+/* Forward declarations. */
+struct mm_struct;
+struct vm_area_struct;
+struct vm_area_desc;
+struct pagetable_move_control;
+struct mmap_action;
+struct file;
+struct anon_vma;
+struct anon_vma_chain;
+struct address_space;
+struct unmap_desc;
+
+#define __bitwise
+#define __randomize_layout
+
+#define FIRST_USER_ADDRESS	0UL
+#define USER_PGTABLES_CEILING	0UL
+
+#define vma_policy(vma) NULL
+
+#define down_write_nest_lock(sem, nest_lock)
+
+#define data_race(expr) expr
+
+#define ASSERT_EXCLUSIVE_WRITER(x)
+
+struct vm_userfaultfd_ctx {};
+struct mempolicy {};
+struct mmu_gather {};
+struct mutex {};
+struct vm_fault {};
+
+static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
+					      struct list_head *uf)
+{
+}
+
+static inline unsigned long move_page_tables(struct pagetable_move_control *pmc)
+{
+	return 0;
+}
+
+static inline void free_pgd_range(struct mmu_gather *tlb,
+			unsigned long addr, unsigned long end,
+			unsigned long floor, unsigned long ceiling)
+{
+}
+
+static inline int ksm_execve(struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void ksm_exit(struct mm_struct *mm)
+{
+}
+
+static inline void vma_numab_state_init(struct vm_area_struct *vma)
+{
+}
+
+static inline void vma_numab_state_free(struct vm_area_struct *vma)
+{
+}
+
+static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
+				     struct vm_area_struct *new_vma)
+{
+}
+
+static inline void free_anon_vma_name(struct vm_area_struct *vma)
+{
+}
+
+static inline void mmap_action_prepare(struct mmap_action *action,
+					   struct vm_area_desc *desc)
+{
+}
+
+static inline int mmap_action_complete(struct mmap_action *action,
+					   struct vm_area_struct *vma)
+{
+	return 0;
+}
+
+static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
+{
+}
+
+static inline bool shmem_file(struct file *file)
+{
+	return false;
+}
+
+static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm,
+		const struct file *file, vm_flags_t vm_flags)
+{
+	return vm_flags;
+}
+
+static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
+{
+}
+
+static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
+		unsigned long pfn, unsigned long size, pgprot_t pgprot)
+{
+	return 0;
+}
+
+static inline int do_munmap(struct mm_struct *, unsigned long, size_t,
+		struct list_head *uf)
+{
+	return 0;
+}
+
+/* Currently stubbed but we may later wish to un-stub. */
+static inline void vm_acct_memory(long pages);
+
+static inline void mmap_assert_locked(struct mm_struct *mm)
+{
+}
+
+
+static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
+{
+}
+
+static inline void i_mmap_unlock_write(struct address_space *mapping)
+{
+}
+
+static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+					 unsigned long start,
+					 unsigned long end,
+					 struct list_head *unmaps)
+{
+	return 0;
+}
+
+static inline void mmap_write_downgrade(struct mm_struct *mm)
+{
+}
+
+static inline void mmap_read_unlock(struct mm_struct *mm)
+{
+}
+
+static inline void mmap_write_unlock(struct mm_struct *mm)
+{
+}
+
+static inline int mmap_write_lock_killable(struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline bool can_modify_mm(struct mm_struct *mm,
+				 unsigned long start,
+				 unsigned long end)
+{
+	return true;
+}
+
+static inline void arch_unmap(struct mm_struct *mm,
+				 unsigned long start,
+				 unsigned long end)
+{
+}
+
+static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
+{
+	return true;
+}
+
+static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
+			  vm_flags_t vm_flags)
+{
+}
+
+static inline bool mapping_can_writeback(struct address_space *mapping)
+{
+	return true;
+}
+
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+{
+	return false;
+}
+
+static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
+{
+	return false;
+}
+
+static inline bool userfaultfd_wp(struct vm_area_struct *vma)
+{
+	return false;
+}
+
+static inline void mmap_assert_write_locked(struct mm_struct *mm)
+{
+}
+
+static inline void mutex_lock(struct mutex *lock)
+{
+}
+
+static inline void mutex_unlock(struct mutex *lock)
+{
+}
+
+static inline bool mutex_is_locked(struct mutex *lock)
+{
+	return true;
+}
+
+static inline bool signal_pending(void *p)
+{
+	return false;
+}
+
+static inline bool is_file_hugepages(struct file *file)
+{
+	return false;
+}
+
+static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
+{
+	return 0;
+}
+
+static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags,
+				 unsigned long npages)
+{
+	return true;
+}
+
+static inline int shmem_zero_setup(struct vm_area_struct *vma)
+{
+	return 0;
+}
+
+
+static inline void vm_acct_memory(long pages)
+{
+}
+
+static inline void vma_interval_tree_insert(struct vm_area_struct *vma,
+					    struct rb_root_cached *rb)
+{
+}
+
+static inline void vma_interval_tree_remove(struct vm_area_struct *vma,
+					    struct rb_root_cached *rb)
+{
+}
+
+static inline void flush_dcache_mmap_unlock(struct address_space *mapping)
+{
+}
+
+static inline void anon_vma_interval_tree_insert(struct anon_vma_chain *avc,
+						 struct rb_root_cached *rb)
+{
+}
+
+static inline void anon_vma_interval_tree_remove(struct anon_vma_chain *avc,
+						 struct rb_root_cached *rb)
+{
+}
+
+static inline void uprobe_mmap(struct vm_area_struct *vma)
+{
+}
+
+static inline void uprobe_munmap(struct vm_area_struct *vma,
+				 unsigned long start, unsigned long end)
+{
+}
+
+static inline void i_mmap_lock_write(struct address_space *mapping)
+{
+}
+
+static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
+{
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+}
+
+static inline void ksm_add_vma(struct vm_area_struct *vma)
+{
+}
+
+static inline void perf_event_mmap(struct vm_area_struct *vma)
+{
+}
+
+static inline bool vma_is_dax(struct vm_area_struct *vma)
+{
+	return false;
+}
+
+static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+	return NULL;
+}
+
+static inline bool arch_validate_flags(vm_flags_t flags)
+{
+	return true;
+}
+
+static inline void vma_close(struct vm_area_struct *vma)
+{
+}
+
+static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
+{
+	return 0;
+}
+
+static inline int is_hugepage_only_range(struct mm_struct *mm,
+					unsigned long addr, unsigned long len)
+{
+	return 0;
+}
+
+static inline bool capable(int cap)
+{
+	return true;
+}
+
+static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+					struct vm_userfaultfd_ctx vm_ctx)
+{
+	return true;
+}
+
+static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
+				    struct anon_vma_name *anon_name2)
+{
+	return true;
+}
+
+static inline void might_sleep(void)
+{
+}
+
+static inline void fput(struct file *file)
+{
+}
+
+static inline void mpol_put(struct mempolicy *pol)
+{
+}
+
+static inline void lru_add_drain(void)
+{
+}
+
+static inline void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
+{
+}
+
+static inline void update_hiwater_rss(struct mm_struct *mm)
+{
+}
+
+static inline void update_hiwater_vm(struct mm_struct *mm)
+{
+}
+
+static inline void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap)
+{
+}
+
+static inline void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *unmap)
+{
+}
+
+static inline void mapping_unmap_writable(struct address_space *mapping)
+{
+}
+
+static inline void flush_dcache_mmap_lock(struct address_space *mapping)
+{
+}
+
+static inline void tlb_finish_mmu(struct mmu_gather *tlb)
+{
+}
+
+static inline struct file *get_file(struct file *f)
+{
+	return f;
+}
+
+static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
+{
+	return 0;
+}
+
+static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
+					 unsigned long start,
+					 unsigned long end,
+					 struct vm_area_struct *next)
+{
+}
+
+static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index b48ebae3927d..e3ed05b57819 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -12,15 +12,11 @@
 #ifndef __MM_VMA_INTERNAL_H
 #define __MM_VMA_INTERNAL_H
 
-#define __private
-#define __bitwise
-#define __randomize_layout
+#include <stdlib.h>
 
 #define CONFIG_MMU
 #define CONFIG_PER_VMA_LOCK
 
-#include <stdlib.h>
-
 #ifdef __CONCAT
 #undef __CONCAT
 #endif
@@ -35,1936 +31,28 @@
 #include <linux/refcount.h>
 #include <linux/slab.h>
 
-extern unsigned long stack_guard_gap;
-#ifdef CONFIG_MMU
-extern unsigned long mmap_min_addr;
-extern unsigned long dac_mmap_min_addr;
-#else
-#define mmap_min_addr		0UL
-#define dac_mmap_min_addr	0UL
-#endif
-
-#define ACCESS_PRIVATE(p, member) ((p)->member)
-
-#define VM_WARN_ON(_expr) (WARN_ON(_expr))
-#define VM_WARN_ON_ONCE(_expr) (WARN_ON_ONCE(_expr))
-#define VM_WARN_ON_VMG(_expr, _vmg) (WARN_ON(_expr))
-#define VM_BUG_ON(_expr) (BUG_ON(_expr))
-#define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr))
-
-#define MMF_HAS_MDWE	28
-
-/*
- * vm_flags in vm_area_struct, see mm_types.h.
- * When changing, update also include/trace/events/mmflags.h
- */
-
-#define VM_NONE		0x00000000
-
-/**
- * typedef vma_flag_t - specifies an individual VMA flag by bit number.
- *
- * This value is made type safe by sparse to avoid passing invalid flag values
- * around.
- */
-typedef int __bitwise vma_flag_t;
-
-#define DECLARE_VMA_BIT(name, bitnum) \
-	VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum)
-#define DECLARE_VMA_BIT_ALIAS(name, aliased) \
-	VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT
-enum {
-	DECLARE_VMA_BIT(READ, 0),
-	DECLARE_VMA_BIT(WRITE, 1),
-	DECLARE_VMA_BIT(EXEC, 2),
-	DECLARE_VMA_BIT(SHARED, 3),
-	/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
-	DECLARE_VMA_BIT(MAYREAD, 4),	/* limits for mprotect() etc. */
-	DECLARE_VMA_BIT(MAYWRITE, 5),
-	DECLARE_VMA_BIT(MAYEXEC, 6),
-	DECLARE_VMA_BIT(MAYSHARE, 7),
-	DECLARE_VMA_BIT(GROWSDOWN, 8),	/* general info on the segment */
-#ifdef CONFIG_MMU
-	DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */
-#else
-	/* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
-	DECLARE_VMA_BIT(MAYOVERLAY, 9),
-#endif /* CONFIG_MMU */
-	/* Page-ranges managed without "struct page", just pure PFN */
-	DECLARE_VMA_BIT(PFNMAP, 10),
-	DECLARE_VMA_BIT(MAYBE_GUARD, 11),
-	DECLARE_VMA_BIT(UFFD_WP, 12),	/* wrprotect pages tracking */
-	DECLARE_VMA_BIT(LOCKED, 13),
-	DECLARE_VMA_BIT(IO, 14),	/* Memory mapped I/O or similar */
-	DECLARE_VMA_BIT(SEQ_READ, 15),	/* App will access data sequentially */
-	DECLARE_VMA_BIT(RAND_READ, 16),	/* App will not benefit from clustered reads */
-	DECLARE_VMA_BIT(DONTCOPY, 17),	/* Do not copy this vma on fork */
-	DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */
-	DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */
-	DECLARE_VMA_BIT(ACCOUNT, 20),	/* Is a VM accounted object */
-	DECLARE_VMA_BIT(NORESERVE, 21),	/* should the VM suppress accounting */
-	DECLARE_VMA_BIT(HUGETLB, 22),	/* Huge TLB Page VM */
-	DECLARE_VMA_BIT(SYNC, 23),	/* Synchronous page faults */
-	DECLARE_VMA_BIT(ARCH_1, 24),	/* Architecture-specific flag */
-	DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */
-	DECLARE_VMA_BIT(DONTDUMP, 26),	/* Do not include in the core dump */
-	DECLARE_VMA_BIT(SOFTDIRTY, 27),	/* NOT soft dirty clean area */
-	DECLARE_VMA_BIT(MIXEDMAP, 28),	/* Can contain struct page and pure PFN pages */
-	DECLARE_VMA_BIT(HUGEPAGE, 29),	/* MADV_HUGEPAGE marked this vma */
-	DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */
-	DECLARE_VMA_BIT(MERGEABLE, 31),	/* KSM may merge identical pages */
-	/* These bits are reused, we define specific uses below. */
-	DECLARE_VMA_BIT(HIGH_ARCH_0, 32),
-	DECLARE_VMA_BIT(HIGH_ARCH_1, 33),
-	DECLARE_VMA_BIT(HIGH_ARCH_2, 34),
-	DECLARE_VMA_BIT(HIGH_ARCH_3, 35),
-	DECLARE_VMA_BIT(HIGH_ARCH_4, 36),
-	DECLARE_VMA_BIT(HIGH_ARCH_5, 37),
-	DECLARE_VMA_BIT(HIGH_ARCH_6, 38),
-	/*
-	 * This flag is used to connect VFIO to arch specific KVM code. It
-	 * indicates that the memory under this VMA is safe for use with any
-	 * non-cachable memory type inside KVM. Some VFIO devices, on some
-	 * platforms, are thought to be unsafe and can cause machine crashes
-	 * if KVM does not lock down the memory type.
-	 */
-	DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39),
-#ifdef CONFIG_PPC32
-	DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1),
-#else
-	DECLARE_VMA_BIT(DROPPABLE, 40),
-#endif
-	DECLARE_VMA_BIT(UFFD_MINOR, 41),
-	DECLARE_VMA_BIT(SEALED, 42),
-	/* Flags that reuse flags above. */
-	DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0),
-	DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1),
-	DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2),
-	DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3),
-	DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4),
-#if defined(CONFIG_X86_USER_SHADOW_STACK)
-	/*
-	 * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
-	 * support core mm.
-	 *
-	 * These VMAs will get a single end guard page. This helps userspace
-	 * protect itself from attacks. A single page is enough for current
-	 * shadow stack archs (x86). See the comments near alloc_shstk() in
-	 * arch/x86/kernel/shstk.c for more details on the guard size.
-	 */
-	DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5),
-#elif defined(CONFIG_ARM64_GCS)
-	/*
-	 * arm64's Guarded Control Stack implements similar functionality and
-	 * has similar constraints to shadow stacks.
-	 */
-	DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6),
-#endif
-	DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1),		/* Strong Access Ordering (powerpc) */
-	DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1),		/* parisc */
-	DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1),	/* sparc64 */
-	DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1),	/* arm64 */
-	DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1),	/* sparc64, arm64 */
-	DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1),	/* !CONFIG_MMU */
-	DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4),	/* arm64 */
-	DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */
-#ifdef CONFIG_STACK_GROWSUP
-	DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP),
-	DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN),
-#else
-	DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN),
-#endif
-};
-
-#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT)
-#define VM_READ		INIT_VM_FLAG(READ)
-#define VM_WRITE	INIT_VM_FLAG(WRITE)
-#define VM_EXEC		INIT_VM_FLAG(EXEC)
-#define VM_SHARED	INIT_VM_FLAG(SHARED)
-#define VM_MAYREAD	INIT_VM_FLAG(MAYREAD)
-#define VM_MAYWRITE	INIT_VM_FLAG(MAYWRITE)
-#define VM_MAYEXEC	INIT_VM_FLAG(MAYEXEC)
-#define VM_MAYSHARE	INIT_VM_FLAG(MAYSHARE)
-#define VM_GROWSDOWN	INIT_VM_FLAG(GROWSDOWN)
-#ifdef CONFIG_MMU
-#define VM_UFFD_MISSING	INIT_VM_FLAG(UFFD_MISSING)
-#else
-#define VM_UFFD_MISSING	VM_NONE
-#define VM_MAYOVERLAY	INIT_VM_FLAG(MAYOVERLAY)
-#endif
-#define VM_PFNMAP	INIT_VM_FLAG(PFNMAP)
-#define VM_MAYBE_GUARD	INIT_VM_FLAG(MAYBE_GUARD)
-#define VM_UFFD_WP	INIT_VM_FLAG(UFFD_WP)
-#define VM_LOCKED	INIT_VM_FLAG(LOCKED)
-#define VM_IO		INIT_VM_FLAG(IO)
-#define VM_SEQ_READ	INIT_VM_FLAG(SEQ_READ)
-#define VM_RAND_READ	INIT_VM_FLAG(RAND_READ)
-#define VM_DONTCOPY	INIT_VM_FLAG(DONTCOPY)
-#define VM_DONTEXPAND	INIT_VM_FLAG(DONTEXPAND)
-#define VM_LOCKONFAULT	INIT_VM_FLAG(LOCKONFAULT)
-#define VM_ACCOUNT	INIT_VM_FLAG(ACCOUNT)
-#define VM_NORESERVE	INIT_VM_FLAG(NORESERVE)
-#define VM_HUGETLB	INIT_VM_FLAG(HUGETLB)
-#define VM_SYNC		INIT_VM_FLAG(SYNC)
-#define VM_ARCH_1	INIT_VM_FLAG(ARCH_1)
-#define VM_WIPEONFORK	INIT_VM_FLAG(WIPEONFORK)
-#define VM_DONTDUMP	INIT_VM_FLAG(DONTDUMP)
-#ifdef CONFIG_MEM_SOFT_DIRTY
-#define VM_SOFTDIRTY	INIT_VM_FLAG(SOFTDIRTY)
-#else
-#define VM_SOFTDIRTY	VM_NONE
-#endif
-#define VM_MIXEDMAP	INIT_VM_FLAG(MIXEDMAP)
-#define VM_HUGEPAGE	INIT_VM_FLAG(HUGEPAGE)
-#define VM_NOHUGEPAGE	INIT_VM_FLAG(NOHUGEPAGE)
-#define VM_MERGEABLE	INIT_VM_FLAG(MERGEABLE)
-#define VM_STACK	INIT_VM_FLAG(STACK)
-#ifdef CONFIG_STACK_GROWS_UP
-#define VM_STACK_EARLY	INIT_VM_FLAG(STACK_EARLY)
-#else
-#define VM_STACK_EARLY	VM_NONE
-#endif
-#ifdef CONFIG_ARCH_HAS_PKEYS
-#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT)
-/* Despite the naming, these are FLAGS not bits. */
-#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0)
-#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1)
-#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2)
-#if CONFIG_ARCH_PKEY_BITS > 3
-#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3)
-#else
-#define VM_PKEY_BIT3  VM_NONE
-#endif /* CONFIG_ARCH_PKEY_BITS > 3 */
-#if CONFIG_ARCH_PKEY_BITS > 4
-#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4)
-#else
-#define VM_PKEY_BIT4  VM_NONE
-#endif /* CONFIG_ARCH_PKEY_BITS > 4 */
-#endif /* CONFIG_ARCH_HAS_PKEYS */
-#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS)
-#define VM_SHADOW_STACK	INIT_VM_FLAG(SHADOW_STACK)
-#else
-#define VM_SHADOW_STACK	VM_NONE
-#endif
-#if defined(CONFIG_PPC64)
-#define VM_SAO		INIT_VM_FLAG(SAO)
-#elif defined(CONFIG_PARISC)
-#define VM_GROWSUP	INIT_VM_FLAG(GROWSUP)
-#elif defined(CONFIG_SPARC64)
-#define VM_SPARC_ADI	INIT_VM_FLAG(SPARC_ADI)
-#define VM_ARCH_CLEAR	INIT_VM_FLAG(ARCH_CLEAR)
-#elif defined(CONFIG_ARM64)
-#define VM_ARM64_BTI	INIT_VM_FLAG(ARM64_BTI)
-#define VM_ARCH_CLEAR	INIT_VM_FLAG(ARCH_CLEAR)
-#elif !defined(CONFIG_MMU)
-#define VM_MAPPED_COPY	INIT_VM_FLAG(MAPPED_COPY)
-#endif
-#ifndef VM_GROWSUP
-#define VM_GROWSUP	VM_NONE
-#endif
-#ifdef CONFIG_ARM64_MTE
-#define VM_MTE		INIT_VM_FLAG(MTE)
-#define VM_MTE_ALLOWED	INIT_VM_FLAG(MTE_ALLOWED)
-#else
-#define VM_MTE		VM_NONE
-#define VM_MTE_ALLOWED	VM_NONE
-#endif
-#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
-#define VM_UFFD_MINOR	INIT_VM_FLAG(UFFD_MINOR)
-#else
-#define VM_UFFD_MINOR	VM_NONE
-#endif
-#ifdef CONFIG_64BIT
-#define VM_ALLOW_ANY_UNCACHED	INIT_VM_FLAG(ALLOW_ANY_UNCACHED)
-#define VM_SEALED		INIT_VM_FLAG(SEALED)
-#else
-#define VM_ALLOW_ANY_UNCACHED	VM_NONE
-#define VM_SEALED		VM_NONE
-#endif
-#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
-#define VM_DROPPABLE		INIT_VM_FLAG(DROPPABLE)
-#else
-#define VM_DROPPABLE		VM_NONE
-#endif
-
-/* Bits set in the VMA until the stack is in its final location */
-#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
-
-#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
-
-/* Common data flag combinations */
-#define VM_DATA_FLAGS_TSK_EXEC	(VM_READ | VM_WRITE | TASK_EXEC | \
-				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-#define VM_DATA_FLAGS_NON_EXEC	(VM_READ | VM_WRITE | VM_MAYREAD | \
-				 VM_MAYWRITE | VM_MAYEXEC)
-#define VM_DATA_FLAGS_EXEC	(VM_READ | VM_WRITE | VM_EXEC | \
-				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
-#ifndef VM_DATA_DEFAULT_FLAGS		/* arch can override this */
-#define VM_DATA_DEFAULT_FLAGS  VM_DATA_FLAGS_EXEC
-#endif
-
-#ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
-#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
-#endif
-
-#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
-
-#define VM_STACK_FLAGS	(VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
-
-/* VMA basic access permission flags */
-#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
-
-/*
- * Special vmas that are non-mergable, non-mlock()able.
- */
-#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
-
-#define DEFAULT_MAP_WINDOW	((1UL << 47) - PAGE_SIZE)
-#define TASK_SIZE_LOW		DEFAULT_MAP_WINDOW
-#define TASK_SIZE_MAX		DEFAULT_MAP_WINDOW
-#define STACK_TOP		TASK_SIZE_LOW
-#define STACK_TOP_MAX		TASK_SIZE_MAX
-
-/* This mask represents all the VMA flag bits used by mlock */
-#define VM_LOCKED_MASK	(VM_LOCKED | VM_LOCKONFAULT)
-
-#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
-
-#define VM_DATA_FLAGS_TSK_EXEC	(VM_READ | VM_WRITE | TASK_EXEC | \
-				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
-#define RLIMIT_STACK		3	/* max stack size */
-#define RLIMIT_MEMLOCK		8	/* max locked-in-memory address space */
-
-#define CAP_IPC_LOCK         14
-
-/*
- * Flags which should be 'sticky' on merge - that is, flags which, when one VMA
- * possesses it but the other does not, the merged VMA should nonetheless have
- * applied to it:
- *
- *   VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its
- *                  references cleared via /proc/$pid/clear_refs, any merged VMA
- *                  should be considered soft-dirty also as it operates at a VMA
- *                  granularity.
- */
-#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD)
-
 /*
- * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one
- * of these flags and the other not does not preclude a merge.
- *
- *    VM_STICKY - When merging VMAs, VMA flags must match, unless they are
- *                'sticky'. If any sticky flags exist in either VMA, we simply
- *                set all of them on the merged VMA.
+ * DUPLICATE typedef definitions from kernel source that have to be declared
+ * ahead of all other headers.
  */
-#define VM_IGNORE_MERGE VM_STICKY
-
-/*
- * Flags which should result in page tables being copied on fork. These are
- * flags which indicate that the VMA maps page tables which cannot be
- * reconsistuted upon page fault, so necessitate page table copying upon
- *
- * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be
- *                           reasonably reconstructed on page fault.
- *
- *              VM_UFFD_WP - Encodes metadata about an installed uffd
- *                           write protect handler, which cannot be
- *                           reconstructed on page fault.
- *
- *                           We always copy pgtables when dst_vma has uffd-wp
- *                           enabled even if it's file-backed
- *                           (e.g. shmem). Because when uffd-wp is enabled,
- *                           pgtable contains uffd-wp protection information,
- *                           that's something we can't retrieve from page cache,
- *                           and skip copying will lose those info.
- *
- *          VM_MAYBE_GUARD - Could contain page guard region markers which
- *                           by design are a property of the page tables
- *                           only and thus cannot be reconstructed on page
- *                           fault.
- */
-#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD)
-
-#define FIRST_USER_ADDRESS	0UL
-#define USER_PGTABLES_CEILING	0UL
-
-#define vma_policy(vma) NULL
-
-#define down_write_nest_lock(sem, nest_lock)
-
-#define pgprot_val(x)		((x).pgprot)
-#define __pgprot(x)		((pgprot_t) { (x) } )
-
-#define for_each_vma(__vmi, __vma)					\
-	while (((__vma) = vma_next(&(__vmi))) != NULL)
-
-/* The MM code likes to work with exclusive end addresses */
-#define for_each_vma_range(__vmi, __vma, __end)				\
-	while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)
-
-#define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
-
-#define PHYS_PFN(x)	((unsigned long)((x) >> PAGE_SHIFT))
-
-#define test_and_set_bit(nr, addr) __test_and_set_bit(nr, addr)
-#define test_and_clear_bit(nr, addr) __test_and_clear_bit(nr, addr)
-
-#define TASK_SIZE ((1ul << 47)-PAGE_SIZE)
-
-#define AS_MM_ALL_LOCKS 2
-
-/* We hardcode this for now. */
-#define sysctl_max_map_count 0x1000000UL
-
-#define pgoff_t unsigned long
-typedef unsigned long	pgprotval_t;
-typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
-typedef unsigned long vm_flags_t;
-typedef __bitwise unsigned int vm_fault_t;
-
-/*
- * The shared stubs do not implement this, it amounts to an fprintf(STDERR,...)
- * either way :)
- */
-#define pr_warn_once pr_err
-
-#define data_race(expr) expr
-
-#define ASSERT_EXCLUSIVE_WRITER(x)
-
-#define pgtable_supports_soft_dirty() 1
-
-/**
- * swap - swap values of @a and @b
- * @a: first value
- * @b: second value
- */
-#define swap(a, b) \
-	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
-
-struct kref {
-	refcount_t refcount;
-};
-
-/*
- * Define the task command name length as enum, then it can be visible to
- * BPF programs.
- */
-enum {
-	TASK_COMM_LEN = 16,
-};
-
-/*
- * Flags for bug emulation.
- *
- * These occupy the top three bytes.
- */
-enum {
-	READ_IMPLIES_EXEC =	0x0400000,
-};
-
-struct task_struct {
-	char comm[TASK_COMM_LEN];
-	pid_t pid;
-	struct mm_struct *mm;
-
-	/* Used for emulating ABI behavior of previous Linux versions: */
-	unsigned int			personality;
-};
-
-struct task_struct *get_current(void);
-#define current get_current()
-
-struct anon_vma {
-	struct anon_vma *root;
-	struct rb_root_cached rb_root;
-
-	/* Test fields. */
-	bool was_cloned;
-	bool was_unlinked;
-};
-
-struct anon_vma_chain {
-	struct anon_vma *anon_vma;
-	struct list_head same_vma;
-};
-
-struct anon_vma_name {
-	struct kref kref;
-	/* The name needs to be at the end because it is dynamically sized. */
-	char name[];
-};
-
-struct vma_iterator {
-	struct ma_state mas;
-};
-
-#define VMA_ITERATOR(name, __mm, __addr)				\
-	struct vma_iterator name = {					\
-		.mas = {						\
-			.tree = &(__mm)->mm_mt,				\
-			.index = __addr,				\
-			.node = NULL,					\
-			.status = ma_start,				\
-		},							\
-	}
-
-struct address_space {
-	struct rb_root_cached	i_mmap;
-	unsigned long		flags;
-	atomic_t		i_mmap_writable;
-};
-
-struct vm_userfaultfd_ctx {};
-struct mempolicy {};
-struct mmu_gather {};
-struct mutex {};
-#define DEFINE_MUTEX(mutexname) \
-	struct mutex mutexname = {}
-
-#define DECLARE_BITMAP(name, bits) \
-	unsigned long name[BITS_TO_LONGS(bits)]
-
+#define __private
 #define NUM_MM_FLAG_BITS (64)
 typedef struct {
 	__private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
 } mm_flags_t;
-
-/*
- * Opaque type representing current VMA (vm_area_struct) flag state. Must be
- * accessed via vma_flags_xxx() helper functions.
- */
 #define NUM_VMA_FLAG_BITS BITS_PER_LONG
 typedef struct {
 	DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS);
 } __private vma_flags_t;
 
-#define EMPTY_VMA_FLAGS ((vma_flags_t){ })
-
-struct mm_struct {
-	struct maple_tree mm_mt;
-	int map_count;			/* number of VMAs */
-	unsigned long total_vm;	   /* Total pages mapped */
-	unsigned long locked_vm;   /* Pages that have PG_mlocked set */
-	unsigned long data_vm;	   /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
-	unsigned long exec_vm;	   /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
-	unsigned long stack_vm;	   /* VM_STACK */
-
-	unsigned long def_flags;
-
-	mm_flags_t flags; /* Must use mm_flags_* helpers to access */
-};
-
-struct vm_area_struct;
-
-
-/* What action should be taken after an .mmap_prepare call is complete? */
-enum mmap_action_type {
-	MMAP_NOTHING,		/* Mapping is complete, no further action. */
-	MMAP_REMAP_PFN,		/* Remap PFN range. */
-	MMAP_IO_REMAP_PFN,	/* I/O remap PFN range. */
-};
-
-/*
- * Describes an action an mmap_prepare hook can instruct to be taken to complete
- * the mapping of a VMA. Specified in vm_area_desc.
- */
-struct mmap_action {
-	union {
-		/* Remap range. */
-		struct {
-			unsigned long start;
-			unsigned long start_pfn;
-			unsigned long size;
-			pgprot_t pgprot;
-		} remap;
-	};
-	enum mmap_action_type type;
-
-	/*
-	 * If specified, this hook is invoked after the selected action has been
-	 * successfully completed. Note that the VMA write lock still held.
-	 *
-	 * The absolute minimum ought to be done here.
-	 *
-	 * Returns 0 on success, or an error code.
-	 */
-	int (*success_hook)(const struct vm_area_struct *vma);
-
-	/*
-	 * If specified, this hook is invoked when an error occurred when
-	 * attempting the selection action.
-	 *
-	 * The hook can return an error code in order to filter the error, but
-	 * it is not valid to clear the error here.
-	 */
-	int (*error_hook)(int err);
-
-	/*
-	 * This should be set in rare instances where the operation required
-	 * that the rmap should not be able to access the VMA until
-	 * completely set up.
-	 */
-	bool hide_from_rmap_until_complete :1;
-};
-
-/* Operations which modify VMAs. */
-enum vma_operation {
-	VMA_OP_SPLIT,
-	VMA_OP_MERGE_UNFAULTED,
-	VMA_OP_REMAP,
-	VMA_OP_FORK,
-};
-
-/*
- * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
- * manipulate mutable fields which will cause those fields to be updated in the
- * resultant VMA.
- *
- * Helper functions are not required for manipulating any field.
- */
-struct vm_area_desc {
-	/* Immutable state. */
-	const struct mm_struct *const mm;
-	struct file *const file; /* May vary from vm_file in stacked callers. */
-	unsigned long start;
-	unsigned long end;
-
-	/* Mutable fields. Populated with initial state. */
-	pgoff_t pgoff;
-	struct file *vm_file;
-	union {
-		vm_flags_t vm_flags;
-		vma_flags_t vma_flags;
-	};
-	pgprot_t page_prot;
-
-	/* Write-only fields. */
-	const struct vm_operations_struct *vm_ops;
-	void *private_data;
-
-	/* Take further action? */
-	struct mmap_action action;
-};
-
-struct file_operations {
-	int (*mmap)(struct file *, struct vm_area_struct *);
-	int (*mmap_prepare)(struct vm_area_desc *);
-};
-
-struct file {
-	struct address_space	*f_mapping;
-	const struct file_operations	*f_op;
-};
-
-#define VMA_LOCK_OFFSET	0x40000000
-
-typedef struct { unsigned long v; } freeptr_t;
-
-struct vm_area_struct {
-	/* The first cache line has the info for VMA tree walking. */
-
-	union {
-		struct {
-			/* VMA covers [vm_start; vm_end) addresses within mm */
-			unsigned long vm_start;
-			unsigned long vm_end;
-		};
-		freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
-	};
-
-	struct mm_struct *vm_mm;	/* The address space we belong to. */
-	pgprot_t vm_page_prot;          /* Access permissions of this VMA. */
-
-	/*
-	 * Flags, see mm.h.
-	 * To modify use vm_flags_{init|reset|set|clear|mod} functions.
-	 */
-	union {
-		const vm_flags_t vm_flags;
-		vma_flags_t flags;
-	};
-
-#ifdef CONFIG_PER_VMA_LOCK
-	/*
-	 * Can only be written (using WRITE_ONCE()) while holding both:
-	 *  - mmap_lock (in write mode)
-	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set
-	 * Can be read reliably while holding one of:
-	 *  - mmap_lock (in read or write mode)
-	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
-	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
-	 * while holding nothing (except RCU to keep the VMA struct allocated).
-	 *
-	 * This sequence counter is explicitly allowed to overflow; sequence
-	 * counter reuse can only lead to occasional unnecessary use of the
-	 * slowpath.
-	 */
-	unsigned int vm_lock_seq;
-#endif
-
-	/*
-	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
-	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
-	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
-	 * or brk vma (with NULL file) can only be in an anon_vma list.
-	 */
-	struct list_head anon_vma_chain; /* Serialized by mmap_lock &
-					  * page_table_lock */
-	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */
-
-	/* Function pointers to deal with this struct. */
-	const struct vm_operations_struct *vm_ops;
-
-	/* Information about our backing store: */
-	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
-					   units */
-	struct file * vm_file;		/* File we map to (can be NULL). */
-	void * vm_private_data;		/* was vm_pte (shared mem) */
-
-#ifdef CONFIG_SWAP
-	atomic_long_t swap_readahead_info;
-#endif
-#ifndef CONFIG_MMU
-	struct vm_region *vm_region;	/* NOMMU mapping region */
-#endif
-#ifdef CONFIG_NUMA
-	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
-#endif
-#ifdef CONFIG_NUMA_BALANCING
-	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
-#endif
-#ifdef CONFIG_PER_VMA_LOCK
-	/* Unstable RCU readers are allowed to read this. */
-	refcount_t vm_refcnt;
-#endif
-	/*
-	 * For areas with an address space and backing store,
-	 * linkage into the address_space->i_mmap interval tree.
-	 *
-	 */
-	struct {
-		struct rb_node rb;
-		unsigned long rb_subtree_last;
-	} shared;
-#ifdef CONFIG_ANON_VMA_NAME
-	/*
-	 * For private and shared anonymous mappings, a pointer to a null
-	 * terminated string containing the name given to the vma, or NULL if
-	 * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
-	 */
-	struct anon_vma_name *anon_name;
-#endif
-	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
-} __randomize_layout;
-
-struct vm_fault {};
-
-struct vm_operations_struct {
-	void (*open)(struct vm_area_struct * area);
-	/**
-	 * @close: Called when the VMA is being removed from the MM.
-	 * Context: User context.  May sleep.  Caller holds mmap_lock.
-	 */
-	void (*close)(struct vm_area_struct * area);
-	/* Called any time before splitting to check if it's allowed */
-	int (*may_split)(struct vm_area_struct *area, unsigned long addr);
-	int (*mremap)(struct vm_area_struct *area);
-	/*
-	 * Called by mprotect() to make driver-specific permission
-	 * checks before mprotect() is finalised.   The VMA must not
-	 * be modified.  Returns 0 if mprotect() can proceed.
-	 */
-	int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
-			unsigned long end, unsigned long newflags);
-	vm_fault_t (*fault)(struct vm_fault *vmf);
-	vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
-	vm_fault_t (*map_pages)(struct vm_fault *vmf,
-			pgoff_t start_pgoff, pgoff_t end_pgoff);
-	unsigned long (*pagesize)(struct vm_area_struct * area);
-
-	/* notification that a previously read-only page is about to become
-	 * writable, if an error is returned it will cause a SIGBUS */
-	vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
-
-	/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
-	vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
-
-	/* called by access_process_vm when get_user_pages() fails, typically
-	 * for use by special VMAs. See also generic_access_phys() for a generic
-	 * implementation useful for any iomem mapping.
-	 */
-	int (*access)(struct vm_area_struct *vma, unsigned long addr,
-		      void *buf, int len, int write);
-
-	/* Called by the /proc/PID/maps code to ask the vma whether it
-	 * has a special name.  Returning non-NULL will also cause this
-	 * vma to be dumped unconditionally. */
-	const char *(*name)(struct vm_area_struct *vma);
-
-#ifdef CONFIG_NUMA
-	/*
-	 * set_policy() op must add a reference to any non-NULL @new mempolicy
-	 * to hold the policy upon return.  Caller should pass NULL @new to
-	 * remove a policy and fall back to surrounding context--i.e. do not
-	 * install a MPOL_DEFAULT policy, nor the task or system default
-	 * mempolicy.
-	 */
-	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
-
-	/*
-	 * get_policy() op must add reference [mpol_get()] to any policy at
-	 * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
-	 * in mm/mempolicy.c will do this automatically.
-	 * get_policy() must NOT add a ref if the policy at (vma,addr) is not
-	 * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
-	 * If no [shared/vma] mempolicy exists at the addr, get_policy() op
-	 * must return NULL--i.e., do not "fallback" to task or system default
-	 * policy.
-	 */
-	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
-					unsigned long addr, pgoff_t *ilx);
-#endif
-#ifdef CONFIG_FIND_NORMAL_PAGE
-	/*
-	 * Called by vm_normal_page() for special PTEs in @vma at @addr. This
-	 * allows for returning a "normal" page from vm_normal_page() even
-	 * though the PTE indicates that the "struct page" either does not exist
-	 * or should not be touched: "special".
-	 *
-	 * Do not add new users: this really only works when a "normal" page
-	 * was mapped, but then the PTE got changed to something weird (+
-	 * marked special) that would not make pte_pfn() identify the originally
-	 * inserted page.
-	 */
-	struct page *(*find_normal_page)(struct vm_area_struct *vma,
-					 unsigned long addr);
-#endif /* CONFIG_FIND_NORMAL_PAGE */
-};
-
-struct vm_unmapped_area_info {
-#define VM_UNMAPPED_AREA_TOPDOWN 1
-	unsigned long flags;
-	unsigned long length;
-	unsigned long low_limit;
-	unsigned long high_limit;
-	unsigned long align_mask;
-	unsigned long align_offset;
-	unsigned long start_gap;
-};
-
-struct pagetable_move_control {
-	struct vm_area_struct *old; /* Source VMA. */
-	struct vm_area_struct *new; /* Destination VMA. */
-	unsigned long old_addr; /* Address from which the move begins. */
-	unsigned long old_end; /* Exclusive address at which old range ends. */
-	unsigned long new_addr; /* Address to move page tables to. */
-	unsigned long len_in; /* Bytes to remap specified by user. */
-
-	bool need_rmap_locks; /* Do rmap locks need to be taken? */
-	bool for_stack; /* Is this an early temp stack being moved? */
-};
-
-#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_)	\
-	struct pagetable_move_control name = {				\
-		.old = old_,						\
-		.new = new_,						\
-		.old_addr = old_addr_,					\
-		.old_end = (old_addr_) + (len_),			\
-		.new_addr = new_addr_,					\
-		.len_in = len_,						\
-	}
-
-static inline void vma_iter_invalidate(struct vma_iterator *vmi)
-{
-	mas_pause(&vmi->mas);
-}
-
-static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
-{
-	return __pgprot(pgprot_val(oldprot) | pgprot_val(newprot));
-}
-
-static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
-{
-	return __pgprot(vm_flags);
-}
-
-static inline void vma_flags_clear_all(vma_flags_t *flags)
-{
-	bitmap_zero(flags->__vma_flags, NUM_VMA_FLAG_BITS);
-}
-
-static inline void vma_flag_set(vma_flags_t *flags, vma_flag_t bit)
-{
-	unsigned long *bitmap = flags->__vma_flags;
-
-	__set_bit((__force int)bit, bitmap);
-}
-
-static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits)
-{
-	vma_flags_t flags;
-	int i;
-
-	vma_flags_clear_all(&flags);
-	for (i = 0; i < count; i++)
-		vma_flag_set(&flags, bits[i]);
-	return flags;
-}
-
-#define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \
-					 (const vma_flag_t []){__VA_ARGS__})
-
-static __always_inline bool vma_flags_test_mask(const vma_flags_t *flags,
-		vma_flags_t to_test)
-{
-	const unsigned long *bitmap = flags->__vma_flags;
-	const unsigned long *bitmap_to_test = to_test.__vma_flags;
-
-	return bitmap_intersects(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
-}
-
-#define vma_flags_test(flags, ...) \
-	vma_flags_test_mask(flags, mk_vma_flags(__VA_ARGS__))
-
-static __always_inline bool vma_flags_test_all_mask(const vma_flags_t *flags,
-		vma_flags_t to_test)
-{
-	const unsigned long *bitmap = flags->__vma_flags;
-	const unsigned long *bitmap_to_test = to_test.__vma_flags;
-
-	return bitmap_subset(bitmap_to_test, bitmap, NUM_VMA_FLAG_BITS);
-}
-
-#define vma_flags_test_all(flags, ...) \
-	vma_flags_test_all_mask(flags, mk_vma_flags(__VA_ARGS__))
-
-static __always_inline void vma_flags_set_mask(vma_flags_t *flags, vma_flags_t to_set)
-{
-	unsigned long *bitmap = flags->__vma_flags;
-	const unsigned long *bitmap_to_set = to_set.__vma_flags;
-
-	bitmap_or(bitmap, bitmap, bitmap_to_set, NUM_VMA_FLAG_BITS);
-}
-
-#define vma_flags_set(flags, ...) \
-	vma_flags_set_mask(flags, mk_vma_flags(__VA_ARGS__))
-
-static __always_inline void vma_flags_clear_mask(vma_flags_t *flags, vma_flags_t to_clear)
-{
-	unsigned long *bitmap = flags->__vma_flags;
-	const unsigned long *bitmap_to_clear = to_clear.__vma_flags;
-
-	bitmap_andnot(bitmap, bitmap, bitmap_to_clear, NUM_VMA_FLAG_BITS);
-}
-
-#define vma_flags_clear(flags, ...) \
-	vma_flags_clear_mask(flags, mk_vma_flags(__VA_ARGS__))
-
-static inline bool vma_test_all_flags_mask(const struct vm_area_struct *vma,
-					   vma_flags_t flags)
-{
-	return vma_flags_test_all_mask(&vma->flags, flags);
-}
-
-#define vma_test_all_flags(vma, ...) \
-	vma_test_all_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
-
-static inline void vma_set_flags_mask(struct vm_area_struct *vma,
-				      vma_flags_t flags)
-{
-	vma_flags_set_mask(&vma->flags, flags);
-}
-
-#define vma_set_flags(vma, ...) \
-	vma_set_flags_mask(vma, mk_vma_flags(__VA_ARGS__))
-
-static inline bool vma_desc_test_flags_mask(const struct vm_area_desc *desc,
-					    vma_flags_t flags)
-{
-	return vma_flags_test_mask(&desc->vma_flags, flags);
-}
-
-#define vma_desc_test_flags(desc, ...) \
-	vma_desc_test_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
-
-static inline void vma_desc_set_flags_mask(struct vm_area_desc *desc,
-					   vma_flags_t flags)
-{
-	vma_flags_set_mask(&desc->vma_flags, flags);
-}
-
-#define vma_desc_set_flags(desc, ...) \
-	vma_desc_set_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
-
-static inline void vma_desc_clear_flags_mask(struct vm_area_desc *desc,
-					     vma_flags_t flags)
-{
-	vma_flags_clear_mask(&desc->vma_flags, flags);
-}
-
-#define vma_desc_clear_flags(desc, ...) \
-	vma_desc_clear_flags_mask(desc, mk_vma_flags(__VA_ARGS__))
-
-static inline bool is_shared_maywrite_vm_flags(vm_flags_t vm_flags)
-{
-	return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
-		(VM_SHARED | VM_MAYWRITE);
-}
-
-static inline bool is_shared_maywrite(const vma_flags_t *flags)
-{
-	return vma_flags_test_all(flags, VMA_SHARED_BIT, VMA_MAYWRITE_BIT);
-}
-
-static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
-{
-	return is_shared_maywrite(&vma->flags);
-}
-
-static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
-{
-	/*
-	 * Uses mas_find() to get the first VMA when the iterator starts.
-	 * Calling mas_next() could skip the first entry.
-	 */
-	return mas_find(&vmi->mas, ULONG_MAX);
-}
-
-/*
- * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
- * assertions should be made either under mmap_write_lock or when the object
- * has been isolated under mmap_write_lock, ensuring no competing writers.
- */
-static inline void vma_assert_attached(struct vm_area_struct *vma)
-{
-	WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
-}
-
-static inline void vma_assert_detached(struct vm_area_struct *vma)
-{
-	WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
-}
-
-static inline void vma_assert_write_locked(struct vm_area_struct *);
-static inline void vma_mark_attached(struct vm_area_struct *vma)
-{
-	vma_assert_write_locked(vma);
-	vma_assert_detached(vma);
-	refcount_set_release(&vma->vm_refcnt, 1);
-}
-
-static inline void vma_mark_detached(struct vm_area_struct *vma)
-{
-	vma_assert_write_locked(vma);
-	vma_assert_attached(vma);
-	/* We are the only writer, so no need to use vma_refcount_put(). */
-	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
-		/*
-		 * Reader must have temporarily raised vm_refcnt but it will
-		 * drop it without using the vma since vma is write-locked.
-		 */
-	}
-}
-
-extern const struct vm_operations_struct vma_dummy_vm_ops;
-
-extern unsigned long rlimit(unsigned int limit);
-
-static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
-{
-	memset(vma, 0, sizeof(*vma));
-	vma->vm_mm = mm;
-	vma->vm_ops = &vma_dummy_vm_ops;
-	INIT_LIST_HEAD(&vma->anon_vma_chain);
-	vma->vm_lock_seq = UINT_MAX;
-}
-
-/*
- * These are defined in vma.h, but sadly vm_stat_account() is referenced by
- * kernel/fork.c, so we have to these broadly available there, and temporarily
- * define them here to resolve the dependency cycle.
- */
-
-#define is_exec_mapping(flags) \
-	((flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC)
-
-#define is_stack_mapping(flags) \
-	(((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK))
-
-#define is_data_mapping(flags) \
-	((flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE)
-
-static inline void vm_stat_account(struct mm_struct *mm, vm_flags_t flags,
-				   long npages)
-{
-	WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
-
-	if (is_exec_mapping(flags))
-		mm->exec_vm += npages;
-	else if (is_stack_mapping(flags))
-		mm->stack_vm += npages;
-	else if (is_data_mapping(flags))
-		mm->data_vm += npages;
-}
-
-#undef is_exec_mapping
-#undef is_stack_mapping
-#undef is_data_mapping
-
-/* Currently stubbed but we may later wish to un-stub. */
-static inline void vm_acct_memory(long pages);
-static inline void vm_unacct_memory(long pages)
-{
-	vm_acct_memory(-pages);
-}
-
-static inline void mapping_allow_writable(struct address_space *mapping)
-{
-	atomic_inc(&mapping->i_mmap_writable);
-}
-
-static inline
-struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
-{
-	return mas_find(&vmi->mas, max - 1);
-}
-
-static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
-			unsigned long start, unsigned long end, gfp_t gfp)
-{
-	__mas_set_range(&vmi->mas, start, end - 1);
-	mas_store_gfp(&vmi->mas, NULL, gfp);
-	if (unlikely(mas_is_err(&vmi->mas)))
-		return -ENOMEM;
-
-	return 0;
-}
-
-static inline void mmap_assert_locked(struct mm_struct *);
-static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
-						unsigned long start_addr,
-						unsigned long end_addr)
-{
-	unsigned long index = start_addr;
-
-	mmap_assert_locked(mm);
-	return mt_find(&mm->mm_mt, &index, end_addr - 1);
-}
-
-static inline
-struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
-{
-	return mtree_load(&mm->mm_mt, addr);
-}
-
-static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
-{
-	return mas_prev(&vmi->mas, 0);
-}
-
-static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
-{
-	mas_set(&vmi->mas, addr);
-}
-
-static inline bool vma_is_anonymous(struct vm_area_struct *vma)
-{
-	return !vma->vm_ops;
-}
-
-/* Defined in vma.h, so temporarily define here to avoid circular dependency. */
-#define vma_iter_load(vmi) \
-	mas_walk(&(vmi)->mas)
-
-static inline struct vm_area_struct *
-find_vma_prev(struct mm_struct *mm, unsigned long addr,
-			struct vm_area_struct **pprev)
-{
-	struct vm_area_struct *vma;
-	VMA_ITERATOR(vmi, mm, addr);
-
-	vma = vma_iter_load(&vmi);
-	*pprev = vma_prev(&vmi);
-	if (!vma)
-		vma = vma_next(&vmi);
-	return vma;
-}
-
-#undef vma_iter_load
-
-static inline void vma_iter_init(struct vma_iterator *vmi,
-		struct mm_struct *mm, unsigned long addr)
-{
-	mas_init(&vmi->mas, &mm->mm_mt, addr);
-}
-
-/* Stubbed functions. */
-
-static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
-{
-	return NULL;
-}
-
-static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
-					struct vm_userfaultfd_ctx vm_ctx)
-{
-	return true;
-}
-
-static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
-				    struct anon_vma_name *anon_name2)
-{
-	return true;
-}
-
-static inline void might_sleep(void)
-{
-}
-
-static inline unsigned long vma_pages(struct vm_area_struct *vma)
-{
-	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-}
-
-static inline void fput(struct file *file)
-{
-}
-
-static inline void mpol_put(struct mempolicy *pol)
-{
-}
-
-static inline void lru_add_drain(void)
-{
-}
-
-static inline void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
-{
-}
-
-static inline void update_hiwater_rss(struct mm_struct *mm)
-{
-}
-
-static inline void update_hiwater_vm(struct mm_struct *mm)
-{
-}
-
-struct unmap_desc;
-
-static inline void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap)
-{
-}
-
-static inline void free_pgtables(struct mmu_gather *tlb, struct unmap_desc *desc)
-{
-	(void)tlb;
-	(void)desc;
-}
-
-static inline void mapping_unmap_writable(struct address_space *mapping)
-{
-}
-
-static inline void flush_dcache_mmap_lock(struct address_space *mapping)
-{
-}
-
-static inline void tlb_finish_mmu(struct mmu_gather *tlb)
-{
-}
-
-static inline struct file *get_file(struct file *f)
-{
-	return f;
-}
-
-static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
-{
-	return 0;
-}
-
-static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
-				 enum vma_operation operation)
-{
-	/* For testing purposes. We indicate that an anon_vma has been cloned. */
-	if (src->anon_vma != NULL) {
-		dst->anon_vma = src->anon_vma;
-		dst->anon_vma->was_cloned = true;
-	}
-
-	return 0;
-}
-
-static inline void vma_start_write(struct vm_area_struct *vma)
-{
-	/* Used to indicate to tests that a write operation has begun. */
-	vma->vm_lock_seq++;
-}
-
-static inline __must_check
-int vma_start_write_killable(struct vm_area_struct *vma)
-{
-	/* Used to indicate to tests that a write operation has begun. */
-	vma->vm_lock_seq++;
-	return 0;
-}
-
-static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
-					 unsigned long start,
-					 unsigned long end,
-					 struct vm_area_struct *next)
-{
-}
-
-static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
-
-static inline void vma_iter_free(struct vma_iterator *vmi)
-{
-	mas_destroy(&vmi->mas);
-}
-
-static inline
-struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
-{
-	return mas_next_range(&vmi->mas, ULONG_MAX);
-}
-
-static inline void vm_acct_memory(long pages)
-{
-}
-
-static inline void vma_interval_tree_insert(struct vm_area_struct *vma,
-					    struct rb_root_cached *rb)
-{
-}
-
-static inline void vma_interval_tree_remove(struct vm_area_struct *vma,
-					    struct rb_root_cached *rb)
-{
-}
-
-static inline void flush_dcache_mmap_unlock(struct address_space *mapping)
-{
-}
-
-static inline void anon_vma_interval_tree_insert(struct anon_vma_chain *avc,
-						 struct rb_root_cached *rb)
-{
-}
-
-static inline void anon_vma_interval_tree_remove(struct anon_vma_chain *avc,
-						 struct rb_root_cached *rb)
-{
-}
-
-static inline void uprobe_mmap(struct vm_area_struct *vma)
-{
-}
-
-static inline void uprobe_munmap(struct vm_area_struct *vma,
-				 unsigned long start, unsigned long end)
-{
-}
-
-static inline void i_mmap_lock_write(struct address_space *mapping)
-{
-}
-
-static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
-{
-}
-
-static inline void vma_assert_write_locked(struct vm_area_struct *vma)
-{
-}
-
-static inline void unlink_anon_vmas(struct vm_area_struct *vma)
-{
-	/* For testing purposes, indicate that the anon_vma was unlinked. */
-	vma->anon_vma->was_unlinked = true;
-}
-
-static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
-{
-}
-
-static inline void i_mmap_unlock_write(struct address_space *mapping)
-{
-}
-
-static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
-					 unsigned long start,
-					 unsigned long end,
-					 struct list_head *unmaps)
-{
-	return 0;
-}
-
-static inline void mmap_write_downgrade(struct mm_struct *mm)
-{
-}
-
-static inline void mmap_read_unlock(struct mm_struct *mm)
-{
-}
-
-static inline void mmap_write_unlock(struct mm_struct *mm)
-{
-}
-
-static inline int mmap_write_lock_killable(struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline bool can_modify_mm(struct mm_struct *mm,
-				 unsigned long start,
-				 unsigned long end)
-{
-	return true;
-}
-
-static inline void arch_unmap(struct mm_struct *mm,
-				 unsigned long start,
-				 unsigned long end)
-{
-}
-
-static inline void mmap_assert_locked(struct mm_struct *mm)
-{
-}
-
-static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
-{
-	return true;
-}
-
-static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
-			  vm_flags_t vm_flags)
-{
-}
-
-static inline bool mapping_can_writeback(struct address_space *mapping)
-{
-	return true;
-}
-
-static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
-{
-	return false;
-}
-
-static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
-{
-	return false;
-}
-
-static inline bool userfaultfd_wp(struct vm_area_struct *vma)
-{
-	return false;
-}
-
-static inline void mmap_assert_write_locked(struct mm_struct *mm)
-{
-}
-
-static inline void mutex_lock(struct mutex *lock)
-{
-}
-
-static inline void mutex_unlock(struct mutex *lock)
-{
-}
-
-static inline bool mutex_is_locked(struct mutex *lock)
-{
-	return true;
-}
-
-static inline bool signal_pending(void *p)
-{
-	return false;
-}
-
-static inline bool is_file_hugepages(struct file *file)
-{
-	return false;
-}
-
-static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
-{
-	return 0;
-}
-
-static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags,
-				 unsigned long npages)
-{
-	return true;
-}
-
-static inline int shmem_zero_setup(struct vm_area_struct *vma)
-{
-	return 0;
-}
-
-static inline void vma_set_anonymous(struct vm_area_struct *vma)
-{
-	vma->vm_ops = NULL;
-}
-
-static inline void ksm_add_vma(struct vm_area_struct *vma)
-{
-}
-
-static inline void perf_event_mmap(struct vm_area_struct *vma)
-{
-}
-
-static inline bool vma_is_dax(struct vm_area_struct *vma)
-{
-	return false;
-}
-
-static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
-	return NULL;
-}
-
-bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
-
-/* Update vma->vm_page_prot to reflect vma->vm_flags. */
-static inline void vma_set_page_prot(struct vm_area_struct *vma)
-{
-	vm_flags_t vm_flags = vma->vm_flags;
-	pgprot_t vm_page_prot;
-
-	/* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
-	vm_page_prot = pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vm_flags));
-
-	if (vma_wants_writenotify(vma, vm_page_prot)) {
-		vm_flags &= ~VM_SHARED;
-		/* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
-		vm_page_prot = pgprot_modify(vm_page_prot, vm_get_page_prot(vm_flags));
-	}
-	/* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
-	WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
-}
-
-static inline bool arch_validate_flags(vm_flags_t flags)
-{
-	return true;
-}
-
-static inline void vma_close(struct vm_area_struct *vma)
-{
-}
-
-static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
-{
-	return 0;
-}
-
-static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
-{
-	if (vma->vm_flags & VM_GROWSDOWN)
-		return stack_guard_gap;
-
-	/* See reasoning around the VM_SHADOW_STACK definition */
-	if (vma->vm_flags & VM_SHADOW_STACK)
-		return PAGE_SIZE;
-
-	return 0;
-}
-
-static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
-{
-	unsigned long gap = stack_guard_start_gap(vma);
-	unsigned long vm_start = vma->vm_start;
-
-	vm_start -= gap;
-	if (vm_start > vma->vm_start)
-		vm_start = 0;
-	return vm_start;
-}
-
-static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
-{
-	unsigned long vm_end = vma->vm_end;
-
-	if (vma->vm_flags & VM_GROWSUP) {
-		vm_end += stack_guard_gap;
-		if (vm_end < vma->vm_end)
-			vm_end = -PAGE_SIZE;
-	}
-	return vm_end;
-}
-
-static inline int is_hugepage_only_range(struct mm_struct *mm,
-					unsigned long addr, unsigned long len)
-{
-	return 0;
-}
-
-static inline bool vma_is_accessible(struct vm_area_struct *vma)
-{
-	return vma->vm_flags & VM_ACCESS_FLAGS;
-}
-
-static inline bool capable(int cap)
-{
-	return true;
-}
-
-static inline bool mlock_future_ok(const struct mm_struct *mm,
-		vm_flags_t vm_flags, unsigned long bytes)
-{
-	unsigned long locked_pages, limit_pages;
-
-	if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
-		return true;
-
-	locked_pages = bytes >> PAGE_SHIFT;
-	locked_pages += mm->locked_vm;
-
-	limit_pages = rlimit(RLIMIT_MEMLOCK);
-	limit_pages >>= PAGE_SHIFT;
-
-	return locked_pages <= limit_pages;
-}
-
-static inline int __anon_vma_prepare(struct vm_area_struct *vma)
-{
-	struct anon_vma *anon_vma = calloc(1, sizeof(struct anon_vma));
-
-	if (!anon_vma)
-		return -ENOMEM;
-
-	anon_vma->root = anon_vma;
-	vma->anon_vma = anon_vma;
-
-	return 0;
-}
-
-static inline int anon_vma_prepare(struct vm_area_struct *vma)
-{
-	if (likely(vma->anon_vma))
-		return 0;
-
-	return __anon_vma_prepare(vma);
-}
-
-static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
-					      struct list_head *uf)
-{
-}
-
-static inline bool mm_flags_test(int flag, const struct mm_struct *mm)
-{
-	return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags));
-}
-
-/*
- * Copy value to the first system word of VMA flags, non-atomically.
- *
- * IMPORTANT: This does not overwrite bytes past the first system word. The
- * caller must account for this.
- */
-static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value)
-{
-	*ACCESS_PRIVATE(flags, __vma_flags) = value;
-}
-
-/*
- * Copy value to the first system word of VMA flags ONCE, non-atomically.
- *
- * IMPORTANT: This does not overwrite bytes past the first system word. The
- * caller must account for this.
- */
-static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value)
-{
-	unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
-
-	WRITE_ONCE(*bitmap, value);
-}
-
-/* Update the first system word of VMA flags setting bits, non-atomically. */
-static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value)
-{
-	unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
-
-	*bitmap |= value;
-}
-
-/* Update the first system word of VMA flags clearing bits, non-atomically. */
-static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value)
-{
-	unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags);
-
-	*bitmap &= ~value;
-}
-
-
-/* Use when VMA is not part of the VMA tree and needs no locking */
-static inline void vm_flags_init(struct vm_area_struct *vma,
-				 vm_flags_t flags)
-{
-	vma_flags_clear_all(&vma->flags);
-	vma_flags_overwrite_word(&vma->flags, flags);
-}
-
-/*
- * Use when VMA is part of the VMA tree and modifications need coordination
- * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
- * it should be locked explicitly beforehand.
- */
-static inline void vm_flags_reset(struct vm_area_struct *vma,
-				  vm_flags_t flags)
-{
-	vma_assert_write_locked(vma);
-	vm_flags_init(vma, flags);
-}
-
-static inline void vm_flags_reset_once(struct vm_area_struct *vma,
-				       vm_flags_t flags)
-{
-	vma_assert_write_locked(vma);
-	/*
-	 * The user should only be interested in avoiding reordering of
-	 * assignment to the first word.
-	 */
-	vma_flags_clear_all(&vma->flags);
-	vma_flags_overwrite_word_once(&vma->flags, flags);
-}
-
-static inline void vm_flags_set(struct vm_area_struct *vma,
-				vm_flags_t flags)
-{
-	vma_start_write(vma);
-	vma_flags_set_word(&vma->flags, flags);
-}
-
-static inline void vm_flags_clear(struct vm_area_struct *vma,
-				  vm_flags_t flags)
-{
-	vma_start_write(vma);
-	vma_flags_clear_word(&vma->flags, flags);
-}
-
-/*
- * Denies creating a writable executable mapping or gaining executable permissions.
- *
- * This denies the following:
- *
- *     a)      mmap(PROT_WRITE | PROT_EXEC)
- *
- *     b)      mmap(PROT_WRITE)
- *             mprotect(PROT_EXEC)
- *
- *     c)      mmap(PROT_WRITE)
- *             mprotect(PROT_READ)
- *             mprotect(PROT_EXEC)
- *
- * But allows the following:
- *
- *     d)      mmap(PROT_READ | PROT_EXEC)
- *             mmap(PROT_READ | PROT_EXEC | PROT_BTI)
- *
- * This is only applicable if the user has set the Memory-Deny-Write-Execute
- * (MDWE) protection mask for the current process.
- *
- * @old specifies the VMA flags the VMA originally possessed, and @new the ones
- * we propose to set.
- *
- * Return: false if proposed change is OK, true if not ok and should be denied.
- */
-static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
-{
-	/* If MDWE is disabled, we have nothing to deny. */
-	if (mm_flags_test(MMF_HAS_MDWE, current->mm))
-		return false;
-
-	/* If the new VMA is not executable, we have nothing to deny. */
-	if (!(new & VM_EXEC))
-		return false;
-
-	/* Under MDWE we do not accept newly writably executable VMAs... */
-	if (new & VM_WRITE)
-		return true;
-
-	/* ...nor previously non-executable VMAs becoming executable. */
-	if (!(old & VM_EXEC))
-		return true;
-
-	return false;
-}
-
-static inline int mapping_map_writable(struct address_space *mapping)
-{
-	return atomic_inc_unless_negative(&mapping->i_mmap_writable) ?
-		0 : -EPERM;
-}
-
-static inline unsigned long move_page_tables(struct pagetable_move_control *pmc)
-{
-	return 0;
-}
-
-static inline void free_pgd_range(struct mmu_gather *tlb,
-			unsigned long addr, unsigned long end,
-			unsigned long floor, unsigned long ceiling)
-{
-}
-
-static inline int ksm_execve(struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void ksm_exit(struct mm_struct *mm)
-{
-}
-
-static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
-{
-	if (reset_refcnt)
-		refcount_set(&vma->vm_refcnt, 0);
-}
-
-static inline void vma_numab_state_init(struct vm_area_struct *vma)
-{
-}
-
-static inline void vma_numab_state_free(struct vm_area_struct *vma)
-{
-}
-
-static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
-				     struct vm_area_struct *new_vma)
-{
-}
-
-static inline void free_anon_vma_name(struct vm_area_struct *vma)
-{
-}
-
-/* Declared in vma.h. */
-static inline void set_vma_from_desc(struct vm_area_struct *vma,
-		struct vm_area_desc *desc);
-
-static inline void mmap_action_prepare(struct mmap_action *action,
-					   struct vm_area_desc *desc)
-{
-}
-
-static inline int mmap_action_complete(struct mmap_action *action,
-					   struct vm_area_struct *vma)
-{
-	return 0;
-}
-
-static inline int __compat_vma_mmap(const struct file_operations *f_op,
-		struct file *file, struct vm_area_struct *vma)
-{
-	struct vm_area_desc desc = {
-		.mm = vma->vm_mm,
-		.file = file,
-		.start = vma->vm_start,
-		.end = vma->vm_end,
-
-		.pgoff = vma->vm_pgoff,
-		.vm_file = vma->vm_file,
-		.vm_flags = vma->vm_flags,
-		.page_prot = vma->vm_page_prot,
-
-		.action.type = MMAP_NOTHING, /* Default */
-	};
-	int err;
-
-	err = f_op->mmap_prepare(&desc);
-	if (err)
-		return err;
-
-	mmap_action_prepare(&desc.action, &desc);
-	set_vma_from_desc(vma, &desc);
-	return mmap_action_complete(&desc.action, vma);
-}
-
-static inline int compat_vma_mmap(struct file *file,
-		struct vm_area_struct *vma)
-{
-	return __compat_vma_mmap(file->f_op, file, vma);
-}
-
-/* Did the driver provide valid mmap hook configuration? */
-static inline bool can_mmap_file(struct file *file)
-{
-	bool has_mmap = file->f_op->mmap;
-	bool has_mmap_prepare = file->f_op->mmap_prepare;
-
-	/* Hooks are mutually exclusive. */
-	if (WARN_ON_ONCE(has_mmap && has_mmap_prepare))
-		return false;
-	if (!has_mmap && !has_mmap_prepare)
-		return false;
-
-	return true;
-}
-
-static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	if (file->f_op->mmap_prepare)
-		return compat_vma_mmap(file, vma);
-
-	return file->f_op->mmap(file, vma);
-}
-
-static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
-{
-	return file->f_op->mmap_prepare(desc);
-}
-
-static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
-{
-}
-
-static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
-{
-	/* Changing an anonymous vma with this is illegal */
-	get_file(file);
-	swap(vma->vm_file, file);
-	fput(file);
-}
-
-static inline bool shmem_file(struct file *file)
-{
-	return false;
-}
-
-static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm,
-		const struct file *file, vm_flags_t vm_flags)
-{
-	return vm_flags;
-}
-
-static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
-{
-}
-
-static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
-		unsigned long pfn, unsigned long size, pgprot_t pgprot)
-{
-	return 0;
-}
+typedef unsigned long vm_flags_t;
+#define pgoff_t unsigned long
+typedef unsigned long	pgprotval_t;
+typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
+typedef __bitwise unsigned int vm_fault_t;
 
-static inline int do_munmap(struct mm_struct *, unsigned long, size_t,
-		struct list_head *uf)
-{
-	return 0;
-}
+#include "include/stubs.h"
+#include "include/dup.h"
+#include "include/custom.h"
 
 #endif	/* __MM_VMA_INTERNAL_H */
-- 
cgit v1.2.3


From f615cc92641a403d354c6ee68263074a86de49c7 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 22 Jan 2026 16:06:22 +0000
Subject: tools/testing/vma: add VMA userland tests for VMA flag functions

Now we have the capability to test the new helpers for the bitmap VMA
flags in userland, do so.

We also update the Makefile such that both VMA (and while we're here)
mm_struct flag sizes can be customised on build.  We default to 128-bit to
enable testing of flags above word size even on 64-bit systems.

We add userland tests to ensure that we do not regress VMA flag behaviour
with the introduction when using bitmap VMA flags, nor accidentally
introduce unexpected results due to for instance higher bit values not
being correctly cleared/set.

As part of this change, make __mk_vma_flags() a custom function so we can
handle specifying invalid VMA bits.  This is purposeful so we can have the
VMA tests work at lower and higher number of VMA flags without having to
duplicate code too much.

Link: https://lkml.kernel.org/r/7fe6afe9c8c61e4d3cfc9a2d50a5d24da8528e68.1769097829.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Yury Norov <ynorov@nvidia.com>
Cc: Chris Mason <clm@fb.com>
Cc: Pedro Falcato <pfalcato@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/vma/Makefile         |   3 +
 tools/testing/vma/include/custom.h |  16 ++
 tools/testing/vma/include/dup.h    |  11 +-
 tools/testing/vma/tests/vma.c      | 300 +++++++++++++++++++++++++++++++++++++
 tools/testing/vma/vma_internal.h   |   4 +-
 5 files changed, 322 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
index 50aa4301b3a6..e72b45dedda5 100644
--- a/tools/testing/vma/Makefile
+++ b/tools/testing/vma/Makefile
@@ -9,6 +9,9 @@ include ../shared/shared.mk
 OFILES = $(SHARED_OFILES) main.o shared.o maple-shim.o
 TARGETS = vma
 
+# These can be varied to test different sizes.
+CFLAGS += -DNUM_VMA_FLAG_BITS=128 -DNUM_MM_FLAG_BITS=128
+
 main.o: main.c shared.c shared.h vma_internal.h tests/merge.c tests/mmap.c tests/vma.c ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h include/custom.h include/dup.h include/stubs.h
 
 vma:	$(OFILES)
diff --git a/tools/testing/vma/include/custom.h b/tools/testing/vma/include/custom.h
index f567127efba9..802a76317245 100644
--- a/tools/testing/vma/include/custom.h
+++ b/tools/testing/vma/include/custom.h
@@ -101,3 +101,19 @@ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
 	if (reset_refcnt)
 		refcount_set(&vma->vm_refcnt, 0);
 }
+
+static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits)
+{
+	vma_flags_t flags;
+	int i;
+
+	/*
+	 * For testing purposes: allow invalid bit specification so we can
+	 * easily test.
+	 */
+	vma_flags_clear_all(&flags);
+	for (i = 0; i < count; i++)
+		if (bits[i] < NUM_VMA_FLAG_BITS)
+			vma_flag_set(&flags, bits[i]);
+	return flags;
+}
diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
index 0accfc296615..3078ff1487d3 100644
--- a/tools/testing/vma/include/dup.h
+++ b/tools/testing/vma/include/dup.h
@@ -838,16 +838,7 @@ static inline void vm_flags_clear(struct vm_area_struct *vma,
 	vma_flags_clear_word(&vma->flags, flags);
 }
 
-static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits)
-{
-	vma_flags_t flags;
-	int i;
-
-	vma_flags_clear_all(&flags);
-	for (i = 0; i < count; i++)
-		vma_flag_set(&flags, bits[i]);
-	return flags;
-}
+static inline vma_flags_t __mk_vma_flags(size_t count, const vma_flag_t *bits);
 
 #define mk_vma_flags(...) __mk_vma_flags(COUNT_ARGS(__VA_ARGS__), \
 					 (const vma_flag_t []){__VA_ARGS__})
diff --git a/tools/testing/vma/tests/vma.c b/tools/testing/vma/tests/vma.c
index 6d9775aee243..c54ffc954f11 100644
--- a/tools/testing/vma/tests/vma.c
+++ b/tools/testing/vma/tests/vma.c
@@ -1,5 +1,25 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+static bool compare_legacy_flags(vm_flags_t legacy_flags, vma_flags_t flags)
+{
+	const unsigned long legacy_val = legacy_flags;
+	/* The lower word should contain the precise same value. */
+	const unsigned long flags_lower = flags.__vma_flags[0];
+#if NUM_VMA_FLAGS > BITS_PER_LONG
+	int i;
+
+	/* All bits in higher flag values should be zero. */
+	for (i = 1; i < NUM_VMA_FLAGS / BITS_PER_LONG; i++) {
+		if (flags.__vma_flags[i] != 0)
+			return false;
+	}
+#endif
+
+	static_assert(sizeof(legacy_flags) == sizeof(unsigned long));
+
+	return legacy_val == flags_lower;
+}
+
 static bool test_copy_vma(void)
 {
 	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
@@ -33,7 +53,287 @@ static bool test_copy_vma(void)
 	return true;
 }
 
+static bool test_vma_flags_unchanged(void)
+{
+	vma_flags_t flags = EMPTY_VMA_FLAGS;
+	vm_flags_t legacy_flags = 0;
+	int bit;
+	struct vm_area_struct vma;
+	struct vm_area_desc desc;
+
+
+	vma.flags = EMPTY_VMA_FLAGS;
+	desc.vma_flags = EMPTY_VMA_FLAGS;
+
+	for (bit = 0; bit < BITS_PER_LONG; bit++) {
+		vma_flags_t mask = mk_vma_flags(bit);
+
+		legacy_flags |= (1UL << bit);
+
+		/* Individual flags. */
+		vma_flags_set(&flags, bit);
+		ASSERT_TRUE(compare_legacy_flags(legacy_flags, flags));
+
+		/* Via mask. */
+		vma_flags_set_mask(&flags, mask);
+		ASSERT_TRUE(compare_legacy_flags(legacy_flags, flags));
+
+		/* Same for VMA. */
+		vma_set_flags(&vma, bit);
+		ASSERT_TRUE(compare_legacy_flags(legacy_flags, vma.flags));
+		vma_set_flags_mask(&vma, mask);
+		ASSERT_TRUE(compare_legacy_flags(legacy_flags, vma.flags));
+
+		/* Same for VMA descriptor. */
+		vma_desc_set_flags(&desc, bit);
+		ASSERT_TRUE(compare_legacy_flags(legacy_flags, desc.vma_flags));
+		vma_desc_set_flags_mask(&desc, mask);
+		ASSERT_TRUE(compare_legacy_flags(legacy_flags, desc.vma_flags));
+	}
+
+	return true;
+}
+
+static bool test_vma_flags_cleared(void)
+{
+	const vma_flags_t empty = EMPTY_VMA_FLAGS;
+	vma_flags_t flags;
+	int i;
+
+	/* Set all bits high. */
+	memset(&flags, 1, sizeof(flags));
+	/* Try to clear. */
+	vma_flags_clear_all(&flags);
+	/* Equal to EMPTY_VMA_FLAGS? */
+	ASSERT_EQ(memcmp(&empty, &flags, sizeof(flags)), 0);
+	/* Make sure every unsigned long entry in bitmap array zero. */
+	for (i = 0; i < sizeof(flags) / BITS_PER_LONG; i++) {
+		const unsigned long val = flags.__vma_flags[i];
+
+		ASSERT_EQ(val, 0);
+	}
+
+	return true;
+}
+
+/*
+ * Assert that VMA flag functions that operate at the system word level function
+ * correctly.
+ */
+static bool test_vma_flags_word(void)
+{
+	vma_flags_t flags = EMPTY_VMA_FLAGS;
+	const vma_flags_t comparison =
+		mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT, 64, 65);
+
+	/* Set some custom high flags. */
+	vma_flags_set(&flags, 64, 65);
+	/* Now overwrite the first word. */
+	vma_flags_overwrite_word(&flags, VM_READ | VM_WRITE);
+	/* Ensure they are equal. */
+	ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0);
+
+	flags = EMPTY_VMA_FLAGS;
+	vma_flags_set(&flags, 64, 65);
+
+	/* Do the same with the _once() equivalent. */
+	vma_flags_overwrite_word_once(&flags, VM_READ | VM_WRITE);
+	ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0);
+
+	flags = EMPTY_VMA_FLAGS;
+	vma_flags_set(&flags, 64, 65);
+
+	/* Make sure we can set a word without disturbing other bits. */
+	vma_flags_set(&flags, VMA_WRITE_BIT);
+	vma_flags_set_word(&flags, VM_READ);
+	ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0);
+
+	flags = EMPTY_VMA_FLAGS;
+	vma_flags_set(&flags, 64, 65);
+
+	/* Make sure we can clear a word without disturbing other bits. */
+	vma_flags_set(&flags, VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
+	vma_flags_clear_word(&flags, VM_EXEC);
+	ASSERT_EQ(memcmp(&flags, &comparison, sizeof(flags)), 0);
+
+	return true;
+}
+
+/* Ensure that vma_flags_test() and friends works correctly. */
+static bool test_vma_flags_test(void)
+{
+	const vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT,
+					       VMA_EXEC_BIT, 64, 65);
+	struct vm_area_struct vma;
+	struct vm_area_desc desc;
+
+	vma.flags = flags;
+	desc.vma_flags = flags;
+
+#define do_test(...)						\
+	ASSERT_TRUE(vma_flags_test(&flags, __VA_ARGS__));	\
+	ASSERT_TRUE(vma_desc_test_flags(&desc, __VA_ARGS__))
+
+#define do_test_all_true(...)					\
+	ASSERT_TRUE(vma_flags_test_all(&flags, __VA_ARGS__));	\
+	ASSERT_TRUE(vma_test_all_flags(&vma, __VA_ARGS__))
+
+#define do_test_all_false(...)					\
+	ASSERT_FALSE(vma_flags_test_all(&flags, __VA_ARGS__));	\
+	ASSERT_FALSE(vma_test_all_flags(&vma, __VA_ARGS__))
+
+	/*
+	 * Testing for some flags that are present, some that are not - should
+	 * pass. ANY flags matching should work.
+	 */
+	do_test(VMA_READ_BIT, VMA_MAYREAD_BIT, VMA_SEQ_READ_BIT);
+	/* However, the ...test_all() variant should NOT pass. */
+	do_test_all_false(VMA_READ_BIT, VMA_MAYREAD_BIT, VMA_SEQ_READ_BIT);
+	/* But should pass for flags present. */
+	do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, 65);
+	/* Also subsets... */
+	do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64);
+	do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
+	do_test_all_true(VMA_READ_BIT, VMA_WRITE_BIT);
+	do_test_all_true(VMA_READ_BIT);
+	/*
+	 * Check _mask variant. We don't need to test extensively as macro
+	 * helper is the equivalent.
+	 */
+	ASSERT_TRUE(vma_flags_test_mask(&flags, flags));
+	ASSERT_TRUE(vma_flags_test_all_mask(&flags, flags));
+
+	/* Single bits. */
+	do_test(VMA_READ_BIT);
+	do_test(VMA_WRITE_BIT);
+	do_test(VMA_EXEC_BIT);
+#if NUM_VMA_FLAG_BITS > 64
+	do_test(64);
+	do_test(65);
+#endif
+
+	/* Two bits. */
+	do_test(VMA_READ_BIT, VMA_WRITE_BIT);
+	do_test(VMA_READ_BIT, VMA_EXEC_BIT);
+	do_test(VMA_WRITE_BIT, VMA_EXEC_BIT);
+	/* Ordering shouldn't matter. */
+	do_test(VMA_WRITE_BIT, VMA_READ_BIT);
+	do_test(VMA_EXEC_BIT, VMA_READ_BIT);
+	do_test(VMA_EXEC_BIT, VMA_WRITE_BIT);
+#if NUM_VMA_FLAG_BITS > 64
+	do_test(VMA_READ_BIT, 64);
+	do_test(VMA_WRITE_BIT, 64);
+	do_test(64, VMA_READ_BIT);
+	do_test(64, VMA_WRITE_BIT);
+	do_test(VMA_READ_BIT, 65);
+	do_test(VMA_WRITE_BIT, 65);
+	do_test(65, VMA_READ_BIT);
+	do_test(65, VMA_WRITE_BIT);
+#endif
+	/* Three bits. */
+	do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT);
+#if NUM_VMA_FLAG_BITS > 64
+	/* No need to consider every single permutation. */
+	do_test(VMA_READ_BIT, VMA_WRITE_BIT, 64);
+	do_test(VMA_READ_BIT, VMA_WRITE_BIT, 65);
+
+	/* Four bits. */
+	do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64);
+	do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 65);
+
+	/* Five bits. */
+	do_test(VMA_READ_BIT, VMA_WRITE_BIT, VMA_EXEC_BIT, 64, 65);
+#endif
+
+#undef do_test
+#undef do_test_all_true
+#undef do_test_all_false
+
+	return true;
+}
+
+/* Ensure that vma_flags_clear() and friends works correctly. */
+static bool test_vma_flags_clear(void)
+{
+	vma_flags_t flags = mk_vma_flags(VMA_READ_BIT, VMA_WRITE_BIT,
+					 VMA_EXEC_BIT, 64, 65);
+	vma_flags_t mask = mk_vma_flags(VMA_EXEC_BIT, 64);
+	struct vm_area_struct vma;
+	struct vm_area_desc desc;
+
+	vma.flags = flags;
+	desc.vma_flags = flags;
+
+	/* Cursory check of _mask() variant, as the helper macros imply. */
+	vma_flags_clear_mask(&flags, mask);
+	vma_flags_clear_mask(&vma.flags, mask);
+	vma_desc_clear_flags_mask(&desc, mask);
+	ASSERT_FALSE(vma_flags_test(&flags, VMA_EXEC_BIT, 64));
+	ASSERT_FALSE(vma_flags_test(&vma.flags, VMA_EXEC_BIT, 64));
+	ASSERT_FALSE(vma_desc_test_flags(&desc, VMA_EXEC_BIT, 64));
+	/* Reset. */
+	vma_flags_set(&flags, VMA_EXEC_BIT, 64);
+	vma_set_flags(&vma, VMA_EXEC_BIT, 64);
+	vma_desc_set_flags(&desc, VMA_EXEC_BIT, 64);
+
+	/*
+	 * Clear the flags and assert clear worked, then reset flags back to
+	 * include specified flags.
+	 */
+#define do_test_and_reset(...)					\
+	vma_flags_clear(&flags, __VA_ARGS__);			\
+	vma_flags_clear(&vma.flags, __VA_ARGS__);		\
+	vma_desc_clear_flags(&desc, __VA_ARGS__);		\
+	ASSERT_FALSE(vma_flags_test(&flags, __VA_ARGS__));	\
+	ASSERT_FALSE(vma_flags_test(&vma.flags, __VA_ARGS__));	\
+	ASSERT_FALSE(vma_desc_test_flags(&desc, __VA_ARGS__));	\
+	vma_flags_set(&flags, __VA_ARGS__);			\
+	vma_set_flags(&vma, __VA_ARGS__);			\
+	vma_desc_set_flags(&desc, __VA_ARGS__)
+
+	/* Single flags. */
+	do_test_and_reset(VMA_READ_BIT);
+	do_test_and_reset(VMA_WRITE_BIT);
+	do_test_and_reset(VMA_EXEC_BIT);
+	do_test_and_reset(64);
+	do_test_and_reset(65);
+
+	/* Two flags, in different orders. */
+	do_test_and_reset(VMA_READ_BIT, VMA_WRITE_BIT);
+	do_test_and_reset(VMA_READ_BIT, VMA_EXEC_BIT);
+	do_test_and_reset(VMA_READ_BIT, 64);
+	do_test_and_reset(VMA_READ_BIT, 65);
+	do_test_and_reset(VMA_WRITE_BIT, VMA_READ_BIT);
+	do_test_and_reset(VMA_WRITE_BIT, VMA_EXEC_BIT);
+	do_test_and_reset(VMA_WRITE_BIT, 64);
+	do_test_and_reset(VMA_WRITE_BIT, 65);
+	do_test_and_reset(VMA_EXEC_BIT, VMA_READ_BIT);
+	do_test_and_reset(VMA_EXEC_BIT, VMA_WRITE_BIT);
+	do_test_and_reset(VMA_EXEC_BIT, 64);
+	do_test_and_reset(VMA_EXEC_BIT, 65);
+	do_test_and_reset(64, VMA_READ_BIT);
+	do_test_and_reset(64, VMA_WRITE_BIT);
+	do_test_and_reset(64, VMA_EXEC_BIT);
+	do_test_and_reset(64, 65);
+	do_test_and_reset(65, VMA_READ_BIT);
+	do_test_and_reset(65, VMA_WRITE_BIT);
+	do_test_and_reset(65, VMA_EXEC_BIT);
+	do_test_and_reset(65, 64);
+
+	/* Three flags. */
+
+#undef do_test_some_missing
+#undef do_test_and_reset
+
+	return true;
+}
+
 static void run_vma_tests(int *num_tests, int *num_fail)
 {
 	TEST(copy_vma);
+	TEST(vma_flags_unchanged);
+	TEST(vma_flags_cleared);
+	TEST(vma_flags_word);
+	TEST(vma_flags_test);
+	TEST(vma_flags_clear);
 }
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index e3ed05b57819..0e1121e2ef23 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -36,11 +36,11 @@
  * ahead of all other headers.
  */
 #define __private
-#define NUM_MM_FLAG_BITS (64)
+/* NUM_MM_FLAG_BITS defined by test code. */
 typedef struct {
 	__private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
 } mm_flags_t;
-#define NUM_VMA_FLAG_BITS BITS_PER_LONG
+/* NUM_VMA_FLAG_BITS defined by test code. */
 typedef struct {
 	DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS);
 } __private vma_flags_t;
-- 
cgit v1.2.3


From ff4ef2fbd10192357da76fd80796b7262df21b78 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Fri, 6 Feb 2026 11:16:37 +0800
Subject: selftests/mm: add memory failure anonymous page test

Patch series "selftests/mm: add memory failure selftests", v4.

Introduce selftests to validate the functionality of memory failure.
These tests help ensure that memory failure handling for anonymous pages,
pagecaches pages works correctly, including proper SIGBUS delivery to user
processes, page isolation, and recovery paths.

Currently madvise syscall is used to inject memory failures.  And only
anonymous pages and pagecaches are tested.  More test scenarios, e.g.
hugetlb, shmem, thp, will be added.  Also more memory failure injecting
methods will be supported, e.g.  APEI Error INJection, if required.


This patch (of 3):

This patch adds a new kselftest to validate memory failure handling for
anonymous pages. The test performs the following operations:
1. Allocates anonymous pages using mmap().
2. Injects memory failure via madvise syscall.
3. Verifies expected error handling behavior.
4. Unpoison memory.

This test helps ensure that memory failure handling for anonymous pages
works correctly, including proper SIGBUS delivery to user processes, page
isolation and recovery paths.

Link: https://lkml.kernel.org/r/20260206031639.2707102-1-linmiaohe@huawei.com
Link: https://lkml.kernel.org/r/20260206031639.2707102-2-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS                                       |   1 +
 tools/testing/selftests/mm/.gitignore             |   1 +
 tools/testing/selftests/mm/Makefile               |   2 +
 tools/testing/selftests/mm/config                 |   2 +
 tools/testing/selftests/mm/ksft_memory_failure.sh |   4 +
 tools/testing/selftests/mm/memory-failure.c       | 239 ++++++++++++++++++++++
 tools/testing/selftests/mm/run_vmtests.sh         |  21 ++
 tools/testing/selftests/mm/vm_util.c              |  41 ++++
 tools/testing/selftests/mm/vm_util.h              |   3 +
 9 files changed, 314 insertions(+)
 create mode 100755 tools/testing/selftests/mm/ksft_memory_failure.sh
 create mode 100644 tools/testing/selftests/mm/memory-failure.c

(limited to 'tools')

diff --git a/MAINTAINERS b/MAINTAINERS
index 64006f19954e..18d1ebf053db 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11691,6 +11691,7 @@ F:	include/linux/memory-failure.h
 F:	include/trace/events/memory-failure.h
 F:	mm/hwpoison-inject.c
 F:	mm/memory-failure.c
+F:	tools/testing/selftests/mm/memory-failure.c
 
 HYCON HY46XX TOUCHSCREEN SUPPORT
 M:	Giulio Benetti <giulio.benetti@benettiengineering.com>
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 702e5723c35d..bfd94a79e975 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -12,6 +12,7 @@ map_hugetlb
 map_populate
 thuge-gen
 compaction_test
+memory-failure
 migration
 mlock2-tests
 mrelease_test
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 905f1e034963..4847c6d6c1b0 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -75,6 +75,7 @@ TEST_GEN_FILES += map_populate
 ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64))
 TEST_GEN_FILES += memfd_secret
 endif
+TEST_GEN_FILES += memory-failure
 TEST_GEN_FILES += migration
 TEST_GEN_FILES += mkdirty
 TEST_GEN_FILES += mlock-random-test
@@ -154,6 +155,7 @@ TEST_PROGS += ksft_ksm_numa.sh
 TEST_PROGS += ksft_madv_guard.sh
 TEST_PROGS += ksft_madv_populate.sh
 TEST_PROGS += ksft_memfd_secret.sh
+TEST_PROGS += ksft_memory_failure.sh
 TEST_PROGS += ksft_migration.sh
 TEST_PROGS += ksft_mkdirty.sh
 TEST_PROGS += ksft_mlock.sh
diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config
index deba93379c80..1dbe2b4558ab 100644
--- a/tools/testing/selftests/mm/config
+++ b/tools/testing/selftests/mm/config
@@ -11,3 +11,5 @@ CONFIG_ANON_VMA_NAME=y
 CONFIG_FTRACE=y
 CONFIG_PROFILING=y
 CONFIG_UPROBES=y
+CONFIG_MEMORY_FAILURE=y
+CONFIG_HWPOISON_INJECT=m
diff --git a/tools/testing/selftests/mm/ksft_memory_failure.sh b/tools/testing/selftests/mm/ksft_memory_failure.sh
new file mode 100755
index 000000000000..ae1614d4d49b
--- /dev/null
+++ b/tools/testing/selftests/mm/ksft_memory_failure.sh
@@ -0,0 +1,4 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+
+./run_vmtests.sh -t memory-failure
diff --git a/tools/testing/selftests/mm/memory-failure.c b/tools/testing/selftests/mm/memory-failure.c
new file mode 100644
index 000000000000..37806a58f4b4
--- /dev/null
+++ b/tools/testing/selftests/mm/memory-failure.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory-failure functional tests.
+ *
+ * Author(s): Miaohe Lin <linmiaohe@huawei.com>
+ */
+
+#include "../kselftest_harness.h"
+
+#include <sys/mman.h>
+#include <linux/mman.h>
+#include <linux/string.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "vm_util.h"
+
+enum inject_type {
+	MADV_HARD,
+	MADV_SOFT,
+};
+
+enum result_type {
+	MADV_HARD_ANON,
+	MADV_SOFT_ANON,
+};
+
+static jmp_buf signal_jmp_buf;
+static siginfo_t siginfo;
+const char *pagemap_proc = "/proc/self/pagemap";
+const char *kpageflags_proc = "/proc/kpageflags";
+
+FIXTURE(memory_failure)
+{
+	unsigned long page_size;
+	unsigned long corrupted_size;
+	unsigned long pfn;
+	int pagemap_fd;
+	int kpageflags_fd;
+	bool triggered;
+};
+
+FIXTURE_VARIANT(memory_failure)
+{
+	enum inject_type type;
+	int (*inject)(FIXTURE_DATA(memory_failure) * self, void *vaddr);
+};
+
+static int madv_hard_inject(FIXTURE_DATA(memory_failure) * self, void *vaddr)
+{
+	return madvise(vaddr, self->page_size, MADV_HWPOISON);
+}
+
+FIXTURE_VARIANT_ADD(memory_failure, madv_hard)
+{
+	.type = MADV_HARD,
+	.inject = madv_hard_inject,
+};
+
+static int madv_soft_inject(FIXTURE_DATA(memory_failure) * self, void *vaddr)
+{
+	return madvise(vaddr, self->page_size, MADV_SOFT_OFFLINE);
+}
+
+FIXTURE_VARIANT_ADD(memory_failure, madv_soft)
+{
+	.type = MADV_SOFT,
+	.inject = madv_soft_inject,
+};
+
+static void sigbus_action(int signo, siginfo_t *si, void *args)
+{
+	memcpy(&siginfo, si, sizeof(siginfo_t));
+	siglongjmp(signal_jmp_buf, 1);
+}
+
+static int setup_sighandler(void)
+{
+	struct sigaction sa = {
+		.sa_sigaction = sigbus_action,
+		.sa_flags = SA_SIGINFO,
+	};
+
+	return sigaction(SIGBUS, &sa, NULL);
+}
+
+FIXTURE_SETUP(memory_failure)
+{
+	memset(self, 0, sizeof(*self));
+
+	self->page_size = (unsigned long)sysconf(_SC_PAGESIZE);
+
+	memset(&siginfo, 0, sizeof(siginfo));
+	if (setup_sighandler())
+		SKIP(return, "setup sighandler failed.\n");
+
+	self->pagemap_fd = open(pagemap_proc, O_RDONLY);
+	if (self->pagemap_fd == -1)
+		SKIP(return, "open %s failed.\n", pagemap_proc);
+
+	self->kpageflags_fd = open(kpageflags_proc, O_RDONLY);
+	if (self->kpageflags_fd == -1)
+		SKIP(return, "open %s failed.\n", kpageflags_proc);
+}
+
+static void teardown_sighandler(void)
+{
+	struct sigaction sa = {
+		.sa_handler = SIG_DFL,
+		.sa_flags = SA_SIGINFO,
+	};
+
+	sigaction(SIGBUS, &sa, NULL);
+}
+
+FIXTURE_TEARDOWN(memory_failure)
+{
+	close(self->kpageflags_fd);
+	close(self->pagemap_fd);
+	teardown_sighandler();
+}
+
+static void prepare(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure) * self,
+		    void *vaddr)
+{
+	self->pfn = pagemap_get_pfn(self->pagemap_fd, vaddr);
+	ASSERT_NE(self->pfn, -1UL);
+
+	ASSERT_EQ(get_hardware_corrupted_size(&self->corrupted_size), 0);
+}
+
+static bool check_memory(void *vaddr, unsigned long size)
+{
+	char buf[64];
+
+	memset(buf, 0xce, sizeof(buf));
+	while (size >= sizeof(buf)) {
+		if (memcmp(vaddr, buf, sizeof(buf)))
+			return false;
+		size -= sizeof(buf);
+		vaddr += sizeof(buf);
+	}
+
+	return true;
+}
+
+static void check(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure) * self,
+		  void *vaddr, enum result_type type, int setjmp)
+{
+	unsigned long size;
+	uint64_t pfn_flags;
+
+	switch (type) {
+	case MADV_SOFT_ANON:
+		/* It is not expected to receive a SIGBUS signal. */
+		ASSERT_EQ(setjmp, 0);
+
+		/* The page content should remain unchanged. */
+		ASSERT_TRUE(check_memory(vaddr, self->page_size));
+
+		/* The backing pfn of addr should have changed. */
+		ASSERT_NE(pagemap_get_pfn(self->pagemap_fd, vaddr), self->pfn);
+		break;
+	case MADV_HARD_ANON:
+		/* The SIGBUS signal should have been received. */
+		ASSERT_EQ(setjmp, 1);
+
+		/* Check if siginfo contains correct SIGBUS context. */
+		ASSERT_EQ(siginfo.si_signo, SIGBUS);
+		ASSERT_EQ(siginfo.si_code, BUS_MCEERR_AR);
+		ASSERT_EQ(1UL << siginfo.si_addr_lsb, self->page_size);
+		ASSERT_EQ(siginfo.si_addr, vaddr);
+
+		/* XXX Check backing pte is hwpoison entry when supported. */
+		ASSERT_TRUE(pagemap_is_swapped(self->pagemap_fd, vaddr));
+		break;
+	default:
+		SKIP(return, "unexpected inject type %d.\n", type);
+	}
+
+	/* Check if the value of HardwareCorrupted has increased. */
+	ASSERT_EQ(get_hardware_corrupted_size(&size), 0);
+	ASSERT_EQ(size, self->corrupted_size + self->page_size / 1024);
+
+	/* Check if HWPoison flag is set. */
+	ASSERT_EQ(pageflags_get(self->pfn, self->kpageflags_fd, &pfn_flags), 0);
+	ASSERT_EQ(pfn_flags & KPF_HWPOISON, KPF_HWPOISON);
+}
+
+static void cleanup(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure) * self,
+		    void *vaddr)
+{
+	unsigned long size;
+	uint64_t pfn_flags;
+
+	ASSERT_EQ(unpoison_memory(self->pfn), 0);
+
+	/* Check if HWPoison flag is cleared. */
+	ASSERT_EQ(pageflags_get(self->pfn, self->kpageflags_fd, &pfn_flags), 0);
+	ASSERT_NE(pfn_flags & KPF_HWPOISON, KPF_HWPOISON);
+
+	/* Check if the value of HardwareCorrupted has decreased. */
+	ASSERT_EQ(get_hardware_corrupted_size(&size), 0);
+	ASSERT_EQ(size, self->corrupted_size);
+}
+
+TEST_F(memory_failure, anon)
+{
+	char *addr;
+	int ret;
+
+	addr = mmap(0, self->page_size, PROT_READ | PROT_WRITE,
+		    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+	if (addr == MAP_FAILED)
+		SKIP(return, "mmap failed, not enough memory.\n");
+	memset(addr, 0xce, self->page_size);
+
+	prepare(_metadata, self, addr);
+
+	ret = sigsetjmp(signal_jmp_buf, 1);
+	if (!self->triggered) {
+		self->triggered = true;
+		ASSERT_EQ(variant->inject(self, addr), 0);
+		FORCE_READ(*addr);
+	}
+
+	if (variant->type == MADV_HARD)
+		check(_metadata, self, addr, MADV_HARD_ANON, ret);
+	else
+		check(_metadata, self, addr, MADV_SOFT_ANON, ret);
+
+	cleanup(_metadata, self, addr);
+
+	ASSERT_EQ(munmap(addr, self->page_size), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
index 29be9038bfb0..afdcfd0d7cef 100755
--- a/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@ -91,6 +91,8 @@ separated by spaces:
 	test VMA merge cases behave as expected
 - rmap
 	test rmap behaves as expected
+- memory-failure
+	test memory-failure behaves as expected
 
 example: ./run_vmtests.sh -t "hmm mmap ksm"
 EOF
@@ -527,6 +529,25 @@ CATEGORY="page_frag" run_test ./test_page_frag.sh nonaligned
 
 CATEGORY="rmap" run_test ./rmap
 
+# Try to load hwpoison_inject if not present.
+HWPOISON_DIR=/sys/kernel/debug/hwpoison/
+if [ ! -d "$HWPOISON_DIR" ]; then
+	if ! modprobe -q -R hwpoison_inject; then
+		echo "Module hwpoison_inject not found, skipping..."
+	else
+		modprobe hwpoison_inject > /dev/null 2>&1
+		LOADED_MOD=1
+	fi
+fi
+
+if [ -d "$HWPOISON_DIR" ]; then
+	CATEGORY="memory-failure" run_test ./memory-failure
+fi
+
+if [ -n "${LOADED_MOD}" ]; then
+	modprobe -r hwpoison_inject > /dev/null 2>&1
+fi
+
 if [ "${HAVE_HUGEPAGES}" = 1 ]; then
 	echo "$orig_nr_hugepgs" > /proc/sys/vm/nr_hugepages
 fi
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index d954bf91afd5..a6d4ff7dfdc0 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -723,3 +723,44 @@ int ksm_stop(void)
 	close(ksm_fd);
 	return ret == 1 ? 0 : -errno;
 }
+
+int get_hardware_corrupted_size(unsigned long *val)
+{
+	unsigned long size;
+	char *line = NULL;
+	size_t linelen = 0;
+	FILE *f = fopen("/proc/meminfo", "r");
+	int ret = -1;
+
+	if (!f)
+		return ret;
+
+	while (getline(&line, &linelen, f) > 0) {
+		if (sscanf(line, "HardwareCorrupted: %12lu kB", &size) == 1) {
+			*val = size;
+			ret = 0;
+			break;
+		}
+	}
+
+	free(line);
+	fclose(f);
+	return ret;
+}
+
+int unpoison_memory(unsigned long pfn)
+{
+	int unpoison_fd, len;
+	char buf[32];
+	ssize_t ret;
+
+	unpoison_fd = open("/sys/kernel/debug/hwpoison/unpoison-pfn", O_WRONLY);
+	if (unpoison_fd < 0)
+		return -errno;
+
+	len = sprintf(buf, "0x%lx\n", pfn);
+	ret = write(unpoison_fd, buf, len);
+	close(unpoison_fd);
+
+	return ret > 0 ? 0 : -errno;
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index 522f7f9050f5..e9c4e24769c1 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -20,6 +20,7 @@
 
 #define KPF_COMPOUND_HEAD             BIT_ULL(15)
 #define KPF_COMPOUND_TAIL             BIT_ULL(16)
+#define KPF_HWPOISON                  BIT_ULL(19)
 #define KPF_THP                       BIT_ULL(22)
 /*
  * Ignore the checkpatch warning, we must read from x but don't want to do
@@ -154,6 +155,8 @@ long ksm_get_full_scans(void);
 int ksm_use_zero_pages(void);
 int ksm_start(void);
 int ksm_stop(void);
+int get_hardware_corrupted_size(unsigned long *val);
+int unpoison_memory(unsigned long pfn);
 
 /*
  * On ppc64 this will only work with radix 2M hugepage size
-- 
cgit v1.2.3


From 12e8a2fae372c55c17a410929cfa60f96b93d17a Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Fri, 6 Feb 2026 11:16:38 +0800
Subject: selftests/mm: add memory failure clean pagecache test

This patch adds a new testcase to validate memory failure handling for
clean pagecache.  This test performs similar operations as anonymous pages
except allocating memory using mmap() with a file fd.

This test helps ensure that memory failure handling for clean pagecache
works correctly, including unchanged page content, page isolation, and
recovery paths.

Link: https://lkml.kernel.org/r/20260206031639.2707102-3-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202601221142.mDWA1ucw-lkp@intel.com/
Cc: David Hildenbrand <david@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/memory-failure.c | 66 +++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/memory-failure.c b/tools/testing/selftests/mm/memory-failure.c
index 37806a58f4b4..3aa624db9577 100644
--- a/tools/testing/selftests/mm/memory-failure.c
+++ b/tools/testing/selftests/mm/memory-failure.c
@@ -10,10 +10,14 @@
 #include <sys/mman.h>
 #include <linux/mman.h>
 #include <linux/string.h>
+#include <unistd.h>
 #include <signal.h>
 #include <setjmp.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <sys/vfs.h>
+#include <linux/magic.h>
+#include <errno.h>
 
 #include "vm_util.h"
 
@@ -24,7 +28,9 @@ enum inject_type {
 
 enum result_type {
 	MADV_HARD_ANON,
+	MADV_HARD_CLEAN_PAGECACHE,
 	MADV_SOFT_ANON,
+	MADV_SOFT_CLEAN_PAGECACHE,
 };
 
 static jmp_buf signal_jmp_buf;
@@ -154,6 +160,8 @@ static void check(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure
 
 	switch (type) {
 	case MADV_SOFT_ANON:
+	case MADV_HARD_CLEAN_PAGECACHE:
+	case MADV_SOFT_CLEAN_PAGECACHE:
 		/* It is not expected to receive a SIGBUS signal. */
 		ASSERT_EQ(setjmp, 0);
 
@@ -236,4 +244,62 @@ TEST_F(memory_failure, anon)
 	ASSERT_EQ(munmap(addr, self->page_size), 0);
 }
 
+/* Borrowed from mm/gup_longterm.c. */
+static int get_fs_type(int fd)
+{
+	struct statfs fs;
+	int ret;
+
+	do {
+		ret = fstatfs(fd, &fs);
+	} while (ret && errno == EINTR);
+
+	return ret ? 0 : (int)fs.f_type;
+}
+
+TEST_F(memory_failure, clean_pagecache)
+{
+	const char *fname = "./clean-page-cache-test-file";
+	int fd;
+	char *addr;
+	int ret;
+	int fs_type;
+
+	fd = open(fname, O_RDWR | O_CREAT, 0664);
+	if (fd < 0)
+		SKIP(return, "failed to open test file.\n");
+	unlink(fname);
+	ftruncate(fd, self->page_size);
+	fs_type = get_fs_type(fd);
+	if (!fs_type || fs_type == TMPFS_MAGIC)
+		SKIP(return, "unsupported filesystem :%x\n", fs_type);
+
+	addr = mmap(0, self->page_size, PROT_READ | PROT_WRITE,
+		    MAP_SHARED, fd, 0);
+	if (addr == MAP_FAILED)
+		SKIP(return, "mmap failed, not enough memory.\n");
+	memset(addr, 0xce, self->page_size);
+	fsync(fd);
+
+	prepare(_metadata, self, addr);
+
+	ret = sigsetjmp(signal_jmp_buf, 1);
+	if (!self->triggered) {
+		self->triggered = true;
+		ASSERT_EQ(variant->inject(self, addr), 0);
+		FORCE_READ(*addr);
+	}
+
+	if (variant->type == MADV_HARD)
+		check(_metadata, self, addr, MADV_HARD_CLEAN_PAGECACHE, ret);
+	else
+		check(_metadata, self, addr, MADV_SOFT_CLEAN_PAGECACHE, ret);
+
+	cleanup(_metadata, self, addr);
+
+	ASSERT_EQ(munmap(addr, self->page_size), 0);
+
+	ASSERT_EQ(close(fd), 0);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From d51b5076c7468fad568645caf38a6979458a5de1 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Fri, 6 Feb 2026 11:16:39 +0800
Subject: selftests/mm: add memory failure dirty pagecache test

This patch adds a new testcase to validate memory failure handling for
dirty pagecache.  This performs similar operations as clean pagecaches
except fsync() is not used to keep pages dirty.

This test helps ensure that memory failure handling for dirty pagecache
works correctly, including proper SIGBUS delivery, page isolation, and
recovery paths.

Link: https://lkml.kernel.org/r/20260206031639.2707102-4-linmiaohe@huawei.com
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: kernel test robot <lkp@intel.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/memory-failure.c | 62 +++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/memory-failure.c b/tools/testing/selftests/mm/memory-failure.c
index 3aa624db9577..3d9e0b9ffb41 100644
--- a/tools/testing/selftests/mm/memory-failure.c
+++ b/tools/testing/selftests/mm/memory-failure.c
@@ -29,8 +29,10 @@ enum inject_type {
 enum result_type {
 	MADV_HARD_ANON,
 	MADV_HARD_CLEAN_PAGECACHE,
+	MADV_HARD_DIRTY_PAGECACHE,
 	MADV_SOFT_ANON,
 	MADV_SOFT_CLEAN_PAGECACHE,
+	MADV_SOFT_DIRTY_PAGECACHE,
 };
 
 static jmp_buf signal_jmp_buf;
@@ -162,6 +164,7 @@ static void check(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure
 	case MADV_SOFT_ANON:
 	case MADV_HARD_CLEAN_PAGECACHE:
 	case MADV_SOFT_CLEAN_PAGECACHE:
+	case MADV_SOFT_DIRTY_PAGECACHE:
 		/* It is not expected to receive a SIGBUS signal. */
 		ASSERT_EQ(setjmp, 0);
 
@@ -172,6 +175,7 @@ static void check(struct __test_metadata *_metadata, FIXTURE_DATA(memory_failure
 		ASSERT_NE(pagemap_get_pfn(self->pagemap_fd, vaddr), self->pfn);
 		break;
 	case MADV_HARD_ANON:
+	case MADV_HARD_DIRTY_PAGECACHE:
 		/* The SIGBUS signal should have been received. */
 		ASSERT_EQ(setjmp, 1);
 
@@ -244,6 +248,18 @@ TEST_F(memory_failure, anon)
 	ASSERT_EQ(munmap(addr, self->page_size), 0);
 }
 
+static int prepare_file(const char *fname, unsigned long size)
+{
+	int fd;
+
+	fd = open(fname, O_RDWR | O_CREAT, 0664);
+	if (fd >= 0) {
+		unlink(fname);
+		ftruncate(fd, size);
+	}
+	return fd;
+}
+
 /* Borrowed from mm/gup_longterm.c. */
 static int get_fs_type(int fd)
 {
@@ -259,17 +275,14 @@ static int get_fs_type(int fd)
 
 TEST_F(memory_failure, clean_pagecache)
 {
-	const char *fname = "./clean-page-cache-test-file";
 	int fd;
 	char *addr;
 	int ret;
 	int fs_type;
 
-	fd = open(fname, O_RDWR | O_CREAT, 0664);
+	fd = prepare_file("./clean-page-cache-test-file", self->page_size);
 	if (fd < 0)
 		SKIP(return, "failed to open test file.\n");
-	unlink(fname);
-	ftruncate(fd, self->page_size);
 	fs_type = get_fs_type(fd);
 	if (!fs_type || fs_type == TMPFS_MAGIC)
 		SKIP(return, "unsupported filesystem :%x\n", fs_type);
@@ -302,4 +315,45 @@ TEST_F(memory_failure, clean_pagecache)
 	ASSERT_EQ(close(fd), 0);
 }
 
+TEST_F(memory_failure, dirty_pagecache)
+{
+	int fd;
+	char *addr;
+	int ret;
+	int fs_type;
+
+	fd = prepare_file("./dirty-page-cache-test-file", self->page_size);
+	if (fd < 0)
+		SKIP(return, "failed to open test file.\n");
+	fs_type = get_fs_type(fd);
+	if (!fs_type || fs_type == TMPFS_MAGIC)
+		SKIP(return, "unsupported filesystem :%x\n", fs_type);
+
+	addr = mmap(0, self->page_size, PROT_READ | PROT_WRITE,
+		    MAP_SHARED, fd, 0);
+	if (addr == MAP_FAILED)
+		SKIP(return, "mmap failed, not enough memory.\n");
+	memset(addr, 0xce, self->page_size);
+
+	prepare(_metadata, self, addr);
+
+	ret = sigsetjmp(signal_jmp_buf, 1);
+	if (!self->triggered) {
+		self->triggered = true;
+		ASSERT_EQ(variant->inject(self, addr), 0);
+		FORCE_READ(*addr);
+	}
+
+	if (variant->type == MADV_HARD)
+		check(_metadata, self, addr, MADV_HARD_DIRTY_PAGECACHE, ret);
+	else
+		check(_metadata, self, addr, MADV_SOFT_DIRTY_PAGECACHE, ret);
+
+	cleanup(_metadata, self, addr);
+
+	ASSERT_EQ(munmap(addr, self->page_size), 0);
+
+	ASSERT_EQ(close(fd), 0);
+}
+
 TEST_HARNESS_MAIN
-- 
cgit v1.2.3


From 9333980c230fc29afb41dc52b58a0dc9ea201f2a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 10 Feb 2026 11:34:22 +0100
Subject: delayacct: fix build regression on accounting tool

The accounting tool was modified for the original ABI using a custom
'timespec64' type in linux/taskstats.h, which I changed to use the regular
__kernel_timespec type, causing a build failure:

        getdelays.c:202:45: warning: 'struct timespec64' declared inside parameter list will not be visible outside of this definition or declaration

     202 | static const char *format_timespec64(struct timespec64 *ts)
         |                                             ^~~~~~~~~~

Change the tool to match the updated header.

Link: https://lkml.kernel.org/r/20260210103427.2984963-1-arnd@kernel.org
Fixes: 503efe850c74 ("delayacct: add timestamp of delay max")
Fixes: f06e31eef4c1 ("delayacct: fix uapi timespec64 definition")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/r/202602091611.lxgINqXp-lkp@intel.com/
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/accounting/getdelays.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 64796c0223be..50792df27707 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -196,20 +196,20 @@ static int get_family_id(int sd)
 #define delay_ms(t) (t / 1000000ULL)
 
 /*
- * Format timespec64 to human readable string (YYYY-MM-DD HH:MM:SS)
+ * Format __kernel_timespec to human readable string (YYYY-MM-DD HH:MM:SS)
  * Returns formatted string or "N/A" if timestamp is zero
  */
-static const char *format_timespec64(struct timespec64 *ts)
+static const char *format_timespec(struct __kernel_timespec *ts)
 {
 	static char buffer[32];
 	struct tm tm_info;
-	time_t time_sec;
+	__kernel_time_t time_sec;
 
 	/* Check if timestamp is zero (not set) */
 	if (ts->tv_sec == 0 && ts->tv_nsec == 0)
 		return "N/A";
 
-	time_sec = (time_t)ts->tv_sec;
+	time_sec = ts->tv_sec;
 
 	/* Use thread-safe localtime_r */
 	if (localtime_r(&time_sec, &tm_info) == NULL)
@@ -257,7 +257,7 @@ static const char *format_timespec64(struct timespec64 *ts)
 				average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \
 				delay_ms((double)(t)->cpu_delay_max), \
 				delay_ms((double)(t)->cpu_delay_min), \
-				format_timespec64(&(t)->cpu_delay_max_ts)); \
+				format_timespec(&(t)->cpu_delay_max_ts)); \
 		} else if (version >= 16) { \
 			printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \
 				"CPU", "count", "real total", "virtual total", \
@@ -316,7 +316,7 @@ static const char *format_timespec64(struct timespec64 *ts)
 				average_ms((double)(t)->total, (t)->count), \
 				delay_ms((double)(t)->max), \
 				delay_ms((double)(t)->min), \
-				format_timespec64(&(t)->max_ts)); \
+				format_timespec(&(t)->max_ts)); \
 		} else if (version >= 16) { \
 			printf("%-10s%15s%15s%15s%15s%15s\n", \
 				name, "count", "delay total", "delay average", \
-- 
cgit v1.2.3


From b24335521de92fd2ee22460072b75367ca8860b0 Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <aris@redhat.com>
Date: Mon, 2 Feb 2026 09:38:05 -0500
Subject: selftests/memfd: use IPC semaphore instead of SIGSTOP/SIGCONT

selftests/memfd: use IPC semaphore instead of SIGSTOP/SIGCONT

In order to synchronize new processes to test inheritance of memfd_noexec
sysctl, memfd_test sets up the sysctl with a value before creating the new
process.  The new process then sends itself a SIGSTOP in order to wait for
the parent to flip the sysctl value and send a SIGCONT signal.

This would work as intended if it wasn't the fact that the new process is
being created with CLONE_NEWPID, which creates a new PID namespace and the
new process has PID 1 in this namespace.  There're restrictions on sending
signals to PID 1 and, although it's relaxed for other than root PID
namespace, it's biting us here.  In this specific case the SIGSTOP sent by
the new process is ignored (no error to kill() is returned) and it never
stops its execution.  This is usually not noticiable as the parent usually
manages to set the new sysctl value before the child has a chance to run
and the test succeeds.  But if you run the test in a loop, it eventually
reproduces:

	while [ 1 ]; do ./memfd_test >log 2>&1 || break; done; cat log

So this patch replaces the SIGSTOP/SIGCONT synchronization with IPC
semaphore.

Link: https://lkml.kernel.org/r/a7776389-b3d6-4b18-b438-0b0e3ed1fd3b@work
Fixes: 6469b66e3f5a ("selftests: improve vm.memfd_noexec sysctl tests")
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: liuye <liuye@kylinos.cn>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/memfd/memfd_test.c | 113 +++++++++++++++++++++++++++--
 1 file changed, 105 insertions(+), 8 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
index 5b993924cc3f..2ca07ea7202a 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -18,6 +18,9 @@
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/sem.h>
 #include <unistd.h>
 #include <ctype.h>
 
@@ -39,6 +42,20 @@
 		    F_SEAL_EXEC)
 
 #define MFD_NOEXEC_SEAL	0x0008U
+union semun {
+	int val;
+	struct semid_ds *buf;
+	unsigned short int *array;
+	struct seminfo *__buf;
+};
+
+/*
+ * we use semaphores on nested wait tasks due the use of CLONE_NEWPID: the
+ * child will be PID 1 and can't send SIGSTOP to themselves due special
+ * treatment of the init task, so the SIGSTOP/SIGCONT synchronization
+ * approach can't be used here.
+ */
+#define SEM_KEY 0xdeadbeef
 
 /*
  * Default is not to test hugetlbfs
@@ -1333,8 +1350,22 @@ static int sysctl_nested(void *arg)
 
 static int sysctl_nested_wait(void *arg)
 {
-	/* Wait for a SIGCONT. */
-	kill(getpid(), SIGSTOP);
+	int sem = semget(SEM_KEY, 1, 0600);
+	struct sembuf sembuf;
+
+	if (sem < 0) {
+		perror("semget:");
+		abort();
+	}
+	sembuf.sem_num = 0;
+	sembuf.sem_flg = 0;
+	sembuf.sem_op = 0;
+
+	if (semop(sem, &sembuf, 1) < 0) {
+		perror("semop:");
+		abort();
+	}
+
 	return sysctl_nested(arg);
 }
 
@@ -1355,7 +1386,9 @@ static void test_sysctl_sysctl2_failset(void)
 
 static int sysctl_nested_child(void *arg)
 {
-	int pid;
+	int pid, sem;
+	union semun semun;
+	struct sembuf sembuf;
 
 	printf("%s nested sysctl 0\n", memfd_str);
 	sysctl_assert_write("0");
@@ -1389,23 +1422,53 @@ static int sysctl_nested_child(void *arg)
 			   test_sysctl_sysctl2_failset);
 	join_thread(pid);
 
+	sem = semget(SEM_KEY, 1, IPC_CREAT | 0600);
+	if (sem < 0) {
+		perror("semget:");
+		return 1;
+	}
+	semun.val = 1;
+	sembuf.sem_op = -1;
+	sembuf.sem_flg = 0;
+	sembuf.sem_num = 0;
+
 	/* Verify that the rules are actually inherited after fork. */
 	printf("%s nested sysctl 0 -> 1 after fork\n", memfd_str);
 	sysctl_assert_write("0");
 
+	if (semctl(sem, 0, SETVAL, semun) < 0) {
+		perror("semctl:");
+		return 1;
+	}
+
 	pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait,
 			   test_sysctl_sysctl1_failset);
 	sysctl_assert_write("1");
-	kill(pid, SIGCONT);
+
+	/* Allow child to continue */
+	if (semop(sem, &sembuf, 1) < 0) {
+		perror("semop:");
+		return 1;
+	}
 	join_thread(pid);
 
 	printf("%s nested sysctl 0 -> 2 after fork\n", memfd_str);
 	sysctl_assert_write("0");
 
+	if (semctl(sem, 0, SETVAL, semun) < 0) {
+		perror("semctl:");
+		return 1;
+	}
+
 	pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait,
 			   test_sysctl_sysctl2_failset);
 	sysctl_assert_write("2");
-	kill(pid, SIGCONT);
+
+	/* Allow child to continue */
+	if (semop(sem, &sembuf, 1) < 0) {
+		perror("semop:");
+		return 1;
+	}
 	join_thread(pid);
 
 	/*
@@ -1415,28 +1478,62 @@ static int sysctl_nested_child(void *arg)
 	 */
 	printf("%s nested sysctl 2 -> 1 after fork\n", memfd_str);
 	sysctl_assert_write("2");
+
+	if (semctl(sem, 0, SETVAL, semun) < 0) {
+		perror("semctl:");
+		return 1;
+	}
+
 	pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait,
 			   test_sysctl_sysctl2);
 	sysctl_assert_write("1");
-	kill(pid, SIGCONT);
+
+	/* Allow child to continue */
+	if (semop(sem, &sembuf, 1) < 0) {
+		perror("semop:");
+		return 1;
+	}
 	join_thread(pid);
 
 	printf("%s nested sysctl 2 -> 0 after fork\n", memfd_str);
 	sysctl_assert_write("2");
+
+	if (semctl(sem, 0, SETVAL, semun) < 0) {
+		perror("semctl:");
+		return 1;
+	}
+
 	pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait,
 			   test_sysctl_sysctl2);
 	sysctl_assert_write("0");
-	kill(pid, SIGCONT);
+
+	/* Allow child to continue */
+	if (semop(sem, &sembuf, 1) < 0) {
+		perror("semop:");
+		return 1;
+	}
 	join_thread(pid);
 
 	printf("%s nested sysctl 1 -> 0 after fork\n", memfd_str);
 	sysctl_assert_write("1");
+
+	if (semctl(sem, 0, SETVAL, semun) < 0) {
+		perror("semctl:");
+		return 1;
+	}
+
 	pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait,
 			   test_sysctl_sysctl1);
 	sysctl_assert_write("0");
-	kill(pid, SIGCONT);
+	/* Allow child to continue */
+	if (semop(sem, &sembuf, 1) < 0) {
+		perror("semop:");
+		return 1;
+	}
 	join_thread(pid);
 
+	semctl(sem, 0, IPC_RMID);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 8429538c2c0420c65fbb4872966622b96ec36cea Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 12 Feb 2026 17:19:36 -0800
Subject: tools/testing: keep legacy generated files around in .gitignore file

People keep removing generated files from .gitignore files even when the
files stay around.  Please don't do that: just because the file is no
longer being generated doesn't make it magically go away, and doesn't
make it suddenly be something that should now not be ignored any more.

Fixes: dd2c6ec24fca ("selftests/mm: remove virtual_address_range test")
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/mm/.gitignore | 1 +
 1 file changed, 1 insertion(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 702e5723c35d..c2a8586e51a1 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -32,6 +32,7 @@ uffd-unit-tests
 uffd-wp-mremap
 mlock-intersect-test
 mlock-random-test
+virtual_address_range
 gup_test
 va_128TBswitch
 map_fixed_noreplace
-- 
cgit v1.2.3


From a2646773a005b59fd1dc7ff3ba15df84889ca5d2 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Mon, 9 Feb 2026 14:53:53 +0100
Subject: selftests: mlxsw: tc_restrictions: Fix test failure with new iproute2

As explained in [1], iproute2 started rejecting tc-police burst sizes
that result in an overflow. This can happen when the burst size is high
enough and the rate is low enough.

A couple of test cases specify such configurations, resulting in
iproute2 errors and test failure.

Fix by reducing the burst size so that the test will pass with both new
and old iproute2 versions.

[1] https://lore.kernel.org/netdev/20250916215731.3431465-1-jay.vosburgh@canonical.com/

Fixes: cb12d1763267 ("selftests: mlxsw: tc_restrictions: Test tc-police restrictions")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/88b00c6e85188aa6a065dc240206119b328c46e1.1770643998.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh
index 0441a18f098b..aac8ef490feb 100755
--- a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh
+++ b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh
@@ -317,7 +317,7 @@ police_limits_test()
 
 	tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \
 		flower skip_sw \
-		action police rate 0.5kbit burst 1m conform-exceed drop/ok
+		action police rate 0.5kbit burst 2k conform-exceed drop/ok
 	check_fail $? "Incorrect success to add police action with too low rate"
 
 	tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \
@@ -327,7 +327,7 @@ police_limits_test()
 
 	tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \
 		flower skip_sw \
-		action police rate 1.5kbit burst 1m conform-exceed drop/ok
+		action police rate 1.5kbit burst 2k conform-exceed drop/ok
 	check_err $? "Failed to add police action with low rate"
 
 	tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
-- 
cgit v1.2.3


From 10ec0fc0ccc525abc807b0ca8ad5a26a0bd56361 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Wed, 11 Feb 2026 10:21:46 +0800
Subject: selftests: net: lib: Fix jq parsing error

The testcase failed as below:
$./vlan_bridge_binding.sh
...
+ adf_ip_link_set_up d1
+ local name=d1
+ shift
+ ip_link_is_up d1
+ ip_link_has_flag d1 UP
+ local name=d1
+ shift
+ local flag=UP
+ shift
++ ip -j link show d1
++ jq --arg flag UP 'any(.[].flags.[]; . == $flag)'
jq: error: syntax error, unexpected '[', expecting FORMAT or QQSTRING_START
 (Unix shell quoting issues?) at <top-level>, line 1:
any(.[].flags.[]; . == $flag)
jq: 1 compile error

Remove the extra dot (.) after flags array to fix this.

Fixes: 4baa1d3a5080 ("selftests: net: lib: Add ip_link_has_flag()")
Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/20260211022146.190948-1-yuehaibing@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/lib.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index 0ec131b339bc..b40694573f4c 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -577,7 +577,7 @@ ip_link_has_flag()
 	local flag=$1; shift
 
 	local state=$(ip -j link show "$name" |
-		      jq --arg flag "$flag" 'any(.[].flags.[]; . == $flag)')
+		      jq --arg flag "$flag" 'any(.[].flags[]; . == $flag)')
 	[[ $state == true ]]
 }
 
-- 
cgit v1.2.3


From ed6788c5a7614d00321bc4c88fdb9d83fcba0e02 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Tue, 10 Feb 2026 11:31:10 +0200
Subject: selftests: drv-net: limit RPS test CPUs to supported range

The _get_unused_cpus() function can return CPU numbers >= 16, which
exceeds RPS_MAX_CPUS in toeplitz.c. When this happens, the test fails
with a cryptic message:

  # Exception| Traceback (most recent call last):
  # Exception|   File "/tmp/cur/linux/tools/testing/selftests/net/lib/py/ksft.py", line 319, in ksft_run
  # Exception|     func(*args)
  # Exception|   File "/tmp/cur/linux/tools/testing/selftests/drivers/net/hw/toeplitz.py", line 189, in test
  # Exception|     with bkg(" ".join(rx_cmd), ksft_ready=True, exit_wait=True) as rx_proc:
  # Exception|          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  # Exception|   File "/tmp/cur/linux/tools/testing/selftests/net/lib/py/utils.py", line 124, in __init__
  # Exception|     super().__init__(comm, background=True,
  # Exception|   File "/tmp/cur/linux/tools/testing/selftests/net/lib/py/utils.py", line 77, in __init__
  # Exception|     raise Exception("Did not receive ready message")
  # Exception| Exception: Did not receive ready message

Rename _get_unused_cpus() to _get_unused_rps_cpus() and cap the CPU
search range to RPS_MAX_CPUS.

Reviewed-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20260210093110.1935149-1-gal@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/hw/toeplitz.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/toeplitz.py b/tools/testing/selftests/drivers/net/hw/toeplitz.py
index d288c57894f6..cd7e080e6f84 100755
--- a/tools/testing/selftests/drivers/net/hw/toeplitz.py
+++ b/tools/testing/selftests/drivers/net/hw/toeplitz.py
@@ -19,6 +19,8 @@ from lib.py import ksft_variants, KsftNamedVariant, KsftSkipEx, KsftFailEx
 
 # "define" for the ID of the Toeplitz hash function
 ETH_RSS_HASH_TOP = 1
+# Must match RPS_MAX_CPUS in toeplitz.c
+RPS_MAX_CPUS = 16
 
 
 def _check_rps_and_rfs_not_configured(cfg):
@@ -67,23 +69,24 @@ def _get_irq_cpus(cfg):
     return cpus
 
 
-def _get_unused_cpus(cfg, count=2):
+def _get_unused_rps_cpus(cfg, count=2):
     """
-    Get CPUs that are not used by Rx queues.
-    Returns a list of at least 'count' CPU numbers.
+    Get CPUs that are not used by Rx queues for RPS.
+    Returns a list of at least 'count' CPU numbers within
+    the RPS_MAX_CPUS supported range.
     """
 
     # Get CPUs used by Rx queues
     rx_cpus = set(_get_irq_cpus(cfg))
 
-    # Get total number of CPUs
-    num_cpus = os.cpu_count()
+    # Get total number of CPUs, capped by RPS_MAX_CPUS
+    num_cpus = min(os.cpu_count(), RPS_MAX_CPUS)
 
     # Find unused CPUs
     unused_cpus = [cpu for cpu in range(num_cpus) if cpu not in rx_cpus]
 
     if len(unused_cpus) < count:
-        raise KsftSkipEx(f"Need at {count} CPUs not used by Rx queues, found {len(unused_cpus)}")
+        raise KsftSkipEx(f"Need at least {count} CPUs in range 0..{num_cpus - 1} not used by Rx queues, found {len(unused_cpus)}")
 
     return unused_cpus[:count]
 
@@ -181,7 +184,7 @@ def test(cfg, proto_flag, ipver, grp):
         ksft_pr(f"RSS using CPUs: {irq_cpus}")
     elif grp == "rps":
         # Get CPUs not used by Rx queues and configure them for RPS
-        rps_cpus = _get_unused_cpus(cfg, count=2)
+        rps_cpus = _get_unused_rps_cpus(cfg, count=2)
         rps_mask = _configure_rps(cfg, rps_cpus)
         defer(_configure_rps, cfg, [])
         rx_cmd += ["-r", rps_mask]
-- 
cgit v1.2.3


From dd23bfe4c317a9b5cbb4edfd36e5b8df12e84b8d Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 21 Jan 2026 21:55:39 -0600
Subject: tools/power turbostat: Add L2 cache statistics

version 2026.02.04

Add support for L2 cache statistics: L2MRPS and L2%hit
L2 statistics join the LLC in the "cache" counter group.

While the underlying LLC perf kernel support was architectural,
L2 perf counters are model-specific:

Support Intel Xeon -- Sapphire Rapids and newer.
Support Intel Atom -- Gracemont and newer.
Support Intel Hybrid -- Alder Lake and newer.

Example:

alder-lake-n$ sudo turbostat --quiet --show CPU,Busy%,cache my_workload
CPU	Busy%	LLCMRPS	LLC%hit	L2MRPS	L2%hit
-	49.82	1210	85.02	2909	31.63
0	99.14	322	88.89	767	32.38
1	0.91	1	32.47	1	18.86
2	0.20	0	40.78	0	23.34
3	99.17	295	81.79	706	31.89
4	0.68	1	58.71	1	15.61
5	99.16	299	85.65	726	31.32
6	0.08	0	45.35	0	31.71
7	99.21	293	83.63	707	30.92

where "my_workload" is a wrapper for a yogini workload
that has 4 fully-busy threads with 2MB working set each.

Note that analogous to the system summary for multiple LLC systems,
the system summary row for the L2 is the aggregate of all CPUS in the
system -- there is no per-cache roll-up.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.8 |   6 +-
 tools/power/x86/turbostat/turbostat.c | 450 ++++++++++++++++++++++++++++++----
 2 files changed, 405 insertions(+), 51 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 44a416a728b3..b4ef04200219 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -163,7 +163,11 @@ The system configuration dump (if --quiet is not used) is followed by statistics
 .PP
 \fBLLCMRPS\fP Last Level Cache Millions of References Per Second.  For CPUs with an L3 LLC, this is the number of references that CPU made to the L3 (and the number of misses that CPU made to it's L2).  For CPUs with an L2 LLC, this is the number of references to the L2 (and the number of misses to the CPU's L1).  The system summary row shows the sum for all CPUs.  In both cases, the value displayed is the actual value divided by 1,000,000.  If this value is large, then the LLC%hit column is significant.  If this value is small, then the LLC%hit column is not significant.
 .PP
-\fBLLC%hit\fP Last Level Cache Hit Rate %.  Hit Rate Percent = 100.0 * (References - Misses)/References.  The system summary row shows the weighted average for all CPUs (100.0 * (Sum_References - Sum_Misses)/Sum_References).
+\fBLLC%hit\fP Last Level Cache Hit Rate %.  Hit Rate Percent = 100.0 * Hits/References.  The system summary row shows the weighted average for all CPUs (100.0 * Sum_Hits/Sum_References).
+.PP
+\fBL2MRPS\fP Level-2 Cache Millions of References Per Second.  For CPUs with an L2 LLC, this is the same as LLC references.  The system summary row shows the sum for all CPUs.  In both cases, the value displayed is the actual value divided by 1,000,000.  If this value is large, then the L2%hit column is significant.  If this value is small, then the L2%hit column is not significant.
+.PP
+\fBL2%hit\fP Level-2 Cache Hit Rate %.  Hit Rate Percent = 100.0 * Hits/References.  The system summary row shows the weighted average for all CPUs (100.0 * (Sum_Hits)/Sum_References).
 .PP
 \fBC1, C2, C3...\fP The number times Linux requested the C1, C2, C3 idle state during the measurement interval.  The system summary line shows the sum for all CPUs.  These are C-state names as exported in /sys/devices/system/cpu/cpu*/cpuidle/state*/name.  While their names are generic, their attributes are processor specific. They the system description section of output shows what MWAIT sub-states they are mapped to on each system.  These counters are in the "cpuidle" group, which is disabled, by default.
 .PP
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index c622b55c330c..0b52363d5e7c 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -212,6 +212,8 @@ struct msr_counter bic[] = {
 	{ 0x0, "pct_idle", NULL, 0, 0, 0, NULL, 0 },
 	{ 0x0, "LLCMRPS", NULL, 0, 0, 0, NULL, 0 },
 	{ 0x0, "LLC%hit", NULL, 0, 0, 0, NULL, 0 },
+	{ 0x0, "L2MRPS", NULL, 0, 0, 0, NULL, 0 },
+	{ 0x0, "L2%hit", NULL, 0, 0, 0, NULL, 0 },
 };
 
 /* n.b. bic_names must match the order in bic[], above */
@@ -283,6 +285,8 @@ enum bic_names {
 	BIC_pct_idle,
 	BIC_LLC_MRPS,
 	BIC_LLC_HIT,
+	BIC_L2_MRPS,
+	BIC_L2_HIT,
 	MAX_BIC
 };
 
@@ -294,12 +298,10 @@ void print_bic_set(char *s, cpu_set_t *set)
 
 	printf("%s:", s);
 
-	for (i = 0; i <= MAX_BIC; ++i) {
+	for (i = 0; i < MAX_BIC; ++i) {
 
-		if (CPU_ISSET(i, set)) {
-			assert(i < MAX_BIC);
+		if (CPU_ISSET(i, set))
 			printf(" %s", bic[i].name);
-		}
 	}
 	putchar('\n');
 }
@@ -426,6 +428,8 @@ static void bic_groups_init(void)
 	BIC_INIT(&bic_group_cache);
 	SET_BIC(BIC_LLC_MRPS, &bic_group_cache);
 	SET_BIC(BIC_LLC_HIT, &bic_group_cache);
+	SET_BIC(BIC_L2_MRPS, &bic_group_cache);
+	SET_BIC(BIC_L2_HIT, &bic_group_cache);
 
 	BIC_INIT(&bic_group_other);
 	SET_BIC(BIC_IRQ, &bic_group_other);
@@ -482,6 +486,7 @@ FILE *outf;
 int *fd_percpu;
 int *fd_instr_count_percpu;
 int *fd_llc_percpu;
+int *fd_l2_percpu;
 struct timeval interval_tv = { 5, 0 };
 struct timespec interval_ts = { 5, 0 };
 
@@ -1249,6 +1254,84 @@ static const struct platform_data turbostat_pdata[] = {
 	{ 0, NULL },
 };
 
+struct {
+	unsigned int uniform;
+	unsigned int pcore;
+	unsigned int ecore;
+	unsigned int lcore;
+} perf_pmu_types;
+
+/*
+ * Events are enumerated in https://github.com/intel/perfmon
+ * and tools/perf/pmu-events/arch/x86/.../cache.json
+ */
+struct perf_l2_events {
+	unsigned long long refs;	/* L2_REQUEST.ALL */
+	unsigned long long hits;	/* L2_REQUEST.HIT */
+};
+
+struct perf_model_support {
+	unsigned int vfm;
+	struct perf_l2_events first;
+	struct perf_l2_events second;
+	struct perf_l2_events third;
+} *perf_model_support;
+
+/* Perf Cache Events */
+#define	PCE(ext_umask, umask)	(((unsigned long long) ext_umask) << 40 | umask << 8 | 0x24)
+
+/*
+ * Enumerate up to three perf CPU PMU's in a system.
+ * The first, second, and third columns are populated without skipping, describing
+ * pcore, ecore, lcore PMUs, in order, if present.  (The associated PMU "type" field is
+ * read from sysfs in all cases.)  Eg.
+ *
+ * non-hybrid:
+ *	GNR: pcore, {}, {}
+ *	ADL-N: ecore, {}, {}
+ * hybrid:
+ *	MTL: pcore, ecore, {}%
+ *	ARL-H: pcore, ecore, lcore
+ *	LNL: ecore, ecore%%, {}
+ *
+ * % MTL physical lcore share architecture and PMU with ecore, and are thus not enumerated separately.
+ * %% LNL physical lcore is enumerated by perf as ecore
+ */
+static struct perf_model_support turbostat_perf_model_support[] = {
+	{ INTEL_SAPPHIRERAPIDS_X, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, {}, {} },
+	{ INTEL_EMERALDRAPIDS_X, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, {}, {} },
+	{ INTEL_GRANITERAPIDS_X, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, {}, {} },
+	{ INTEL_GRANITERAPIDS_D, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, {}, {} },
+	{ INTEL_DIAMONDRAPIDS_X, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, {}, {} },
+
+	{ INTEL_ATOM_GRACEMONT, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {}, {} },	/* ADL-N */
+	{ INTEL_ATOM_CRESTMONT_X, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {}, {} },	/* SRF */
+	{ INTEL_ATOM_CRESTMONT, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {}, {} },	/* GRR */
+	{ INTEL_ATOM_DARKMONT_X, { PCE(0x01, 0xFF), PCE(0x01, 0xBF)}, {}, {} },	/* CWF */
+
+	{ INTEL_ALDERLAKE, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} },
+	{ INTEL_ALDERLAKE, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} },
+	{ INTEL_ALDERLAKE_L, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} },
+	{ INTEL_RAPTORLAKE, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} },
+	{ INTEL_RAPTORLAKE_P, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} },
+	{ INTEL_RAPTORLAKE_S, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} },
+	{ INTEL_METEORLAKE_L, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} },
+	{ INTEL_METEORLAKE, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} },
+	{ INTEL_ARROWLAKE_U, { PCE(0x00, 0xFF), PCE(0x00, 0xDF)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)}, {} },
+
+	{ INTEL_LUNARLAKE_M, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x00, 0x07), PCE(0x00, 0x02)}, {} },
+	{ INTEL_ARROWLAKE_H, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x00, 0x07), PCE(0x00, 0x02)}, { PCE(0x00, 0x00), PCE(0x00, 0x02)} },
+	{ INTEL_ARROWLAKE, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x00, 0x07), PCE(0x00, 0x02)}, {} },
+
+	{ INTEL_PANTHERLAKE_L, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x01, 0xFF), PCE(0x01, 0xBF)}, {} },
+	{ INTEL_WILDCATLAKE_L, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x01, 0xFF), PCE(0x01, 0xBF)}, {} },
+
+	{ INTEL_NOVALAKE, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x01, 0xFF), PCE(0x01, 0xBF)}, {} },
+	{ INTEL_NOVALAKE_L, { PCE(0x00, 0xFF), PCE(0x00, 0x5F)}, { PCE(0x01, 0xFF), PCE(0x01, 0xBF)}, {} },
+
+	{ 0, {}, {}, {} }
+};
+
 static const struct platform_features *platform;
 
 void probe_platform_features(unsigned int family, unsigned int model)
@@ -1292,6 +1375,21 @@ end:
 	exit(1);
 }
 
+void init_perf_model_support(unsigned int family, unsigned int model)
+{
+	int i;
+
+	if (!genuine_intel)
+		return;
+
+	for (i = 0; turbostat_perf_model_support[i].vfm; i++) {
+		if (VFM_FAMILY(turbostat_perf_model_support[i].vfm) == family && VFM_MODEL(turbostat_perf_model_support[i].vfm) == model) {
+			perf_model_support = &turbostat_perf_model_support[i];
+			return;
+		}
+	}
+}
+
 /* Model specific support End */
 
 #define	TJMAX_DEFAULT	100
@@ -1308,6 +1406,7 @@ char *progname;
 
 #define CPU_SUBSET_MAXCPUS	8192	/* need to use before probe... */
 cpu_set_t *cpu_present_set, *cpu_possible_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset;
+cpu_set_t *perf_pcore_set, *perf_ecore_set, *perf_lcore_set;
 size_t cpu_present_setsize, cpu_possible_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size;
 #define MAX_ADDED_THREAD_COUNTERS 24
 #define MAX_ADDED_CORE_COUNTERS 8
@@ -2008,6 +2107,10 @@ struct llc_stats {
 	unsigned long long references;
 	unsigned long long misses;
 };
+struct l2_stats {
+	unsigned long long references;
+	unsigned long long hits;
+};
 struct thread_data {
 	struct timeval tv_begin;
 	struct timeval tv_end;
@@ -2021,6 +2124,7 @@ struct thread_data {
 	unsigned long long nmi_count;
 	unsigned int smi_count;
 	struct llc_stats llc;
+	struct l2_stats l2;
 	unsigned int cpu_id;
 	unsigned int apic_id;
 	unsigned int x2apic_id;
@@ -2442,6 +2546,8 @@ static void bic_disable_perf_access(void)
 	CLR_BIC(BIC_IPC, &bic_enabled);
 	CLR_BIC(BIC_LLC_MRPS, &bic_enabled);
 	CLR_BIC(BIC_LLC_HIT, &bic_enabled);
+	CLR_BIC(BIC_L2_MRPS, &bic_enabled);
+	CLR_BIC(BIC_L2_HIT, &bic_enabled);
 }
 
 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
@@ -2820,6 +2926,12 @@ void print_header(char *delim)
 	if (DO_BIC(BIC_LLC_HIT))
 		outp += sprintf(outp, "%sLLC%%hit", (printed++ ? delim : ""));
 
+	if (DO_BIC(BIC_L2_MRPS))
+		outp += sprintf(outp, "%sL2MRPS", (printed++ ? delim : ""));
+
+	if (DO_BIC(BIC_L2_HIT))
+		outp += sprintf(outp, "%sL2%%hit", (printed++ ? delim : ""));
+
 	for (mp = sys.tp; mp; mp = mp->next)
 		outp += print_name(mp->width, &printed, delim, mp->name, mp->type, mp->format);
 
@@ -3057,6 +3169,10 @@ int dump_counters(PER_THREAD_PARAMS)
 		outp += sprintf(outp, "LLC miss: %lld", t->llc.misses);
 		outp += sprintf(outp, "LLC Hit%%: %.2f", pct((t->llc.references - t->llc.misses), t->llc.references));
 
+		outp += sprintf(outp, "L2 refs: %lld", t->l2.references);
+		outp += sprintf(outp, "L2 hits: %lld", t->l2.hits);
+		outp += sprintf(outp, "L2 Hit%%: %.2f", pct(t->l2.hits, t->l2.references));
+
 		for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
 			outp += sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num, t->counter[i], mp->sp->path);
 		}
@@ -3158,6 +3274,26 @@ void get_perf_llc_stats(int cpu, struct llc_stats *llc)
 		warn("%s: failed to read perf_data (req %zu act %zu)", __func__, expected_read_size, actual_read_size);
 }
 
+void get_perf_l2_stats(int cpu, struct l2_stats *l2)
+{
+	struct read_format {
+		unsigned long long num_read;
+		struct l2_stats l2;
+	} r;
+	const ssize_t expected_read_size = sizeof(r);
+	ssize_t actual_read_size;
+
+	actual_read_size = read(fd_l2_percpu[cpu], &r, expected_read_size);
+
+	if (actual_read_size == -1)
+		err(-1, "%s(cpu%d,) %d,,%ld", __func__, cpu, fd_l2_percpu[cpu], expected_read_size);
+
+	l2->references = r.l2.references;
+	l2->hits = r.l2.hits;
+	if (actual_read_size != expected_read_size)
+		warn("%s: cpu%d: failed to read(%d) perf_data (req %zu act %zu)", __func__, cpu, fd_l2_percpu[cpu], expected_read_size, actual_read_size);
+}
+
 /*
  * column formatting convention & formats
  */
@@ -3306,13 +3442,18 @@ int format_counters(PER_THREAD_PARAMS)
 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->smi_count);
 
 	/* LLC Stats */
-	if (DO_BIC(BIC_LLC_MRPS) || DO_BIC(BIC_LLC_HIT)) {
-		if (DO_BIC(BIC_LLC_MRPS))
-			outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), t->llc.references / interval_float / 1000000);
+	if (DO_BIC(BIC_LLC_MRPS))
+		outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), t->llc.references / interval_float / 1000000);
 
-		if (DO_BIC(BIC_LLC_HIT))
-			outp += sprintf(outp, fmt8, (printed++ ? delim : ""), pct((t->llc.references - t->llc.misses), t->llc.references));
-	}
+	if (DO_BIC(BIC_LLC_HIT))
+		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), pct((t->llc.references - t->llc.misses), t->llc.references));
+
+	/* L2 Stats */
+	if (DO_BIC(BIC_L2_MRPS))
+		outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), t->l2.references / interval_float / 1000000);
+
+	if (DO_BIC(BIC_L2_HIT))
+		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), pct(t->l2.hits, t->l2.references));
 
 	/* Added Thread Counters */
 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
@@ -3855,12 +3996,18 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
 	if (DO_BIC(BIC_SMI))
 		old->smi_count = new->smi_count - old->smi_count;
 
-	if (DO_BIC(BIC_LLC_MRPS))
+	if (DO_BIC(BIC_LLC_MRPS) || DO_BIC(BIC_LLC_HIT))
 		old->llc.references = new->llc.references - old->llc.references;
 
 	if (DO_BIC(BIC_LLC_HIT))
 		old->llc.misses = new->llc.misses - old->llc.misses;
 
+	if (DO_BIC(BIC_L2_MRPS) || DO_BIC(BIC_L2_HIT))
+		old->l2.references = new->l2.references - old->l2.references;
+
+	if (DO_BIC(BIC_L2_HIT))
+		old->l2.hits = new->l2.hits - old->l2.hits;
+
 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW || mp->format == FORMAT_AVERAGE)
 			old->counter[i] = new->counter[i];
@@ -3941,6 +4088,9 @@ void clear_counters(PER_THREAD_PARAMS)
 	t->llc.references = 0;
 	t->llc.misses = 0;
 
+	t->l2.references = 0;
+	t->l2.hits = 0;
+
 	c->c3 = 0;
 	c->c6 = 0;
 	c->c7 = 0;
@@ -3949,9 +4099,6 @@ void clear_counters(PER_THREAD_PARAMS)
 	rapl_counter_clear(&c->core_energy);
 	c->core_throt_cnt = 0;
 
-	t->llc.references = 0;
-	t->llc.misses = 0;
-
 	p->pkg_wtd_core_c0 = 0;
 	p->pkg_any_core_c0 = 0;
 	p->pkg_any_gfxe_c0 = 0;
@@ -4052,6 +4199,9 @@ int sum_counters(PER_THREAD_PARAMS)
 	average.threads.llc.references += t->llc.references;
 	average.threads.llc.misses += t->llc.misses;
 
+	average.threads.l2.references += t->l2.references;
+	average.threads.l2.hits += t->l2.hits;
+
 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW)
 			continue;
@@ -5070,6 +5220,9 @@ int get_counters(PER_THREAD_PARAMS)
 	if (DO_BIC(BIC_LLC_MRPS) || DO_BIC(BIC_LLC_HIT))
 		get_perf_llc_stats(cpu, &t->llc);
 
+	if (DO_BIC(BIC_L2_MRPS) || DO_BIC(BIC_L2_HIT))
+		get_perf_l2_stats(cpu, &t->l2);
+
 	if (DO_BIC(BIC_IPC))
 		if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long))
 			return -4;
@@ -5685,6 +5838,26 @@ void free_fd_llc_percpu(void)
 
 	free(fd_llc_percpu);
 	fd_llc_percpu = NULL;
+
+	BIC_NOT_PRESENT(BIC_LLC_MRPS);
+	BIC_NOT_PRESENT(BIC_LLC_HIT);
+}
+
+void free_fd_l2_percpu(void)
+{
+	if (!fd_l2_percpu)
+		return;
+
+	for (int i = 0; i < topo.max_cpu_num + 1; ++i) {
+		if (fd_l2_percpu[i] != 0)
+			close(fd_l2_percpu[i]);
+	}
+
+	free(fd_l2_percpu);
+	fd_l2_percpu = NULL;
+
+	BIC_NOT_PRESENT(BIC_L2_MRPS);
+	BIC_NOT_PRESENT(BIC_L2_HIT);
 }
 
 void free_fd_cstate(void)
@@ -5789,6 +5962,21 @@ void free_all_buffers(void)
 	cpu_affinity_set = NULL;
 	cpu_affinity_setsize = 0;
 
+	if (perf_pcore_set) {
+		CPU_FREE(perf_pcore_set);
+		perf_pcore_set = NULL;
+	}
+
+	if (perf_ecore_set) {
+		CPU_FREE(perf_ecore_set);
+		perf_ecore_set = NULL;
+	}
+
+	if (perf_lcore_set) {
+		CPU_FREE(perf_lcore_set);
+		perf_lcore_set = NULL;
+	}
+
 	free(thread_even);
 	free(core_even);
 	free(package_even);
@@ -5812,6 +6000,7 @@ void free_all_buffers(void)
 	free_fd_percpu();
 	free_fd_instr_count_percpu();
 	free_fd_llc_percpu();
+	free_fd_l2_percpu();
 	free_fd_msr();
 	free_fd_rapl_percpu();
 	free_fd_cstate();
@@ -6159,6 +6348,7 @@ void msr_perf_init(void);
 void rapl_perf_init(void);
 void cstate_perf_init(void);
 void perf_llc_init(void);
+void perf_l2_init(void);
 void added_perf_counters_init(void);
 void pmt_init(void);
 
@@ -6171,6 +6361,7 @@ void re_initialize(void)
 	rapl_perf_init();
 	cstate_perf_init();
 	perf_llc_init();
+	perf_l2_init();
 	added_perf_counters_init();
 	pmt_init();
 	fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus);
@@ -8333,25 +8524,126 @@ end:
 	return ret;
 }
 
+char cpuset_buf[1024];
+int initialize_cpu_set_from_sysfs(cpu_set_t *cpu_set, char *sysfs_path, char *sysfs_file)
+{
+	FILE *fp;
+	char path[128];
+
+	if (snprintf(path, 128, "%s/%s", sysfs_path, sysfs_file) > 128)
+		err(-1, "%s %s", sysfs_path, sysfs_file);
+
+	fp = fopen(path, "r");
+	if (!fp) {
+		warn("open %s", path);
+		return -1;
+	}
+	if (fread(cpuset_buf, sizeof(char), 1024, fp) == 0) {
+		warn("read %s", sysfs_path);
+		goto err;
+	}
+	if (parse_cpu_str(cpuset_buf, cpu_set, cpu_possible_setsize)) {
+		warnx("%s: cpu str malformat %s\n", sysfs_path, cpu_effective_str);
+		goto err;
+	}
+	return 0;
+
+err:
+	fclose(fp);
+	return -1;
+}
+
+void print_cpu_set(char *s, cpu_set_t *set)
+{
+	int i;
+
+	assert(MAX_BIC < CPU_SETSIZE);
+
+	printf("%s:", s);
+
+	for (i = 0; i <= topo.max_cpu_num; ++i)
+		if (CPU_ISSET(i, set))
+			printf(" %d", i);
+	putchar('\n');
+}
+
+void linux_perf_init_hybrid_cpus(void)
+{
+	char *perf_cpu_pcore_path = "/sys/devices/cpu_core";
+	char *perf_cpu_ecore_path = "/sys/devices/cpu_atom";
+	char *perf_cpu_lcore_path = "/sys/devices/cpu_lowpower";
+	char path[128];
+
+	if (!access(perf_cpu_pcore_path, F_OK)) {
+		perf_pcore_set = CPU_ALLOC((topo.max_cpu_num + 1));
+		if (perf_pcore_set == NULL)
+			err(3, "CPU_ALLOC");
+		CPU_ZERO_S(cpu_possible_setsize, perf_pcore_set);
+		initialize_cpu_set_from_sysfs(perf_pcore_set, perf_cpu_pcore_path, "cpus");
+		if (debug)
+			print_cpu_set("perf pcores", perf_pcore_set);
+		sprintf(path, "%s/%s", perf_cpu_pcore_path, "type");
+		perf_pmu_types.pcore = snapshot_sysfs_counter(path);
+	}
+
+	if (!access(perf_cpu_ecore_path, F_OK)) {
+		perf_ecore_set = CPU_ALLOC((topo.max_cpu_num + 1));
+		if (perf_ecore_set == NULL)
+			err(3, "CPU_ALLOC");
+		CPU_ZERO_S(cpu_possible_setsize, perf_ecore_set);
+		initialize_cpu_set_from_sysfs(perf_ecore_set, perf_cpu_ecore_path, "cpus");
+		if (debug)
+			print_cpu_set("perf ecores", perf_ecore_set);
+		sprintf(path, "%s/%s", perf_cpu_ecore_path, "type");
+		perf_pmu_types.ecore = snapshot_sysfs_counter(path);
+	}
+
+	if (!access(perf_cpu_lcore_path, F_OK)) {
+		perf_lcore_set = CPU_ALLOC((topo.max_cpu_num + 1));
+		if (perf_lcore_set == NULL)
+			err(3, "CPU_ALLOC");
+		CPU_ZERO_S(cpu_possible_setsize, perf_lcore_set);
+		initialize_cpu_set_from_sysfs(perf_lcore_set, perf_cpu_lcore_path, "cpus");
+		if (debug)
+			print_cpu_set("perf lcores", perf_lcore_set);
+		sprintf(path, "%s/%s", perf_cpu_lcore_path, "type");
+		perf_pmu_types.lcore = snapshot_sysfs_counter(path);
+	}
+}
+
 /*
- * Linux-perf manages the HW instructions-retired counter
- * by enabling when requested, and hiding rollover
+ * Linux-perf related initialization
  */
 void linux_perf_init(void)
 {
+	char path[128];
+	char *perf_cpu_path = "/sys/devices/cpu";
+
 	if (access("/proc/sys/kernel/perf_event_paranoid", F_OK))
 		return;
 
+	if (!access(perf_cpu_path, F_OK)) {
+		sprintf(path, "%s/%s", perf_cpu_path, "type");
+		perf_pmu_types.uniform = snapshot_sysfs_counter(path);
+	} else {
+		linux_perf_init_hybrid_cpus();
+	}
+
 	if (BIC_IS_ENABLED(BIC_IPC) && cpuid_has_aperf_mperf) {
 		fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
 		if (fd_instr_count_percpu == NULL)
 			err(-1, "calloc fd_instr_count_percpu");
 	}
-	if (BIC_IS_ENABLED(BIC_LLC_MRPS)) {
+	if (BIC_IS_ENABLED(BIC_LLC_MRPS) || BIC_IS_ENABLED(BIC_LLC_HIT)) {
 		fd_llc_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
 		if (fd_llc_percpu == NULL)
 			err(-1, "calloc fd_llc_percpu");
 	}
+	if (BIC_IS_ENABLED(BIC_L2_MRPS) || BIC_IS_ENABLED(BIC_L2_HIT)) {
+		fd_l2_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
+		if (fd_l2_percpu == NULL)
+			err(-1, "calloc fd_l2_percpu");
+	}
 }
 
 void rapl_perf_init(void)
@@ -8783,6 +9075,7 @@ void probe_pstates(void)
 	for_all_cpus(print_epb, ODD_COUNTERS);
 	for_all_cpus(print_perf_limit, ODD_COUNTERS);
 }
+
 void dump_word_chars(unsigned int word)
 {
 	int i;
@@ -8790,6 +9083,7 @@ void dump_word_chars(unsigned int word)
 	for (i = 0; i < 4; ++i)
 		fprintf(outf, "%c", (word >> (i * 8)) & 0xFF);
 }
+
 void dump_cpuid_hypervisor(void)
 {
 	unsigned int ebx = 0;
@@ -8875,6 +9169,7 @@ void process_cpuid()
 		dump_cpuid_hypervisor();
 
 	probe_platform_features(family, model);
+	init_perf_model_support(family, model);
 
 	if (!(edx_flags & (1 << 5)))
 		errx(1, "CPUID: no MSR");
@@ -9041,7 +9336,8 @@ void probe_pm_features(void)
 		decode_misc_feature_control();
 }
 
-/* perf_llc_probe
+/*
+ * has_perf_llc_access()
  *
  * return 1 on success, else 0
  */
@@ -9070,22 +9366,22 @@ void perf_llc_init(void)
 
 	if (no_perf)
 		return;
-	if (!(BIC_IS_ENABLED(BIC_LLC_MRPS) && BIC_IS_ENABLED(BIC_LLC_HIT)))
+	if (!(BIC_IS_ENABLED(BIC_LLC_MRPS) || BIC_IS_ENABLED(BIC_LLC_HIT)))
 		return;
 
+	assert(fd_llc_percpu != 0);
+
 	for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) {
 
 		if (cpu_is_not_allowed(cpu))
 			continue;
 
-		assert(fd_llc_percpu != 0);
 		fd_llc_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, -1, PERF_FORMAT_GROUP);
 		if (fd_llc_percpu[cpu] == -1) {
 			warnx("%s: perf REFS: failed to open counter on cpu%d", __func__, cpu);
 			free_fd_llc_percpu();
 			return;
 		}
-		assert(fd_llc_percpu != 0);
 		retval = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, fd_llc_percpu[cpu], PERF_FORMAT_GROUP);
 		if (retval == -1) {
 			warnx("%s: perf MISS: failed to open counter on cpu%d", __func__, cpu);
@@ -9097,6 +9393,86 @@ void perf_llc_init(void)
 	BIC_PRESENT(BIC_LLC_HIT);
 }
 
+void perf_l2_init(void)
+{
+	int cpu;
+	int retval;
+
+	if (no_perf)
+		return;
+	if (!(BIC_IS_ENABLED(BIC_L2_MRPS) || BIC_IS_ENABLED(BIC_L2_HIT)))
+		return;
+	if (perf_model_support == NULL)
+		return;
+
+	assert(fd_l2_percpu != 0);
+
+	for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) {
+
+		if (cpu_is_not_allowed(cpu))
+			continue;
+
+		if (!is_hybrid) {
+			fd_l2_percpu[cpu] = open_perf_counter(cpu, perf_pmu_types.uniform, perf_model_support->first.refs, -1, PERF_FORMAT_GROUP);
+			if (fd_l2_percpu[cpu] == -1) {
+				err(-1, "%s(cpu%d, 0x%x, 0x%llx) REFS", __func__, cpu, perf_pmu_types.uniform, perf_model_support->first.refs);
+				free_fd_l2_percpu();
+				return;
+			}
+			retval = open_perf_counter(cpu, perf_pmu_types.uniform, perf_model_support->first.hits, fd_l2_percpu[cpu], PERF_FORMAT_GROUP);
+			if (retval == -1) {
+				err(-1, "%s(cpu%d, 0x%x, 0x%llx) HITS", __func__, cpu, perf_pmu_types.uniform, perf_model_support->first.hits);
+				free_fd_l2_percpu();
+				return;
+			}
+			continue;
+		}
+		if (perf_pcore_set && CPU_ISSET_S(cpu, cpu_possible_setsize, perf_pcore_set)) {
+			fd_l2_percpu[cpu] = open_perf_counter(cpu, perf_pmu_types.pcore, perf_model_support->first.refs, -1, PERF_FORMAT_GROUP);
+			if (fd_l2_percpu[cpu] == -1) {
+				err(-1, "%s(cpu%d, 0x%x, 0x%llx) REFS", __func__, cpu, perf_pmu_types.pcore, perf_model_support->first.refs);
+				free_fd_l2_percpu();
+				return;
+			}
+			retval = open_perf_counter(cpu, perf_pmu_types.pcore, perf_model_support->first.hits, fd_l2_percpu[cpu], PERF_FORMAT_GROUP);
+			if (retval == -1) {
+				err(-1, "%s(cpu%d, 0x%x, 0x%llx) HITS", __func__, cpu, perf_pmu_types.pcore, perf_model_support->first.hits);
+				free_fd_l2_percpu();
+				return;
+			}
+		} else if (perf_ecore_set && CPU_ISSET_S(cpu, cpu_possible_setsize, perf_ecore_set)) {
+			fd_l2_percpu[cpu] = open_perf_counter(cpu, perf_pmu_types.ecore, perf_model_support->second.refs, -1, PERF_FORMAT_GROUP);
+			if (fd_l2_percpu[cpu] == -1) {
+				err(-1, "%s(cpu%d, 0x%x, 0x%llx) REFS", __func__, cpu, perf_pmu_types.pcore, perf_model_support->second.refs);
+				free_fd_l2_percpu();
+				return;
+			}
+			retval = open_perf_counter(cpu, perf_pmu_types.ecore, perf_model_support->second.hits, fd_l2_percpu[cpu], PERF_FORMAT_GROUP);
+			if (retval == -1) {
+				err(-1, "%s(cpu%d, 0x%x, 0x%llx) HITS", __func__, cpu, perf_pmu_types.pcore, perf_model_support->second.hits);
+				free_fd_l2_percpu();
+				return;
+			}
+		} else if (perf_lcore_set && CPU_ISSET_S(cpu, cpu_possible_setsize, perf_lcore_set)) {
+			fd_l2_percpu[cpu] = open_perf_counter(cpu, perf_pmu_types.lcore, perf_model_support->third.refs, -1, PERF_FORMAT_GROUP);
+			if (fd_l2_percpu[cpu] == -1) {
+				err(-1, "%s(cpu%d, 0x%x, 0x%llx) REFS", __func__, cpu, perf_pmu_types.pcore, perf_model_support->third.refs);
+				free_fd_l2_percpu();
+				return;
+			}
+			retval = open_perf_counter(cpu, perf_pmu_types.lcore, perf_model_support->third.hits, fd_l2_percpu[cpu], PERF_FORMAT_GROUP);
+			if (retval == -1) {
+				err(-1, "%s(cpu%d, 0x%x, 0x%llx) HITS", __func__, cpu, perf_pmu_types.pcore, perf_model_support->third.hits);
+				free_fd_l2_percpu();
+				return;
+			}
+		} else
+			err(-1, "%s: cpu%d: type %d", __func__, cpu, cpus[cpu].type);
+	}
+	BIC_PRESENT(BIC_L2_MRPS);
+	BIC_PRESENT(BIC_L2_HIT);
+}
+
 /*
  * in /dev/cpu/ return success for names that are numbers
  * ie. filter out ".", "..", "microcode".
@@ -9109,33 +9485,6 @@ int dir_filter(const struct dirent *dirp)
 		return 0;
 }
 
-char *possible_file = "/sys/devices/system/cpu/possible";
-char possible_buf[1024];
-
-int initialize_cpu_possible_set(void)
-{
-	FILE *fp;
-
-	fp = fopen(possible_file, "r");
-	if (!fp) {
-		warn("open %s", possible_file);
-		return -1;
-	}
-	if (fread(possible_buf, sizeof(char), 1024, fp) == 0) {
-		warn("read %s", possible_file);
-		goto err;
-	}
-	if (parse_cpu_str(possible_buf, cpu_possible_set, cpu_possible_setsize)) {
-		warnx("%s: cpu str malformat %s\n", possible_file, cpu_effective_str);
-		goto err;
-	}
-	return 0;
-
-err:
-	fclose(fp);
-	return -1;
-}
-
 void topology_probe(bool startup)
 {
 	int i;
@@ -9175,7 +9524,7 @@ void topology_probe(bool startup)
 		err(3, "CPU_ALLOC");
 	cpu_possible_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
 	CPU_ZERO_S(cpu_possible_setsize, cpu_possible_set);
-	initialize_cpu_possible_set();
+	initialize_cpu_set_from_sysfs(cpu_possible_set, "/sys/devices/system/cpu", "possible");
 
 	/*
 	 * Allocate and initialize cpu_effective_set
@@ -10078,6 +10427,7 @@ void turbostat_init()
 	rapl_perf_init();
 	cstate_perf_init();
 	perf_llc_init();
+	perf_l2_init();
 	added_perf_counters_init();
 	pmt_init();
 
@@ -10183,7 +10533,7 @@ int get_and_dump_counters(void)
 
 void print_version()
 {
-	fprintf(outf, "turbostat version 2025.12.05 - Len Brown <lenb@kernel.org>\n");
+	fprintf(outf, "turbostat version 2026.02.04 - Len Brown <lenb@kernel.org>\n");
 }
 
 #define COMMAND_LINE_SIZE 2048
-- 
cgit v1.2.3


From 54ca69f33cf1b67dd3dc9555fb9d82c83ee3ba89 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sat, 7 Feb 2026 10:17:38 -0600
Subject: tools/power turbostat: Cleanup internal use of "base_cpu"

Disambiguate the uses "base_cpu":

master_cpu: lowest permitted cpu#, read global MSRs here
package_data.first_cpu: lowest permitted cpu# in that package
core_data.first_cpu: lowest permitted cpu# in the core
current_cpu: where I'm running now

No functional change.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 200 +++++++++++++++++-----------------
 1 file changed, 100 insertions(+), 100 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 0b52363d5e7c..2c603990c0d3 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -534,7 +534,7 @@ double rapl_dram_energy_units, rapl_energy_units, rapl_psys_energy_units;
 double rapl_joule_counter_range;
 unsigned int crystal_hz;
 unsigned long long tsc_hz;
-int base_cpu;
+int master_cpu;
 unsigned int has_hwp;		/* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */
 			/* IA32_HWP_REQUEST, IA32_HWP_STATUS */
 unsigned int has_hwp_notify;	/* IA32_HWP_INTERRUPT */
@@ -626,7 +626,7 @@ double slm_bclk(void)
 	unsigned int i;
 	double freq;
 
-	if (get_msr(base_cpu, MSR_FSB_FREQ, &msr))
+	if (get_msr(master_cpu, MSR_FSB_FREQ, &msr))
 		fprintf(outf, "SLM BCLK: unknown\n");
 
 	i = msr & 0xf;
@@ -2136,7 +2136,7 @@ struct thread_data {
 } *thread_even, *thread_odd;
 
 struct core_data {
-	int base_cpu;
+	int first_cpu;
 	unsigned long long c3;
 	unsigned long long c6;
 	unsigned long long c7;
@@ -2151,7 +2151,7 @@ struct core_data {
 } *core_even, *core_odd;
 
 struct pkg_data {
-	int base_cpu;
+	int first_cpu;
 	unsigned long long pc2;
 	unsigned long long pc3;
 	unsigned long long pc6;
@@ -2486,12 +2486,12 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk
 
 int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c)
 {
-	return ((int)t->cpu_id == c->base_cpu || c->base_cpu < 0);
+	return ((int)t->cpu_id == c->first_cpu || c->first_cpu < 0);
 }
 
 int is_cpu_first_core_in_package(struct thread_data *t, struct pkg_data *p)
 {
-	return ((int)t->cpu_id == p->base_cpu || p->base_cpu < 0);
+	return ((int)t->cpu_id == p->first_cpu || p->first_cpu < 0);
 }
 
 int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
@@ -5455,7 +5455,7 @@ void probe_cst_limit(void)
 		return;
 	}
 
-	get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
+	get_msr(master_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
 	pkg_cstate_limit = pkg_cstate_limits[msr & 0xF];
 }
 
@@ -5467,9 +5467,9 @@ static void dump_platform_info(void)
 	if (!platform->has_nhm_msrs || no_msr)
 		return;
 
-	get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
+	get_msr(master_cpu, MSR_PLATFORM_INFO, &msr);
 
-	fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr);
+	fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", master_cpu, msr);
 
 	ratio = (msr >> 40) & 0xFF;
 	fprintf(outf, "%d * %.1f = %.1f MHz max efficiency frequency\n", ratio, bclk, ratio * bclk);
@@ -5485,8 +5485,8 @@ static void dump_power_ctl(void)
 	if (!platform->has_nhm_msrs || no_msr)
 		return;
 
-	get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr);
-	fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n", base_cpu, msr, msr & 0x2 ? "EN" : "DIS");
+	get_msr(master_cpu, MSR_IA32_POWER_CTL, &msr);
+	fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n", master_cpu, msr, msr & 0x2 ? "EN" : "DIS");
 
 	/* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */
 	if (platform->has_cst_prewake_bit)
@@ -5500,9 +5500,9 @@ static void dump_turbo_ratio_limit2(void)
 	unsigned long long msr;
 	unsigned int ratio;
 
-	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT2, &msr);
+	get_msr(master_cpu, MSR_TURBO_RATIO_LIMIT2, &msr);
 
-	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT2: 0x%08llx\n", base_cpu, msr);
+	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT2: 0x%08llx\n", master_cpu, msr);
 
 	ratio = (msr >> 8) & 0xFF;
 	if (ratio)
@@ -5519,9 +5519,9 @@ static void dump_turbo_ratio_limit1(void)
 	unsigned long long msr;
 	unsigned int ratio;
 
-	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &msr);
+	get_msr(master_cpu, MSR_TURBO_RATIO_LIMIT1, &msr);
 
-	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, msr);
+	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", master_cpu, msr);
 
 	ratio = (msr >> 56) & 0xFF;
 	if (ratio)
@@ -5562,13 +5562,13 @@ static void dump_turbo_ratio_limits(int trl_msr_offset)
 	unsigned long long msr, core_counts;
 	int shift;
 
-	get_msr(base_cpu, trl_msr_offset, &msr);
+	get_msr(master_cpu, trl_msr_offset, &msr);
 	fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n",
-		base_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr);
+		master_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr);
 
 	if (platform->trl_msrs & TRL_CORECOUNT) {
-		get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts);
-		fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, core_counts);
+		get_msr(master_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts);
+		fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", master_cpu, core_counts);
 	} else {
 		core_counts = 0x0807060504030201;
 	}
@@ -5590,8 +5590,8 @@ static void dump_atom_turbo_ratio_limits(void)
 	unsigned long long msr;
 	unsigned int ratio;
 
-	get_msr(base_cpu, MSR_ATOM_CORE_RATIOS, &msr);
-	fprintf(outf, "cpu%d: MSR_ATOM_CORE_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF);
+	get_msr(master_cpu, MSR_ATOM_CORE_RATIOS, &msr);
+	fprintf(outf, "cpu%d: MSR_ATOM_CORE_RATIOS: 0x%08llx\n", master_cpu, msr & 0xFFFFFFFF);
 
 	ratio = (msr >> 0) & 0x3F;
 	if (ratio)
@@ -5605,8 +5605,8 @@ static void dump_atom_turbo_ratio_limits(void)
 	if (ratio)
 		fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
 
-	get_msr(base_cpu, MSR_ATOM_CORE_TURBO_RATIOS, &msr);
-	fprintf(outf, "cpu%d: MSR_ATOM_CORE_TURBO_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF);
+	get_msr(master_cpu, MSR_ATOM_CORE_TURBO_RATIOS, &msr);
+	fprintf(outf, "cpu%d: MSR_ATOM_CORE_TURBO_RATIOS: 0x%08llx\n", master_cpu, msr & 0xFFFFFFFF);
 
 	ratio = (msr >> 24) & 0x3F;
 	if (ratio)
@@ -5635,9 +5635,9 @@ static void dump_knl_turbo_ratio_limits(void)
 	unsigned int cores[buckets_no];
 	unsigned int ratio[buckets_no];
 
-	get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT, &msr);
+	get_msr(master_cpu, MSR_TURBO_RATIO_LIMIT, &msr);
 
-	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr);
+	fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", master_cpu, msr);
 
 	/*
 	 * Turbo encoding in KNL is as follows:
@@ -5687,9 +5687,9 @@ static void dump_cst_cfg(void)
 	if (!platform->has_nhm_msrs || no_msr)
 		return;
 
-	get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
+	get_msr(master_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
 
-	fprintf(outf, "cpu%d: MSR_PKG_CST_CONFIG_CONTROL: 0x%08llx", base_cpu, msr);
+	fprintf(outf, "cpu%d: MSR_PKG_CST_CONFIG_CONTROL: 0x%08llx", master_cpu, msr);
 
 	fprintf(outf, " (%s%s%s%s%slocked, pkg-cstate-limit=%d (%s)",
 		(msr & SNB_C3_AUTO_UNDEMOTE) ? "UNdemote-C3, " : "",
@@ -5712,12 +5712,12 @@ static void dump_config_tdp(void)
 {
 	unsigned long long msr;
 
-	get_msr(base_cpu, MSR_CONFIG_TDP_NOMINAL, &msr);
-	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_NOMINAL: 0x%08llx", base_cpu, msr);
+	get_msr(master_cpu, MSR_CONFIG_TDP_NOMINAL, &msr);
+	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_NOMINAL: 0x%08llx", master_cpu, msr);
 	fprintf(outf, " (base_ratio=%d)\n", (unsigned int)msr & 0xFF);
 
-	get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_1, &msr);
-	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_1: 0x%08llx (", base_cpu, msr);
+	get_msr(master_cpu, MSR_CONFIG_TDP_LEVEL_1, &msr);
+	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_1: 0x%08llx (", master_cpu, msr);
 	if (msr) {
 		fprintf(outf, "PKG_MIN_PWR_LVL1=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
 		fprintf(outf, "PKG_MAX_PWR_LVL1=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
@@ -5726,8 +5726,8 @@ static void dump_config_tdp(void)
 	}
 	fprintf(outf, ")\n");
 
-	get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_2, &msr);
-	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_2: 0x%08llx (", base_cpu, msr);
+	get_msr(master_cpu, MSR_CONFIG_TDP_LEVEL_2, &msr);
+	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_2: 0x%08llx (", master_cpu, msr);
 	if (msr) {
 		fprintf(outf, "PKG_MIN_PWR_LVL2=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
 		fprintf(outf, "PKG_MAX_PWR_LVL2=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
@@ -5736,15 +5736,15 @@ static void dump_config_tdp(void)
 	}
 	fprintf(outf, ")\n");
 
-	get_msr(base_cpu, MSR_CONFIG_TDP_CONTROL, &msr);
-	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_CONTROL: 0x%08llx (", base_cpu, msr);
+	get_msr(master_cpu, MSR_CONFIG_TDP_CONTROL, &msr);
+	fprintf(outf, "cpu%d: MSR_CONFIG_TDP_CONTROL: 0x%08llx (", master_cpu, msr);
 	if ((msr) & 0x3)
 		fprintf(outf, "TDP_LEVEL=%d ", (unsigned int)(msr) & 0x3);
 	fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
 	fprintf(outf, ")\n");
 
-	get_msr(base_cpu, MSR_TURBO_ACTIVATION_RATIO, &msr);
-	fprintf(outf, "cpu%d: MSR_TURBO_ACTIVATION_RATIO: 0x%08llx (", base_cpu, msr);
+	get_msr(master_cpu, MSR_TURBO_ACTIVATION_RATIO, &msr);
+	fprintf(outf, "cpu%d: MSR_TURBO_ACTIVATION_RATIO: 0x%08llx (", master_cpu, msr);
 	fprintf(outf, "MAX_NON_TURBO_RATIO=%d", (unsigned int)(msr) & 0xFF);
 	fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
 	fprintf(outf, ")\n");
@@ -5760,38 +5760,38 @@ void print_irtl(void)
 		return;
 
 	if (platform->supported_cstates & PC3) {
-		get_msr(base_cpu, MSR_PKGC3_IRTL, &msr);
-		fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr);
+		get_msr(master_cpu, MSR_PKGC3_IRTL, &msr);
+		fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", master_cpu, msr);
 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
 	}
 
 	if (platform->supported_cstates & PC6) {
-		get_msr(base_cpu, MSR_PKGC6_IRTL, &msr);
-		fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr);
+		get_msr(master_cpu, MSR_PKGC6_IRTL, &msr);
+		fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", master_cpu, msr);
 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
 	}
 
 	if (platform->supported_cstates & PC7) {
-		get_msr(base_cpu, MSR_PKGC7_IRTL, &msr);
-		fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr);
+		get_msr(master_cpu, MSR_PKGC7_IRTL, &msr);
+		fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", master_cpu, msr);
 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
 	}
 
 	if (platform->supported_cstates & PC8) {
-		get_msr(base_cpu, MSR_PKGC8_IRTL, &msr);
-		fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr);
+		get_msr(master_cpu, MSR_PKGC8_IRTL, &msr);
+		fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", master_cpu, msr);
 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
 	}
 
 	if (platform->supported_cstates & PC9) {
-		get_msr(base_cpu, MSR_PKGC9_IRTL, &msr);
-		fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr);
+		get_msr(master_cpu, MSR_PKGC9_IRTL, &msr);
+		fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", master_cpu, msr);
 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
 	}
 
 	if (platform->supported_cstates & PC10) {
-		get_msr(base_cpu, MSR_PKGC10_IRTL, &msr);
-		fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr);
+		get_msr(master_cpu, MSR_PKGC10_IRTL, &msr);
+		fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", master_cpu, msr);
 		fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
 	}
 }
@@ -6370,14 +6370,14 @@ void re_initialize(void)
 void set_max_cpu_num(void)
 {
 	FILE *filep;
-	int base_cpu;
+	int current_cpu;
 	unsigned long dummy;
 	char pathname[64];
 
-	base_cpu = sched_getcpu();
-	if (base_cpu < 0)
+	current_cpu = sched_getcpu();
+	if (current_cpu < 0)
 		err(1, "cannot find calling cpu ID");
-	sprintf(pathname, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", base_cpu);
+	sprintf(pathname, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", current_cpu);
 
 	filep = fopen_or_die(pathname, "r");
 	topo.max_cpu_num = 0;
@@ -6940,7 +6940,7 @@ int probe_dev_msr(void)
 	struct stat sb;
 	char pathname[32];
 
-	sprintf(pathname, "/dev/msr%d", base_cpu);
+	sprintf(pathname, "/dev/msr%d", master_cpu);
 	return !stat(pathname, &sb);
 }
 
@@ -6949,7 +6949,7 @@ int probe_dev_cpu_msr(void)
 	struct stat sb;
 	char pathname[32];
 
-	sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
+	sprintf(pathname, "/dev/cpu/%d/msr", master_cpu);
 	return !stat(pathname, &sb);
 }
 
@@ -7026,7 +7026,7 @@ void check_msr_permission(void)
 	failed += check_for_cap_sys_rawio();
 
 	/* test file permissions */
-	sprintf(pathname, use_android_msr_path ? "/dev/msr%d" : "/dev/cpu/%d/msr", base_cpu);
+	sprintf(pathname, use_android_msr_path ? "/dev/msr%d" : "/dev/cpu/%d/msr", master_cpu);
 	if (euidaccess(pathname, R_OK)) {
 		failed++;
 	}
@@ -7055,7 +7055,7 @@ void probe_bclk(void)
 	else
 		return;
 
-	get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
+	get_msr(master_cpu, MSR_PLATFORM_INFO, &msr);
 	base_ratio = (msr >> 8) & 0xFF;
 
 	base_hz = base_ratio * bclk * 1000000;
@@ -7402,7 +7402,7 @@ static void dump_sysfs_cstate_config(void)
 
 	for (state = 0; state < 10; ++state) {
 
-		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
+		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", master_cpu, state);
 		input = fopen(path, "r");
 		if (input == NULL)
 			continue;
@@ -7418,14 +7418,14 @@ static void dump_sysfs_cstate_config(void)
 
 		remove_underbar(name_buf);
 
-		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", base_cpu, state);
+		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", master_cpu, state);
 		input = fopen(path, "r");
 		if (input == NULL)
 			continue;
 		if (!fgets(desc, sizeof(desc), input))
 			err(1, "%s: failed to read file", path);
 
-		fprintf(outf, "cpu%d: %s: %s", base_cpu, name_buf, desc);
+		fprintf(outf, "cpu%d: %s: %s", master_cpu, name_buf, desc);
 		fclose(input);
 	}
 }
@@ -7438,7 +7438,7 @@ static void dump_sysfs_pstate_config(void)
 	FILE *input;
 	int turbo;
 
-	sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", base_cpu);
+	sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", master_cpu);
 	input = fopen(path, "r");
 	if (input == NULL) {
 		fprintf(outf, "NSFOD %s\n", path);
@@ -7448,7 +7448,7 @@ static void dump_sysfs_pstate_config(void)
 		err(1, "%s: failed to read file", path);
 	fclose(input);
 
-	sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", base_cpu);
+	sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", master_cpu);
 	input = fopen(path, "r");
 	if (input == NULL) {
 		fprintf(outf, "NSFOD %s\n", path);
@@ -7458,8 +7458,8 @@ static void dump_sysfs_pstate_config(void)
 		err(1, "%s: failed to read file", path);
 	fclose(input);
 
-	fprintf(outf, "cpu%d: cpufreq driver: %s", base_cpu, driver_buf);
-	fprintf(outf, "cpu%d: cpufreq governor: %s", base_cpu, governor_buf);
+	fprintf(outf, "cpu%d: cpufreq driver: %s", master_cpu, driver_buf);
+	fprintf(outf, "cpu%d: cpufreq governor: %s", master_cpu, governor_buf);
 
 	sprintf(path, "/sys/devices/system/cpu/cpufreq/boost");
 	input = fopen(path, "r");
@@ -7721,7 +7721,7 @@ double get_tdp_intel(void)
 	unsigned long long msr;
 
 	if (valid_rapl_msrs & RAPL_PKG_POWER_INFO)
-		if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr))
+		if (!get_msr(master_cpu, MSR_PKG_POWER_INFO, &msr))
 			return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
 	return get_quirk_tdp();
 }
@@ -7760,7 +7760,7 @@ void rapl_probe_intel(void)
 		CLR_BIC(BIC_RAM__, &bic_enabled);
 
 	/* units on package 0, verify later other packages match */
-	if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr))
+	if (get_msr(master_cpu, MSR_RAPL_POWER_UNIT, &msr))
 		return;
 
 	rapl_power_units = 1.0 / (1 << (msr & 0xF));
@@ -7808,7 +7808,7 @@ void rapl_probe_amd(void)
 	if (!valid_rapl_msrs || no_msr)
 		return;
 
-	if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr))
+	if (get_msr(master_cpu, MSR_RAPL_PWR_UNIT, &msr))
 		return;
 
 	rapl_time_units = ldexp(1.0, -(msr >> 16 & 0xf));
@@ -8119,7 +8119,7 @@ void probe_rapl_msrs(void)
 	if (offset < 0)
 		return;
 
-	ret = get_msr(base_cpu, offset, &msr_value);
+	ret = get_msr(master_cpu, offset, &msr_value);
 	if (ret) {
 		if (debug)
 			fprintf(outf, "Can not read RAPL_PKG_ENERGY MSR(0x%llx)\n", (unsigned long long)offset);
@@ -8204,7 +8204,7 @@ int set_temperature_target(PER_THREAD_PARAMS)
 	if (!platform->has_nhm_msrs || no_msr)
 		goto guess;
 
-	if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
+	if (get_msr(master_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
 		goto guess;
 
 	tcc_default = (msr >> 16) & 0xFF;
@@ -8213,7 +8213,7 @@ int set_temperature_target(PER_THREAD_PARAMS)
 		int bits = platform->tcc_offset_bits;
 		unsigned long long enabled = 0;
 
-		if (bits && !get_msr(base_cpu, MSR_PLATFORM_INFO, &enabled))
+		if (bits && !get_msr(master_cpu, MSR_PLATFORM_INFO, &enabled))
 			enabled = (enabled >> 30) & 1;
 
 		if (bits && enabled) {
@@ -8351,9 +8351,9 @@ void decode_feature_control_msr(void)
 	if (quiet)
 		return;
 
-	if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr))
+	if (!get_msr(master_cpu, MSR_IA32_FEAT_CTL, &msr))
 		fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n",
-			base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : "");
+			master_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : "");
 }
 
 void decode_misc_enable_msr(void)
@@ -8366,9 +8366,9 @@ void decode_misc_enable_msr(void)
 	if (!genuine_intel)
 		return;
 
-	if (!get_msr(base_cpu, MSR_IA32_MISC_ENABLE, &msr))
+	if (!get_msr(master_cpu, MSR_IA32_MISC_ENABLE, &msr))
 		fprintf(outf, "cpu%d: MSR_IA32_MISC_ENABLE: 0x%08llx (%sTCC %sEIST %sMWAIT %sPREFETCH %sTURBO)\n",
-			base_cpu, msr,
+			master_cpu, msr,
 			msr & MSR_IA32_MISC_ENABLE_TM1 ? "" : "No-",
 			msr & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP ? "" : "No-",
 			msr & MSR_IA32_MISC_ENABLE_MWAIT ? "" : "No-",
@@ -8385,10 +8385,10 @@ void decode_misc_feature_control(void)
 	if (!platform->has_msr_misc_feature_control)
 		return;
 
-	if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr))
+	if (!get_msr(master_cpu, MSR_MISC_FEATURE_CONTROL, &msr))
 		fprintf(outf,
 			"cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n",
-			base_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "",
+			master_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "",
 			msr & (2 << 0) ? "No-" : "", msr & (3 << 0) ? "No-" : "");
 }
 
@@ -8409,9 +8409,9 @@ void decode_misc_pwr_mgmt_msr(void)
 	if (!platform->has_msr_misc_pwr_mgmt)
 		return;
 
-	if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr))
+	if (!get_msr(master_cpu, MSR_MISC_PWR_MGMT, &msr))
 		fprintf(outf, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB %sable-OOB)\n",
-			base_cpu, msr, msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS");
+			master_cpu, msr, msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS");
 }
 
 /*
@@ -8430,13 +8430,13 @@ void decode_c6_demotion_policy_msr(void)
 	if (!platform->has_msr_c6_demotion_policy_config)
 		return;
 
-	if (!get_msr(base_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr))
+	if (!get_msr(master_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr))
 		fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n",
-			base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
+			master_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
 
-	if (!get_msr(base_cpu, MSR_MC6_DEMOTION_POLICY_CONFIG, &msr))
+	if (!get_msr(master_cpu, MSR_MC6_DEMOTION_POLICY_CONFIG, &msr))
 		fprintf(outf, "cpu%d: MSR_MC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-MC6-Demotion)\n",
-			base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
+			master_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
 }
 
 void print_dev_latency(void)
@@ -8471,7 +8471,7 @@ static int has_perf_instr_count_access(void)
 	if (no_perf)
 		return 0;
 
-	fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
+	fd = open_perf_counter(master_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
 	if (fd != -1)
 		close(fd);
 
@@ -8701,7 +8701,7 @@ void rapl_perf_init(void)
 
 			domain_visited[next_domain] = 1;
 
-			if ((cai->flags & RAPL_COUNTER_FLAG_PLATFORM_COUNTER) && (cpu != base_cpu))
+			if ((cai->flags & RAPL_COUNTER_FLAG_PLATFORM_COUNTER) && (cpu != master_cpu))
 				continue;
 
 			struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain];
@@ -9348,7 +9348,7 @@ int has_perf_llc_access(void)
 	if (no_perf)
 		return 0;
 
-	fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, -1, PERF_FORMAT_GROUP);
+	fd = open_perf_counter(master_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES, -1, PERF_FORMAT_GROUP);
 	if (fd != -1)
 		close(fd);
 
@@ -9710,7 +9710,7 @@ void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_
 
 	for (i = 0; i < num_cores; i++) {
 		(*c)[i].core_id = -1;
-		(*c)[i].base_cpu = -1;
+		(*c)[i].first_cpu = -1;
 	}
 
 	*p = calloc(topo.num_packages, sizeof(struct pkg_data));
@@ -9719,7 +9719,7 @@ void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_
 
 	for (i = 0; i < topo.num_packages; i++) {
 		(*p)[i].package_id = i;
-		(*p)[i].base_cpu = -1;
+		(*p)[i].first_cpu = -1;
 	}
 
 	return;
@@ -9753,10 +9753,10 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base,
 	t->cpu_id = cpu_id;
 	if (!cpu_is_not_allowed(cpu_id)) {
 
-		if (c->base_cpu < 0)
-			c->base_cpu = t->cpu_id;
-		if (pkg_base[pkg_id].base_cpu < 0)
-			pkg_base[pkg_id].base_cpu = t->cpu_id;
+		if (c->first_cpu < 0)
+			c->first_cpu = t->cpu_id;
+		if (pkg_base[pkg_id].first_cpu < 0)
+			pkg_base[pkg_id].first_cpu = t->cpu_id;
 	}
 
 	c->core_id = core_id;
@@ -9803,9 +9803,9 @@ void allocate_irq_buffers(void)
 int update_topo(PER_THREAD_PARAMS)
 {
 	topo.allowed_cpus++;
-	if ((int)t->cpu_id == c->base_cpu)
+	if ((int)t->cpu_id == c->first_cpu)
 		topo.allowed_cores++;
-	if ((int)t->cpu_id == p->base_cpu)
+	if ((int)t->cpu_id == p->first_cpu)
 		topo.allowed_packages++;
 
 	return 0;
@@ -9831,16 +9831,16 @@ void setup_all_buffers(bool startup)
 	topology_update();
 }
 
-void set_base_cpu(void)
+void set_master_cpu(void)
 {
 	int i;
 
 	for (i = 0; i < topo.max_cpu_num + 1; ++i) {
 		if (cpu_is_not_allowed(i))
 			continue;
-		base_cpu = i;
+		master_cpu = i;
 		if (debug > 1)
-			fprintf(outf, "base_cpu = %d\n", base_cpu);
+			fprintf(outf, "master_cpu = %d\n", master_cpu);
 		return;
 	}
 	err(-ENODEV, "No valid cpus found");
@@ -10416,7 +10416,7 @@ next_loop_iter:
 void turbostat_init()
 {
 	setup_all_buffers(true);
-	set_base_cpu();
+	set_master_cpu();
 	check_msr_access();
 	check_perf_access();
 	process_cpuid();
@@ -10434,7 +10434,7 @@ void turbostat_init()
 	for_all_cpus(get_cpu_type, ODD_COUNTERS);
 	for_all_cpus(get_cpu_type, EVEN_COUNTERS);
 
-	if (BIC_IS_ENABLED(BIC_IPC) && has_aperf_access && get_instr_count_fd(base_cpu) != -1)
+	if (BIC_IS_ENABLED(BIC_IPC) && has_aperf_access && get_instr_count_fd(master_cpu) != -1)
 		BIC_PRESENT(BIC_IPC);
 
 	/*
@@ -11251,7 +11251,7 @@ void probe_cpuidle_residency(void)
 
 	for (state = 10; state >= 0; --state) {
 
-		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
+		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", master_cpu, state);
 		input = fopen(path, "r");
 		if (input == NULL)
 			continue;
@@ -11300,7 +11300,7 @@ void probe_cpuidle_counts(void)
 
 	for (state = 10; state >= 0; --state) {
 
-		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
+		sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", master_cpu, state);
 		input = fopen(path, "r");
 		if (input == NULL)
 			continue;
-- 
cgit v1.2.3


From 4aaf7d07a089696e16e1ca70c2d501dc8a2224a4 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 8 Feb 2026 09:09:25 -0600
Subject: tools/power turbostat: Cleanup package_id

The kernel topology sysfs uses the name "physical_package_id"
because it is allowed to be sparse.

Inside Turbostat, that physical package_id namespace is the only
package_id namespace, so re-name it to simply be "package_id"
in cpus[].

Delete the redundant copy of package_id in pkg_data.
Rely instead on the single copy of the truth in cpus[].

No functional change.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 52 ++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 29 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 2c603990c0d3..74d9f9e21e94 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2171,7 +2171,6 @@ struct pkg_data {
 	long long sam_mc6_ms;
 	unsigned int sam_mhz;
 	unsigned int sam_act_mhz;
-	unsigned int package_id;
 	struct rapl_counter energy_pkg;	/* MSR_PKG_ENERGY_STATUS */
 	struct rapl_counter energy_dram;	/* MSR_DRAM_ENERGY_STATUS */
 	struct rapl_counter energy_cores;	/* MSR_PP0_ENERGY_STATUS */
@@ -2395,7 +2394,7 @@ struct platform_counters {
 } platform_counters_odd, platform_counters_even;
 
 struct cpu_topology {
-	int physical_package_id;
+	int package_id;
 	int die_id;
 	int l3_id;
 	int logical_cpu_id;
@@ -2662,7 +2661,7 @@ unsigned int cpu_to_domain(const struct perf_counter_info *pc, int cpu)
 		return cpus[cpu].physical_core_id;
 
 	case SCOPE_PACKAGE:
-		return cpus[cpu].physical_package_id;
+		return cpus[cpu].package_id;
 	}
 
 	__builtin_unreachable();
@@ -3199,8 +3198,6 @@ int dump_counters(PER_THREAD_PARAMS)
 	}
 
 	if (p && is_cpu_first_core_in_package(t, p)) {
-		outp += sprintf(outp, "package: %d\n", p->package_id);
-
 		outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0);
 		outp += sprintf(outp, "Any cores: %016llX\n", p->pkg_any_core_c0);
 		outp += sprintf(outp, "Any GFX: %016llX\n", p->pkg_any_gfxe_c0);
@@ -3366,7 +3363,7 @@ int format_counters(PER_THREAD_PARAMS)
 	} else {
 		if (DO_BIC(BIC_Package)) {
 			if (p)
-				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->package_id);
+				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), cpus[t->cpu_id].package_id);
 			else
 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
 		}
@@ -5180,11 +5177,11 @@ static inline int get_rapl_domain_id(int cpu)
 	int rapl_core_id;
 
 	if (!platform->has_per_core_rapl)
-		return cpus[cpu].physical_package_id;
+		return cpus[cpu].package_id;
 
 	/* Compute the system-wide unique core-id for @cpu */
 	rapl_core_id = cpus[cpu].physical_core_id;
-	rapl_core_id += cpus[cpu].physical_package_id * nr_cores_per_package;
+	rapl_core_id += cpus[cpu].package_id * nr_cores_per_package;
 
 	return rapl_core_id;
 }
@@ -5328,7 +5325,7 @@ int get_counters(PER_THREAD_PARAMS)
 	}
 
 	if (DO_BIC(BIC_UNCORE_MHZ))
-		p->uncore_mhz = get_legacy_uncore_mhz(p->package_id);
+		p->uncore_mhz = get_legacy_uncore_mhz(cpus[t->cpu_id].package_id);
 
 	if (DO_BIC(BIC_GFX_rc6))
 		p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull;
@@ -5352,9 +5349,9 @@ int get_counters(PER_THREAD_PARAMS)
 		char *path = NULL;
 
 		if (mp->msr_num == 0) {
-			path = find_sysfs_path_by_id(mp->sp, p->package_id);
+			path = find_sysfs_path_by_id(mp->sp, cpus[t->cpu_id].package_id);
 			if (path == NULL) {
-				warnx("%s: package_id %d not found", __func__, p->package_id);
+				warnx("%s: package_id %d not found", __func__, cpus[t->cpu_id].package_id);
 				return -10;
 			}
 		}
@@ -5366,7 +5363,7 @@ int get_counters(PER_THREAD_PARAMS)
 		return -10;
 
 	for (i = 0, pp = sys.pmt_pp; pp; i++, pp = pp->next)
-		p->pmt_counter[i] = pmt_read_counter(pp, p->package_id);
+		p->pmt_counter[i] = pmt_read_counter(pp, cpus[t->cpu_id].package_id);
 
 done:
 	gettimeofday(&t->tv_end, (struct timezone *)NULL);
@@ -6050,7 +6047,7 @@ int cpu_is_first_core_in_package(int cpu)
 	return cpu == parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_siblings_list", cpu);
 }
 
-int get_physical_package_id(int cpu)
+int get_package_id(int cpu)
 {
 	return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/physical_package_id", cpu);
 }
@@ -6083,7 +6080,7 @@ void set_node_data(void)
 	for (pkg = 0; pkg < topo.num_packages; pkg++) {
 		lnode = 0;
 		for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) {
-			if (cpus[cpu].physical_package_id != pkg)
+			if (cpus[cpu].package_id != pkg)
 				continue;
 			/* find a cpu with an unset logical_node_id */
 			if (cpus[cpu].logical_node_id != -1)
@@ -6096,7 +6093,7 @@ void set_node_data(void)
 			 * the logical_node_id
 			 */
 			for (cpux = cpu; cpux <= topo.max_cpu_num; cpux++) {
-				if ((cpus[cpux].physical_package_id == pkg) && (cpus[cpux].physical_node_id == node)) {
+				if ((cpus[cpux].package_id == pkg) && (cpus[cpux].physical_node_id == node)) {
 					cpus[cpux].logical_node_id = lnode;
 					cpu_count++;
 				}
@@ -7206,7 +7203,7 @@ static void probe_intel_uncore_frequency_cluster(void)
 	}
 	for (i = uncore_max_id; i >= 0; --i) {
 		int k, l;
-		int package_id, domain_id, cluster_id;
+		int unc_pkg_id, domain_id, cluster_id;
 		char name_buf[16];
 
 		sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i);
@@ -7215,7 +7212,7 @@ static void probe_intel_uncore_frequency_cluster(void)
 			err(1, "%s: %s", __func__, path_base);
 
 		sprintf(path, "%s/package_id", path_base);
-		package_id = read_sysfs_int(path);
+		unc_pkg_id = read_sysfs_int(path);
 
 		sprintf(path, "%s/domain_id", path_base);
 		domain_id = read_sysfs_int(path);
@@ -7238,7 +7235,7 @@ static void probe_intel_uncore_frequency_cluster(void)
 		 */
 		if BIC_IS_ENABLED
 			(BIC_UNCORE_MHZ)
-			    add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id);
+			    add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, unc_pkg_id);
 
 		if (quiet)
 			continue;
@@ -7247,7 +7244,7 @@ static void probe_intel_uncore_frequency_cluster(void)
 		k = read_sysfs_int(path);
 		sprintf(path, "%s/max_freq_khz", path_base);
 		l = read_sysfs_int(path);
-		fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id, cluster_id, k / 1000, l / 1000);
+		fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", unc_pkg_id, domain_id, cluster_id, k / 1000, l / 1000);
 
 		sprintf(path, "%s/initial_min_freq_khz", path_base);
 		k = read_sysfs_int(path);
@@ -8952,7 +8949,7 @@ void cstate_perf_init_(bool soft_c1)
 				continue;
 
 			const int core_id = cpus[cpu].physical_core_id;
-			const int pkg_id = cpus[cpu].physical_package_id;
+			const int pkg_id = cpus[cpu].package_id;
 
 			assert(core_id < cores_visited_elems);
 			assert(pkg_id < pkg_visited_elems);
@@ -9612,9 +9609,9 @@ void topology_probe(bool startup)
 		cpus[i].logical_cpu_id = i;
 
 		/* get package information */
-		cpus[i].physical_package_id = get_physical_package_id(i);
-		if (cpus[i].physical_package_id > max_package_id)
-			max_package_id = cpus[i].physical_package_id;
+		cpus[i].package_id = get_package_id(i);
+		if (cpus[i].package_id > max_package_id)
+			max_package_id = cpus[i].package_id;
 
 		/* get die information */
 		cpus[i].die_id = get_die_id(i);
@@ -9685,7 +9682,7 @@ void topology_probe(bool startup)
 			continue;
 		fprintf(outf,
 			"cpu %d pkg %d die %d l3 %d node %d lnode %d core %d thread %d\n",
-			i, cpus[i].physical_package_id, cpus[i].die_id, cpus[i].l3_id,
+			i, cpus[i].package_id, cpus[i].die_id, cpus[i].l3_id,
 			cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].physical_core_id, cpus[i].thread_id);
 	}
 
@@ -9717,10 +9714,8 @@ void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_
 	if (*p == NULL)
 		goto error;
 
-	for (i = 0; i < topo.num_packages; i++) {
-		(*p)[i].package_id = i;
+	for (i = 0; i < topo.num_packages; i++)
 		(*p)[i].first_cpu = -1;
-	}
 
 	return;
 error:
@@ -9734,7 +9729,7 @@ error:
  */
 void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base, int cpu_id)
 {
-	int pkg_id = cpus[cpu_id].physical_package_id;
+	int pkg_id = cpus[cpu_id].package_id;
 	int node_id = cpus[cpu_id].logical_node_id;
 	int core_id = cpus[cpu_id].physical_core_id;
 	int thread_id = cpus[cpu_id].thread_id;
@@ -9760,7 +9755,6 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base,
 	}
 
 	c->core_id = core_id;
-	pkg_base[pkg_id].package_id = pkg_id;
 }
 
 int initialize_counters(int cpu_id)
-- 
cgit v1.2.3


From 5e160646f4dbca7cf9cc09abc31a22931e362b8a Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 8 Feb 2026 09:25:51 -0600
Subject: tools/power turbostat: Rename physical_core_id to core_id

The Linux Kernel topology sysfs is flawed.
core_id is not globally unique, but is per-package.

Turbostat works around this when it needs to, with

        rapl_core_id = cpus[cpu].core_id;
        rapl_core_id += cpus[cpu].package_id * nr_cores_per_package

Otherwise, turbostat handles core_id as subservient to each package.

As there is only one core_id namespace, rename
physical_core_id to simply be core_id.

No functional change.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 74d9f9e21e94..9f93efafbf94 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2394,13 +2394,13 @@ struct platform_counters {
 } platform_counters_odd, platform_counters_even;
 
 struct cpu_topology {
+	int core_id;
 	int package_id;
 	int die_id;
 	int l3_id;
 	int logical_cpu_id;
 	int physical_node_id;
 	int logical_node_id;	/* 0-based count within the package */
-	int physical_core_id;
 	int thread_id;
 	int type;
 	cpu_set_t *put_ids;	/* Processing Unit/Thread IDs */
@@ -2658,7 +2658,7 @@ unsigned int cpu_to_domain(const struct perf_counter_info *pc, int cpu)
 		return cpu;
 
 	case SCOPE_CORE:
-		return cpus[cpu].physical_core_id;
+		return cpus[cpu].core_id;
 
 	case SCOPE_PACKAGE:
 		return cpus[cpu].package_id;
@@ -5180,7 +5180,7 @@ static inline int get_rapl_domain_id(int cpu)
 		return cpus[cpu].package_id;
 
 	/* Compute the system-wide unique core-id for @cpu */
-	rapl_core_id = cpus[cpu].physical_core_id;
+	rapl_core_id = cpus[cpu].core_id;
 	rapl_core_id += cpus[cpu].package_id * nr_cores_per_package;
 
 	return rapl_core_id;
@@ -6216,7 +6216,7 @@ int get_thread_siblings(struct cpu_topology *thiscpu)
 			if ((map >> shift) & 0x1) {
 				so = shift + offset;
 				sib_core = get_core_id(so);
-				if (sib_core == thiscpu->physical_core_id) {
+				if (sib_core == thiscpu->core_id) {
 					CPU_SET_S(so, size, thiscpu->put_ids);
 					if ((so != cpu) && (cpus[so].thread_id < 0))
 						cpus[so].thread_id = thread_id++;
@@ -8948,7 +8948,7 @@ void cstate_perf_init_(bool soft_c1)
 			if (cpu_is_not_allowed(cpu))
 				continue;
 
-			const int core_id = cpus[cpu].physical_core_id;
+			const int core_id = cpus[cpu].core_id;
 			const int pkg_id = cpus[cpu].package_id;
 
 			assert(core_id < cores_visited_elems);
@@ -9629,9 +9629,9 @@ void topology_probe(bool startup)
 			topo.max_node_num = cpus[i].physical_node_id;
 
 		/* get core information */
-		cpus[i].physical_core_id = get_core_id(i);
-		if (cpus[i].physical_core_id > max_core_id)
-			max_core_id = cpus[i].physical_core_id;
+		cpus[i].core_id = get_core_id(i);
+		if (cpus[i].core_id > max_core_id)
+			max_core_id = cpus[i].core_id;
 
 		/* get thread information */
 		siblings = get_thread_siblings(&cpus[i]);
@@ -9683,7 +9683,7 @@ void topology_probe(bool startup)
 		fprintf(outf,
 			"cpu %d pkg %d die %d l3 %d node %d lnode %d core %d thread %d\n",
 			i, cpus[i].package_id, cpus[i].die_id, cpus[i].l3_id,
-			cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].physical_core_id, cpus[i].thread_id);
+			cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].core_id, cpus[i].thread_id);
 	}
 
 }
@@ -9731,7 +9731,7 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base,
 {
 	int pkg_id = cpus[cpu_id].package_id;
 	int node_id = cpus[cpu_id].logical_node_id;
-	int core_id = cpus[cpu_id].physical_core_id;
+	int core_id = cpus[cpu_id].core_id;
 	int thread_id = cpus[cpu_id].thread_id;
 	struct thread_data *t;
 	struct core_data *c;
-- 
cgit v1.2.3


From 439632cf95d369ea05841d45d48fbd1d5968d268 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 8 Feb 2026 09:34:34 -0600
Subject: tools/power turbostat: Delete core_data.core_id

Delete redundant core_data.core_id.
Use cpus[].core_id as the single copy of the truth.

No functional change.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 9f93efafbf94..af6338460320 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2143,7 +2143,6 @@ struct core_data {
 	unsigned long long mc6_us;	/* duplicate as per-core for now, even though per module */
 	unsigned int core_temp_c;
 	struct rapl_counter core_energy;	/* MSR_CORE_ENERGY_STAT */
-	unsigned int core_id;
 	unsigned long long core_throt_cnt;
 	unsigned long long counter[MAX_ADDED_CORE_COUNTERS];
 	unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS];
@@ -3178,7 +3177,7 @@ int dump_counters(PER_THREAD_PARAMS)
 	}
 
 	if (c && is_cpu_first_thread_in_core(t, c)) {
-		outp += sprintf(outp, "core: %d\n", c->core_id);
+		outp += sprintf(outp, "core: %d\n", cpus[t->cpu_id].core_id);
 		outp += sprintf(outp, "c3: %016llX\n", c->c3);
 		outp += sprintf(outp, "c6: %016llX\n", c->c6);
 		outp += sprintf(outp, "c7: %016llX\n", c->c7);
@@ -3387,7 +3386,7 @@ int format_counters(PER_THREAD_PARAMS)
 		}
 		if (DO_BIC(BIC_Core)) {
 			if (c)
-				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_id);
+				outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), cpus[t->cpu_id].core_id);
 			else
 				outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
 		}
@@ -5284,7 +5283,7 @@ int get_counters(PER_THREAD_PARAMS)
 		return -10;
 
 	for (i = 0, pp = sys.pmt_cp; pp; i++, pp = pp->next)
-		c->pmt_counter[i] = pmt_read_counter(pp, c->core_id);
+		c->pmt_counter[i] = pmt_read_counter(pp, cpus[t->cpu_id].core_id);
 
 	/* collect package counters only for 1st core in package */
 	if (!is_cpu_first_core_in_package(t, p))
@@ -9705,10 +9704,8 @@ void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_
 	if (*c == NULL)
 		goto error;
 
-	for (i = 0; i < num_cores; i++) {
-		(*c)[i].core_id = -1;
+	for (i = 0; i < num_cores; i++)
 		(*c)[i].first_cpu = -1;
-	}
 
 	*p = calloc(topo.num_packages, sizeof(struct pkg_data));
 	if (*p == NULL)
@@ -9753,8 +9750,6 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base,
 		if (pkg_base[pkg_id].first_cpu < 0)
 			pkg_base[pkg_id].first_cpu = t->cpu_id;
 	}
-
-	c->core_id = core_id;
 }
 
 int initialize_counters(int cpu_id)
-- 
cgit v1.2.3


From 3cecd4a62d227d86ec6eb5387add0ca3d4d02b27 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 8 Feb 2026 12:08:26 -0600
Subject: tools/power turbostat: Allocate average counters dynamically

The current static definition of average{} is inconsistent with
the dynamically allocated even{} and odd{} counters.

Allocate average{} counters dynamically.

No functional change.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 243 ++++++++++++++++++----------------
 1 file changed, 131 insertions(+), 112 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index af6338460320..b3b697e669a3 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2383,9 +2383,9 @@ static void free_sys_msr_counters(void)
 }
 
 struct system_summary {
-	struct thread_data threads;
-	struct core_data cores;
-	struct pkg_data packages;
+	struct thread_data *threads;
+	struct core_data *cores;
+	struct pkg_data *packages;
 } average;
 
 struct platform_counters {
@@ -3308,7 +3308,7 @@ int format_counters(PER_THREAD_PARAMS)
 	char *delim = "\t";
 	int printed = 0;
 
-	if (t == &average.threads) {
+	if (t == average.threads) {
 		pplat_cnt = count & 1 ? &platform_counters_odd : &platform_counters_even;
 		++count;
 	}
@@ -3322,7 +3322,7 @@ int format_counters(PER_THREAD_PARAMS)
 		return 0;
 
 	/*if not summary line and --cpu is used */
-	if ((t != &average.threads) && (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset)))
+	if ((t != average.threads) && (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset)))
 		return 0;
 
 	if (DO_BIC(BIC_USEC)) {
@@ -3342,7 +3342,7 @@ int format_counters(PER_THREAD_PARAMS)
 	tsc = t->tsc * tsc_tweak;
 
 	/* topo columns, print blanks on 1st (average) line */
-	if (t == &average.threads) {
+	if (t == average.threads) {
 		if (DO_BIC(BIC_Package))
 			outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
 		if (DO_BIC(BIC_Die))
@@ -3723,10 +3723,10 @@ int format_counters(PER_THREAD_PARAMS)
 		}
 	}
 
-	if (DO_BIC(BIC_SysWatt) && (t == &average.threads))
+	if (DO_BIC(BIC_SysWatt) && (t == average.threads))
 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
 				rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_WATTS, interval_float));
-	if (DO_BIC(BIC_Sys_J) && (t == &average.threads))
+	if (DO_BIC(BIC_Sys_J) && (t == average.threads))
 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
 				rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_JOULES, interval_float));
 
@@ -3766,7 +3766,7 @@ void format_all_counters(PER_THREAD_PARAMS)
 	if ((!count || (header_iterations && !(count % header_iterations))) || !summary_only)
 		print_header("\t");
 
-	format_counters(&average.threads, &average.cores, &average.packages);
+	format_counters(average.threads, average.cores, average.packages);
 
 	count++;
 
@@ -4170,78 +4170,78 @@ int sum_counters(PER_THREAD_PARAMS)
 
 	/* copy un-changing apic_id's */
 	if (DO_BIC(BIC_APIC))
-		average.threads.apic_id = t->apic_id;
+		average.threads->apic_id = t->apic_id;
 	if (DO_BIC(BIC_X2APIC))
-		average.threads.x2apic_id = t->x2apic_id;
+		average.threads->x2apic_id = t->x2apic_id;
 
 	/* remember first tv_begin */
-	if (average.threads.tv_begin.tv_sec == 0)
-		average.threads.tv_begin = procsysfs_tv_begin;
+	if (average.threads->tv_begin.tv_sec == 0)
+		average.threads->tv_begin = procsysfs_tv_begin;
 
 	/* remember last tv_end */
-	average.threads.tv_end = t->tv_end;
+	average.threads->tv_end = t->tv_end;
 
-	average.threads.tsc += t->tsc;
-	average.threads.aperf += t->aperf;
-	average.threads.mperf += t->mperf;
-	average.threads.c1 += t->c1;
+	average.threads->tsc += t->tsc;
+	average.threads->aperf += t->aperf;
+	average.threads->mperf += t->mperf;
+	average.threads->c1 += t->c1;
 
-	average.threads.instr_count += t->instr_count;
+	average.threads->instr_count += t->instr_count;
 
-	average.threads.irq_count += t->irq_count;
-	average.threads.nmi_count += t->nmi_count;
-	average.threads.smi_count += t->smi_count;
+	average.threads->irq_count += t->irq_count;
+	average.threads->nmi_count += t->nmi_count;
+	average.threads->smi_count += t->smi_count;
 
-	average.threads.llc.references += t->llc.references;
-	average.threads.llc.misses += t->llc.misses;
+	average.threads->llc.references += t->llc.references;
+	average.threads->llc.misses += t->llc.misses;
 
-	average.threads.l2.references += t->l2.references;
-	average.threads.l2.hits += t->l2.hits;
+	average.threads->l2.references += t->l2.references;
+	average.threads->l2.hits += t->l2.hits;
 
 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW)
 			continue;
-		average.threads.counter[i] += t->counter[i];
+		average.threads->counter[i] += t->counter[i];
 	}
 
 	for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
 		if (pp->format == FORMAT_RAW)
 			continue;
-		average.threads.perf_counter[i] += t->perf_counter[i];
+		average.threads->perf_counter[i] += t->perf_counter[i];
 	}
 
 	for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
-		average.threads.pmt_counter[i] += t->pmt_counter[i];
+		average.threads->pmt_counter[i] += t->pmt_counter[i];
 	}
 
 	/* sum per-core values only for 1st thread in core */
 	if (!is_cpu_first_thread_in_core(t, c))
 		return 0;
 
-	average.cores.c3 += c->c3;
-	average.cores.c6 += c->c6;
-	average.cores.c7 += c->c7;
-	average.cores.mc6_us += c->mc6_us;
+	average.cores->c3 += c->c3;
+	average.cores->c6 += c->c6;
+	average.cores->c7 += c->c7;
+	average.cores->mc6_us += c->mc6_us;
 
-	average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c);
-	average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt);
+	average.cores->core_temp_c = MAX(average.cores->core_temp_c, c->core_temp_c);
+	average.cores->core_throt_cnt = MAX(average.cores->core_throt_cnt, c->core_throt_cnt);
 
-	rapl_counter_accumulate(&average.cores.core_energy, &c->core_energy);
+	rapl_counter_accumulate(&average.cores->core_energy, &c->core_energy);
 
 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW)
 			continue;
-		average.cores.counter[i] += c->counter[i];
+		average.cores->counter[i] += c->counter[i];
 	}
 
 	for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
 		if (pp->format == FORMAT_RAW)
 			continue;
-		average.cores.perf_counter[i] += c->perf_counter[i];
+		average.cores->perf_counter[i] += c->perf_counter[i];
 	}
 
 	for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
-		average.cores.pmt_counter[i] += c->pmt_counter[i];
+		average.cores->pmt_counter[i] += c->pmt_counter[i];
 	}
 
 	/* sum per-pkg values only for 1st core in pkg */
@@ -4249,63 +4249,63 @@ int sum_counters(PER_THREAD_PARAMS)
 		return 0;
 
 	if (DO_BIC(BIC_Totl_c0))
-		average.packages.pkg_wtd_core_c0 += p->pkg_wtd_core_c0;
+		average.packages->pkg_wtd_core_c0 += p->pkg_wtd_core_c0;
 	if (DO_BIC(BIC_Any_c0))
-		average.packages.pkg_any_core_c0 += p->pkg_any_core_c0;
+		average.packages->pkg_any_core_c0 += p->pkg_any_core_c0;
 	if (DO_BIC(BIC_GFX_c0))
-		average.packages.pkg_any_gfxe_c0 += p->pkg_any_gfxe_c0;
+		average.packages->pkg_any_gfxe_c0 += p->pkg_any_gfxe_c0;
 	if (DO_BIC(BIC_CPUGFX))
-		average.packages.pkg_both_core_gfxe_c0 += p->pkg_both_core_gfxe_c0;
+		average.packages->pkg_both_core_gfxe_c0 += p->pkg_both_core_gfxe_c0;
 
-	average.packages.pc2 += p->pc2;
+	average.packages->pc2 += p->pc2;
 	if (DO_BIC(BIC_Pkgpc3))
-		average.packages.pc3 += p->pc3;
+		average.packages->pc3 += p->pc3;
 	if (DO_BIC(BIC_Pkgpc6))
-		average.packages.pc6 += p->pc6;
+		average.packages->pc6 += p->pc6;
 	if (DO_BIC(BIC_Pkgpc7))
-		average.packages.pc7 += p->pc7;
-	average.packages.pc8 += p->pc8;
-	average.packages.pc9 += p->pc9;
-	average.packages.pc10 += p->pc10;
-	average.packages.die_c6 += p->die_c6;
+		average.packages->pc7 += p->pc7;
+	average.packages->pc8 += p->pc8;
+	average.packages->pc9 += p->pc9;
+	average.packages->pc10 += p->pc10;
+	average.packages->die_c6 += p->die_c6;
 
-	average.packages.cpu_lpi = p->cpu_lpi;
-	average.packages.sys_lpi = p->sys_lpi;
+	average.packages->cpu_lpi = p->cpu_lpi;
+	average.packages->sys_lpi = p->sys_lpi;
 
-	rapl_counter_accumulate(&average.packages.energy_pkg, &p->energy_pkg);
-	rapl_counter_accumulate(&average.packages.energy_dram, &p->energy_dram);
-	rapl_counter_accumulate(&average.packages.energy_cores, &p->energy_cores);
-	rapl_counter_accumulate(&average.packages.energy_gfx, &p->energy_gfx);
+	rapl_counter_accumulate(&average.packages->energy_pkg, &p->energy_pkg);
+	rapl_counter_accumulate(&average.packages->energy_dram, &p->energy_dram);
+	rapl_counter_accumulate(&average.packages->energy_cores, &p->energy_cores);
+	rapl_counter_accumulate(&average.packages->energy_gfx, &p->energy_gfx);
 
-	average.packages.gfx_rc6_ms = p->gfx_rc6_ms;
-	average.packages.uncore_mhz = p->uncore_mhz;
-	average.packages.gfx_mhz = p->gfx_mhz;
-	average.packages.gfx_act_mhz = p->gfx_act_mhz;
-	average.packages.sam_mc6_ms = p->sam_mc6_ms;
-	average.packages.sam_mhz = p->sam_mhz;
-	average.packages.sam_act_mhz = p->sam_act_mhz;
+	average.packages->gfx_rc6_ms = p->gfx_rc6_ms;
+	average.packages->uncore_mhz = p->uncore_mhz;
+	average.packages->gfx_mhz = p->gfx_mhz;
+	average.packages->gfx_act_mhz = p->gfx_act_mhz;
+	average.packages->sam_mc6_ms = p->sam_mc6_ms;
+	average.packages->sam_mhz = p->sam_mhz;
+	average.packages->sam_act_mhz = p->sam_act_mhz;
 
-	average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c);
+	average.packages->pkg_temp_c = MAX(average.packages->pkg_temp_c, p->pkg_temp_c);
 
-	rapl_counter_accumulate(&average.packages.rapl_pkg_perf_status, &p->rapl_pkg_perf_status);
-	rapl_counter_accumulate(&average.packages.rapl_dram_perf_status, &p->rapl_dram_perf_status);
+	rapl_counter_accumulate(&average.packages->rapl_pkg_perf_status, &p->rapl_pkg_perf_status);
+	rapl_counter_accumulate(&average.packages->rapl_dram_perf_status, &p->rapl_dram_perf_status);
 
 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
 		if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0))
-			average.packages.counter[i] = p->counter[i];
+			average.packages->counter[i] = p->counter[i];
 		else
-			average.packages.counter[i] += p->counter[i];
+			average.packages->counter[i] += p->counter[i];
 	}
 
 	for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
 		if ((pp->format == FORMAT_RAW) && (topo.num_packages == 0))
-			average.packages.perf_counter[i] = p->perf_counter[i];
+			average.packages->perf_counter[i] = p->perf_counter[i];
 		else
-			average.packages.perf_counter[i] += p->perf_counter[i];
+			average.packages->perf_counter[i] += p->perf_counter[i];
 	}
 
 	for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
-		average.packages.pmt_counter[i] += p->pmt_counter[i];
+		average.packages->pmt_counter[i] += p->pmt_counter[i];
 	}
 
 	return 0;
@@ -4322,117 +4322,117 @@ void compute_average(PER_THREAD_PARAMS)
 	struct perf_counter_info *pp;
 	struct pmt_counter *ppmt;
 
-	clear_counters(&average.threads, &average.cores, &average.packages);
+	clear_counters(average.threads, average.cores, average.packages);
 
 	for_all_cpus(sum_counters, t, c, p);
 
 	/* Use the global time delta for the average. */
-	average.threads.tv_delta = tv_delta;
+	average.threads->tv_delta = tv_delta;
 
-	average.threads.tsc /= topo.allowed_cpus;
-	average.threads.aperf /= topo.allowed_cpus;
-	average.threads.mperf /= topo.allowed_cpus;
-	average.threads.instr_count /= topo.allowed_cpus;
-	average.threads.c1 /= topo.allowed_cpus;
+	average.threads->tsc /= topo.allowed_cpus;
+	average.threads->aperf /= topo.allowed_cpus;
+	average.threads->mperf /= topo.allowed_cpus;
+	average.threads->instr_count /= topo.allowed_cpus;
+	average.threads->c1 /= topo.allowed_cpus;
 
-	if (average.threads.irq_count > 9999999)
+	if (average.threads->irq_count > 9999999)
 		sums_need_wide_columns = 1;
-	if (average.threads.nmi_count > 9999999)
+	if (average.threads->nmi_count > 9999999)
 		sums_need_wide_columns = 1;
 
-	average.cores.c3 /= topo.allowed_cores;
-	average.cores.c6 /= topo.allowed_cores;
-	average.cores.c7 /= topo.allowed_cores;
-	average.cores.mc6_us /= topo.allowed_cores;
+	average.cores->c3 /= topo.allowed_cores;
+	average.cores->c6 /= topo.allowed_cores;
+	average.cores->c7 /= topo.allowed_cores;
+	average.cores->mc6_us /= topo.allowed_cores;
 
 	if (DO_BIC(BIC_Totl_c0))
-		average.packages.pkg_wtd_core_c0 /= topo.allowed_packages;
+		average.packages->pkg_wtd_core_c0 /= topo.allowed_packages;
 	if (DO_BIC(BIC_Any_c0))
-		average.packages.pkg_any_core_c0 /= topo.allowed_packages;
+		average.packages->pkg_any_core_c0 /= topo.allowed_packages;
 	if (DO_BIC(BIC_GFX_c0))
-		average.packages.pkg_any_gfxe_c0 /= topo.allowed_packages;
+		average.packages->pkg_any_gfxe_c0 /= topo.allowed_packages;
 	if (DO_BIC(BIC_CPUGFX))
-		average.packages.pkg_both_core_gfxe_c0 /= topo.allowed_packages;
+		average.packages->pkg_both_core_gfxe_c0 /= topo.allowed_packages;
 
-	average.packages.pc2 /= topo.allowed_packages;
+	average.packages->pc2 /= topo.allowed_packages;
 	if (DO_BIC(BIC_Pkgpc3))
-		average.packages.pc3 /= topo.allowed_packages;
+		average.packages->pc3 /= topo.allowed_packages;
 	if (DO_BIC(BIC_Pkgpc6))
-		average.packages.pc6 /= topo.allowed_packages;
+		average.packages->pc6 /= topo.allowed_packages;
 	if (DO_BIC(BIC_Pkgpc7))
-		average.packages.pc7 /= topo.allowed_packages;
+		average.packages->pc7 /= topo.allowed_packages;
 
-	average.packages.pc8 /= topo.allowed_packages;
-	average.packages.pc9 /= topo.allowed_packages;
-	average.packages.pc10 /= topo.allowed_packages;
-	average.packages.die_c6 /= topo.allowed_packages;
+	average.packages->pc8 /= topo.allowed_packages;
+	average.packages->pc9 /= topo.allowed_packages;
+	average.packages->pc10 /= topo.allowed_packages;
+	average.packages->die_c6 /= topo.allowed_packages;
 
 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW)
 			continue;
 		if (mp->type == COUNTER_ITEMS) {
-			if (average.threads.counter[i] > 9999999)
+			if (average.threads->counter[i] > 9999999)
 				sums_need_wide_columns = 1;
 			continue;
 		}
-		average.threads.counter[i] /= topo.allowed_cpus;
+		average.threads->counter[i] /= topo.allowed_cpus;
 	}
 	for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW)
 			continue;
 		if (mp->type == COUNTER_ITEMS) {
-			if (average.cores.counter[i] > 9999999)
+			if (average.cores->counter[i] > 9999999)
 				sums_need_wide_columns = 1;
 		}
-		average.cores.counter[i] /= topo.allowed_cores;
+		average.cores->counter[i] /= topo.allowed_cores;
 	}
 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW)
 			continue;
 		if (mp->type == COUNTER_ITEMS) {
-			if (average.packages.counter[i] > 9999999)
+			if (average.packages->counter[i] > 9999999)
 				sums_need_wide_columns = 1;
 		}
-		average.packages.counter[i] /= topo.allowed_packages;
+		average.packages->counter[i] /= topo.allowed_packages;
 	}
 
 	for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
 		if (pp->format == FORMAT_RAW)
 			continue;
 		if (pp->type == COUNTER_ITEMS) {
-			if (average.threads.perf_counter[i] > 9999999)
+			if (average.threads->perf_counter[i] > 9999999)
 				sums_need_wide_columns = 1;
 			continue;
 		}
-		average.threads.perf_counter[i] /= topo.allowed_cpus;
+		average.threads->perf_counter[i] /= topo.allowed_cpus;
 	}
 	for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
 		if (pp->format == FORMAT_RAW)
 			continue;
 		if (pp->type == COUNTER_ITEMS) {
-			if (average.cores.perf_counter[i] > 9999999)
+			if (average.cores->perf_counter[i] > 9999999)
 				sums_need_wide_columns = 1;
 		}
-		average.cores.perf_counter[i] /= topo.allowed_cores;
+		average.cores->perf_counter[i] /= topo.allowed_cores;
 	}
 	for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
 		if (pp->format == FORMAT_RAW)
 			continue;
 		if (pp->type == COUNTER_ITEMS) {
-			if (average.packages.perf_counter[i] > 9999999)
+			if (average.packages->perf_counter[i] > 9999999)
 				sums_need_wide_columns = 1;
 		}
-		average.packages.perf_counter[i] /= topo.allowed_packages;
+		average.packages->perf_counter[i] /= topo.allowed_packages;
 	}
 
 	for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
-		average.threads.pmt_counter[i] /= topo.allowed_cpus;
+		average.threads->pmt_counter[i] /= topo.allowed_cpus;
 	}
 	for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
-		average.cores.pmt_counter[i] /= topo.allowed_cores;
+		average.cores->pmt_counter[i] /= topo.allowed_cores;
 	}
 	for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
-		average.packages.pmt_counter[i] /= topo.allowed_packages;
+		average.packages->pmt_counter[i] /= topo.allowed_packages;
 	}
 }
 
@@ -9687,6 +9687,24 @@ void topology_probe(bool startup)
 
 }
 
+void allocate_counters_1(struct thread_data **t, struct core_data **c, struct pkg_data **p)
+{
+	*t = calloc(1, sizeof(struct thread_data));
+	if (*t == NULL)
+		goto error;
+
+	*c = calloc(1, sizeof(struct core_data));
+	if (*c == NULL)
+		goto error;
+
+	*p = calloc(1, sizeof(struct pkg_data));
+	if (*p == NULL)
+		goto error;
+
+	return;
+error:
+	err(1, "calloc counters_1");
+}
 void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_data **p)
 {
 	int i;
@@ -9813,6 +9831,7 @@ void setup_all_buffers(bool startup)
 	topology_probe(startup);
 	allocate_irq_buffers();
 	allocate_fd_percpu();
+	allocate_counters_1(&average.threads, &average.cores, &average.packages);
 	allocate_counters(&thread_even, &core_even, &package_even);
 	allocate_counters(&thread_odd, &core_odd, &package_odd);
 	allocate_output_buffer();
-- 
cgit v1.2.3


From a854684986b7056dd1be8730ccce24d0d5a413e9 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Sun, 8 Feb 2026 12:18:35 -0600
Subject: tools/power turbostat: Unify even/odd/average counter referencing

Update the syntax of accesses to the even and odd counters
to match the average counters.

No functional change.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 82 +++++++++++++++++------------------
 1 file changed, 41 insertions(+), 41 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index b3b697e669a3..d37f36852f0f 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2133,7 +2133,7 @@ struct thread_data {
 	unsigned long long counter[MAX_ADDED_THREAD_COUNTERS];
 	unsigned long long perf_counter[MAX_ADDED_THREAD_COUNTERS];
 	unsigned long long pmt_counter[PMT_MAX_ADDED_THREAD_COUNTERS];
-} *thread_even, *thread_odd;
+};
 
 struct core_data {
 	int first_cpu;
@@ -2147,7 +2147,7 @@ struct core_data {
 	unsigned long long counter[MAX_ADDED_CORE_COUNTERS];
 	unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS];
 	unsigned long long pmt_counter[PMT_MAX_ADDED_CORE_COUNTERS];
-} *core_even, *core_odd;
+};
 
 struct pkg_data {
 	int first_cpu;
@@ -2182,10 +2182,10 @@ struct pkg_data {
 	unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS];
 	unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS];
 	unsigned long long pmt_counter[PMT_MAX_ADDED_PACKAGE_COUNTERS];
-} *package_even, *package_odd;
+};
 
-#define ODD_COUNTERS thread_odd, core_odd, package_odd
-#define EVEN_COUNTERS thread_even, core_even, package_even
+#define ODD_COUNTERS odd.threads, odd.cores, odd.packages
+#define EVEN_COUNTERS even.threads, even.cores, even.packages
 
 #define GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no)	      \
 	((thread_base) +						      \
@@ -2382,11 +2382,11 @@ static void free_sys_msr_counters(void)
 	sys.added_package_counters -= free_msr_counters_(&sys.pp);
 }
 
-struct system_summary {
+struct counters {
 	struct thread_data *threads;
 	struct core_data *cores;
 	struct pkg_data *packages;
-} average;
+} average, even, odd;
 
 struct platform_counters {
 	struct rapl_counter energy_psys;	/* MSR_PLATFORM_ENERGY_STATUS */
@@ -3142,7 +3142,7 @@ int dump_counters(PER_THREAD_PARAMS)
 {
 	int i;
 	struct msr_counter *mp;
-	struct platform_counters *pplat_cnt = p == package_odd ? &platform_counters_odd : &platform_counters_even;
+	struct platform_counters *pplat_cnt = p == odd.packages ? &platform_counters_odd : &platform_counters_even;
 
 	outp += sprintf(outp, "t %p, c %p, p %p\n", t, c, p);
 
@@ -4800,7 +4800,7 @@ void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci
 
 int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct pkg_data *p)
 {
-	struct platform_counters *pplat_cnt = p == package_odd ? &platform_counters_odd : &platform_counters_even;
+	struct platform_counters *pplat_cnt = p == odd.packages ? &platform_counters_odd : &platform_counters_even;
 	unsigned long long perf_data[NUM_RAPL_COUNTERS + 1];
 	struct rapl_counter_info_t *rci;
 
@@ -5973,21 +5973,21 @@ void free_all_buffers(void)
 		perf_lcore_set = NULL;
 	}
 
-	free(thread_even);
-	free(core_even);
-	free(package_even);
+	free(even.threads);
+	free(even.cores);
+	free(even.packages);
 
-	thread_even = NULL;
-	core_even = NULL;
-	package_even = NULL;
+	even.threads = NULL;
+	even.cores = NULL;
+	even.packages = NULL;
 
-	free(thread_odd);
-	free(core_odd);
-	free(package_odd);
+	free(odd.threads);
+	free(odd.cores);
+	free(odd.packages);
 
-	thread_odd = NULL;
-	core_odd = NULL;
-	package_odd = NULL;
+	odd.threads = NULL;
+	odd.cores = NULL;
+	odd.packages = NULL;
 
 	free(output_buffer);
 	output_buffer = NULL;
@@ -9687,50 +9687,50 @@ void topology_probe(bool startup)
 
 }
 
-void allocate_counters_1(struct thread_data **t, struct core_data **c, struct pkg_data **p)
+void allocate_counters_1(struct counters *counters)
 {
-	*t = calloc(1, sizeof(struct thread_data));
-	if (*t == NULL)
+	counters->threads = calloc(1, sizeof(struct thread_data));
+	if (counters->threads == NULL)
 		goto error;
 
-	*c = calloc(1, sizeof(struct core_data));
-	if (*c == NULL)
+	counters->cores = calloc(1, sizeof(struct core_data));
+	if (counters->cores == NULL)
 		goto error;
 
-	*p = calloc(1, sizeof(struct pkg_data));
-	if (*p == NULL)
+	counters->packages = calloc(1, sizeof(struct pkg_data));
+	if (counters->packages == NULL)
 		goto error;
 
 	return;
 error:
 	err(1, "calloc counters_1");
 }
-void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_data **p)
+void allocate_counters(struct counters *counters)
 {
 	int i;
 	int num_cores = topo.cores_per_node * topo.nodes_per_pkg * topo.num_packages;
 	int num_threads = topo.threads_per_core * num_cores;
 
-	*t = calloc(num_threads, sizeof(struct thread_data));
-	if (*t == NULL)
+	counters->threads = calloc(num_threads, sizeof(struct thread_data));
+	if (counters->threads == NULL)
 		goto error;
 
 	for (i = 0; i < num_threads; i++)
-		(*t)[i].cpu_id = -1;
+		(counters->threads)[i].cpu_id = -1;
 
-	*c = calloc(num_cores, sizeof(struct core_data));
-	if (*c == NULL)
+	counters->cores = calloc(num_cores, sizeof(struct core_data));
+	if (counters->cores == NULL)
 		goto error;
 
 	for (i = 0; i < num_cores; i++)
-		(*c)[i].first_cpu = -1;
+		(counters->cores)[i].first_cpu = -1;
 
-	*p = calloc(topo.num_packages, sizeof(struct pkg_data));
-	if (*p == NULL)
+	counters->packages = calloc(topo.num_packages, sizeof(struct pkg_data));
+	if (counters->packages == NULL)
 		goto error;
 
 	for (i = 0; i < topo.num_packages; i++)
-		(*p)[i].first_cpu = -1;
+		(counters->packages)[i].first_cpu = -1;
 
 	return;
 error:
@@ -9831,9 +9831,9 @@ void setup_all_buffers(bool startup)
 	topology_probe(startup);
 	allocate_irq_buffers();
 	allocate_fd_percpu();
-	allocate_counters_1(&average.threads, &average.cores, &average.packages);
-	allocate_counters(&thread_even, &core_even, &package_even);
-	allocate_counters(&thread_odd, &core_odd, &package_odd);
+	allocate_counters_1(&average);
+	allocate_counters(&even);
+	allocate_counters(&odd);
 	allocate_output_buffer();
 	for_all_proc_cpus(initialize_counters);
 	topology_update();
-- 
cgit v1.2.3


From ddf60e38ca048842a34eb6e9d7a0d3e7d459df8d Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Thu, 12 Feb 2026 23:06:47 -0600
Subject: tools/power turbostat: Simplify global core_id calculation

Standardize the generation of globally unique core_id's
in a macro, and simplify the related code.

No functional change.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index d37f36852f0f..6091d0117994 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2393,7 +2393,7 @@ struct platform_counters {
 } platform_counters_odd, platform_counters_even;
 
 struct cpu_topology {
-	int core_id;
+	int core_id;		/* unique within a package */
 	int package_id;
 	int die_id;
 	int l3_id;
@@ -2409,12 +2409,12 @@ struct topo_params {
 	int num_packages;
 	int num_die;
 	int num_cpus;
-	int num_cores;
+	int num_cores;		/* system wide */
 	int allowed_packages;
 	int allowed_cpus;
 	int allowed_cores;
 	int max_cpu_num;
-	int max_core_id;
+	int max_core_id;	/* within a package */
 	int max_package_id;
 	int max_die_id;
 	int max_l3_id;
@@ -2446,6 +2446,7 @@ int cpu_is_not_allowed(int cpu)
 	return !CPU_ISSET_S(cpu, cpu_allowed_setsize, cpu_allowed_set);
 }
 
+#define GLOBAL_CORE_ID(core_id, pkg_id)	(core_id + pkg_id * (topo.max_core_id + 1))
 /*
  * run func(thread, core, package) in topology order
  * skip non-present cpus
@@ -5157,32 +5158,18 @@ unsigned long pmt_read_counter(struct pmt_counter *ppmt, unsigned int domain_id)
 /* Rapl domain enumeration helpers */
 static inline int get_rapl_num_domains(void)
 {
-	int num_packages = topo.max_package_id + 1;
-	int num_cores_per_package;
-	int num_cores;
-
 	if (!platform->has_per_core_rapl)
-		return num_packages;
-
-	num_cores_per_package = topo.max_core_id + 1;
-	num_cores = num_cores_per_package * num_packages;
+		return topo.num_packages;
 
-	return num_cores;
+	return topo.num_cores;
 }
 
 static inline int get_rapl_domain_id(int cpu)
 {
-	int nr_cores_per_package = topo.max_core_id + 1;
-	int rapl_core_id;
-
 	if (!platform->has_per_core_rapl)
 		return cpus[cpu].package_id;
 
-	/* Compute the system-wide unique core-id for @cpu */
-	rapl_core_id = cpus[cpu].core_id;
-	rapl_core_id += cpus[cpu].package_id * nr_cores_per_package;
-
-	return rapl_core_id;
+	return GLOBAL_CORE_ID(cpu, cpus[cpu].package_id);
 }
 
 /*
-- 
cgit v1.2.3


From 070e92361eeca21407ce90b582698a877ece5694 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Thu, 12 Feb 2026 23:31:02 -0600
Subject: tools/power turbostat: Enhance HT enumeration

Record the cpu_id of each CPU HT sibling -- will need this later.

Rename "thread_id" to "ht_id" to disambiguate that the scope
of this id is within a Core -- it is not a global cpu_id.

No functional change.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 6091d0117994..12d5f8112c92 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2400,7 +2400,8 @@ struct cpu_topology {
 	int logical_cpu_id;
 	int physical_node_id;
 	int logical_node_id;	/* 0-based count within the package */
-	int thread_id;
+	int ht_id;		/* unique within a core */
+	int ht_sibling_cpu_id;
 	int type;
 	cpu_set_t *put_ids;	/* Processing Unit/Thread IDs */
 } *cpus;
@@ -6179,8 +6180,8 @@ int get_thread_siblings(struct cpu_topology *thiscpu)
 	int thread_id = 0;
 
 	thiscpu->put_ids = CPU_ALLOC((topo.max_cpu_num + 1));
-	if (thiscpu->thread_id < 0)
-		thiscpu->thread_id = thread_id++;
+	if (thiscpu->ht_id < 0)
+		thiscpu->ht_id = thread_id++;
 	if (!thiscpu->put_ids)
 		return -1;
 
@@ -6204,8 +6205,10 @@ int get_thread_siblings(struct cpu_topology *thiscpu)
 				sib_core = get_core_id(so);
 				if (sib_core == thiscpu->core_id) {
 					CPU_SET_S(so, size, thiscpu->put_ids);
-					if ((so != cpu) && (cpus[so].thread_id < 0))
-						cpus[so].thread_id = thread_id++;
+					if ((so != cpu) && (cpus[so].ht_id < 0)) {
+						cpus[so].ht_id = thread_id++;
+						cpus[cpu].ht_sibling_cpu_id = so;
+					}
 				}
 			}
 		}
@@ -6388,9 +6391,10 @@ int mark_cpu_present(int cpu)
 	return 0;
 }
 
-int init_thread_id(int cpu)
+int init_ht_id(int cpu)
 {
-	cpus[cpu].thread_id = -1;
+	cpus[cpu].ht_id = -1;
+	cpus[cpu].ht_sibling_cpu_id = -1;
 	return 0;
 }
 
@@ -9575,13 +9579,13 @@ void topology_probe(bool startup)
 	cpu_affinity_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
 	CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
 
-	for_all_proc_cpus(init_thread_id);
+	for_all_proc_cpus(init_ht_id);
 
 	for_all_proc_cpus(set_cpu_hybrid_type);
 
 	/*
 	 * For online cpus
-	 * find max_core_id, max_package_id
+	 * find max_core_id, max_package_id, num_cores (per system)
 	 */
 	for (i = 0; i <= topo.max_cpu_num; ++i) {
 		int siblings;
@@ -9623,11 +9627,12 @@ void topology_probe(bool startup)
 		siblings = get_thread_siblings(&cpus[i]);
 		if (siblings > max_siblings)
 			max_siblings = siblings;
-		if (cpus[i].thread_id == 0)
+		if (cpus[i].ht_id == 0)
 			topo.num_cores++;
 	}
-	topo.max_core_id = max_core_id;
+	topo.max_core_id = max_core_id;			/* within a package */
 	topo.max_package_id = max_package_id;
+	topo.num_cores = (max_core_id + 1) * topo.num_packages;	/* per system */
 
 	topo.cores_per_node = max_core_id + 1;
 	if (debug > 1)
@@ -9669,7 +9674,7 @@ void topology_probe(bool startup)
 		fprintf(outf,
 			"cpu %d pkg %d die %d l3 %d node %d lnode %d core %d thread %d\n",
 			i, cpus[i].package_id, cpus[i].die_id, cpus[i].l3_id,
-			cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].core_id, cpus[i].thread_id);
+			cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].core_id, cpus[i].ht_id);
 	}
 
 }
@@ -9727,14 +9732,13 @@ error:
 /*
  * init_counter()
  *
- * set FIRST_THREAD_IN_CORE and FIRST_CORE_IN_PACKAGE
+ * set t->cpu_id, FIRST_THREAD_IN_CORE and FIRST_CORE_IN_PACKAGE
  */
 void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base, int cpu_id)
 {
 	int pkg_id = cpus[cpu_id].package_id;
 	int node_id = cpus[cpu_id].logical_node_id;
 	int core_id = cpus[cpu_id].core_id;
-	int thread_id = cpus[cpu_id].thread_id;
 	struct thread_data *t;
 	struct core_data *c;
 
@@ -9744,7 +9748,7 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base,
 	if (node_id < 0)
 		node_id = 0;
 
-	t = GET_THREAD(thread_base, thread_id, core_id, node_id, pkg_id);
+	t = GET_THREAD(thread_base, cpus[cpu_id].ht_id, core_id, node_id, pkg_id);
 	c = GET_CORE(core_base, core_id, node_id, pkg_id);
 
 	t->cpu_id = cpu_id;
-- 
cgit v1.2.3


From 6be5c151eb1ebf4d5007b9f60c729f7381255a23 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Thu, 12 Feb 2026 23:52:02 -0600
Subject: tools/power turbostat: Expunge logical_cpu_id

There is only once cpu_id name space -- cpu_id.
Expunge the term logical_cpu_id.

No functional change.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 12d5f8112c92..9fdb41410f15 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2393,11 +2393,11 @@ struct platform_counters {
 } platform_counters_odd, platform_counters_even;
 
 struct cpu_topology {
+	int cpu_id;
 	int core_id;		/* unique within a package */
 	int package_id;
 	int die_id;
 	int l3_id;
-	int logical_cpu_id;
 	int physical_node_id;
 	int logical_node_id;	/* 0-based count within the package */
 	int ht_id;		/* unique within a core */
@@ -6099,7 +6099,7 @@ int get_physical_node_id(struct cpu_topology *thiscpu)
 	char path[80];
 	FILE *filep;
 	int i;
-	int cpu = thiscpu->logical_cpu_id;
+	int cpu = thiscpu->cpu_id;
 
 	for (i = 0; i <= topo.max_cpu_num; i++) {
 		sprintf(path, "/sys/devices/system/cpu/cpu%d/node%i/cpulist", cpu, i);
@@ -6174,7 +6174,7 @@ int get_thread_siblings(struct cpu_topology *thiscpu)
 	FILE *filep;
 	unsigned long map;
 	int so, shift, sib_core;
-	int cpu = thiscpu->logical_cpu_id;
+	int cpu = thiscpu->cpu_id;
 	int offset = topo.max_cpu_num + 1;
 	size_t size;
 	int thread_id = 0;
@@ -9596,7 +9596,7 @@ void topology_probe(bool startup)
 			continue;
 		}
 
-		cpus[i].logical_cpu_id = i;
+		cpus[i].cpu_id = i;
 
 		/* get package information */
 		cpus[i].package_id = get_package_id(i);
-- 
cgit v1.2.3


From 0b82cc331d2e23537670878c62c19ee3f4147a93 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Fri, 13 Feb 2026 10:21:36 -0800
Subject: selftests/sched_ext: Fix rt_stall flaky failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The rt_stall test measures the runtime ratio between an EXT and an RT
task pinned to the same CPU, verifying that the deadline server prevents
RT tasks from starving SCHED_EXT tasks. It expects the EXT task to get
at least 4% of CPU time.

The test is flaky because sched_stress_test() calls sleep(RUN_TIME)
immediately after fork(), without waiting for the RT child to complete
its setup (set_affinity + set_sched). If the RT child experiences
scheduling latency before completing setup, that delay eats into the
measurement window: the RT child runs for less than RUN_TIME seconds,
and the EXT task's measured ratio drops below the 4% threshold.

For example, in the failing CI run [1]:
  EXT=0.140s RT=4.750s total=4.890s (expected ~5.0s)
  ratio=2.86% < 4% → FAIL

The 110ms gap (5.0 - 4.89) corresponds to the RT child's setup time
being counted inside the measurement window, during which fewer
deadline server ticks fire for the EXT task.

Fix by using pipes to synchronize: each child signals the parent after
completing its setup, and the parent waits for both signals before
starting sleep(RUN_TIME). This ensures the measurement window only
counts time when both tasks are fully configured and competing.

[1] https://github.com/kernel-patches/bpf/actions/runs/21961895809/job/63442490449

Fixes: be621a76341c ("selftests/sched_ext: Add test for sched_ext dl_server")
Assisted-by: claude-opus-4-6-v1
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/rt_stall.c | 49 ++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/sched_ext/rt_stall.c b/tools/testing/selftests/sched_ext/rt_stall.c
index 015200f80f6e..ab772e336f86 100644
--- a/tools/testing/selftests/sched_ext/rt_stall.c
+++ b/tools/testing/selftests/sched_ext/rt_stall.c
@@ -23,6 +23,30 @@
 #define CORE_ID		0	/* CPU to pin tasks to */
 #define RUN_TIME        5	/* How long to run the test in seconds */
 
+/* Signal the parent that setup is complete by writing to a pipe */
+static void signal_ready(int fd)
+{
+	char c = 1;
+
+	if (write(fd, &c, 1) != 1) {
+		perror("write to ready pipe");
+		exit(EXIT_FAILURE);
+	}
+	close(fd);
+}
+
+/* Wait for a child to signal readiness via a pipe */
+static void wait_ready(int fd)
+{
+	char c;
+
+	if (read(fd, &c, 1) != 1) {
+		perror("read from ready pipe");
+		exit(EXIT_FAILURE);
+	}
+	close(fd);
+}
+
 /* Simple busy-wait function for test tasks */
 static void process_func(void)
 {
@@ -122,14 +146,24 @@ static bool sched_stress_test(bool is_ext)
 
 	float ext_runtime, rt_runtime, actual_ratio;
 	int ext_pid, rt_pid;
+	int ext_ready[2], rt_ready[2];
 
 	ksft_print_header();
 	ksft_set_plan(1);
 
+	if (pipe(ext_ready) || pipe(rt_ready)) {
+		perror("pipe");
+		ksft_exit_fail();
+	}
+
 	/* Create and set up a EXT task */
 	ext_pid = fork();
 	if (ext_pid == 0) {
+		close(ext_ready[0]);
+		close(rt_ready[0]);
+		close(rt_ready[1]);
 		set_affinity(CORE_ID);
+		signal_ready(ext_ready[1]);
 		process_func();
 		exit(0);
 	} else if (ext_pid < 0) {
@@ -140,8 +174,12 @@ static bool sched_stress_test(bool is_ext)
 	/* Create an RT task */
 	rt_pid = fork();
 	if (rt_pid == 0) {
+		close(ext_ready[0]);
+		close(ext_ready[1]);
+		close(rt_ready[0]);
 		set_affinity(CORE_ID);
 		set_sched(SCHED_FIFO, 50);
+		signal_ready(rt_ready[1]);
 		process_func();
 		exit(0);
 	} else if (rt_pid < 0) {
@@ -149,6 +187,17 @@ static bool sched_stress_test(bool is_ext)
 		ksft_exit_fail();
 	}
 
+	/*
+	 * Wait for both children to complete their setup (affinity and
+	 * scheduling policy) before starting the measurement window.
+	 * This prevents flaky failures caused by the RT child's setup
+	 * time eating into the measurement period.
+	 */
+	close(ext_ready[1]);
+	close(rt_ready[1]);
+	wait_ready(ext_ready[0]);
+	wait_ready(rt_ready[0]);
+
 	/* Let the processes run for the specified time */
 	sleep(RUN_TIME);
 
-- 
cgit v1.2.3


From a2b4d0f8bf07a4a4fe8a526e10c45e593c7a3bf0 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Thu, 12 Feb 2026 14:04:25 -0600
Subject: tools/power turbostat: Favor cpu# over core#

Turbostat collects statistics and outputs results in "topology order",
which means it prioritizes the core# over the cpu#.
The strategy is to minimize wakesups to a core -- which is
important when measuring an idle system.

But core order is problematic, because Linux core#'s are physical
(within each package), and thus subject to APIC-id scrambling
that may be done by the hardware or the BIOS.

As a result users may be are faced with rows in a confusing order:

sudo turbostat -q --show topology,Busy%,CPU%c6,UncMHz sleep 1
Core	CPU	Busy%	CPU%c6	UncMHz
-	-	1.25	72.18	3400
0	4	7.74	0.00
1	5	1.77	88.59
2	6	0.48	96.73
3	7	0.21	98.34
4	8	0.14	96.85
5	9	0.26	97.55
6	10	0.44	97.24
7	11	0.12	96.18
8	0	5.41	0.31	3400
8	1	0.19
12	2	0.41	0.22
12	3	0.08
32	12	0.04	99.21
33	13	0.25	94.92

Abandon the legacy "core# topology order" in favor of simply
ordering by cpu#, with a special case to handle HT siblings
that may not have adjacent cpu#'s.

sudo ./turbostat -q --show topology,Busy%,CPU%c6,UncMHz sleep 1
1.003001 sec
Core	CPU	Busy%	CPU%c6	UncMHz
-	-	1.38	80.55	1600
8	0	10.94	0.00	1600
8	1	0.53
12	2	2.90	0.45
12	3	0.11
0	4	1.96	91.20
1	5	0.97	96.40
2	6	0.24	94.72
3	7	0.31	98.01
4	8	0.20	98.20
5	9	0.62	96.00
6	10	0.06	98.15
7	11	0.12	99.31
32	12	0.04	99.07
33	13	0.27	95.09

The result is that cpu#'s now take precedence over core#'s.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 123 +++++++++++++++++++---------------
 1 file changed, 69 insertions(+), 54 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 9fdb41410f15..5239fd971b66 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -2187,20 +2187,6 @@ struct pkg_data {
 #define ODD_COUNTERS odd.threads, odd.cores, odd.packages
 #define EVEN_COUNTERS even.threads, even.cores, even.packages
 
-#define GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no)	      \
-	((thread_base) +						      \
-	 ((pkg_no) *							      \
-	  topo.nodes_per_pkg * topo.cores_per_node * topo.threads_per_core) + \
-	 ((node_no) * topo.cores_per_node * topo.threads_per_core) +	      \
-	 ((core_no) * topo.threads_per_core) +				      \
-	 (thread_no))
-
-#define GET_CORE(core_base, core_no, node_no, pkg_no)			\
-	((core_base) +							\
-	 ((pkg_no) *  topo.nodes_per_pkg * topo.cores_per_node) +	\
-	 ((node_no) * topo.cores_per_node) +				\
-	 (core_no))
-
 /*
  * The accumulated sum of MSR is defined as a monotonic
  * increasing MSR, it will be accumulated periodically,
@@ -2392,6 +2378,8 @@ struct platform_counters {
 	struct rapl_counter energy_psys;	/* MSR_PLATFORM_ENERGY_STATUS */
 } platform_counters_odd, platform_counters_even;
 
+#define	MAX_HT_ID	3	/* support SMT-4 */
+
 struct cpu_topology {
 	int cpu_id;
 	int core_id;		/* unique within a package */
@@ -2401,7 +2389,7 @@ struct cpu_topology {
 	int physical_node_id;
 	int logical_node_id;	/* 0-based count within the package */
 	int ht_id;		/* unique within a core */
-	int ht_sibling_cpu_id;
+	int ht_sibling_cpu_id[MAX_HT_ID + 1];
 	int type;
 	cpu_set_t *put_ids;	/* Processing Unit/Thread IDs */
 } *cpus;
@@ -2458,27 +2446,38 @@ int cpu_is_not_allowed(int cpu)
 int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pkg_data *),
 		 struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base)
 {
-	int retval, pkg_no, core_no, thread_no, node_no;
+	int cpu, retval;
 
 	retval = 0;
 
-	for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
-		for (node_no = 0; node_no < topo.nodes_per_pkg; node_no++) {
-			for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
-				for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
-					struct thread_data *t;
-					struct core_data *c;
+	for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) {
+		struct thread_data *t;
+		struct core_data *c;
+		struct pkg_data *p;
+
+		int pkg_id = cpus[cpu].package_id;
+
+		if (cpu_is_not_allowed(cpu))
+			continue;
+
+		if (cpus[cpu].ht_id > 0)	/* skip HT sibling */
+			continue;
 
-					t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
+		t = &thread_base[cpu];
+		c = &core_base[GLOBAL_CORE_ID(cpus[cpu].core_id, pkg_id)];
+		p = &pkg_base[pkg_id];
 
-					if (cpu_is_not_allowed(t->cpu_id))
-						continue;
+		retval |= func(t, c, p);
 
-					c = GET_CORE(core_base, core_no, node_no, pkg_no);
+		/* Handle HT sibling now */
+		int i;
 
-					retval |= func(t, c, &pkg_base[pkg_no]);
-				}
-			}
+		for (i = MAX_HT_ID; i > 0; --i) {	/* ht_id 0 is self */
+			if (cpus[cpu].ht_sibling_cpu_id[i] <= 0)
+				continue;
+			t = &thread_base[cpus[cpu].ht_sibling_cpu_id[i]];
+
+			retval |= func(t, c, p);
 		}
 	}
 	return retval;
@@ -6168,7 +6167,7 @@ static int parse_cpu_str(char *cpu_str, cpu_set_t *cpu_set, int cpu_set_size)
 	return 0;
 }
 
-int get_thread_siblings(struct cpu_topology *thiscpu)
+int set_thread_siblings(struct cpu_topology *thiscpu)
 {
 	char path[80], character;
 	FILE *filep;
@@ -6206,8 +6205,11 @@ int get_thread_siblings(struct cpu_topology *thiscpu)
 				if (sib_core == thiscpu->core_id) {
 					CPU_SET_S(so, size, thiscpu->put_ids);
 					if ((so != cpu) && (cpus[so].ht_id < 0)) {
-						cpus[so].ht_id = thread_id++;
-						cpus[cpu].ht_sibling_cpu_id = so;
+						cpus[so].ht_id = thread_id;
+						cpus[cpu].ht_sibling_cpu_id[thread_id] = so;
+						if (debug)
+							fprintf(stderr, "%s: cpu%d.ht_sibling_cpu_id[%d] = %d\n", __func__, cpu, thread_id, so);
+						thread_id += 1;
 					}
 				}
 			}
@@ -6229,30 +6231,40 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *,
 		   struct core_data *core_base, struct pkg_data *pkg_base,
 		   struct thread_data *thread_base2, struct core_data *core_base2, struct pkg_data *pkg_base2)
 {
-	int retval, pkg_no, node_no, core_no, thread_no;
+	int cpu, retval;
 
 	retval = 0;
 
-	for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
-		for (node_no = 0; node_no < topo.nodes_per_pkg; ++node_no) {
-			for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
-				for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
-					struct thread_data *t, *t2;
-					struct core_data *c, *c2;
+	for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) {
+		struct thread_data *t, *t2;
+		struct core_data *c, *c2;
+		struct pkg_data *p, *p2;
 
-					t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
+		if (cpu_is_not_allowed(cpu))
+			continue;
 
-					if (cpu_is_not_allowed(t->cpu_id))
-						continue;
+		if (cpus[cpu].ht_id > 0)	/* skip HT sibling */
+			continue;
 
-					t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no);
+		t = &thread_base[cpu];
+		t2 = &thread_base2[cpu];
+		c = &core_base[GLOBAL_CORE_ID(cpus[cpu].core_id, cpus[cpu].package_id)];
+		c2 = &core_base2[GLOBAL_CORE_ID(cpus[cpu].core_id, cpus[cpu].package_id)];
+		p = &pkg_base[cpus[cpu].package_id];
+		p2 = &pkg_base2[cpus[cpu].package_id];
 
-					c = GET_CORE(core_base, core_no, node_no, pkg_no);
-					c2 = GET_CORE(core_base2, core_no, node_no, pkg_no);
+		retval |= func(t, c, p, t2, c2, p2);
 
-					retval |= func(t, c, &pkg_base[pkg_no], t2, c2, &pkg_base2[pkg_no]);
-				}
-			}
+		/* Handle HT sibling now */
+		int i;
+
+		for (i = MAX_HT_ID; i > 0; --i) {	/* ht_id 0 is self */
+			if (cpus[cpu].ht_sibling_cpu_id[i] <= 0)
+				continue;
+			t = &thread_base[cpus[cpu].ht_sibling_cpu_id[i]];
+			t2 = &thread_base2[cpus[cpu].ht_sibling_cpu_id[i]];
+
+			retval |= func(t, c, p, t2, c2, p2);
 		}
 	}
 	return retval;
@@ -6391,10 +6403,13 @@ int mark_cpu_present(int cpu)
 	return 0;
 }
 
-int init_ht_id(int cpu)
+int clear_ht_id(int cpu)
 {
+	int i;
+
 	cpus[cpu].ht_id = -1;
-	cpus[cpu].ht_sibling_cpu_id = -1;
+	for (i = 0; i <= MAX_HT_ID; ++i)
+		cpus[cpu].ht_sibling_cpu_id[i] = -1;
 	return 0;
 }
 
@@ -9579,7 +9594,7 @@ void topology_probe(bool startup)
 	cpu_affinity_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
 	CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
 
-	for_all_proc_cpus(init_ht_id);
+	for_all_proc_cpus(clear_ht_id);
 
 	for_all_proc_cpus(set_cpu_hybrid_type);
 
@@ -9624,7 +9639,7 @@ void topology_probe(bool startup)
 			max_core_id = cpus[i].core_id;
 
 		/* get thread information */
-		siblings = get_thread_siblings(&cpus[i]);
+		siblings = set_thread_siblings(&cpus[i]);
 		if (siblings > max_siblings)
 			max_siblings = siblings;
 		if (cpus[i].ht_id == 0)
@@ -9748,8 +9763,8 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base,
 	if (node_id < 0)
 		node_id = 0;
 
-	t = GET_THREAD(thread_base, cpus[cpu_id].ht_id, core_id, node_id, pkg_id);
-	c = GET_CORE(core_base, core_id, node_id, pkg_id);
+	t = &thread_base[cpu_id];
+	c = &core_base[GLOBAL_CORE_ID(core_id, pkg_id)];
 
 	t->cpu_id = cpu_id;
 	if (!cpu_is_not_allowed(cpu_id)) {
-- 
cgit v1.2.3


From 8e5c0cc326f2e95a71bb6e6063e65caa60c8f951 Mon Sep 17 00:00:00 2001
From: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Date: Mon, 8 Dec 2025 08:38:04 +0530
Subject: tools/power turbostat: Use strtoul() for iteration parsing

Replace strtod() with strtoul() and check errno for -n/-N options, since
num_iterations and header_iterations are unsigned long counters. Reject
zero and conversion errors; negative inputs wrap to large positive values
per standard unsigned semantics.

Signed-off-by: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 5239fd971b66..b8cbbff95e84 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -11536,18 +11536,20 @@ void cmdline(int argc, char **argv)
 			/* Parsed earlier */
 			break;
 		case 'n':
-			num_iterations = strtod(optarg, NULL);
+			num_iterations = strtoul(optarg, NULL, 0);
+			errno = 0;
 
-			if (num_iterations <= 0) {
-				fprintf(outf, "iterations %d should be positive number\n", num_iterations);
+			if (errno || num_iterations == 0) {
+				fprintf(outf, "invalid iteration count: %s\n", optarg);
 				exit(2);
 			}
 			break;
 		case 'N':
-			header_iterations = strtod(optarg, NULL);
+			header_iterations = strtoul(optarg, NULL, 0);
+			errno = 0;
 
-			if (header_iterations <= 0) {
-				fprintf(outf, "iterations %d should be positive number\n", header_iterations);
+			if (errno || header_iterations == 0) {
+				fprintf(outf, "invalid header iteration count: %s\n", optarg);
 				exit(2);
 			}
 			break;
-- 
cgit v1.2.3


From 96718ad296af4a6d984b3a09276b165ab6a3b0c8 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 13 Feb 2026 13:26:12 -0600
Subject: tools/power turbostat: Fix and document --header_iterations

The "header_iterations" option is commonly used to de-clutter
the screen of redundant header label rows in an interactive session:
Eg. every 10 rows:

$ sudo turbostat --header_iterations 10 -S -q -i 1

But --header_iterations was missing from turbostat.8

Also turbostat help advertised the "-N" short option
that did not actually work:

$ turbostat --help
  -N, --header_iterations num
		print header every num iterations

Repair "-N"
Document "--header_iterations" on turbostat.8

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.8 |  4 +++-
 tools/power/x86/turbostat/turbostat.c | 20 +++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index b4ef04200219..344ede2f8546 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -111,12 +111,14 @@ The column name "all" can be used to enable all disabled-by-default built-in cou
 .PP
 \fB--no-perf\fP Disable all the uses of the perf API.
 .PP
-\fB--force\fPForce turbostat to run on an unsupported platform (minimal defaults).
+\fB--force\fP Force turbostat to run on an unsupported platform (minimal defaults).
 .PP
 \fB--interval seconds\fP overrides the default 5.0 second measurement interval.
 .PP
 \fB--num_iterations num\fP number of the measurement iterations.
 .PP
+\fB--header_iterations num\fP print header every num iterations.
+.PP
 \fB--out output_file\fP turbostat output is written to the specified output_file.
 The file is truncated if it already exists, and it is created if it does not exist.
 .PP
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index b8cbbff95e84..1ce175841583 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -11443,7 +11443,7 @@ void cmdline(int argc, char **argv)
 	 * Parse some options early, because they may make other options invalid,
 	 * like adding the MSR counter with --add and at the same time using --no-msr.
 	 */
-	while ((opt = getopt_long_only(argc, argv, "+MPn:", long_options, &option_index)) != -1) {
+	while ((opt = getopt_long_only(argc, argv, "+:MP", long_options, &option_index)) != -1) {
 		switch (opt) {
 		case 'M':
 			no_msr = 1;
@@ -11457,7 +11457,7 @@ void cmdline(int argc, char **argv)
 	}
 	optind = 0;
 
-	while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) {
+	while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:N:o:qMST:v", long_options, &option_index)) != -1) {
 		switch (opt) {
 		case 'a':
 			parse_add_command(optarg);
@@ -11500,7 +11500,6 @@ void cmdline(int argc, char **argv)
 			}
 			break;
 		case 'h':
-		default:
 			help();
 			exit(1);
 		case 'i':
@@ -11539,19 +11538,15 @@ void cmdline(int argc, char **argv)
 			num_iterations = strtoul(optarg, NULL, 0);
 			errno = 0;
 
-			if (errno || num_iterations == 0) {
-				fprintf(outf, "invalid iteration count: %s\n", optarg);
-				exit(2);
-			}
+			if (errno || num_iterations == 0)
+				errx(-1, "invalid iteration count: %s", optarg);
 			break;
 		case 'N':
 			header_iterations = strtoul(optarg, NULL, 0);
 			errno = 0;
 
-			if (errno || header_iterations == 0) {
-				fprintf(outf, "invalid header iteration count: %s\n", optarg);
-				exit(2);
-			}
+			if (errno || header_iterations == 0)
+				errx(-1, "invalid header iteration count: %s", optarg);
 			break;
 		case 's':
 			/*
@@ -11574,6 +11569,9 @@ void cmdline(int argc, char **argv)
 			print_version();
 			exit(0);
 			break;
+		default:
+			help();
+			exit(1);
 		}
 	}
 }
-- 
cgit v1.2.3


From 51496091dd37b405e6e399a9638da3f1da3f4c64 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 13 Feb 2026 13:38:02 -0600
Subject: tools/power turbostat: version 2026.02.14

Since release 2025.12.02:

Add L2 statistics columns for recent Intel processors:
	L2MRPS = L2 Cache M-References Per Second
	L2%hit = L2 Cache Hit %

Sort work and output by cpu# rather than core#

This commit:
	Version number and white space (indent -l160)
	No functional change.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 57 +++++++++++++----------------------
 1 file changed, 21 insertions(+), 36 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 1ce175841583..1aace9b3269e 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -3,7 +3,7 @@
  * turbostat -- show CPU frequency and C-state residency
  * on modern Intel and AMD processors.
  *
- * Copyright (c) 2025 Intel Corporation.
+ * Copyright (c) 2010 - 2026 Intel Corporation
  * Len Brown <len.brown@intel.com>
  */
 
@@ -2735,8 +2735,7 @@ void help(void)
 		"		sets the Thermal Control Circuit temperature in\n"
 		"		  degrees Celsius\n"
 		"  -h, --help\n"
-		"		print this help message\n"
-		"  -v, --version\n\t\tprint version information\n\nFor more help, run \"man turbostat\"\n");
+		"		print this help message\n  -v, --version\n\t\tprint version information\n\nFor more help, run \"man turbostat\"\n");
 }
 
 /*
@@ -3671,11 +3670,9 @@ int format_counters(PER_THREAD_PARAMS)
 	if (DO_BIC(BIC_RAM_J))
 		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float));
 	if (DO_BIC(BIC_PKG__))
-		outp +=
-		    sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float));
+		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float));
 	if (DO_BIC(BIC_RAM__))
-		outp +=
-		    sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float));
+		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float));
 	/* UncMHz */
 	if (DO_BIC(BIC_UNCORE_MHZ))
 		outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz);
@@ -3725,11 +3722,9 @@ int format_counters(PER_THREAD_PARAMS)
 	}
 
 	if (DO_BIC(BIC_SysWatt) && (t == average.threads))
-		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
-				rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_WATTS, interval_float));
+		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_WATTS, interval_float));
 	if (DO_BIC(BIC_Sys_J) && (t == average.threads))
-		outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
-				rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_JOULES, interval_float));
+		outp += sprintf(outp, fmt8, (printed++ ? delim : ""), rapl_counter_get_value(&pplat_cnt->energy_psys, RAPL_UNIT_JOULES, interval_float));
 
 done:
 	if (*(outp - 1) != '\n')
@@ -3942,7 +3937,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
 	/* check for TSC < 1 Mcycles over interval */
 	if (old->tsc < (1000 * 1000))
 		errx(-3, "Insanely slow TSC rate, TSC stops in idle?\n"
-		     "You can disable all c-states by booting with \"idle=poll\"\n" "or just the deep ones with \"processor.max_cstate=1\"");
+		     "You can disable all c-states by booting with \"idle=poll\"\nor just the deep ones with \"processor.max_cstate=1\"");
 
 	old->c1 = new->c1 - old->c1;
 
@@ -5546,8 +5541,7 @@ static void dump_turbo_ratio_limits(int trl_msr_offset)
 	int shift;
 
 	get_msr(master_cpu, trl_msr_offset, &msr);
-	fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n",
-		master_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr);
+	fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n", master_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr);
 
 	if (platform->trl_msrs & TRL_CORECOUNT) {
 		get_msr(master_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts);
@@ -8019,8 +8013,7 @@ int print_rapl(PER_THREAD_PARAMS)
 			return -1;
 	}
 
-	fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr,
-		rapl_power_units, rapl_energy_units, rapl_time_units);
+	fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr, rapl_power_units, rapl_energy_units, rapl_time_units);
 
 	if (valid_rapl_msrs & RAPL_PKG_POWER_INFO) {
 
@@ -8052,8 +8045,7 @@ int print_rapl(PER_THREAD_PARAMS)
 			return -9;
 
 		fprintf(outf, "cpu%d: MSR_VR_CURRENT_CONFIG: 0x%08llx\n", cpu, msr);
-		fprintf(outf, "cpu%d: PKG Limit #4: %f Watts (%slocked)\n",
-			cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN");
+		fprintf(outf, "cpu%d: PKG Limit #4: %f Watts (%slocked)\n", cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN");
 	}
 
 	if (valid_rapl_msrs & RAPL_DRAM_POWER_INFO) {
@@ -8390,8 +8382,7 @@ void decode_misc_feature_control(void)
 	if (!get_msr(master_cpu, MSR_MISC_FEATURE_CONTROL, &msr))
 		fprintf(outf,
 			"cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n",
-			master_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "",
-			msr & (2 << 0) ? "No-" : "", msr & (3 << 0) ? "No-" : "");
+			master_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "", msr & (2 << 0) ? "No-" : "", msr & (3 << 0) ? "No-" : "");
 }
 
 /*
@@ -8433,12 +8424,10 @@ void decode_c6_demotion_policy_msr(void)
 		return;
 
 	if (!get_msr(master_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr))
-		fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n",
-			master_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
+		fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n", master_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
 
 	if (!get_msr(master_cpu, MSR_MC6_DEMOTION_POLICY_CONFIG, &msr))
-		fprintf(outf, "cpu%d: MSR_MC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-MC6-Demotion)\n",
-			master_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
+		fprintf(outf, "cpu%d: MSR_MC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-MC6-Demotion)\n", master_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
 }
 
 void print_dev_latency(void)
@@ -8756,8 +8745,7 @@ void rapl_perf_init(void)
 /* Assumes msr_counter_info is populated */
 static int has_amperf_access(void)
 {
-	return cpuid_has_aperf_mperf && msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present &&
-	    msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].present;
+	return cpuid_has_aperf_mperf && msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present && msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].present;
 }
 
 int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name)
@@ -8968,8 +8956,7 @@ void cstate_perf_init_(bool soft_c1)
 			if (!per_core && pkg_visited[pkg_id])
 				continue;
 
-			const bool counter_needed = BIC_IS_ENABLED(cai->bic_number) ||
-			    (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY));
+			const bool counter_needed = BIC_IS_ENABLED(cai->bic_number) || (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY));
 			const bool counter_supported = (platform->supported_cstates & cai->feature_mask);
 
 			if (counter_needed && counter_supported) {
@@ -9259,8 +9246,7 @@ void process_cpuid()
 			if (crystal_hz) {
 				tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal;
 				if (!quiet)
-					fprintf(outf, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n",
-						tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal);
+					fprintf(outf, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n", tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal);
 			}
 		}
 	}
@@ -9645,7 +9631,7 @@ void topology_probe(bool startup)
 		if (cpus[i].ht_id == 0)
 			topo.num_cores++;
 	}
-	topo.max_core_id = max_core_id;			/* within a package */
+	topo.max_core_id = max_core_id;	/* within a package */
 	topo.max_package_id = max_package_id;
 	topo.num_cores = (max_core_id + 1) * topo.num_packages;	/* per system */
 
@@ -9712,6 +9698,7 @@ void allocate_counters_1(struct counters *counters)
 error:
 	err(1, "calloc counters_1");
 }
+
 void allocate_counters(struct counters *counters)
 {
 	int i;
@@ -10368,8 +10355,7 @@ void pmt_init(void)
 
 	if (BIC_IS_ENABLED(BIC_Diec6)) {
 		pmt_add_counter(PMT_MTL_DC6_GUID, PMT_MTL_DC6_SEQ, "Die%c6", PMT_TYPE_XTAL_TIME,
-				PMT_COUNTER_MTL_DC6_LSB, PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET,
-				SCOPE_PACKAGE, FORMAT_DELTA, 0, PMT_OPEN_TRY);
+				PMT_COUNTER_MTL_DC6_LSB, PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, SCOPE_PACKAGE, FORMAT_DELTA, 0, PMT_OPEN_TRY);
 	}
 
 	if (BIC_IS_ENABLED(BIC_CPU_c1e)) {
@@ -10547,7 +10533,7 @@ int get_and_dump_counters(void)
 
 void print_version()
 {
-	fprintf(outf, "turbostat version 2026.02.04 - Len Brown <lenb@kernel.org>\n");
+	fprintf(outf, "turbostat version 2026.02.14 - Len Brown <lenb@kernel.org>\n");
 }
 
 #define COMMAND_LINE_SIZE 2048
@@ -11169,8 +11155,7 @@ next:
 	}
 
 	if (direct_path && has_guid) {
-		printf("%s: path and guid+seq parameters are mutually exclusive\n"
-		       "notice: passed guid=0x%x and path=%s\n", __func__, guid, direct_path);
+		printf("%s: path and guid+seq parameters are mutually exclusive\nnotice: passed guid=0x%x and path=%s\n", __func__, guid, direct_path);
 		exit(1);
 	}
 
-- 
cgit v1.2.3


From a68a9bd086c2822d0c629443bd16ad1317afe501 Mon Sep 17 00:00:00 2001
From: Pin-yen Lin <treapking@google.com>
Date: Mon, 9 Feb 2026 16:59:36 -0800
Subject: selftests: netconsole: Increase port listening timeout

wait_for_port() can wait up to 2 seconds with the sleep and the polling
in wait_local_port_listen() combined. So, in netcons_basic.sh, the socat
process could die before the test writes to the netconsole.

Increase the timeout to 3 seconds to make netcons_basic.sh pass
consistently.

Fixes: 3dc6c76391cb ("selftests: net: Add IPv6 support to netconsole basic tests")
Signed-off-by: Pin-yen Lin <treapking@google.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20260210005939.3230550-1-treapking@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
index b6093bcf2b06..02dcdeb723be 100644
--- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
+++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
@@ -249,8 +249,8 @@ function listen_port_and_save_to() {
 		SOCAT_MODE="UDP6-LISTEN"
 	fi
 
-	# Just wait for 2 seconds
-	timeout 2 ip netns exec "${NAMESPACE}" \
+	# Just wait for 3 seconds
+	timeout 3 ip netns exec "${NAMESPACE}" \
 		socat "${SOCAT_MODE}":"${PORT}",fork "${OUTPUT}" 2> /dev/null
 }
 
-- 
cgit v1.2.3


From 728ff167910ef16e97717719c749ddf4064c653b Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Tue, 10 Feb 2026 13:45:32 -0500
Subject: libbpf: Add gating for arena globals relocation feature

Add feature gating for the arena globals relocation introduced in
commit c1f61171d44b. The commit depends on a previous commit in the
same patchset that is absent from older kernels
(12a1fe6e12db "bpf/verifier: Do not limit maximum direct offset into arena map").

Without this commit, arena globals relocation with arenas >= 512MiB
fails to load and breaks libbpf's backwards compatibility.

Introduce a libbpf feature to check whether the running kernel allows for
full range ldimm64 offset, and only relocate arena globals if it does.

Fixes: c1f61171d44b ("libbpf: Move arena globals to the end of the arena")
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260210184532.255475-1-emil@etsalapatis.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/features.c        | 64 +++++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/libbpf.c          |  7 +++--
 tools/lib/bpf/libbpf_internal.h |  2 ++
 3 files changed, 71 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c
index b842b83e2480..b65ab109e3ff 100644
--- a/tools/lib/bpf/features.c
+++ b/tools/lib/bpf/features.c
@@ -506,6 +506,67 @@ static int probe_kern_arg_ctx_tag(int token_fd)
 	return probe_fd(prog_fd);
 }
 
+static int probe_ldimm64_full_range_off(int token_fd)
+{
+	char log_buf[1024];
+	int prog_fd, map_fd;
+	int ret;
+	LIBBPF_OPTS(bpf_map_create_opts, map_opts,
+		.token_fd = token_fd,
+		.map_flags = token_fd ? BPF_F_TOKEN_FD : 0,
+	);
+	LIBBPF_OPTS(bpf_prog_load_opts, prog_opts,
+		.token_fd = token_fd,
+		.prog_flags = token_fd ? BPF_F_TOKEN_FD : 0,
+		.log_buf = log_buf,
+		.log_size = sizeof(log_buf),
+	);
+	struct bpf_insn insns[] = {
+		BPF_LD_MAP_VALUE(BPF_REG_1, 0, 1UL << 30),
+		BPF_EXIT_INSN(),
+	};
+	int insn_cnt = ARRAY_SIZE(insns);
+
+	map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "arr", sizeof(int), 1, 1, &map_opts);
+	if (map_fd < 0) {
+		ret = -errno;
+		pr_warn("Error in %s(): %s. Couldn't create simple array map.\n",
+			__func__, errstr(ret));
+		return ret;
+	}
+	insns[0].imm = map_fd;
+
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, "global_reloc", "GPL", insns, insn_cnt, &prog_opts);
+	ret = -errno;
+
+	close(map_fd);
+	close(prog_fd);
+
+	if (prog_fd >= 0) {
+		pr_warn("Error in %s(): Program loading unexpectedly succeeded.\n", __func__);
+		return -EINVAL;
+	}
+
+	/*
+	 * Feature is allowed if we're not failing with the error message
+	 * "direct value offset of %u is not allowed" removed in
+	 * 12a1fe6e12db ("bpf/verifier: Do not limit maximum direct offset into arena map").
+	 * We should instead fail with "invalid access to map value pointer".
+	 * Ensure we match with one of the two and we're not failing with a
+	 * different, unexpected message.
+	 */
+	if (strstr(log_buf, "direct value offset of"))
+		return 0;
+
+	if (!strstr(log_buf, "invalid access to map value pointer")) {
+		pr_warn("Error in %s(): Program unexpectedly failed with message: %s.\n",
+			__func__, log_buf);
+		return ret;
+	}
+
+	return 1;
+}
+
 typedef int (*feature_probe_fn)(int /* token_fd */);
 
 static struct kern_feature_cache feature_cache;
@@ -581,6 +642,9 @@ static struct kern_feature_desc {
 	[FEAT_BTF_QMARK_DATASEC] = {
 		"BTF DATASEC names starting from '?'", probe_kern_btf_qmark_datasec,
 	},
+	[FEAT_LDIMM64_FULL_RANGE_OFF] = {
+		"full range LDIMM64 support", probe_ldimm64_full_range_off,
+	},
 };
 
 bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 0c8bf0b5cce4..93e59ed8d9a1 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -3009,8 +3009,11 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map,
 	memcpy(obj->arena_data, data, data_sz);
 	obj->arena_data_sz = data_sz;
 
-	/* place globals at the end of the arena */
-	obj->arena_data_off = mmap_sz - data_alloc_sz;
+	/* place globals at the end of the arena (if supported) */
+	if (kernel_supports(obj, FEAT_LDIMM64_FULL_RANGE_OFF))
+		obj->arena_data_off = mmap_sz - data_alloc_sz;
+	else
+		obj->arena_data_off = 0;
 
 	/* make bpf_map__init_value() work for ARENA maps */
 	map->mmaped = obj->arena_data;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index fc59b21b51b5..974147e8a8aa 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -392,6 +392,8 @@ enum kern_feature_id {
 	FEAT_ARG_CTX_TAG,
 	/* Kernel supports '?' at the front of datasec names */
 	FEAT_BTF_QMARK_DATASEC,
+	/* Kernel supports LDIMM64 imm offsets past 512 MiB. */
+	FEAT_LDIMM64_FULL_RANGE_OFF,
 	__FEAT_CNT,
 };
 
-- 
cgit v1.2.3


From 04999b99e81eaa7b6223ec1c03af3bcb4ac57aaa Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Mon, 9 Feb 2026 15:01:34 -0800
Subject: libbpf: Fix invalid write loop logic in bpf_linker__add_buf()

Fix bpf_linker__add_buf()'s logic of copying data from memory buffer into
memfd. In the event of short write not writing entire buf_sz bytes into memfd
file, we'll append bytes from the beginning of buf *again* (corrupting ELF
file contents) instead of correctly appending the rest of not-yet-read buf
contents.

Closes: https://github.com/libbpf/libbpf/issues/945
Fixes: 6d5e5e5d7ce1 ("libbpf: Extend linker API to support in-memory ELF files")
Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20260209230134.3530521-1-ameryhung@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/linker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index f4403e3cf994..78f92c39290a 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -581,7 +581,7 @@ int bpf_linker__add_buf(struct bpf_linker *linker, void *buf, size_t buf_sz,
 
 	written = 0;
 	while (written < buf_sz) {
-		ret = write(fd, buf, buf_sz);
+		ret = write(fd, buf + written, buf_sz - written);
 		if (ret < 0) {
 			ret = -errno;
 			pr_warn("failed to write '%s': %s\n", filename, errstr(ret));
-- 
cgit v1.2.3


From 48f624c3dc71c2b807ce138bb70d1f5216532874 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Tue, 10 Feb 2026 15:58:55 -0800
Subject: selftests/bpf: Adjust selftest due to function rename

do_filp_open() was renamed in commit
541003b576c3 ("rename do_filp_open() to do_file_open()")

This broke test_profiler, because it uses a kretprobe on that
function. Fix it by renaming accordingly.

Reported-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Closes: https://lore.kernel.org/bpf/djwjf2vfb7gro3rfag666bojod6ytcectahnb5z6hx2hawimtj@sx47ghzjg4lw/
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20260210235855.215679-1-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/profiler.h     | 2 +-
 tools/testing/selftests/bpf/progs/profiler.inc.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/profiler.h b/tools/testing/selftests/bpf/progs/profiler.h
index 3bac4fdd4bdf..637fbf2c2652 100644
--- a/tools/testing/selftests/bpf/progs/profiler.h
+++ b/tools/testing/selftests/bpf/progs/profiler.h
@@ -169,7 +169,7 @@ enum bpf_function_id {
 	profiler_bpf_sched_process_exec,
 	profiler_bpf_sched_process_exit,
 	profiler_bpf_sys_enter_kill,
-	profiler_bpf_do_filp_open_ret,
+	profiler_bpf_do_file_open_ret,
 	profiler_bpf_sched_process_fork,
 	profiler_bpf_vfs_link,
 	profiler_bpf_vfs_symlink,
diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h
index 813143b4985d..9044dd8aff11 100644
--- a/tools/testing/selftests/bpf/progs/profiler.inc.h
+++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
@@ -751,11 +751,11 @@ out:
 	return 0;
 }
 
-SEC("kretprobe/do_filp_open")
-int kprobe_ret__do_filp_open(struct pt_regs* ctx)
+SEC("kretprobe/do_file_open")
+int kprobe_ret__do_file_open(struct pt_regs *ctx)
 {
 	struct bpf_func_stats_ctx stats_ctx;
-	bpf_stats_enter(&stats_ctx, profiler_bpf_do_filp_open_ret);
+	bpf_stats_enter(&stats_ctx, profiler_bpf_do_file_open_ret);
 
 	struct file* filp = (struct file*)PT_REGS_RC_CORE(ctx);
 
-- 
cgit v1.2.3


From 2669dde7a8c67e3efe8052d75d6040de2cbb5e5a Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 11 Feb 2026 10:57:47 -0800
Subject: selftests/bpf: Fix map_kptr grace period wait

Commit c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast")
broke map_kptr selftest since it removed the function we were kprobing.
Use a new kfunc that invokes call_rcu_tasks_trace and sets a program
provided pointer to an integer to 1. Technically this can be unsafe if
the memory being written to from the callback disappears, but this is
just for usage in a test where we ensure we spin until we see the value
to be set to 1, so it's ok.

Reported-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Fixes: c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20260211185747.3630539-1-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/map_kptr.c  | 15 +++++------
 .../selftests/bpf/progs/rcu_tasks_trace_gp.c       | 30 +++-------------------
 .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 28 ++++++++++++++++++++
 .../selftests/bpf/test_kmods/bpf_testmod_kfunc.h   |  1 +
 4 files changed, 39 insertions(+), 35 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c
index f372162c0280..03b46f17cf53 100644
--- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c
@@ -118,15 +118,16 @@ exit:
 
 static int kern_sync_rcu_tasks_trace(struct rcu_tasks_trace_gp *rcu)
 {
-	long gp_seq = READ_ONCE(rcu->bss->gp_seq);
 	LIBBPF_OPTS(bpf_test_run_opts, opts);
+	int ret;
 
-	if (!ASSERT_OK(bpf_prog_test_run_opts(bpf_program__fd(rcu->progs.do_call_rcu_tasks_trace),
-					      &opts), "do_call_rcu_tasks_trace"))
+	WRITE_ONCE(rcu->bss->done, 0);
+	ret = bpf_prog_test_run_opts(bpf_program__fd(rcu->progs.call_rcu_tasks_trace), &opts);
+	if (!ASSERT_OK(ret, "call_rcu_tasks_trace"))
 		return -EFAULT;
-	if (!ASSERT_OK(opts.retval, "opts.retval == 0"))
+	if (!ASSERT_OK(opts.retval, "call_rcu_tasks_trace retval"))
 		return -EFAULT;
-	while (gp_seq == READ_ONCE(rcu->bss->gp_seq))
+	while (!READ_ONCE(rcu->bss->done))
 		sched_yield();
 	return 0;
 }
@@ -159,8 +160,6 @@ void serial_test_map_kptr(void)
 	skel = rcu_tasks_trace_gp__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "rcu_tasks_trace_gp__open_and_load"))
 		return;
-	if (!ASSERT_OK(rcu_tasks_trace_gp__attach(skel), "rcu_tasks_trace_gp__attach"))
-		goto end;
 
 	if (test__start_subtest("success-map")) {
 		test_map_kptr_success(true);
@@ -180,7 +179,5 @@ void serial_test_map_kptr(void)
 		test_map_kptr_success(true);
 	}
 
-end:
 	rcu_tasks_trace_gp__destroy(skel);
-	return;
 }
diff --git a/tools/testing/selftests/bpf/progs/rcu_tasks_trace_gp.c b/tools/testing/selftests/bpf/progs/rcu_tasks_trace_gp.c
index df4873558634..189c05c6abcc 100644
--- a/tools/testing/selftests/bpf/progs/rcu_tasks_trace_gp.c
+++ b/tools/testing/selftests/bpf/progs/rcu_tasks_trace_gp.c
@@ -1,36 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <vmlinux.h>
-#include <bpf/bpf_tracing.h>
 #include <bpf/bpf_helpers.h>
+#include "../test_kmods/bpf_testmod_kfunc.h"
 
-struct task_ls_map {
-	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
-	__type(key, int);
-	__type(value, int);
-} task_ls_map SEC(".maps");
-
-long gp_seq;
+int done;
 
 SEC("syscall")
-int do_call_rcu_tasks_trace(void *ctx)
-{
-    struct task_struct *current;
-    int *v;
-
-    current = bpf_get_current_task_btf();
-    v = bpf_task_storage_get(&task_ls_map, current, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE);
-    if (!v)
-        return 1;
-    /* Invoke call_rcu_tasks_trace */
-    return bpf_task_storage_delete(&task_ls_map, current);
-}
-
-SEC("kprobe/rcu_tasks_trace_postgp")
-int rcu_tasks_trace_postgp(void *ctx)
+int call_rcu_tasks_trace(void *ctx)
 {
-    __sync_add_and_fetch(&gp_seq, 1);
-    return 0;
+	return bpf_kfunc_call_test_call_rcu_tasks_trace(&done);
 }
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index 186a25ab429a..e62c6b78657f 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -18,6 +18,7 @@
 #include <linux/in6.h>
 #include <linux/un.h>
 #include <linux/filter.h>
+#include <linux/rcupdate_trace.h>
 #include <net/sock.h>
 #include <linux/namei.h>
 #include "bpf_testmod.h"
@@ -885,6 +886,32 @@ __bpf_kfunc void bpf_kfunc_call_test_sleepable(void)
 {
 }
 
+struct bpf_kfunc_rcu_tasks_trace_data {
+	struct rcu_head rcu;
+	int *done;
+};
+
+static void bpf_kfunc_rcu_tasks_trace_cb(struct rcu_head *rhp)
+{
+	struct bpf_kfunc_rcu_tasks_trace_data *data;
+
+	data = container_of(rhp, struct bpf_kfunc_rcu_tasks_trace_data, rcu);
+	WRITE_ONCE(*data->done, 1);
+	kfree(data);
+}
+
+__bpf_kfunc int bpf_kfunc_call_test_call_rcu_tasks_trace(int *done)
+{
+	struct bpf_kfunc_rcu_tasks_trace_data *data;
+
+	data = kmalloc(sizeof(*data), GFP_ATOMIC);
+	if (!data)
+		return -ENOMEM;
+	data->done = done;
+	call_rcu_tasks_trace(&data->rcu, bpf_kfunc_rcu_tasks_trace_cb);
+	return 0;
+}
+
 __bpf_kfunc int bpf_kfunc_init_sock(struct init_sock_args *args)
 {
 	int proto;
@@ -1222,6 +1249,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_sleepable, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_call_rcu_tasks_trace)
 BTF_ID_FLAGS(func, bpf_kfunc_init_sock, KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_kfunc_close_sock, KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_connect, KF_SLEEPABLE)
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
index d5c5454e257e..b393bf771131 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h
@@ -118,6 +118,7 @@ void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym;
 
 void bpf_kfunc_call_test_destructive(void) __ksym;
 void bpf_kfunc_call_test_sleepable(void) __ksym;
+int bpf_kfunc_call_test_call_rcu_tasks_trace(int *done) __ksym;
 
 void bpf_kfunc_call_test_offset(struct prog_test_ref_kfunc *p);
 struct prog_test_member *bpf_kfunc_call_memb_acquire(void);
-- 
cgit v1.2.3


From 0265c1fd912ee0ea0cb00c539fb73e99578a866d Mon Sep 17 00:00:00 2001
From: Menglong Dong <menglong8.dong@gmail.com>
Date: Sun, 8 Feb 2026 13:33:11 +0800
Subject: selftests/bpf: enable fsession_test on riscv64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that the RISC-V trampoline JIT supports BPF_TRACE_FSESSION, run
the fsession selftest on riscv64 as well as x86_64.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Tested-by: Björn Töpel <bjorn@kernel.org>
Acked-by: Björn Töpel <bjorn@kernel.org>
Link: https://lore.kernel.org/r/20260208053311.698352-4-dongml2@chinatelecom.cn
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Pu Lehui <pulehui@huawei.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/get_func_args_test.c | 2 +-
 tools/testing/selftests/bpf/progs/get_func_ip_test.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c
index 180ba5098ca1..075a1180ec26 100644
--- a/tools/testing/selftests/bpf/progs/get_func_args_test.c
+++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c
@@ -167,7 +167,7 @@ int BPF_PROG(tp_test2)
 }
 
 __u64 test7_result = 0;
-#if defined(bpf_target_x86) || defined(bpf_target_arm64)
+#if defined(bpf_target_x86) || defined(bpf_target_arm64) || defined(bpf_target_riscv)
 SEC("fsession/bpf_fentry_test1")
 int BPF_PROG(test7)
 {
diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
index 43ff836a8ed8..45eaa54d1ac7 100644
--- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c
+++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
@@ -106,7 +106,7 @@ int BPF_URETPROBE(test8, int ret)
 
 __u64 test9_entry_result = 0;
 __u64 test9_exit_result = 0;
-#if defined(bpf_target_x86) || defined(bpf_target_arm64)
+#if defined(bpf_target_x86) || defined(bpf_target_arm64) || defined(bpf_target_riscv)
 SEC("fsession/bpf_fentry_test1")
 int BPF_PROG(test9, int a)
 {
-- 
cgit v1.2.3


From cabd76bbc03617e55c25f0b06167aa5e0b911a36 Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Sat, 14 Feb 2026 07:32:05 +0000
Subject: tools/sched_ext: scx_flatcg: fix potential stack overflow from VLA in
 fcg_read_stats

fcg_read_stats() had a VLA allocating 21 * nr_cpus * 8 bytes on the
stack, risking stack overflow on large CPU counts (nr_cpus can be up
to 512).

Fix by using a single heap allocation with the correct size, reusing
it across all stat indices, and freeing it at the end.

Signed-off-by: David Carlier <devnexen@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_flatcg.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index bea76d060201..a8446509949e 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -102,22 +102,27 @@ static float read_cpu_util(__u64 *last_sum, __u64 *last_idle)
 
 static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats)
 {
-	__u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus];
+	__u64 *cnts;
 	__u32 idx;
 
+	cnts = calloc(skel->rodata->nr_cpus, sizeof(__u64));
+	if (!cnts)
+		return;
+
 	memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS);
-	memset(cnts, 0, sizeof(cnts));
 
 	for (idx = 0; idx < FCG_NR_STATS; idx++) {
 		int ret, cpu;
 
 		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
-					  &idx, cnts[idx]);
+					  &idx, cnts);
 		if (ret < 0)
 			continue;
 		for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++)
-			stats[idx] += cnts[idx][cpu];
+			stats[idx] += cnts[cpu];
 	}
+
+	free(cnts);
 }
 
 int main(int argc, char **argv)
-- 
cgit v1.2.3


From 07676846132340c7d0f50eca189a24cea4ae3cd8 Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Sat, 14 Feb 2026 08:00:33 +0000
Subject: tools/sched_ext: scx_userland: fix stale data on restart

Reset all counters, tasks and vruntime_head list on restart.

Signed-off-by: David Carlier <devnexen@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_userland.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
index 63f89b35d999..504a80824f5c 100644
--- a/tools/sched_ext/scx_userland.c
+++ b/tools/sched_ext/scx_userland.c
@@ -375,6 +375,14 @@ static void pre_bootstrap(int argc, char **argv)
 static void bootstrap(char *comm)
 {
 	exit_req = 0;
+	min_vruntime = 0.0;
+	nr_vruntime_enqueues = 0;
+	nr_vruntime_dispatches = 0;
+	nr_vruntime_failed = 0;
+	nr_curr_enqueued = 0;
+	memset(tasks, 0, pid_max * sizeof(*tasks));
+	LIST_INIT(&vruntime_head);
+
 	skel = SCX_OPS_OPEN(userland_ops, scx_userland);
 
 	skel->rodata->num_possible_cpus = libbpf_num_possible_cpus();
-- 
cgit v1.2.3


From e7a3c1adc127f9f91a35169d34f7471d417d72a6 Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobbyeshleman@meta.com>
Date: Wed, 11 Feb 2026 17:00:44 -0800
Subject: selftests: drv-net: add HDS payload sweep test for devmem TCP

Add check_rx_hds test that verifies header/data split works across
payload sizes. The test sweeps payload sizes from 1 byte to 8KB, if any
data propagates up to userspace as SCM_DEVMEM_LINEAR, then the test
fails. This shows that regardless of payload size, ncdevmem's
configuration of hds-thresh to 0 is respected.

Add -L (--fail-on-linear) flag to ncdevmem that causes the receiver to
fail if any SCM_DEVMEM_LINEAR cmsg is received.

Use socat option for fixed block sizing and tcp nodelay to disable
nagle's algo to avoid buffering.

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Link: https://patch.msgid.link/20260211-fbnic-tcp-hds-fixes-v1-4-55d050e6f606@meta.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/drivers/net/hw/devmem.py  | 19 ++++++++++++++++++-
 tools/testing/selftests/drivers/net/hw/ncdevmem.c | 11 ++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py
index 45c2d49d55b6..ee863e90d1e0 100755
--- a/tools/testing/selftests/drivers/net/hw/devmem.py
+++ b/tools/testing/selftests/drivers/net/hw/devmem.py
@@ -63,12 +63,29 @@ def check_tx_chunks(cfg) -> None:
     ksft_eq(socat.stdout.strip(), "hello\nworld")
 
 
+def check_rx_hds(cfg) -> None:
+    """Test HDS splitting across payload sizes."""
+    require_devmem(cfg)
+
+    for size in [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]:
+        port = rand_port()
+        listen_cmd = f"{cfg.bin_local} -L -l -f {cfg.ifname} -s {cfg.addr} -p {port}"
+
+        with bkg(listen_cmd, exit_wait=True) as ncdevmem:
+            wait_port_listen(port)
+            cmd(f"dd if=/dev/zero bs={size} count=1 2>/dev/null | " +
+                f"socat -b {size} -u - TCP{cfg.addr_ipver}:{cfg.baddr}:{port},nodelay",
+                host=cfg.remote, shell=True)
+
+        ksft_eq(ncdevmem.ret, 0, f"HDS failed for payload size {size}")
+
+
 def main() -> None:
     with NetDrvEpEnv(__file__) as cfg:
         cfg.bin_local = path.abspath(path.dirname(__file__) + "/ncdevmem")
         cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
 
-        ksft_run([check_rx, check_tx, check_tx_chunks],
+        ksft_run([check_rx, check_tx, check_tx_chunks, check_rx_hds],
                  args=(cfg, ))
     ksft_exit()
 
diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c
index 16864c844108..e098d6534c3c 100644
--- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c
+++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c
@@ -98,6 +98,7 @@ static unsigned int ifindex;
 static unsigned int dmabuf_id;
 static uint32_t tx_dmabuf_id;
 static int waittime_ms = 500;
+static bool fail_on_linear;
 
 /* System state loaded by current_config_load() */
 #define MAX_FLOWS	8
@@ -975,6 +976,11 @@ static int do_server(struct memory_buffer *mem)
 					"SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n",
 					dmabuf_cmsg->frag_size);
 
+				if (fail_on_linear) {
+					pr_err("received SCM_DEVMEM_LINEAR but --fail-on-linear (-L) set");
+					goto err_close_client;
+				}
+
 				continue;
 			}
 
@@ -1398,8 +1404,11 @@ int main(int argc, char *argv[])
 	int is_server = 0, opt;
 	int ret, err = 1;
 
-	while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:z:")) != -1) {
+	while ((opt = getopt(argc, argv, "Lls:c:p:v:q:t:f:z:")) != -1) {
 		switch (opt) {
+		case 'L':
+			fail_on_linear = true;
+			break;
 		case 'l':
 			is_server = 1;
 			break;
-- 
cgit v1.2.3


From a8470953b4caf52b32d27e2a23797824b312a325 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Fri, 13 Feb 2026 09:00:31 +0200
Subject: selftests: forwarding: bridge_mdb_max: add tests for mdb_n_entries
 warning

Recently we were able to trigger a warning in the mdb_n_entries counting
code. Add tests that exercise different ways which used to trigger that
warning.

Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Link: https://patch.msgid.link/20260213070031.1400003-3-nikolay@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../selftests/net/forwarding/bridge_mdb_max.sh     | 90 +++++++++++++++++++++-
 1 file changed, 88 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh b/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh
index 3da9d93ab36f..625162fd7e8b 100755
--- a/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_mdb_max.sh
@@ -28,6 +28,7 @@ ALL_TESTS="
 	test_8021d
 	test_8021q
 	test_8021qvs
+	test_mdb_count_warning
 "
 
 NUM_NETIFS=4
@@ -83,8 +84,6 @@ switch_create_8021q()
 {
 	local br_flags=$1; shift
 
-	log_info "802.1q $br_flags${br_flags:+ }tests"
-
 	ip link add name br0 type bridge vlan_filtering 1 vlan_default_pvid 0 \
 		mcast_snooping 1 $br_flags \
 		mcast_igmp_version 3 mcast_mld_version 2
@@ -106,6 +105,7 @@ switch_create_8021q()
 
 switch_create_8021qvs()
 {
+	log_info "802.1q mcast_vlan_snooping 1 tests"
 	switch_create_8021q "mcast_vlan_snooping 1"
 	bridge vlan global set dev br0 vid 10 mcast_igmp_version 3
 	bridge vlan global set dev br0 vid 10 mcast_mld_version 2
@@ -1272,6 +1272,76 @@ test_8021qvs_toggle_vlan_snooping()
 	test_toggle_vlan_snooping_permanent
 }
 
+mdb_count_check_warn()
+{
+	local msg=$1; shift
+
+	dmesg | grep -q "WARNING:.*br_multicast_port_ngroups_dec.*"
+	check_fail $? "$msg"
+}
+
+test_mdb_count_mcast_vlan_snooping_flush()
+{
+	RET=0
+
+	# check if we already have a warning
+	mdb_count_check_warn "Check MDB entries count warning before test"
+
+	bridge mdb add dev br0 port "$swp1" grp 239.0.0.1 permanent vid 10
+	ip link set dev br0 down
+	ip link set dev br0 type bridge mcast_vlan_snooping 1
+	bridge mdb flush dev br0
+
+	mdb_count_check_warn "Check MDB entries count warning after test"
+
+	ip link set dev br0 type bridge mcast_vlan_snooping 0
+	ip link set dev br0 up
+
+	log_test "MDB count warning: mcast_vlan_snooping and MDB flush"
+}
+
+test_mdb_count_mcast_snooping_flush()
+{
+	RET=0
+
+	# check if we already have a warning
+	mdb_count_check_warn "Check MDB entries count warning before test"
+
+	bridge mdb add dev br0 port "$swp1" grp 239.0.0.1 permanent vid 10
+	ip link set dev br0 type bridge mcast_snooping 0
+	ip link set dev br0 type bridge mcast_vlan_snooping 1
+	bridge mdb flush dev br0
+
+	mdb_count_check_warn "Check MDB entries count warning after test"
+
+	ip link set dev br0 type bridge mcast_vlan_snooping 0
+	ip link set dev br0 type bridge mcast_snooping 1
+
+	log_test "MDB count warning: mcast_snooping and MDB flush"
+}
+
+test_mdb_count_vlan_state_flush()
+{
+	RET=0
+
+	# check if we already have a warning
+	mdb_count_check_warn "Check MDB entries count warning before test"
+
+	bridge mdb add dev br0 port "$swp1" grp 239.0.0.1 permanent vid 10
+	ip link set dev br0 down
+	bridge vlan set vid 10 dev "$swp1" state blocking
+	ip link set dev br0 type bridge mcast_vlan_snooping 1
+	ip link set dev br0 up
+	bridge mdb flush dev br0
+
+	mdb_count_check_warn "Check MDB entries count warning after test"
+
+	bridge vlan set vid 10 dev "$swp1" state forwarding
+	ip link set dev br0 type bridge mcast_vlan_snooping 0
+
+	log_test "MDB count warning: disabled vlan state and MDB flush"
+}
+
 # test groups
 
 test_8021d()
@@ -1297,6 +1367,7 @@ test_8021q()
 {
 	# Tests for vlan_filtering 1 mcast_vlan_snooping 0.
 
+	log_info "802.1q tests"
 	switch_create_8021q
 	setup_wait
 
@@ -1334,6 +1405,21 @@ test_8021qvs()
 	switch_destroy
 }
 
+test_mdb_count_warning()
+{
+	# Tests for mdb_n_entries warning
+
+	log_info "MDB count warning tests"
+	switch_create_8021q
+	setup_wait
+
+	test_mdb_count_mcast_vlan_snooping_flush
+	test_mdb_count_mcast_snooping_flush
+	test_mdb_count_vlan_state_flush
+
+	switch_destroy
+}
+
 if ! bridge link help 2>&1 | grep -q "mcast_max_groups"; then
 	echo "SKIP: iproute2 too old, missing bridge \"mcast_max_groups\" support"
 	exit $ksft_skip
-- 
cgit v1.2.3


From 02cb2e6bacbb08ebf6acb61be816efd11e1f4a21 Mon Sep 17 00:00:00 2001
From: Aleksei Oladko <aleksey.oladko@virtuozzo.com>
Date: Fri, 13 Feb 2026 13:19:05 +0000
Subject: selftests: forwarding: vxlan_bridge_1d: fix test failure with
 br_netfilter enabled

The test generates VXLAN traffic using mausezahn, where the encapsulated
inner IPv4 packet contains a zero IP header checksum. After VXLAN
decapsulation, such packets do not pass sanity checks in br_netfilter
and are dropped, which causes the test to fail.

Fix this by calculating and setting a valid IPv4 header checksum for the
encapsulated packet generated by mausezahn, so that the packet is accepted
by br_netfilter. Fixed by using the payload_template_calc_checksum() /
payload_template_expand_checksum() helpers that are only available
in v6.3 and newer kernels.

Fixes: a0b61f3d8ebf ("selftests: forwarding: vxlan_bridge_1d: Add an ECN decap test")
Signed-off-by: Aleksei Oladko <aleksey.oladko@virtuozzo.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20260213131907.43351-2-aleksey.oladko@virtuozzo.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .../selftests/net/forwarding/vxlan_bridge_1d.sh    | 26 +++++++++++++---------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh
index b43816dd998c..457f41d5e584 100755
--- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh
@@ -567,6 +567,21 @@ vxlan_encapped_ping_do()
 	local inner_tos=$1; shift
 	local outer_tos=$1; shift
 
+	local ipv4hdr=$(:
+		    )"45:"$(                      : IP version + IHL
+		    )"$inner_tos:"$(              : IP TOS
+		    )"00:54:"$(                   : IP total length
+		    )"99:83:"$(                   : IP identification
+		    )"40:00:"$(                   : IP flags + frag off
+		    )"40:"$(                      : IP TTL
+		    )"01:"$(                      : IP proto
+		    )"CHECKSUM:"$(                : IP header csum
+		    )"c0:00:02:03:"$(             : IP saddr: 192.0.2.3
+		    )"c0:00:02:01"$(              : IP daddr: 192.0.2.1
+		)
+	local checksum=$(payload_template_calc_checksum "$ipv4hdr")
+	ipv4hdr=$(payload_template_expand_checksum "$ipv4hdr" $checksum)
+
 	$MZ $dev -c $count -d 100msec -q \
 		-b $next_hop_mac -B $dest_ip \
 		-t udp tos=$outer_tos,sp=23456,dp=$VXPORT,p=$(:
@@ -577,16 +592,7 @@ vxlan_encapped_ping_do()
 		    )"$dest_mac:"$(               : ETH daddr
 		    )"$(mac_get w2):"$(           : ETH saddr
 		    )"08:00:"$(                   : ETH type
-		    )"45:"$(                      : IP version + IHL
-		    )"$inner_tos:"$(              : IP TOS
-		    )"00:54:"$(                   : IP total length
-		    )"99:83:"$(                   : IP identification
-		    )"40:00:"$(                   : IP flags + frag off
-		    )"40:"$(                      : IP TTL
-		    )"01:"$(                      : IP proto
-		    )"00:00:"$(                   : IP header csum
-		    )"c0:00:02:03:"$(             : IP saddr: 192.0.2.3
-		    )"c0:00:02:01:"$(             : IP daddr: 192.0.2.1
+		    )"$ipv4hdr:"$(                : IPv4 header
 		    )"08:"$(                      : ICMP type
 		    )"00:"$(                      : ICMP code
 		    )"8b:f2:"$(                   : ICMP csum
-- 
cgit v1.2.3


From ce9f6aec0fb780dafc1dfc5f47c688422aff464a Mon Sep 17 00:00:00 2001
From: Aleksei Oladko <aleksey.oladko@virtuozzo.com>
Date: Fri, 13 Feb 2026 13:19:06 +0000
Subject: selftests: forwarding: vxlan_bridge_1d_ipv6: fix test failure with
 br_netfilter enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The test generates VXLAN traffic using mausezahn, where the encapsulated
inner IPv6 packet has an incorrect payload length set in the IPv6 header.
After VXLAN decapsulation, such packets do not pass sanity checks in
br_netfilter and are dropped, which causes the test to fail.

Fix this by setting the correct IPv6 payload length for the encapsulated
packet generated by mausezahn, so that the packet is accepted
by br_netfilter.

tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh
lines 698-706

              )"00:03:"$(           : Payload length
              )"3a:"$(              : Next header
              )"04:"$(              : Hop limit
              )"$saddr:"$(          : IP saddr
              )"$daddr:"$(          : IP daddr
              )"80:"$(              : ICMPv6.type
              )"00:"$(              : ICMPv6.code
              )"00:"$(              : ICMPv6.checksum
              )

Data after IPv6 header:
• 80: — 1 byte (ICMPv6 type)
• 00: — 1 byte (ICMPv6 code)
• 00: — 1 byte (ICMPv6 checksum, truncated)

Total: 3 bytes → 00:03 is correct. The old value 00:08 did not match
the actual payload size.

Fixes: b07e9957f220 ("selftests: forwarding: Add VxLAN tests with a VLAN-unaware bridge for IPv6")
Signed-off-by: Aleksei Oladko <aleksey.oladko@virtuozzo.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20260213131907.43351-3-aleksey.oladko@virtuozzo.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh
index a603f7b0a08f..e642feeada0e 100755
--- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_ipv6.sh
@@ -695,7 +695,7 @@ vxlan_encapped_ping_do()
 		    )"6"$(			  : IP version
 		    )"$inner_tos"$(               : Traffic class
 		    )"0:00:00:"$(                 : Flow label
-		    )"00:08:"$(                   : Payload length
+		    )"00:03:"$(                   : Payload length
 		    )"3a:"$(                      : Next header
 		    )"04:"$(                      : Hop limit
 		    )"$saddr:"$(		  : IP saddr
-- 
cgit v1.2.3


From a8c198d16c64cdf57f481a4cd3e769502802369e Mon Sep 17 00:00:00 2001
From: Aleksei Oladko <aleksey.oladko@virtuozzo.com>
Date: Fri, 13 Feb 2026 13:19:07 +0000
Subject: selftests: forwarding: fix pedit tests failure with br_netfilter
 enabled

The tests use the tc pedit action to modify the IPv4 source address
("pedit ex munge ip src set"), but the IP header checksum is not
recalculated after the modification. As a result, the modified packet
fails sanity checks in br_netfilter after bridging and is dropped,
which causes the test to fail.

Fix this by ensuring net.bridge.bridge-nf-call-iptables is set to 0
during the test execution. This prevents the bridge from passing
L2 traffic to netfilter, bypassing the checksum validation that
causes the test failure.

Fixes: 92ad3828944e ("selftests: forwarding: Add a test for pedit munge SIP and DIP")
Fixes: 226657ba2389 ("selftests: forwarding: Add a forwarding test for pedit munge dsfield")
Signed-off-by: Aleksei Oladko <aleksey.oladko@virtuozzo.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20260213131907.43351-4-aleksey.oladko@virtuozzo.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/forwarding/pedit_dsfield.sh | 8 ++++++++
 tools/testing/selftests/net/forwarding/pedit_ip.sh      | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh
index af008fbf2725..eb2d8034de9c 100755
--- a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh
+++ b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh
@@ -98,12 +98,20 @@ setup_prepare()
 	h1_create
 	h2_create
 	switch_create
+
+	if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then
+		sysctl_set net.bridge.bridge-nf-call-iptables 0
+	fi
 }
 
 cleanup()
 {
 	pre_cleanup
 
+	if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then
+		sysctl_restore net.bridge.bridge-nf-call-iptables
+	fi
+
 	switch_destroy
 	h2_destroy
 	h1_destroy
diff --git a/tools/testing/selftests/net/forwarding/pedit_ip.sh b/tools/testing/selftests/net/forwarding/pedit_ip.sh
index d14efb2d23b2..9235674627ab 100755
--- a/tools/testing/selftests/net/forwarding/pedit_ip.sh
+++ b/tools/testing/selftests/net/forwarding/pedit_ip.sh
@@ -91,12 +91,20 @@ setup_prepare()
 	h1_create
 	h2_create
 	switch_create
+
+	if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then
+		sysctl_set net.bridge.bridge-nf-call-iptables 0
+	fi
 }
 
 cleanup()
 {
 	pre_cleanup
 
+	if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then
+		sysctl_restore net.bridge.bridge-nf-call-iptables
+	fi
+
 	switch_destroy
 	h2_destroy
 	h1_destroy
-- 
cgit v1.2.3


From 7cefbb47ccb2ded187f72db17a46f19d7cf4bf08 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Tue, 17 Feb 2026 15:43:44 -0500
Subject: libbpf: Do not use PROG_TYPE_TRACEPOINT program for feature gating

Commit 728ff167910e uses a PROG_TYPE_TRACEPOINT BPF test program to
check whether the running kernel supports large LDIMM64 offsets. The
feature gate incorrectly assumes that the program will fail at
verification time with one of two messages, depending on whether the
feature is supported by the running kernel. However,
PROG_TYPE_TRACEPOINT programs may fail to load before verification even
starts, e.g., if the shell does not have the appropriate capabilities.
Use a BPF_PROG_TYPE_SOCKET_FILTER program for the feature gate instead.

Also fix two minor issues. First, ensure the log buffer for the test is
initialized: Failing program load before verification led to libbpf dumping
uninitialized data to stdout. Also, ensure that close() is only called
for program_fd in the probe if the program load actually succeeded. The
call was currently failing silently with -EBADF most of the time.

Fixes: 728ff167910e ("libbpf: Add gating for arena globals relocation feature")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260217204345.548648-2-emil@etsalapatis.com
---
 tools/lib/bpf/features.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c
index b65ab109e3ff..2fa434f09cce 100644
--- a/tools/lib/bpf/features.c
+++ b/tools/lib/bpf/features.c
@@ -536,14 +536,15 @@ static int probe_ldimm64_full_range_off(int token_fd)
 	}
 	insns[0].imm = map_fd;
 
-	prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, "global_reloc", "GPL", insns, insn_cnt, &prog_opts);
+	log_buf[0] = '\0';
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "global_reloc", "GPL", insns, insn_cnt, &prog_opts);
 	ret = -errno;
 
 	close(map_fd);
-	close(prog_fd);
 
 	if (prog_fd >= 0) {
 		pr_warn("Error in %s(): Program loading unexpectedly succeeded.\n", __func__);
+		close(prog_fd);
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From d7988720ef3ea5926f1b886b27eddf08abbadba0 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <emil@etsalapatis.com>
Date: Tue, 17 Feb 2026 15:43:45 -0500
Subject: libbpf: Delay feature gate check until object prepare time

Commit 728ff167910e ("libbpf: Add gating for arena globals relocation feature")
adds a feature gate check that loads a map and BPF program to
test the running kernel supports large direct offsets for LDIMM64
instructions. This check is currently used to calculate arena symbol
offsets during bpf_object__collect_relos, itself called by
bpf_object_open.

However, the program calling bpf_object_open may not have the permissions to
load maps and programs. This is the case with the BPF selftests, where
bpftool is invoked at compilation time during skeleton generation. This
causes errors as the feature gate unexpectedly fails with -EPERM.

Avoid this by moving all the use of the FEAT_LDIMM64_FULL_RANGE_OFF feature gate
to BPF object preparation time instead.

Fixes: 728ff167910e ("libbpf: Add gating for arena globals relocation feature")
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20260217204345.548648-3-emil@etsalapatis.com
---
 tools/lib/bpf/libbpf.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 93e59ed8d9a1..0be7017800fe 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -3009,12 +3009,6 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map,
 	memcpy(obj->arena_data, data, data_sz);
 	obj->arena_data_sz = data_sz;
 
-	/* place globals at the end of the arena (if supported) */
-	if (kernel_supports(obj, FEAT_LDIMM64_FULL_RANGE_OFF))
-		obj->arena_data_off = mmap_sz - data_alloc_sz;
-	else
-		obj->arena_data_off = 0;
-
 	/* make bpf_map__init_value() work for ARENA maps */
 	map->mmaped = obj->arena_data;
 
@@ -4672,7 +4666,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog,
 		reloc_desc->type = RELO_DATA;
 		reloc_desc->insn_idx = insn_idx;
 		reloc_desc->map_idx = obj->arena_map_idx;
-		reloc_desc->sym_off = sym->st_value + obj->arena_data_off;
+		reloc_desc->sym_off = sym->st_value;
 
 		map = &obj->maps[obj->arena_map_idx];
 		pr_debug("prog '%s': found arena map %d (%s, sec %d, off %zu) for insn %u\n",
@@ -6386,6 +6380,10 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
 		case RELO_DATA:
 			map = &obj->maps[relo->map_idx];
 			insn[1].imm = insn[0].imm + relo->sym_off;
+
+			if (relo->map_idx == obj->arena_map_idx)
+				insn[1].imm += obj->arena_data_off;
+
 			if (obj->gen_loader) {
 				insn[0].src_reg = BPF_PSEUDO_MAP_IDX_VALUE;
 				insn[0].imm = relo->map_idx;
@@ -7387,6 +7385,14 @@ static int bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_pat
 		bpf_object__sort_relos(obj);
 	}
 
+	/* place globals at the end of the arena (if supported) */
+	if (obj->arena_map_idx >= 0 && kernel_supports(obj, FEAT_LDIMM64_FULL_RANGE_OFF)) {
+		struct bpf_map *arena_map = &obj->maps[obj->arena_map_idx];
+
+		obj->arena_data_off = bpf_map_mmap_sz(arena_map) -
+				      roundup(obj->arena_data_sz, sysconf(_SC_PAGE_SIZE));
+	}
+
 	/* Before relocating calls pre-process relocations and mark
 	 * few ld_imm64 instructions that points to subprogs.
 	 * Otherwise bpf_object__reloc_code() later would have to consider
-- 
cgit v1.2.3


From 3b39d73cc3379360a33eb583b17f21fe55e1288e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 17 Feb 2026 11:41:50 -0800
Subject: bpftool: Fix truncated netlink dumps

Netlink requires that the recv buffer used during dumps is at least
min(PAGE_SIZE, 8k) (see the man page). Otherwise the messages will
get truncated. Make sure bpftool follows this requirement, avoid
missing information on systems with large pages.

Acked-by: Quentin Monnet <qmo@kernel.org>
Fixes: 7084566a236f ("tools/bpftool: Remove libbpf_internal.h usage in bpftool")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20260217194150.734701-1-kuba@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/bpf/bpftool/net.c | 5 ++++-
 tools/lib/bpf/netlink.c | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index f25d66c8395e..974189da8a91 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -156,7 +156,7 @@ static int netlink_recv(int sock, __u32 nl_pid, __u32 seq,
 	bool multipart = true;
 	struct nlmsgerr *err;
 	struct nlmsghdr *nh;
-	char buf[4096];
+	char buf[8192];
 	int len, ret;
 
 	while (multipart) {
@@ -201,6 +201,9 @@ static int netlink_recv(int sock, __u32 nl_pid, __u32 seq,
 					return ret;
 			}
 		}
+
+		if (len)
+			p_err("Invalid message or trailing data in Netlink response: %d bytes left", len);
 	}
 	ret = 0;
 done:
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index c997e69d507f..c9a78fb16f11 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -143,7 +143,7 @@ static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq,
 	struct nlmsghdr *nh;
 	int len, ret;
 
-	ret = alloc_iov(&iov, 4096);
+	ret = alloc_iov(&iov, 8192);
 	if (ret)
 		goto done;
 
@@ -212,6 +212,8 @@ start:
 				}
 			}
 		}
+		if (len)
+			pr_warn("Invalid message or trailing data in Netlink response: %d bytes left\n", len);
 	}
 	ret = 0;
 done:
-- 
cgit v1.2.3


From 32b70e62034aa72f8414ad4e9122cce7ad418c48 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 13 Feb 2026 19:51:59 -0800
Subject: selftests: tc_actions: don't dump 2MB of \0 to stdout

Since we started running selftests in NIPA we have been seeing
tc_actions.sh generate a soft lockup warning on ~20% of the runs.
On the pre-netdev foundation setup it was actually a missed irq
splat from the console. Now it's either that or a lockup.

I initially suspected a socket locking issue since the test
is exercising local loopback with act_mirred.
After hours of staring at this I noticed in strace that ncat
when -o $file is specified _both_ saves the output to the file
and still prints it to stdout. Because the file being sent
is constructed with:

  dd conv=sparse status=none if=/dev/zero bs=1M count=2 of=$mirred
                                ^^^^^^^^^

the data printed is all \0. Most terminals don't display nul
characters (and neither does vng output capture save them).
But QEMU's serial console still has to poke them thru which
is very slow and causes the lockup (if the file is >600kB).

Replace the '-o $file' with '> $file'. This speeds the test up
from 2m20s to 18s on debug kernels, and prevents the warnings.

Fixes: ca22da2fbd69 ("act_mirred: use the backlog for nested calls to mirred ingress")
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20260214035159.2119699-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/forwarding/tc_actions.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh
index ea89e558672d..86edbc7e2489 100755
--- a/tools/testing/selftests/net/forwarding/tc_actions.sh
+++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
@@ -223,7 +223,7 @@ mirred_egress_to_ingress_tcp_test()
 		ip_proto icmp \
 			action drop
 
-	ip vrf exec v$h1 ncat --recv-only -w10 -l -p 12345 -o $mirred_e2i_tf2 &
+	ip vrf exec v$h1 ncat --recv-only -w10 -l -p 12345 > $mirred_e2i_tf2 &
 	local rpid=$!
 	ip vrf exec v$h1 ncat -w1 --send-only 192.0.2.2 12345 <$mirred_e2i_tf1
 	wait -n $rpid
-- 
cgit v1.2.3


From ef0e60083f768b32cda17b1b0ca9519405db89a6 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Tue, 17 Feb 2026 20:31:51 -0600
Subject: tools/power turbostat: Fix AMD RAPL regression

turbostat.c:8688: rapl_perf_init: Assertion `next_domain < num_domains' failed.

Two recent cleanup patches that were not supposed to change anything
broke the core_id code needed for AMD RAPL initialization:

commit 070e92361eec ("tools/power turbostat: Enhance HT enumeration")
commit ddf60e38ca04 ("tools/power turbostat: Simplify global core_id calculation")

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'tools')

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 1aace9b3269e..1a2671c28209 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -5164,7 +5164,7 @@ static inline int get_rapl_domain_id(int cpu)
 	if (!platform->has_per_core_rapl)
 		return cpus[cpu].package_id;
 
-	return GLOBAL_CORE_ID(cpu, cpus[cpu].package_id);
+	return GLOBAL_CORE_ID(cpus[cpu].core_id, cpus[cpu].package_id);
 }
 
 /*
@@ -9633,7 +9633,6 @@ void topology_probe(bool startup)
 	}
 	topo.max_core_id = max_core_id;	/* within a package */
 	topo.max_package_id = max_package_id;
-	topo.num_cores = (max_core_id + 1) * topo.num_packages;	/* per system */
 
 	topo.cores_per_node = max_core_id + 1;
 	if (debug > 1)
-- 
cgit v1.2.3


From 55a24d9203979d1cd0196ba1d189860e8b828c2e Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Tue, 17 Feb 2026 19:48:00 +0000
Subject: tools/sched_ext: scx_central: fix CPU_SET and skeleton leak on early
 exit

Use CPU_SET_S() instead of CPU_SET() on the dynamically allocated
cpuset to avoid a potential out-of-bounds write when nr_cpu_ids
exceeds CPU_SETSIZE.

Also destroy the skeleton before returning on invalid central CPU ID
to prevent a resource leak.

Signed-off-by: David Carlier <devnexen@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_central.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index a6dfd45de70c..39f21b00a208 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -74,6 +74,7 @@ restart:
 			u32 central_cpu = strtoul(optarg, NULL, 0);
 			if (central_cpu >= skel->rodata->nr_cpu_ids) {
 				fprintf(stderr, "invalid central CPU id value, %u given (%u max)\n", central_cpu, skel->rodata->nr_cpu_ids);
+				scx_central__destroy(skel);
 				return -1;
 			}
 			skel->rodata->central_cpu = (s32)central_cpu;
@@ -109,7 +110,7 @@ restart:
 	SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
 	cpuset_size = CPU_ALLOC_SIZE(skel->rodata->nr_cpu_ids);
 	CPU_ZERO_S(cpuset_size, cpuset);
-	CPU_SET(skel->rodata->central_cpu, cpuset);
+	CPU_SET_S(skel->rodata->central_cpu, cpuset_size, cpuset);
 	SCX_BUG_ON(sched_setaffinity(0, cpuset_size, cpuset),
 		   "Failed to affinitize to central CPU %d (max %d)",
 		   skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
-- 
cgit v1.2.3


From 625be3456b3ced6e2dca6166962c0cf6cc2e546d Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Tue, 17 Feb 2026 20:08:36 +0000
Subject: tools/sched_ext: scx_pair: fix stride == 0 crash on single-CPU
 systems

nr_cpu_ids / 2 produces stride 0 on a single-CPU system, which later
causes SCX_BUG_ON(i == j) to fire. Validate stride after option
parsing to also catch invalid user-supplied values via -S.

Signed-off-by: David Carlier <devnexen@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_pair.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
index d3e97faa6334..2a82d8a8a0aa 100644
--- a/tools/sched_ext/scx_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -56,7 +56,6 @@ restart:
 	skel = SCX_OPS_OPEN(pair_ops, scx_pair);
 
 	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
-	assert(skel->rodata->nr_cpu_ids > 0);
 	skel->rodata->pair_batch_dur_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
 
 	/* pair up the earlier half to the latter by default, override with -s */
@@ -76,6 +75,12 @@ restart:
 		}
 	}
 
+	/* Stride must be positive to pair distinct CPUs. */
+	if (stride <= 0) {
+		fprintf(stderr, "Invalid stride %d, must be positive\n", stride);
+		scx_pair__destroy(skel);
+		return -1;
+	}
 	bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2);
 
 	/* Resize arrays so their element count is equal to cpu count. */
-- 
cgit v1.2.3


From b3dfa128f7da7c4dd371a4aff685cd249604e029 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Wed, 18 Feb 2026 13:56:50 -0800
Subject: selftests/bpf: Use vmlinux.h in test_xdp_meta

- Replace linux/* includes with vmlinux.h
- Include errno.h
- Include bpf_tracing_net.h for TC_ACT_* and ETH_*
- Use BPF_STDERR instead of BPF_STREAM_STDERR

Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20260218215651.2057673-2-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/test_xdp_meta.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c
index 0a0f371a2dec..fa73b17cb999 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c
@@ -1,12 +1,12 @@
-#include <stdbool.h>
-#include <linux/bpf.h>
-#include <linux/errno.h>
-#include <linux/if_ether.h>
-#include <linux/pkt_cls.h>
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
 
 #include <bpf/bpf_endian.h>
 #include <bpf/bpf_helpers.h>
+#include <errno.h>
+
 #include "bpf_kfuncs.h"
+#include "bpf_tracing_net.h"
 
 #define META_SIZE 32
 
@@ -42,7 +42,7 @@ static bool check_metadata(const char *file, int line, __u8 *meta_have)
 	if (!__builtin_memcmp(meta_have, meta_want, META_SIZE))
 		return true;
 
-	bpf_stream_printk(BPF_STREAM_STDERR,
+	bpf_stream_printk(BPF_STDERR,
 			  "FAIL:%s:%d: metadata mismatch\n"
 			  "  have:\n    %pI6\n    %pI6\n"
 			  "  want:\n    %pI6\n    %pI6\n",
-- 
cgit v1.2.3


From 0cecd492f5165d3e7a314b87e9b7787734eab324 Mon Sep 17 00:00:00 2001
From: Ihor Solodrai <ihor.solodrai@linux.dev>
Date: Wed, 18 Feb 2026 13:56:51 -0800
Subject: libbpf: Remove extern declaration of bpf_stream_vprintk()

An issue was reported that building BPF program which includes both
vmlinux.h and bpf_helpers.h from libbpf fails due to conflicting
declarations of bpf_stream_vprintk().

Remove the extern declaration from bpf_helpers.h to address this.

In order to use bpf_stream_printk() macro, BPF programs are expected
to either include vmlinux.h of the kernel they are targeting, or add
their own extern declaration.

Reported-by: Luca Boccassi <luca.boccassi@gmail.com>
Closes: https://github.com/libbpf/libbpf/issues/947
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/20260218215651.2057673-3-ihor.solodrai@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/bpf_helpers.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'tools')

diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index c145da05a67c..9d160b5b9c0e 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -315,9 +315,6 @@ enum libbpf_tristate {
 			  ___param, sizeof(___param));		\
 })
 
-extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args,
-			      __u32 len__sz) __weak __ksym;
-
 #define bpf_stream_printk(stream_id, fmt, args...)					\
 ({											\
 	static const char ___fmt[] = fmt;						\
-- 
cgit v1.2.3


From 1e5c009126952f673ffa2427acbd69e57493f0d2 Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Wed, 18 Feb 2026 13:01:44 +0100
Subject: selftests/bpf: Remove hexdump dependency

The verification signature header generation requires converting a
binary certificate to a C array. Previously this only worked with xxd,
and a switch to hexdump has been done in commit b640d556a2b3
("selftests/bpf: Remove xxd util dependency").

hexdump is a more common utility program, yet it might not be installed
by default. When it is not installed, BPF selftests build without
errors, but tests_progs is unusable: it exits with the 255 code and
without any error messages. When manually reproducing the issue, it is
not too hard to find out that the generated verification_cert.h file is
incorrect, but that's time consuming. When digging the BPF selftests
build logs, this line can be seen amongst thousands others, but ignored:

  /bin/sh: 2: hexdump: not found

Here, od is used instead of hexdump. od is coming from the coreutils
package, and this new od command produces the same output when using od
from GNU coreutils, uutils, and even busybox. This is more portable, and
it produces a similar results to what was done before with hexdump:
there is an extra comma at the end instead of trailing whitespaces,
but the C code is not impacted.

Fixes: b640d556a2b3 ("selftests/bpf: Remove xxd util dependency")
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/r/20260218-bpf-sft-hexdump-od-v2-1-2f9b3ee5ab86@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index c6bf4dfb1495..6776158f1f3e 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -723,7 +723,7 @@ $(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP)
 # Generates a header with C array declaration, containing test_progs_verification_cert bytes
 $(VERIFY_SIG_HDR): $(VERIFICATION_CERT)
 	$(Q)(echo "unsigned char test_progs_verification_cert[] = {"; \
-	 hexdump -v -e '12/1 "  0x%02x," "\n"' $< | sed 's/0x  ,//g; $$s/,$$//'; \
+	 od -v -t 'xC' -w12 $< | sed 's/ \(\S\+\)/ 0x\1,/g;s/^\S\+/ /;$$d'; \
 	 echo "};"; \
 	 echo "unsigned int test_progs_verification_cert_len = $$(wc -c < $<);") > $@
 
-- 
cgit v1.2.3


From 570e4549f63293ca4973ef367ff02554f3e3dfc2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 17 Feb 2026 14:29:24 +0000
Subject: selftests/net: packetdrill: add ipv4-mapped-ipv6 tests

Add ipv4-mapped-ipv6 case to ksft_runner.sh before
an upcoming TCP fix in this area.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20260217142924.1853498-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/packetdrill/ksft_runner.sh | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'tools')

diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
index b34e5cf0112e..0a97d5ae3469 100755
--- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh
+++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh
@@ -13,6 +13,15 @@ declare -A ip_args=(
 		-D TFO_COOKIE_ZERO=b7c12350a90dc8f5
 		-D CMSG_LEVEL_IP=SOL_IP
 		-D CMSG_TYPE_RECVERR=IP_RECVERR"
+	[ipv4-mapped-ipv6]="--ip_version=ipv4-mapped-ipv6
+		--local_ip=192.168.0.1
+		--gateway_ip=192.168.0.1
+		--netmask_ip=255.255.0.0
+		--remote_ip=192.0.2.1
+		-D TFO_COOKIE=3021b9d889017eeb
+		-D TFO_COOKIE_ZERO=b7c12350a90dc8f5
+		-D CMSG_LEVEL_IP=SOL_IPV6
+		-D CMSG_TYPE_RECVERR=IPV6_RECVERR"
 	[ipv6]="--ip_version=ipv6
 		--mtu=1520
 		--local_ip=fd3d:0a0b:17d6::1
@@ -45,7 +54,7 @@ fi
 
 ip_versions=$(grep -E '^--ip_version=' $script | cut -d '=' -f 2)
 if [[ -z $ip_versions ]]; then
-	ip_versions="ipv4 ipv6"
+	ip_versions="ipv4 ipv6 ipv4-mapped-ipv6"
 elif [[ ! "$ip_versions" =~ ^ipv[46]$ ]]; then
 	ktap_exit_fail_msg "Too many or unsupported --ip_version: $ip_versions"
 	exit "$KSFT_FAIL"
-- 
cgit v1.2.3


From f892f9f99464bead942a75d2b00dda6be07de97f Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Wed, 18 Feb 2026 19:22:23 +0000
Subject: tools/sched_ext: scx_userland: fix data races on shared counters

The stats thread reads nr_vruntime_enqueues, nr_vruntime_dispatches,
nr_vruntime_failed, and nr_curr_enqueued concurrently with the main
thread writing them, with no synchronization.

Use __atomic builtins with relaxed ordering for all accesses to these
counters to eliminate the data races.

Only display accuracy is affected, not scheduling correctness.

Signed-off-by: David Carlier <devnexen@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_userland.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
index 504a80824f5c..3f2aba658b4a 100644
--- a/tools/sched_ext/scx_userland.c
+++ b/tools/sched_ext/scx_userland.c
@@ -157,9 +157,9 @@ static int dispatch_task(__s32 pid)
 
 	err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0);
 	if (err) {
-		nr_vruntime_failed++;
+		__atomic_add_fetch(&nr_vruntime_failed, 1, __ATOMIC_RELAXED);
 	} else {
-		nr_vruntime_dispatches++;
+		__atomic_add_fetch(&nr_vruntime_dispatches, 1, __ATOMIC_RELAXED);
 	}
 
 	return err;
@@ -202,8 +202,8 @@ static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task)
 		return ENOENT;
 
 	update_enqueued(curr, bpf_task);
-	nr_vruntime_enqueues++;
-	nr_curr_enqueued++;
+	__atomic_add_fetch(&nr_vruntime_enqueues, 1, __ATOMIC_RELAXED);
+	__atomic_add_fetch(&nr_curr_enqueued, 1, __ATOMIC_RELAXED);
 
 	/*
 	 * Enqueue the task in a vruntime-sorted list. A more optimal data
@@ -279,9 +279,9 @@ static void dispatch_batch(void)
 			LIST_INSERT_HEAD(&vruntime_head, task, entries);
 			break;
 		}
-		nr_curr_enqueued--;
+		__atomic_sub_fetch(&nr_curr_enqueued, 1, __ATOMIC_RELAXED);
 	}
-	skel->bss->nr_scheduled = nr_curr_enqueued;
+	skel->bss->nr_scheduled = __atomic_load_n(&nr_curr_enqueued, __ATOMIC_RELAXED);
 }
 
 static void *run_stats_printer(void *arg)
@@ -306,9 +306,9 @@ static void *run_stats_printer(void *arg)
 		printf("|-----------------------|\n");
 		printf("| VRUNTIME / USER       |\n");
 		printf("|-----------------------|\n");
-		printf("|  enq:      %10llu |\n", nr_vruntime_enqueues);
-		printf("|  disp:     %10llu |\n", nr_vruntime_dispatches);
-		printf("|  failed:   %10llu |\n", nr_vruntime_failed);
+		printf("|  enq:      %10llu |\n", __atomic_load_n(&nr_vruntime_enqueues, __ATOMIC_RELAXED));
+		printf("|  disp:     %10llu |\n", __atomic_load_n(&nr_vruntime_dispatches, __ATOMIC_RELAXED));
+		printf("|  failed:   %10llu |\n", __atomic_load_n(&nr_vruntime_failed, __ATOMIC_RELAXED));
 		printf("o-----------------------o\n");
 		printf("\n\n");
 		fflush(stdout);
@@ -376,10 +376,10 @@ static void bootstrap(char *comm)
 {
 	exit_req = 0;
 	min_vruntime = 0.0;
-	nr_vruntime_enqueues = 0;
-	nr_vruntime_dispatches = 0;
-	nr_vruntime_failed = 0;
-	nr_curr_enqueued = 0;
+	__atomic_store_n(&nr_vruntime_enqueues, 0, __ATOMIC_RELAXED);
+	__atomic_store_n(&nr_vruntime_dispatches, 0, __ATOMIC_RELAXED);
+	__atomic_store_n(&nr_vruntime_failed, 0, __ATOMIC_RELAXED);
+	__atomic_store_n(&nr_curr_enqueued, 0, __ATOMIC_RELAXED);
 	memset(tasks, 0, pid_max * sizeof(*tasks));
 	LIST_INIT(&vruntime_head);
 
-- 
cgit v1.2.3


From 640c9dc72f21f325700a4b0f839ad568ff21c697 Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Wed, 18 Feb 2026 19:22:35 +0000
Subject: tools/sched_ext: fix getopt not re-parsed on restart

After goto restart, optind retains its advanced position from the
previous getopt loop, causing getopt() to immediately return -1.
This silently drops all command-line options on the restarted skeleton.

Reset optind to 1 at the restart label so options are re-parsed.

Affected schedulers: scx_simple, scx_central, scx_flatcg, scx_pair,
scx_sdt, scx_cpu0.

Signed-off-by: David Carlier <devnexen@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_central.c | 1 +
 tools/sched_ext/scx_cpu0.c    | 1 +
 tools/sched_ext/scx_flatcg.c  | 1 +
 tools/sched_ext/scx_pair.c    | 1 +
 tools/sched_ext/scx_sdt.c     | 1 +
 tools/sched_ext/scx_simple.c  | 1 +
 6 files changed, 6 insertions(+)

(limited to 'tools')

diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 39f21b00a208..2a805f1d6c8f 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -56,6 +56,7 @@ int main(int argc, char **argv)
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
 restart:
+	optind = 1;
 	skel = SCX_OPS_OPEN(central_ops, scx_central);
 
 	skel->rodata->central_cpu = 0;
diff --git a/tools/sched_ext/scx_cpu0.c b/tools/sched_ext/scx_cpu0.c
index 1e4fa4ab8da9..a6fba9978b9c 100644
--- a/tools/sched_ext/scx_cpu0.c
+++ b/tools/sched_ext/scx_cpu0.c
@@ -69,6 +69,7 @@ int main(int argc, char **argv)
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
 restart:
+	optind = 1;
 	skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0);
 
 	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index a8446509949e..d865c381589b 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -141,6 +141,7 @@ int main(int argc, char **argv)
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
 restart:
+	optind = 1;
 	skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
 
 	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
index 2a82d8a8a0aa..2e509391f3da 100644
--- a/tools/sched_ext/scx_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -53,6 +53,7 @@ int main(int argc, char **argv)
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
 restart:
+	optind = 1;
 	skel = SCX_OPS_OPEN(pair_ops, scx_pair);
 
 	skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
diff --git a/tools/sched_ext/scx_sdt.c b/tools/sched_ext/scx_sdt.c
index b0363363476d..d8ca9aa316a5 100644
--- a/tools/sched_ext/scx_sdt.c
+++ b/tools/sched_ext/scx_sdt.c
@@ -51,6 +51,7 @@ int main(int argc, char **argv)
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
 restart:
+	optind = 1;
 	skel = SCX_OPS_OPEN(sdt_ops, scx_sdt);
 
 	while ((opt = getopt(argc, argv, "fvh")) != -1) {
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index 06d4b13bf76b..c3b48611712b 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -71,6 +71,7 @@ int main(int argc, char **argv)
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
 restart:
+	optind = 1;
 	skel = SCX_OPS_OPEN(simple_ops, scx_simple);
 
 	while ((opt = getopt(argc, argv, "fvh")) != -1) {
-- 
cgit v1.2.3