From ca45c84afb8c91a8d688b0012657099c24f59266 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Wed, 3 Dec 2025 19:32:15 -0800
Subject: bpf: Add bpf_has_frame_pointer()

Introduce a bpf_has_frame_pointer() helper that unwinders can call to
determine whether a given instruction pointer is within the valid frame
pointer region of a BPF JIT program or trampoline (i.e., after the
prologue, before the epilogue).

This will enable livepatch (with the ORC unwinder) to reliably unwind
through BPF JIT frames.

Acked-by: Song Liu <song@kernel.org>
Acked-and-tested-by: Andrey Grodzovsky <andrey.grodzovsky@crowdstrike.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/fd2bc5b4e261a680774b28f6100509fd5ebad2f0.1764818927.git.jpoimboe@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
---
 include/linux/bpf.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6498be4c44f8..e5be698256d1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1283,6 +1283,8 @@ struct bpf_ksym {
 	struct list_head	 lnode;
 	struct latch_tree_node	 tnode;
 	bool			 prog;
+	u32			 fp_start;
+	u32			 fp_end;
 };
 
 enum bpf_tramp_prog_type {
@@ -1511,6 +1513,7 @@ void bpf_image_ksym_add(struct bpf_ksym *ksym);
 void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
 void bpf_ksym_del(struct bpf_ksym *ksym);
+bool bpf_has_frame_pointer(unsigned long ip);
 int bpf_jit_charge_modmem(u32 size);
 void bpf_jit_uncharge_modmem(u32 size);
 bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
-- 
cgit v1.2.3


From d9f514d3e6ee48c34d70d637479b4c9384832d4f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 23 Nov 2025 22:51:23 +0000
Subject: block: move around bio flagging helpers

We'll need bio_flagged() earlier in bio.h for later patches, move it
together with all related helpers, and mark the bio_flagged()'s bio
argument as const.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index ad2d57908c1c..c75a9b3672aa 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -46,6 +46,21 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs)
 #define bio_data_dir(bio) \
 	(op_is_write(bio_op(bio)) ? WRITE : READ)
 
+static inline bool bio_flagged(const struct bio *bio, unsigned int bit)
+{
+	return bio->bi_flags & (1U << bit);
+}
+
+static inline void bio_set_flag(struct bio *bio, unsigned int bit)
+{
+	bio->bi_flags |= (1U << bit);
+}
+
+static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
+{
+	bio->bi_flags &= ~(1U << bit);
+}
+
 /*
  * Check whether this bio carries any data or not. A NULL bio is allowed.
  */
@@ -225,21 +240,6 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count)
 	atomic_set(&bio->__bi_cnt, count);
 }
 
-static inline bool bio_flagged(struct bio *bio, unsigned int bit)
-{
-	return bio->bi_flags & (1U << bit);
-}
-
-static inline void bio_set_flag(struct bio *bio, unsigned int bit)
-{
-	bio->bi_flags |= (1U << bit);
-}
-
-static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
-{
-	bio->bi_flags &= ~(1U << bit);
-}
-
 static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
 {
 	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
-- 
cgit v1.2.3


From 0c01ea92f545ca7fcafdda6a8e29b65ef3a5ec74 Mon Sep 17 00:00:00 2001
From: Tal Zussman <tz2294@columbia.edu>
Date: Fri, 12 Dec 2025 04:08:08 -0500
Subject: mm: Remove tlb_flush_reason::NR_TLB_FLUSH_REASONS from
 <linux/mm_types.h>

This has been unused since it was added 11 years ago in:

  d17d8f9dedb9 ("x86/mm: Add tracepoints for TLB flushes")

Signed-off-by: Tal Zussman <tz2294@columbia.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Rik van Riel <riel@surriel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Link: https://patch.msgid.link/20251212-tlb-trace-fix-v2-2-d322e0ad9b69@columbia.edu
---
 include/linux/mm_types.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9f6de068295d..42af2292951d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1631,7 +1631,6 @@ enum tlb_flush_reason {
 	TLB_LOCAL_MM_SHOOTDOWN,
 	TLB_REMOTE_SEND_IPI,
 	TLB_REMOTE_WRONG_CPU,
-	NR_TLB_FLUSH_REASONS,
 };
 
 /**
-- 
cgit v1.2.3


From e1b4c6a58304fd490124cc2b454d80edc786665c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 13 Dec 2025 17:50:23 -0500
Subject: shmem: fix recovery on rename failures

maple_tree insertions can fail if we are seriously short on memory;
simple_offset_rename() does not recover well if it runs into that.
The same goes for simple_offset_rename_exchange().

Moreover, shmem_whiteout() expects that if it succeeds, the caller will
progress to d_move(), i.e. that shmem_rename2() won't fail past the
successful call of shmem_whiteout().

Not hard to fix, fortunately - mtree_store() can't fail if the index we
are trying to store into is already present in the tree as a singleton.

For simple_offset_rename_exchange() that's enough - we just need to be
careful about the order of operations.

For simple_offset_rename() solution is to preinsert the target into the
tree for new_dir; the rest can be done without any potentially failing
operations.

That preinsertion has to be done in shmem_rename2() rather than in
simple_offset_rename() itself - otherwise we'd need to deal with the
possibility of failure after successful shmem_whiteout().

Fixes: a2e459555c5f ("shmem: stable directory offsets")
Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c         | 50 +++++++++++++++++++++-----------------------------
 include/linux/fs.h |  2 +-
 mm/shmem.c         | 18 +++++++++++++-----
 3 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index 9264523be85c..591eb649ebba 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -346,22 +346,22 @@ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
  * User space expects the directory offset value of the replaced
  * (new) directory entry to be unchanged after a rename.
  *
- * Returns zero on success, a negative errno value on failure.
+ * Caller must have grabbed a slot for new_dentry in the maple_tree
+ * associated with new_dir, even if dentry is negative.
  */
-int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
-			 struct inode *new_dir, struct dentry *new_dentry)
+void simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
+			  struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
 	struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
 	long new_offset = dentry2offset(new_dentry);
 
-	simple_offset_remove(old_ctx, old_dentry);
+	if (WARN_ON(!new_offset))
+		return;
 
-	if (new_offset) {
-		offset_set(new_dentry, 0);
-		return simple_offset_replace(new_ctx, old_dentry, new_offset);
-	}
-	return simple_offset_add(new_ctx, old_dentry);
+	simple_offset_remove(old_ctx, old_dentry);
+	offset_set(new_dentry, 0);
+	WARN_ON(simple_offset_replace(new_ctx, old_dentry, new_offset));
 }
 
 /**
@@ -388,31 +388,23 @@ int simple_offset_rename_exchange(struct inode *old_dir,
 	long new_index = dentry2offset(new_dentry);
 	int ret;
 
-	simple_offset_remove(old_ctx, old_dentry);
-	simple_offset_remove(new_ctx, new_dentry);
+	if (WARN_ON(!old_index || !new_index))
+		return -EINVAL;
 
-	ret = simple_offset_replace(new_ctx, old_dentry, new_index);
-	if (ret)
-		goto out_restore;
+	ret = mtree_store(&new_ctx->mt, new_index, old_dentry, GFP_KERNEL);
+	if (WARN_ON(ret))
+		return ret;
 
-	ret = simple_offset_replace(old_ctx, new_dentry, old_index);
-	if (ret) {
-		simple_offset_remove(new_ctx, old_dentry);
-		goto out_restore;
+	ret = mtree_store(&old_ctx->mt, old_index, new_dentry, GFP_KERNEL);
+	if (WARN_ON(ret)) {
+		mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL);
+		return ret;
 	}
 
-	ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
-	if (ret) {
-		simple_offset_remove(new_ctx, old_dentry);
-		simple_offset_remove(old_ctx, new_dentry);
-		goto out_restore;
-	}
+	offset_set(old_dentry, new_index);
+	offset_set(new_dentry, old_index);
+	simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
 	return 0;
-
-out_restore:
-	(void)simple_offset_replace(old_ctx, old_dentry, old_index);
-	(void)simple_offset_replace(new_ctx, new_dentry, new_index);
-	return ret;
 }
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 04ceeca12a0d..f5c9cf28c4dc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3247,7 +3247,7 @@ struct offset_ctx {
 void simple_offset_init(struct offset_ctx *octx);
 int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
 void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
-int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
+void simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
 			 struct inode *new_dir, struct dentry *new_dentry);
 int simple_offset_rename_exchange(struct inode *old_dir,
 				  struct dentry *old_dentry,
diff --git a/mm/shmem.c b/mm/shmem.c
index d3edc809e2e7..a9666b0599a4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -4038,6 +4038,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
 {
 	struct inode *inode = d_inode(old_dentry);
 	int they_are_dirs = S_ISDIR(inode->i_mode);
+	bool had_offset = false;
 	int error;
 
 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
@@ -4050,16 +4051,23 @@ static int shmem_rename2(struct mnt_idmap *idmap,
 	if (!simple_empty(new_dentry))
 		return -ENOTEMPTY;
 
+	error = simple_offset_add(shmem_get_offset_ctx(new_dir), new_dentry);
+	if (error == -EBUSY)
+		had_offset = true;
+	else if (unlikely(error))
+		return error;
+
 	if (flags & RENAME_WHITEOUT) {
 		error = shmem_whiteout(idmap, old_dir, old_dentry);
-		if (error)
+		if (error) {
+			if (!had_offset)
+				simple_offset_remove(shmem_get_offset_ctx(new_dir),
+						     new_dentry);
 			return error;
+		}
 	}
 
-	error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
-	if (error)
-		return error;
-
+	simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
 	if (d_really_is_positive(new_dentry)) {
 		(void) shmem_unlink(new_dir, new_dentry);
 		if (they_are_dirs) {
-- 
cgit v1.2.3


From dcd0b625fe440d68bb4b97c71d18ca48ecd6e594 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Wed, 17 Dec 2025 07:34:55 -0800
Subject: powercap: intel_rapl: Fix possible recursive lock warning

With the RAPL PMU addition, there is a recursive locking when CPU online
callback function calls rapl_package_add_pmu(). Here cpu_hotplug_lock
is already acquired by cpuhp_thread_fun() and rapl_package_add_pmu()
tries to acquire again.

<4>[ 8.197433] ============================================
<4>[ 8.197437] WARNING: possible recursive locking detected
<4>[ 8.197440] 6.19.0-rc1-lgci-xe-xe-4242-05b7c58b3367dca84+ #1 Not tainted
<4>[ 8.197444] --------------------------------------------
<4>[ 8.197447] cpuhp/0/20 is trying to acquire lock:
<4>[ 8.197450] ffffffff83487870 (cpu_hotplug_lock){++++}-{0:0}, at:
rapl_package_add_pmu+0x37/0x370 [intel_rapl_common]
<4>[ 8.197463]
but task is already holding lock:
<4>[ 8.197466] ffffffff83487870 (cpu_hotplug_lock){++++}-{0:0}, at:
cpuhp_thread_fun+0x6d/0x290
<4>[ 8.197477]
other info that might help us debug this:
<4>[ 8.197480] Possible unsafe locking scenario:

<4>[ 8.197483] CPU0
<4>[ 8.197485] ----
<4>[ 8.197487] lock(cpu_hotplug_lock);
<4>[ 8.197490] lock(cpu_hotplug_lock);
<4>[ 8.197493]
*** DEADLOCK ***
..
..
<4>[ 8.197542] __lock_acquire+0x146e/0x2790
<4>[ 8.197548] lock_acquire+0xc4/0x2c0
<4>[ 8.197550] ? rapl_package_add_pmu+0x37/0x370 [intel_rapl_common]
<4>[ 8.197556] cpus_read_lock+0x41/0x110
<4>[ 8.197558] ? rapl_package_add_pmu+0x37/0x370 [intel_rapl_common]
<4>[ 8.197561] rapl_package_add_pmu+0x37/0x370 [intel_rapl_common]
<4>[ 8.197565] rapl_cpu_online+0x85/0x87 [intel_rapl_msr]
<4>[ 8.197568] ? __pfx_rapl_cpu_online+0x10/0x10 [intel_rapl_msr]
<4>[ 8.197570] cpuhp_invoke_callback+0x41f/0x6c0
<4>[ 8.197573] ? cpuhp_thread_fun+0x6d/0x290
<4>[ 8.197575] cpuhp_thread_fun+0x1e2/0x290
<4>[ 8.197578] ? smpboot_thread_fn+0x26/0x290
<4>[ 8.197581] smpboot_thread_fn+0x12f/0x290
<4>[ 8.197584] ? __pfx_smpboot_thread_fn+0x10/0x10
<4>[ 8.197586] kthread+0x11f/0x250
<4>[ 8.197589] ? __pfx_kthread+0x10/0x10
<4>[ 8.197592] ret_from_fork+0x344/0x3a0
<4>[ 8.197595] ? __pfx_kthread+0x10/0x10
<4>[ 8.197597] ret_from_fork_asm+0x1a/0x30
<4>[ 8.197604] </TASK>

Fix this issue in the same way as rapl powercap package domain is added
from the same CPU online callback by introducing another interface which
doesn't call cpus_read_lock(). Add rapl_package_add_pmu_locked() and
rapl_package_remove_pmu_locked() which don't call cpus_read_lock().

Fixes: 748d6ba43afd ("powercap: intel_rapl: Enable MSR-based RAPL PMU support")
Reported-by: Borah, Chaitanya Kumar <chaitanya.kumar.borah@intel.com>
Closes: https://lore.kernel.org/linux-pm/5427ede1-57a0-43d1-99f3-8ca4b0643e82@intel.com/T/#u
Tested-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Tested-by: RavitejaX Veesam <ravitejax.veesam@intel.com>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Link: https://patch.msgid.link/20251217153455.3560176-1-srinivas.pandruvada@linux.intel.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_common.c | 24 ++++++++++++++++++------
 drivers/powercap/intel_rapl_msr.c    |  4 ++--
 include/linux/intel_rapl.h           |  4 ++++
 3 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index b9d87e56cbbc..3ff6da3bf4e6 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -2032,7 +2032,7 @@ end:
 	return ret;
 }
 
-int rapl_package_add_pmu(struct rapl_package *rp)
+int rapl_package_add_pmu_locked(struct rapl_package *rp)
 {
 	struct rapl_package_pmu_data *data = &rp->pmu_data;
 	int idx;
@@ -2040,8 +2040,6 @@ int rapl_package_add_pmu(struct rapl_package *rp)
 	if (rp->has_pmu)
 		return -EEXIST;
 
-	guard(cpus_read_lock)();
-
 	for (idx = 0; idx < rp->nr_domains; idx++) {
 		struct rapl_domain *rd = &rp->domains[idx];
 		int domain = rd->id;
@@ -2091,17 +2089,23 @@ int rapl_package_add_pmu(struct rapl_package *rp)
 
 	return rapl_pmu_update(rp);
 }
+EXPORT_SYMBOL_GPL(rapl_package_add_pmu_locked);
+
+int rapl_package_add_pmu(struct rapl_package *rp)
+{
+	guard(cpus_read_lock)();
+
+	return rapl_package_add_pmu_locked(rp);
+}
 EXPORT_SYMBOL_GPL(rapl_package_add_pmu);
 
-void rapl_package_remove_pmu(struct rapl_package *rp)
+void rapl_package_remove_pmu_locked(struct rapl_package *rp)
 {
 	struct rapl_package *pos;
 
 	if (!rp->has_pmu)
 		return;
 
-	guard(cpus_read_lock)();
-
 	list_for_each_entry(pos, &rapl_packages, plist) {
 		/* PMU is still needed */
 		if (pos->has_pmu && pos != rp)
@@ -2111,6 +2115,14 @@ void rapl_package_remove_pmu(struct rapl_package *rp)
 	perf_pmu_unregister(&rapl_pmu.pmu);
 	memset(&rapl_pmu, 0, sizeof(struct rapl_pmu));
 }
+EXPORT_SYMBOL_GPL(rapl_package_remove_pmu_locked);
+
+void rapl_package_remove_pmu(struct rapl_package *rp)
+{
+	guard(cpus_read_lock)();
+
+	rapl_package_remove_pmu_locked(rp);
+}
 EXPORT_SYMBOL_GPL(rapl_package_remove_pmu);
 #endif
 
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
index 0ce1096b6314..9a7e150b3536 100644
--- a/drivers/powercap/intel_rapl_msr.c
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -82,7 +82,7 @@ static int rapl_cpu_online(unsigned int cpu)
 		if (IS_ERR(rp))
 			return PTR_ERR(rp);
 		if (rapl_msr_pmu)
-			rapl_package_add_pmu(rp);
+			rapl_package_add_pmu_locked(rp);
 	}
 	cpumask_set_cpu(cpu, &rp->cpumask);
 	return 0;
@@ -101,7 +101,7 @@ static int rapl_cpu_down_prep(unsigned int cpu)
 	lead_cpu = cpumask_first(&rp->cpumask);
 	if (lead_cpu >= nr_cpu_ids) {
 		if (rapl_msr_pmu)
-			rapl_package_remove_pmu(rp);
+			rapl_package_remove_pmu_locked(rp);
 		rapl_remove_package_cpuslocked(rp);
 	} else if (rp->lead_cpu == cpu) {
 		rp->lead_cpu = lead_cpu;
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index e9ade2ff4af6..f479ef5b3341 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -214,10 +214,14 @@ void rapl_remove_package(struct rapl_package *rp);
 
 #ifdef CONFIG_PERF_EVENTS
 int rapl_package_add_pmu(struct rapl_package *rp);
+int rapl_package_add_pmu_locked(struct rapl_package *rp);
 void rapl_package_remove_pmu(struct rapl_package *rp);
+void rapl_package_remove_pmu_locked(struct rapl_package *rp);
 #else
 static inline int rapl_package_add_pmu(struct rapl_package *rp) { return 0; }
+static inline int rapl_package_add_pmu_locked(struct rapl_package *rp) { return 0; }
 static inline void rapl_package_remove_pmu(struct rapl_package *rp) { }
+static inline void rapl_package_remove_pmu_locked(struct rapl_package *rp) { }
 #endif
 
 #endif /* __INTEL_RAPL_H__ */
-- 
cgit v1.2.3


From 4a824c3128998158a093eaadd776a79abe3a601a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 4 Dec 2025 15:31:27 +0000
Subject: entry: Always inline local_irq_{enable,disable}_exit_to_user()

clang needs __always_inline instead of inline, even for tiny helpers.

This saves some cycles in system call fast path, and saves 195 bytes
on x86_64 build:

$ size vmlinux.before vmlinux.after
   text	   data	    bss	    dec	    hex	filename
34652814	22291961	5875180	62819955	3be8e73	vmlinux.before
34652619	22291961	5875180	62819760	3be8db0	vmlinux.after

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251204153127.1321824-1-edumazet@google.com
---
 include/linux/irq-entry-common.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index 6ab913e57da0..d26d1b1bcbfb 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -110,7 +110,7 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
 static inline void local_irq_enable_exit_to_user(unsigned long ti_work);
 
 #ifndef local_irq_enable_exit_to_user
-static inline void local_irq_enable_exit_to_user(unsigned long ti_work)
+static __always_inline void local_irq_enable_exit_to_user(unsigned long ti_work)
 {
 	local_irq_enable();
 }
@@ -125,7 +125,7 @@ static inline void local_irq_enable_exit_to_user(unsigned long ti_work)
 static inline void local_irq_disable_exit_to_user(void);
 
 #ifndef local_irq_disable_exit_to_user
-static inline void local_irq_disable_exit_to_user(void)
+static __always_inline void local_irq_disable_exit_to_user(void)
 {
 	local_irq_disable();
 }
-- 
cgit v1.2.3


From 4cc5373f2e749a6c96e8b9fa971931a4dd852860 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 19 Dec 2025 11:20:06 +0000
Subject: clang: work around asm output constraint problems

Work around clang problems with "=rm" asm constraint.

clang seems to always chose the memory output, while it is almost
always the worst choice.

Add ASM_OUTPUT_RM so that we can replace "=rm" constraint
where it matters for clang, while not penalizing gcc.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Suggested-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-clang.h | 1 +
 include/linux/compiler_types.h | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 107ce05bd16e..7edf1a07b535 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -145,6 +145,7 @@
  */
 #define ASM_INPUT_G "ir"
 #define ASM_INPUT_RM "r"
+#define ASM_OUTPUT_RM "=r"
 
 /*
  * Declare compiler support for __typeof_unqual__() operator.
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 1280693766b9..d3318a3c2577 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -548,11 +548,12 @@ struct ftrace_likely_data {
 
 /*
  * Clang has trouble with constraints with multiple
- * alternative behaviors (mainly "g" and "rm").
+ * alternative behaviors ("g" , "rm" and "=rm").
  */
 #ifndef ASM_INPUT_G
   #define ASM_INPUT_G "g"
   #define ASM_INPUT_RM "rm"
+  #define ASM_OUTPUT_RM "=rm"
 #endif
 
 #ifdef CONFIG_CC_HAS_ASM_INLINE
-- 
cgit v1.2.3


From 87e7f6019097746d1d06f98874a9f179b7a68f3e Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Fri, 19 Dec 2025 10:36:38 +0200
Subject: software node: Also support referencing non-constant software nodes

Fwnode references are be implemented differently if referenced node is a
software node. _Generic() is used to differentiate between the two cases
but only const software nodes were present in the selection. Also add
non-const software nodes.

Reported-by: Kenneth Crudup <kenny@panix.com>
Closes: https://lore.kernel.org/all/af773b82-bef2-4209-baaf-526d4661b7fc@panix.com/
Fixes: d7cdbbc93c56 ("software node: allow referencing firmware nodes")
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Tested-By: Kenneth R. Crudup <kenny@panix.com>
Tested-by: Mehdi Djait <mehdi.djait@linux.intel.com> # Dell XPS 9315
Reviewed-by: Mehdi Djait <mehdi.djait@linux.intel.com>
Link: https://patch.msgid.link/20251219083638.2454138-1-sakari.ailus@linux.intel.com
Signed-off-by: Danilo Krummrich <dakr@kernel.org>
---
 include/linux/property.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/property.h b/include/linux/property.h
index 272bfbdea7bf..e30ef23a9af3 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -371,6 +371,7 @@ struct software_node_ref_args {
 (const struct software_node_ref_args) {				\
 	.swnode = _Generic(_ref_,				\
 			   const struct software_node *: _ref_,	\
+			   struct software_node *: _ref_,	\
 			   default: NULL),			\
 	.fwnode = _Generic(_ref_,				\
 			   struct fwnode_handle *: _ref_,	\
-- 
cgit v1.2.3


From 20e20b147cf7cb6780a5b95da2a0e37c52cd1015 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 15 Dec 2025 22:38:00 -0800
Subject: platform/x86/intel/vsec: correct kernel-doc comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix kernel-doc warnings in intel_vsec.h to eliminate all kernel-doc
warnings:

Warning: include/linux/intel_vsec.h:92 struct member 'read_telem' not
 described in 'pmt_callbacks'
Warning: include/linux/intel_vsec.h:146 expecting prototype for struct
 intel_sec_device.  Prototype was for struct intel_vsec_device instead
Warning: include/linux/intel_vsec.h:146 struct member 'priv_data_size'
 not described in 'intel_vsec_device'

In struct pmt_callbacks, correct the kernel-doc for @read_telem.
kernel-doc doesn't support documenting callback function parameters,
so drop the '@' signs on those and use "* *" to make them somewhat
readable in the produced documentation output.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20251216063801.2896495-1-rdunlap@infradead.org
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/linux/intel_vsec.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h
index 53f6fe88e369..1a0f357c2427 100644
--- a/include/linux/intel_vsec.h
+++ b/include/linux/intel_vsec.h
@@ -80,13 +80,13 @@ enum intel_vsec_quirks {
 
 /**
  * struct pmt_callbacks - Callback infrastructure for PMT devices
- * ->read_telem() when specified, called by client driver to access PMT data (instead
- * of direct copy).
- * @pdev:  PCI device reference for the callback's use
- * @guid:  ID of data to acccss
- * @data:  buffer for the data to be copied
- * @off:   offset into the requested buffer
- * @count: size of buffer
+ * @read_telem: when specified, called by client driver to access PMT
+ * data (instead of direct copy).
+ * * pdev:  PCI device reference for the callback's use
+ * * guid:  ID of data to acccss
+ * * data:  buffer for the data to be copied
+ * * off:   offset into the requested buffer
+ * * count: size of buffer
  */
 struct pmt_callbacks {
 	int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, loff_t off, u32 count);
@@ -120,7 +120,7 @@ struct intel_vsec_platform_info {
 };
 
 /**
- * struct intel_sec_device - Auxbus specific device information
+ * struct intel_vsec_device - Auxbus specific device information
  * @auxdev:        auxbus device struct for auxbus access
  * @pcidev:        pci device associated with the device
  * @resource:      any resources shared by the parent
@@ -128,6 +128,7 @@ struct intel_vsec_platform_info {
  * @num_resources: number of resources
  * @id:            xarray id
  * @priv_data:     any private data needed
+ * @priv_data_size: size of private data area
  * @quirks:        specified quirks
  * @base_addr:     base address of entries (if specified)
  * @cap_id:        the enumerated id of the vsec feature
-- 
cgit v1.2.3


From 5393802c94e0ab1295c04c94c57bcb00222d4674 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 27 Nov 2025 10:39:24 -0800
Subject: genalloc.h: fix htmldocs warning

WARNING: include/linux/genalloc.h:52 function parameter 'start_addr' not described in 'genpool_algo_t'

Fixes: 52fbf1134d47 ("lib/genalloc.c: fix allocation of aligned buffer from non-aligned chunk")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lkml.kernel.org/r/20251127130624.563597e3@canb.auug.org.au
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Alexey Skidanov <alexey.skidanov@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/genalloc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 0bd581003cd5..60de63e46b33 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -44,6 +44,7 @@ struct gen_pool;
  * @nr: The number of zeroed bits we're looking for
  * @data: optional additional data used by the callback
  * @pool: the pool being allocated from
+ * @start_addr: start address of memory chunk
  */
 typedef unsigned long (*genpool_algo_t)(unsigned long *map,
 			unsigned long size,
-- 
cgit v1.2.3


From 007f5da43b3d0ecff972e2616062b8da1f862f5e Mon Sep 17 00:00:00 2001
From: Jiayuan Chen <jiayuan.chen@linux.dev>
Date: Thu, 4 Dec 2025 18:59:55 +0000
Subject: mm/kasan: fix incorrect unpoisoning in vrealloc for KASAN

Patch series "kasan: vmalloc: Fixes for the percpu allocator and
vrealloc", v3.

Patches fix two issues related to KASAN and vmalloc.

The first one, a KASAN tag mismatch, possibly resulting in a kernel panic,
can be observed on systems with a tag-based KASAN enabled and with
multiple NUMA nodes.  Initially it was only noticed on x86 [1] but later a
similar issue was also reported on arm64 [2].

Specifically the problem is related to how vm_structs interact with
pcpu_chunks - both when they are allocated, assigned and when pcpu_chunk
addresses are derived.

When vm_structs are allocated they are unpoisoned, each with a different
random tag, if vmalloc support is enabled along the KASAN mode.  Later
when first pcpu chunk is allocated it gets its 'base_addr' field set to
the first allocated vm_struct.  With that it inherits that vm_struct's
tag.

When pcpu_chunk addresses are later derived (by pcpu_chunk_addr(), for
example in pcpu_alloc_noprof()) the base_addr field is used and offsets
are added to it.  If the initial conditions are satisfied then some of the
offsets will point into memory allocated with a different vm_struct.  So
while the lower bits will get accurately derived the tag bits in the top
of the pointer won't match the shadow memory contents.

The solution (proposed at v2 of the x86 KASAN series [3]) is to unpoison
the vm_structs with the same tag when allocating them for the per cpu
allocator (in pcpu_get_vm_areas()).

The second one reported by syzkaller [4] is related to vrealloc and
happens because of random tag generation when unpoisoning memory without
allocating new pages.  This breaks shadow memory tracking and needs to
reuse the existing tag instead of generating a new one.  At the same time
an inconsistency in used flags is corrected.


This patch (of 3):

Syzkaller reported a memory out-of-bounds bug [4].  This patch fixes two
issues:

1. In vrealloc the KASAN_VMALLOC_VM_ALLOC flag is missing when
   unpoisoning the extended region. This flag is required to correctly
   associate the allocation with KASAN's vmalloc tracking.

   Note: In contrast, vzalloc (via __vmalloc_node_range_noprof)
   explicitly sets KASAN_VMALLOC_VM_ALLOC and calls
   kasan_unpoison_vmalloc() with it.  vrealloc must behave consistently --
   especially when reusing existing vmalloc regions -- to ensure KASAN can
   track allocations correctly.

2. When vrealloc reuses an existing vmalloc region (without allocating
   new pages) KASAN generates a new tag, which breaks tag-based memory
   access tracking.

Introduce KASAN_VMALLOC_KEEP_TAG, a new KASAN flag that allows reusing the
tag already attached to the pointer, ensuring consistent tag behavior
during reallocation.

Pass KASAN_VMALLOC_KEEP_TAG and KASAN_VMALLOC_VM_ALLOC to the
kasan_unpoison_vmalloc inside vrealloc_node_align_noprof().

Link: https://lkml.kernel.org/r/cover.1765978969.git.m.wieczorretman@pm.me
Link: https://lkml.kernel.org/r/38dece0a4074c43e48150d1e242f8242c73bf1a5.1764874575.git.m.wieczorretman@pm.me
Link: https://lore.kernel.org/all/e7e04692866d02e6d3b32bb43b998e5d17092ba4.1738686764.git.maciej.wieczor-retman@intel.com/ [1]
Link: https://lore.kernel.org/all/aMUrW1Znp1GEj7St@MiWiFi-R3L-srv/ [2]
Link: https://lore.kernel.org/all/CAPAsAGxDRv_uFeMYu9TwhBVWHCCtkSxoWY4xmFB_vowMbi8raw@mail.gmail.com/ [3]
Link: https://syzkaller.appspot.com/bug?extid=997752115a851cb0cf36 [4]
Fixes: a0309faf1cb0 ("mm: vmalloc: support more granular vrealloc() sizing")
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Co-developed-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Reported-by: syzbot+997752115a851cb0cf36@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/68e243a2.050a0220.1696c6.007d.GAE@google.com/T/
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Marco Elver <elver@google.com>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 1 +
 mm/kasan/hw_tags.c    | 2 +-
 mm/kasan/shadow.c     | 4 +++-
 mm/vmalloc.c          | 4 +++-
 4 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index f335c1d7b61d..df3d8567dde9 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -28,6 +28,7 @@ typedef unsigned int __bitwise kasan_vmalloc_flags_t;
 #define KASAN_VMALLOC_INIT		((__force kasan_vmalloc_flags_t)0x01u)
 #define KASAN_VMALLOC_VM_ALLOC		((__force kasan_vmalloc_flags_t)0x02u)
 #define KASAN_VMALLOC_PROT_NORMAL	((__force kasan_vmalloc_flags_t)0x04u)
+#define KASAN_VMALLOC_KEEP_TAG		((__force kasan_vmalloc_flags_t)0x08u)
 
 #define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */
 #define KASAN_VMALLOC_TLB_FLUSH  0x2 /* TLB flush */
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 1c373cc4b3fa..cbef5e450954 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -361,7 +361,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
 		return (void *)start;
 	}
 
-	tag = kasan_random_tag();
+	tag = (flags & KASAN_VMALLOC_KEEP_TAG) ? get_tag(start) : kasan_random_tag();
 	start = set_tag(start, tag);
 
 	/* Unpoison and initialize memory up to size. */
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 29a751a8a08d..32fbdf759ea2 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -631,7 +631,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
 	    !(flags & KASAN_VMALLOC_PROT_NORMAL))
 		return (void *)start;
 
-	start = set_tag(start, kasan_random_tag());
+	if (unlikely(!(flags & KASAN_VMALLOC_KEEP_TAG)))
+		start = set_tag(start, kasan_random_tag());
+
 	kasan_unpoison(start, size, false);
 	return (void *)start;
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ecbac900c35f..94c0a9262a46 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4331,7 +4331,9 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align
 	 */
 	if (size <= alloced_size) {
 		kasan_unpoison_vmalloc(p + old_size, size - old_size,
-				       KASAN_VMALLOC_PROT_NORMAL);
+				       KASAN_VMALLOC_PROT_NORMAL |
+				       KASAN_VMALLOC_VM_ALLOC |
+				       KASAN_VMALLOC_KEEP_TAG);
 		/*
 		 * No need to zero memory here, as unused memory will have
 		 * already been zeroed at initial allocation time or during
-- 
cgit v1.2.3


From 6f13db031e27e88213381039032a9cc061578ea6 Mon Sep 17 00:00:00 2001
From: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Date: Thu, 4 Dec 2025 19:00:04 +0000
Subject: kasan: refactor pcpu kasan vmalloc unpoison

A KASAN tag mismatch, possibly causing a kernel panic, can be observed
on systems with a tag-based KASAN enabled and with multiple NUMA nodes.
It was reported on arm64 and reproduced on x86. It can be explained in
the following points:

1. There can be more than one virtual memory chunk.
2. Chunk's base address has a tag.
3. The base address points at the first chunk and thus inherits
   the tag of the first chunk.
4. The subsequent chunks will be accessed with the tag from the
   first chunk.
5. Thus, the subsequent chunks need to have their tag set to
   match that of the first chunk.

Refactor code by reusing __kasan_unpoison_vmalloc in a new helper in
preparation for the actual fix.

Link: https://lkml.kernel.org/r/eb61d93b907e262eefcaa130261a08bcb6c5ce51.1764874575.git.m.wieczorretman@pm.me
Fixes: 1d96320f8d53 ("kasan, vmalloc: add vmalloc tagging for SW_TAGS")
Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Jiayuan Chen <jiayuan.chen@linux.dev>
Cc: Kees Cook <kees@kernel.org>
Cc: Marco Elver <elver@google.com>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: <stable@vger.kernel.org>	[6.1+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 15 +++++++++++++++
 mm/kasan/common.c     | 17 +++++++++++++++++
 mm/vmalloc.c          |  4 +---
 3 files changed, 33 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index df3d8567dde9..9c6ac4b62eb9 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -631,6 +631,16 @@ static __always_inline void kasan_poison_vmalloc(const void *start,
 		__kasan_poison_vmalloc(start, size);
 }
 
+void __kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms,
+				 kasan_vmalloc_flags_t flags);
+static __always_inline void
+kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms,
+			  kasan_vmalloc_flags_t flags)
+{
+	if (kasan_enabled())
+		__kasan_unpoison_vmap_areas(vms, nr_vms, flags);
+}
+
 #else /* CONFIG_KASAN_VMALLOC */
 
 static inline void kasan_populate_early_vm_area_shadow(void *start,
@@ -655,6 +665,11 @@ static inline void *kasan_unpoison_vmalloc(const void *start,
 static inline void kasan_poison_vmalloc(const void *start, unsigned long size)
 { }
 
+static __always_inline void
+kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms,
+			  kasan_vmalloc_flags_t flags)
+{ }
+
 #endif /* CONFIG_KASAN_VMALLOC */
 
 #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 1d27f1bd260b..b2b40c59ce18 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -28,6 +28,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/bug.h>
+#include <linux/vmalloc.h>
 
 #include "kasan.h"
 #include "../slab.h"
@@ -575,3 +576,19 @@ bool __kasan_check_byte(const void *address, unsigned long ip)
 	}
 	return true;
 }
+
+#ifdef CONFIG_KASAN_VMALLOC
+void __kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms,
+				 kasan_vmalloc_flags_t flags)
+{
+	unsigned long size;
+	void *addr;
+	int area;
+
+	for (area = 0 ; area < nr_vms ; area++) {
+		size = vms[area]->size;
+		addr = vms[area]->addr;
+		vms[area]->addr = __kasan_unpoison_vmalloc(addr, size, flags);
+	}
+}
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 94c0a9262a46..41dd01e8430c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -5027,9 +5027,7 @@ retry:
 	 * With hardware tag-based KASAN, marking is skipped for
 	 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
 	 */
-	for (area = 0; area < nr_vms; area++)
-		vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
-				vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
+	kasan_unpoison_vmap_areas(vms, nr_vms, KASAN_VMALLOC_PROT_NORMAL);
 
 	kfree(vas);
 	return vms;
-- 
cgit v1.2.3


From 6ba776b533ca902631fa106b8a90811b3f40b08d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 14 Dec 2025 12:15:17 -0800
Subject: mm: leafops.h: correct kernel-doc function param. names

Modify the kernel-doc function parameter names to prevent kernel-doc
warnings:

Warning: include/linux/leafops.h:135 function parameter 'entry' not
 described in 'leafent_type'
Warning: include/linux/leafops.h:540 function parameter 'pte' not
 described in 'pte_is_uffd_marker'

Link: https://lkml.kernel.org/r/20251214201517.2187051-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/leafops.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/leafops.h b/include/linux/leafops.h
index cfafe7a5e7b1..a9ff94b744f2 100644
--- a/include/linux/leafops.h
+++ b/include/linux/leafops.h
@@ -133,7 +133,7 @@ static inline bool softleaf_is_none(softleaf_t entry)
 
 /**
  * softleaf_type() - Identify the type of leaf entry.
- * @enntry: Leaf entry.
+ * @entry: Leaf entry.
  *
  * Returns: the leaf entry type associated with @entry.
  */
@@ -534,7 +534,7 @@ static inline bool pte_is_uffd_wp_marker(pte_t pte)
 /**
  * pte_is_uffd_marker() - Does this PTE entry encode a userfault-specific marker
  * leaf entry?
- * @entry: Leaf entry.
+ * @pte: PTE entry.
  *
  * It's useful to be able to determine which leaf entries encode UFFD-specific
  * markers so we can handle these correctly.
-- 
cgit v1.2.3


From fe55ea85939efcbf0e6baa234f0d70acb79e7b58 Mon Sep 17 00:00:00 2001
From: Pingfan Liu <piliu@redhat.com>
Date: Tue, 16 Dec 2025 09:48:51 +0800
Subject: kernel/kexec: change the prototype of kimage_map_segment()

The kexec segment index will be required to extract the corresponding
information for that segment in kimage_map_segment().  Additionally,
kexec_segment already holds the kexec relocation destination address and
size.  Therefore, the prototype of kimage_map_segment() can be changed.

Link: https://lkml.kernel.org/r/20251216014852.8737-1-piliu@redhat.com
Fixes: 07d24902977e ("kexec: enable CMA based contiguous allocation")
Signed-off-by: Pingfan Liu <piliu@redhat.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Mimi Zohar <zohar@linux.ibm.com>
Cc: Roberto Sassu <roberto.sassu@huawei.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Steven Chen <chenste@linux.microsoft.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec.h              | 4 ++--
 kernel/kexec_core.c                | 9 ++++++---
 security/integrity/ima/ima_kexec.c | 4 +---
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index ff7e231b0485..8a22bc9b8c6c 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -530,7 +530,7 @@ extern bool kexec_file_dbg_print;
 #define kexec_dprintk(fmt, arg...) \
         do { if (kexec_file_dbg_print) pr_info(fmt, ##arg); } while (0)
 
-extern void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size);
+extern void *kimage_map_segment(struct kimage *image, int idx);
 extern void kimage_unmap_segment(void *buffer);
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
@@ -540,7 +540,7 @@ static inline void __crash_kexec(struct pt_regs *regs) { }
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 static inline int kexec_crash_loaded(void) { return 0; }
-static inline void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size)
+static inline void *kimage_map_segment(struct kimage *image, int idx)
 { return NULL; }
 static inline void kimage_unmap_segment(void *buffer) { }
 #define kexec_in_progress false
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 0f92acdd354d..1a79c5b18d8f 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -953,17 +953,20 @@ int kimage_load_segment(struct kimage *image, int idx)
 	return result;
 }
 
-void *kimage_map_segment(struct kimage *image,
-			 unsigned long addr, unsigned long size)
+void *kimage_map_segment(struct kimage *image, int idx)
 {
+	unsigned long addr, size, eaddr;
 	unsigned long src_page_addr, dest_page_addr = 0;
-	unsigned long eaddr = addr + size;
 	kimage_entry_t *ptr, entry;
 	struct page **src_pages;
 	unsigned int npages;
 	void *vaddr = NULL;
 	int i;
 
+	addr = image->segment[idx].mem;
+	size = image->segment[idx].memsz;
+	eaddr = addr + size;
+
 	/*
 	 * Collect the source pages and map them in a contiguous VA range.
 	 */
diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
index 7362f68f2d8b..5beb69edd12f 100644
--- a/security/integrity/ima/ima_kexec.c
+++ b/security/integrity/ima/ima_kexec.c
@@ -250,9 +250,7 @@ void ima_kexec_post_load(struct kimage *image)
 	if (!image->ima_buffer_addr)
 		return;
 
-	ima_kexec_buffer = kimage_map_segment(image,
-					      image->ima_buffer_addr,
-					      image->ima_buffer_size);
+	ima_kexec_buffer = kimage_map_segment(image, image->ima_segment_index);
 	if (!ima_kexec_buffer) {
 		pr_err("Could not map measurements buffer.\n");
 		return;
-- 
cgit v1.2.3


From e6dbcb7c0e7b508d443a9aa6f77f63a2f83b1ae4 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Thu, 11 Dec 2025 07:06:01 +0000
Subject: mm: fixup pfnmap memory failure handling to use pgoff

The memory failure handling implementation for the PFNMAP memory with no
struct pages is faulty.  The VA of the mapping is determined based on the
the PFN.  It should instead be based on the file mapping offset.

At the occurrence of poison, the memory_failure_pfn is triggered on the
poisoned PFN.  Introduce a callback function that allows mm to translate
the PFN to the corresponding file page offset.  The kernel module using
the registration API must implement the callback function and provide the
translation.  The translated value is then used to determine the VA
information and sending the SIGBUS to the usermode process mapped to the
poisoned PFN.

The callback is also useful for the driver to be notified of the poisoned
PFN, which may then track it.

Link: https://lkml.kernel.org/r/20251211070603.338701-2-ankita@nvidia.com
Fixes: 2ec41967189c ("mm: handle poisoning of pfn without struct pages")
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Matthew R. Ochs <mochs@nvidia.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Neo Jia <cjia@nvidia.com>
Cc: Vikram Sethi <vsethi@nvidia.com>
Cc: Yishai Hadas <yishaih@nvidia.com>
Cc: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-failure.h |  2 ++
 mm/memory-failure.c            | 29 ++++++++++++++++++-----------
 2 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h
index bc326503d2d2..7b5e11cf905f 100644
--- a/include/linux/memory-failure.h
+++ b/include/linux/memory-failure.h
@@ -9,6 +9,8 @@ struct pfn_address_space;
 struct pfn_address_space {
 	struct interval_tree_node node;
 	struct address_space *mapping;
+	int (*pfn_to_vma_pgoff)(struct vm_area_struct *vma,
+				unsigned long pfn, pgoff_t *pgoff);
 };
 
 int register_pfn_address_space(struct pfn_address_space *pfn_space);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fbc5a01260c8..c80c2907da33 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2161,6 +2161,9 @@ int register_pfn_address_space(struct pfn_address_space *pfn_space)
 {
 	guard(mutex)(&pfn_space_lock);
 
+	if (!pfn_space->pfn_to_vma_pgoff)
+		return -EINVAL;
+
 	if (interval_tree_iter_first(&pfn_space_itree,
 				     pfn_space->node.start,
 				     pfn_space->node.last))
@@ -2183,10 +2186,10 @@ void unregister_pfn_address_space(struct pfn_address_space *pfn_space)
 }
 EXPORT_SYMBOL_GPL(unregister_pfn_address_space);
 
-static void add_to_kill_pfn(struct task_struct *tsk,
-			    struct vm_area_struct *vma,
-			    struct list_head *to_kill,
-			    unsigned long pfn)
+static void add_to_kill_pgoff(struct task_struct *tsk,
+			      struct vm_area_struct *vma,
+			      struct list_head *to_kill,
+			      pgoff_t pgoff)
 {
 	struct to_kill *tk;
 
@@ -2197,12 +2200,12 @@ static void add_to_kill_pfn(struct task_struct *tsk,
 	}
 
 	/* Check for pgoff not backed by struct page */
-	tk->addr = vma_address(vma, pfn, 1);
+	tk->addr = vma_address(vma, pgoff, 1);
 	tk->size_shift = PAGE_SHIFT;
 
 	if (tk->addr == -EFAULT)
 		pr_info("Unable to find address %lx in %s\n",
-			pfn, tsk->comm);
+			pgoff, tsk->comm);
 
 	get_task_struct(tsk);
 	tk->tsk = tsk;
@@ -2212,11 +2215,12 @@ static void add_to_kill_pfn(struct task_struct *tsk,
 /*
  * Collect processes when the error hit a PFN not backed by struct page.
  */
-static void collect_procs_pfn(struct address_space *mapping,
+static void collect_procs_pfn(struct pfn_address_space *pfn_space,
 			      unsigned long pfn, struct list_head *to_kill)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
+	struct address_space *mapping = pfn_space->mapping;
 
 	i_mmap_lock_read(mapping);
 	rcu_read_lock();
@@ -2226,9 +2230,12 @@ static void collect_procs_pfn(struct address_space *mapping,
 		t = task_early_kill(tsk, true);
 		if (!t)
 			continue;
-		vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) {
-			if (vma->vm_mm == t->mm)
-				add_to_kill_pfn(t, vma, to_kill, pfn);
+		vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX) {
+			pgoff_t pgoff;
+
+			if (vma->vm_mm == t->mm &&
+			    !pfn_space->pfn_to_vma_pgoff(vma, pfn, &pgoff))
+				add_to_kill_pgoff(t, vma, to_kill, pgoff);
 		}
 	}
 	rcu_read_unlock();
@@ -2264,7 +2271,7 @@ static int memory_failure_pfn(unsigned long pfn, int flags)
 			struct pfn_address_space *pfn_space =
 				container_of(node, struct pfn_address_space, node);
 
-			collect_procs_pfn(pfn_space->mapping, pfn, &tokill);
+			collect_procs_pfn(pfn_space, pfn, &tokill);
 
 			mf_handled = true;
 		}
-- 
cgit v1.2.3


From f183663901f21fe0fba8bd31ae894bc529709ee0 Mon Sep 17 00:00:00 2001
From: Bijan Tabatabai <bijan311@gmail.com>
Date: Tue, 16 Dec 2025 14:07:27 -0600
Subject: mm: consider non-anon swap cache folios in folio_expected_ref_count()

Currently, folio_expected_ref_count() only adds references for the swap
cache if the folio is anonymous.  However, according to the comment above
the definition of PG_swapcache in enum pageflags, shmem folios can also
have PG_swapcache set.  This patch makes sure references for the swap
cache are added if folio_test_swapcache(folio) is true.

This issue was found when trying to hot-unplug memory in a QEMU/KVM
virtual machine.  When initiating hot-unplug when most of the guest memory
is allocated, hot-unplug hangs partway through removal due to migration
failures.  The following message would be printed several times, and would
be printed again about every five seconds:

[   49.641309] migrating pfn b12f25 failed ret:7
[   49.641310] page: refcount:2 mapcount:0 mapping:0000000033bd8fe2 index:0x7f404d925 pfn:0xb12f25
[   49.641311] aops:swap_aops
[   49.641313] flags: 0x300000000030508(uptodate|active|owner_priv_1|reclaim|swapbacked|node=0|zone=3)
[   49.641314] raw: 0300000000030508 ffffed312c4bc908 ffffed312c4bc9c8 0000000000000000
[   49.641315] raw: 00000007f404d925 00000000000c823b 00000002ffffffff 0000000000000000
[   49.641315] page dumped because: migration failure

When debugging this, I found that these migration failures were due to
__migrate_folio() returning -EAGAIN for a small set of folios because the
expected reference count it calculates via folio_expected_ref_count() is
one less than the actual reference count of the folios.  Furthermore, all
of the affected folios were not anonymous, but had the PG_swapcache flag
set, inspiring this patch.  After applying this patch, the memory
hot-unplug behaves as expected.

I tested this on a machine running Ubuntu 24.04 with kernel version
6.8.0-90-generic and 64GB of memory.  The guest VM is managed by libvirt
and runs Ubuntu 24.04 with kernel version 6.18 (though the head of the
mm-unstable branch as a Dec 16, 2025 was also tested and behaves the same)
and 48GB of memory.  The libvirt XML definition for the VM can be found at
[1].  CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE is set in the guest
kernel so the hot-pluggable memory is automatically onlined.

Below are the steps to reproduce this behavior:

1) Define and start and virtual machine
  host$ virsh -c qemu:///system define ./test_vm.xml # test_vm.xml from [1]
  host$ virsh -c qemu:///system start test_vm

2) Setup swap in the guest
  guest$ sudo fallocate -l 32G /swapfile
  guest$ sudo chmod 0600 /swapfile
  guest$ sudo mkswap /swapfile
  guest$ sudo swapon /swapfile

3) Use alloc_data [2] to allocate most of the remaining guest memory
  guest$ ./alloc_data 45

4) In a separate guest terminal, monitor the amount of used memory
  guest$ watch -n1 free -h

5) When alloc_data has finished allocating, initiate the memory
hot-unplug using the provided xml file [3]
  host$ virsh -c qemu:///system detach-device test_vm ./remove.xml --live

After initiating the memory hot-unplug, you should see the amount of
available memory in the guest decrease, and the amount of used swap data
increase.  If everything works as expected, when all of the memory is
unplugged, there should be around 8.5-9GB of data in swap.  If the
unplugging is unsuccessful, the amount of used swap data will settle below
that.  If that happens, you should be able to see log messages in dmesg
similar to the one posted above.

Link: https://lkml.kernel.org/r/20251216200727.2360228-1-bijan311@gmail.com
Link: https://github.com/BijanT/linux_patch_files/blob/main/test_vm.xml [1]
Link: https://github.com/BijanT/linux_patch_files/blob/main/alloc_data.c [2]
Link: https://github.com/BijanT/linux_patch_files/blob/main/remove.xml [3]
Fixes: 86ebd50224c0 ("mm: add folio_expected_ref_count() for reference count calculation")
Signed-off-by: Bijan Tabatabai <bijan311@gmail.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Acked-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shivank Garg <shivankg@amd.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 15076261d0c2..6f959d8ca4b4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2459,10 +2459,10 @@ static inline int folio_expected_ref_count(const struct folio *folio)
 	if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio)))
 		return 0;
 
-	if (folio_test_anon(folio)) {
-		/* One reference per page from the swapcache. */
-		ref_count += folio_test_swapcache(folio) << order;
-	} else {
+	/* One reference per page from the swapcache. */
+	ref_count += folio_test_swapcache(folio) << order;
+
+	if (!folio_test_anon(folio)) {
 		/* One reference per page from the pagecache. */
 		ref_count += !!folio->mapping << order;
 		/* One reference from PG_private. */
-- 
cgit v1.2.3


From dc85a46928c41423ad89869baf05a589e2975575 Mon Sep 17 00:00:00 2001
From: Kevin Tian <kevin.tian@intel.com>
Date: Thu, 18 Dec 2025 08:16:49 +0000
Subject: vfio/pci: Disable qword access to the PCI ROM bar

Commit 2b938e3db335 ("vfio/pci: Enable iowrite64 and ioread64 for vfio
pci") enables qword access to the PCI bar resources. However certain
devices (e.g. Intel X710) are observed with problem upon qword accesses
to the rom bar, e.g. triggering PCI aer errors.

This is triggered by Qemu which caches the rom content by simply does a
pread() of the remaining size until it gets the full contents. The other
bars would only perform operations at the same access width as their
guest drivers.

Instead of trying to identify all broken devices, universally disable
qword access to the rom bar i.e. going back to the old way which worked
reliably for years.

Reported-by: Farrah Chen <farrah.chen@intel.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220740
Fixes: 2b938e3db335 ("vfio/pci: Enable iowrite64 and ioread64 for vfio pci")
Cc: stable@vger.kernel.org
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Farrah Chen <farrah.chen@intel.com>
Link: https://lore.kernel.org/r/20251218081650.555015-2-kevin.tian@intel.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 drivers/vfio/pci/nvgrace-gpu/main.c |  4 ++--
 drivers/vfio/pci/vfio_pci_rdwr.c    | 25 ++++++++++++++++++-------
 include/linux/vfio_pci_core.h       | 10 +++++++++-
 3 files changed, 29 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index 84d142a47ec6..b45a24d00387 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -561,7 +561,7 @@ nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev,
 		ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
 					     nvdev->resmem.ioaddr,
 					     buf, offset, mem_count,
-					     0, 0, false);
+					     0, 0, false, VFIO_PCI_IO_WIDTH_8);
 	}
 
 	return ret;
@@ -693,7 +693,7 @@ nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev,
 		ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
 					     nvdev->resmem.ioaddr,
 					     (char __user *)buf, pos, mem_count,
-					     0, 0, true);
+					     0, 0, true, VFIO_PCI_IO_WIDTH_8);
 	}
 
 	return ret;
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 6192788c8ba3..25380b7dfe18 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -135,7 +135,8 @@ VFIO_IORDWR(64)
 ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 			       void __iomem *io, char __user *buf,
 			       loff_t off, size_t count, size_t x_start,
-			       size_t x_end, bool iswrite)
+			       size_t x_end, bool iswrite,
+			       enum vfio_pci_io_width max_width)
 {
 	ssize_t done = 0;
 	int ret;
@@ -150,20 +151,19 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 		else
 			fillable = 0;
 
-		if (fillable >= 8 && !(off % 8)) {
+		if (fillable >= 8 && !(off % 8) && max_width >= 8) {
 			ret = vfio_pci_iordwr64(vdev, iswrite, test_mem,
 						io, buf, off, &filled);
 			if (ret)
 				return ret;
 
-		} else
-		if (fillable >= 4 && !(off % 4)) {
+		} else if (fillable >= 4 && !(off % 4) && max_width >= 4) {
 			ret = vfio_pci_iordwr32(vdev, iswrite, test_mem,
 						io, buf, off, &filled);
 			if (ret)
 				return ret;
 
-		} else if (fillable >= 2 && !(off % 2)) {
+		} else if (fillable >= 2 && !(off % 2) && max_width >= 2) {
 			ret = vfio_pci_iordwr16(vdev, iswrite, test_mem,
 						io, buf, off, &filled);
 			if (ret)
@@ -234,6 +234,7 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 	void __iomem *io;
 	struct resource *res = &vdev->pdev->resource[bar];
 	ssize_t done;
+	enum vfio_pci_io_width max_width = VFIO_PCI_IO_WIDTH_8;
 
 	if (pci_resource_start(pdev, bar))
 		end = pci_resource_len(pdev, bar);
@@ -262,6 +263,16 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 		if (!io)
 			return -ENOMEM;
 		x_end = end;
+
+		/*
+		 * Certain devices (e.g. Intel X710) don't support qword
+		 * access to the ROM bar. Otherwise PCI AER errors might be
+		 * triggered.
+		 *
+		 * Disable qword access to the ROM bar universally, which
+		 * worked reliably for years before qword access is enabled.
+		 */
+		max_width = VFIO_PCI_IO_WIDTH_4;
 	} else {
 		int ret = vfio_pci_core_setup_barmap(vdev, bar);
 		if (ret) {
@@ -278,7 +289,7 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 	}
 
 	done = vfio_pci_core_do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
-				      count, x_start, x_end, iswrite);
+				      count, x_start, x_end, iswrite, max_width);
 
 	if (done >= 0)
 		*ppos += done;
@@ -352,7 +363,7 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 	 * to the memory enable bit in the command register.
 	 */
 	done = vfio_pci_core_do_io_rw(vdev, false, iomem, buf, off, count,
-				      0, 0, iswrite);
+				      0, 0, iswrite, VFIO_PCI_IO_WIDTH_8);
 
 	vga_put(vdev->pdev, rsrc);
 
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 706877f998ff..1ac86896875c 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -145,6 +145,13 @@ struct vfio_pci_core_device {
 	struct list_head	dmabufs;
 };
 
+enum vfio_pci_io_width {
+	VFIO_PCI_IO_WIDTH_1 = 1,
+	VFIO_PCI_IO_WIDTH_2 = 2,
+	VFIO_PCI_IO_WIDTH_4 = 4,
+	VFIO_PCI_IO_WIDTH_8 = 8,
+};
+
 /* Will be exported for vfio pci drivers usage */
 int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
 				      unsigned int type, unsigned int subtype,
@@ -188,7 +195,8 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
 ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 			       void __iomem *io, char __user *buf,
 			       loff_t off, size_t count, size_t x_start,
-			       size_t x_end, bool iswrite);
+			       size_t x_end, bool iswrite,
+			       enum vfio_pci_io_width max_width);
 bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
 bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt,
 					 loff_t reg_start, size_t reg_cnt,
-- 
cgit v1.2.3


From f059588c552746e0fe299214f35c58effa715b74 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 4 Dec 2025 13:31:52 -0500
Subject: virtio: make it self-contained

virtio.h uses struct module, add a forward declaration to
make the header self-contained.

Message-ID: <9171b5cac60793eb59ab044c96ee038bf1363bee.1764873799.git.mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 132a474e5914..3626eb694728 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -13,6 +13,8 @@
 #include <linux/completion.h>
 #include <linux/virtio_features.h>
 
+struct module;
+
 /**
  * struct virtqueue - a queue to register buffers for sending or receiving.
  * @list: the chain of virtqueues for this device
-- 
cgit v1.2.3


From e88dfb93311c81359b00c12e0b396bd0ea13ad6c Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 4 Dec 2025 12:49:34 -0500
Subject: virtio_features: make it self-contained

virtio_features.h uses WARN_ON_ONCE and memset so it must
include linux/bug.h and linux/string.h

Message-ID: <579986aa9b8d023844990d2a0e267382f8ad85d5.1764873799.git.mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_features.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio_features.h b/include/linux/virtio_features.h
index ea2ad8717882..ce59ea91f474 100644
--- a/include/linux/virtio_features.h
+++ b/include/linux/virtio_features.h
@@ -3,6 +3,8 @@
 #define _LINUX_VIRTIO_FEATURES_H
 
 #include <linux/bits.h>
+#include <linux/bug.h>
+#include <linux/string.h>
 
 #define VIRTIO_FEATURES_U64S	2
 #define VIRTIO_FEATURES_BITS	(VIRTIO_FEATURES_U64S * 64)
-- 
cgit v1.2.3


From 5623eb1ed035f01dfa620366a82b667545b10c82 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 31 Dec 2025 08:12:46 -0700
Subject: io_uring/tctx: add separate lock for list of tctx's in ctx

ctx->tcxt_list holds the tasks using this ring, and it's currently
protected by the normal ctx->uring_lock. However, this can cause a
circular locking issue, as reported by syzbot, where cancelations off
exec end up needing to remove an entry from this list:

======================================================
WARNING: possible circular locking dependency detected
syzkaller #0 Tainted: G             L
------------------------------------------------------
syz.0.9999/12287 is trying to acquire lock:
ffff88805851c0a8 (&ctx->uring_lock){+.+.}-{4:4}, at: io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179

but task is already holding lock:
ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: prepare_bprm_creds fs/exec.c:1360 [inline]
ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: bprm_execve+0xb9/0x1400 fs/exec.c:1733

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (&sig->cred_guard_mutex){+.+.}-{4:4}:
       __mutex_lock_common kernel/locking/mutex.c:614 [inline]
       __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776
       proc_pid_attr_write+0x547/0x630 fs/proc/base.c:2837
       vfs_write+0x27e/0xb30 fs/read_write.c:684
       ksys_write+0x145/0x250 fs/read_write.c:738
       do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
       do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94
       entry_SYSCALL_64_after_hwframe+0x77/0x7f

-> #1 (sb_writers#3){.+.+}-{0:0}:
       percpu_down_read_internal include/linux/percpu-rwsem.h:53 [inline]
       percpu_down_read_freezable include/linux/percpu-rwsem.h:83 [inline]
       __sb_start_write include/linux/fs/super.h:19 [inline]
       sb_start_write+0x4d/0x1c0 include/linux/fs/super.h:125
       mnt_want_write+0x41/0x90 fs/namespace.c:499
       open_last_lookups fs/namei.c:4529 [inline]
       path_openat+0xadd/0x3dd0 fs/namei.c:4784
       do_filp_open+0x1fa/0x410 fs/namei.c:4814
       io_openat2+0x3e0/0x5c0 io_uring/openclose.c:143
       __io_issue_sqe+0x181/0x4b0 io_uring/io_uring.c:1792
       io_issue_sqe+0x165/0x1060 io_uring/io_uring.c:1815
       io_queue_sqe io_uring/io_uring.c:2042 [inline]
       io_submit_sqe io_uring/io_uring.c:2320 [inline]
       io_submit_sqes+0xbf4/0x2140 io_uring/io_uring.c:2434
       __do_sys_io_uring_enter io_uring/io_uring.c:3280 [inline]
       __se_sys_io_uring_enter+0x2e0/0x2b60 io_uring/io_uring.c:3219
       do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
       do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94
       entry_SYSCALL_64_after_hwframe+0x77/0x7f

-> #0 (&ctx->uring_lock){+.+.}-{4:4}:
       check_prev_add kernel/locking/lockdep.c:3165 [inline]
       check_prevs_add kernel/locking/lockdep.c:3284 [inline]
       validate_chain kernel/locking/lockdep.c:3908 [inline]
       __lock_acquire+0x15a6/0x2cf0 kernel/locking/lockdep.c:5237
       lock_acquire+0x107/0x340 kernel/locking/lockdep.c:5868
       __mutex_lock_common kernel/locking/mutex.c:614 [inline]
       __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776
       io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179
       io_uring_clean_tctx+0xd4/0x1a0 io_uring/tctx.c:195
       io_uring_cancel_generic+0x6ca/0x7d0 io_uring/cancel.c:646
       io_uring_task_cancel include/linux/io_uring.h:24 [inline]
       begin_new_exec+0x10ed/0x2440 fs/exec.c:1131
       load_elf_binary+0x9f8/0x2d70 fs/binfmt_elf.c:1010
       search_binary_handler fs/exec.c:1669 [inline]
       exec_binprm fs/exec.c:1701 [inline]
       bprm_execve+0x92e/0x1400 fs/exec.c:1753
       do_execveat_common+0x510/0x6a0 fs/exec.c:1859
       do_execve fs/exec.c:1933 [inline]
       __do_sys_execve fs/exec.c:2009 [inline]
       __se_sys_execve fs/exec.c:2004 [inline]
       __x64_sys_execve+0x94/0xb0 fs/exec.c:2004
       do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
       do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94
       entry_SYSCALL_64_after_hwframe+0x77/0x7f

other info that might help us debug this:

Chain exists of:
  &ctx->uring_lock --> sb_writers#3 --> &sig->cred_guard_mutex

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&sig->cred_guard_mutex);
                               lock(sb_writers#3);
                               lock(&sig->cred_guard_mutex);
  lock(&ctx->uring_lock);

 *** DEADLOCK ***

1 lock held by syz.0.9999/12287:
 #0: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: prepare_bprm_creds fs/exec.c:1360 [inline]
 #0: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: bprm_execve+0xb9/0x1400 fs/exec.c:1733

stack backtrace:
CPU: 0 UID: 0 PID: 12287 Comm: syz.0.9999 Tainted: G             L      syzkaller #0 PREEMPT(full)
Tainted: [L]=SOFTLOCKUP
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/25/2025
Call Trace:
 <TASK>
 dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120
 print_circular_bug+0x2e2/0x300 kernel/locking/lockdep.c:2043
 check_noncircular+0x12e/0x150 kernel/locking/lockdep.c:2175
 check_prev_add kernel/locking/lockdep.c:3165 [inline]
 check_prevs_add kernel/locking/lockdep.c:3284 [inline]
 validate_chain kernel/locking/lockdep.c:3908 [inline]
 __lock_acquire+0x15a6/0x2cf0 kernel/locking/lockdep.c:5237
 lock_acquire+0x107/0x340 kernel/locking/lockdep.c:5868
 __mutex_lock_common kernel/locking/mutex.c:614 [inline]
 __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776
 io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179
 io_uring_clean_tctx+0xd4/0x1a0 io_uring/tctx.c:195
 io_uring_cancel_generic+0x6ca/0x7d0 io_uring/cancel.c:646
 io_uring_task_cancel include/linux/io_uring.h:24 [inline]
 begin_new_exec+0x10ed/0x2440 fs/exec.c:1131
 load_elf_binary+0x9f8/0x2d70 fs/binfmt_elf.c:1010
 search_binary_handler fs/exec.c:1669 [inline]
 exec_binprm fs/exec.c:1701 [inline]
 bprm_execve+0x92e/0x1400 fs/exec.c:1753
 do_execveat_common+0x510/0x6a0 fs/exec.c:1859
 do_execve fs/exec.c:1933 [inline]
 __do_sys_execve fs/exec.c:2009 [inline]
 __se_sys_execve fs/exec.c:2004 [inline]
 __x64_sys_execve+0x94/0xb0 fs/exec.c:2004
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7ff3a8b8f749
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ff3a9a97038 EFLAGS: 00000246 ORIG_RAX: 000000000000003b
RAX: ffffffffffffffda RBX: 00007ff3a8de5fa0 RCX: 00007ff3a8b8f749
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000200000000400
RBP: 00007ff3a8c13f91 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007ff3a8de6038 R14: 00007ff3a8de5fa0 R15: 00007ff3a8f0fa28
 </TASK>

Add a separate lock just for the tctx_list, tctx_lock. This can nest
under ->uring_lock, where necessary, and be used separately for list
manipulation. For the cancelation off exec side, this removes the
need to grab ->uring_lock, hence fixing the circular locking
dependency.

Reported-by: syzbot+b0e3b77ffaa8a4067ce5@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 8 +++++++-
 io_uring/cancel.c              | 5 +++++
 io_uring/io_uring.c            | 5 +++++
 io_uring/register.c            | 2 ++
 io_uring/tctx.c                | 8 ++++----
 5 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e1adb0d20a0a..a3e8ddc9b380 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -424,11 +424,17 @@ struct io_ring_ctx {
 	struct user_struct		*user;
 	struct mm_struct		*mm_account;
 
+	/*
+	 * List of tctx nodes for this ctx, protected by tctx_lock. For
+	 * cancelation purposes, nests under uring_lock.
+	 */
+	struct list_head		tctx_list;
+	struct mutex			tctx_lock;
+
 	/* ctx exit and cancelation */
 	struct llist_head		fallback_llist;
 	struct delayed_work		fallback_work;
 	struct work_struct		exit_work;
-	struct list_head		tctx_list;
 	struct completion		ref_comp;
 
 	/* io-wq management, e.g. thread count */
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index ca12ac10c0ae..07b8d852218b 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -184,7 +184,9 @@ static int __io_async_cancel(struct io_cancel_data *cd,
 	} while (1);
 
 	/* slow path, try all io-wq's */
+	__set_current_state(TASK_RUNNING);
 	io_ring_submit_lock(ctx, issue_flags);
+	mutex_lock(&ctx->tctx_lock);
 	ret = -ENOENT;
 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 		ret = io_async_cancel_one(node->task->io_uring, cd);
@@ -194,6 +196,7 @@ static int __io_async_cancel(struct io_cancel_data *cd,
 			nr++;
 		}
 	}
+	mutex_unlock(&ctx->tctx_lock);
 	io_ring_submit_unlock(ctx, issue_flags);
 	return all ? nr : ret;
 }
@@ -484,6 +487,7 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
 	bool ret = false;
 
 	mutex_lock(&ctx->uring_lock);
+	mutex_lock(&ctx->tctx_lock);
 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 		struct io_uring_task *tctx = node->task->io_uring;
 
@@ -496,6 +500,7 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
 		cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
 		ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
 	}
+	mutex_unlock(&ctx->tctx_lock);
 	mutex_unlock(&ctx->uring_lock);
 
 	return ret;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 709943fedaf4..87a87396e940 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -340,6 +340,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->ltimeout_list);
 	init_llist_head(&ctx->work_llist);
 	INIT_LIST_HEAD(&ctx->tctx_list);
+	mutex_init(&ctx->tctx_lock);
 	ctx->submit_state.free_list.next = NULL;
 	INIT_HLIST_HEAD(&ctx->waitid_list);
 	xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC);
@@ -3045,6 +3046,7 @@ static __cold void io_ring_exit_work(struct work_struct *work)
 	exit.ctx = ctx;
 
 	mutex_lock(&ctx->uring_lock);
+	mutex_lock(&ctx->tctx_lock);
 	while (!list_empty(&ctx->tctx_list)) {
 		WARN_ON_ONCE(time_after(jiffies, timeout));
 
@@ -3056,6 +3058,7 @@ static __cold void io_ring_exit_work(struct work_struct *work)
 		if (WARN_ON_ONCE(ret))
 			continue;
 
+		mutex_unlock(&ctx->tctx_lock);
 		mutex_unlock(&ctx->uring_lock);
 		/*
 		 * See comment above for
@@ -3064,7 +3067,9 @@ static __cold void io_ring_exit_work(struct work_struct *work)
 		 */
 		wait_for_completion_interruptible(&exit.completion);
 		mutex_lock(&ctx->uring_lock);
+		mutex_lock(&ctx->tctx_lock);
 	}
+	mutex_unlock(&ctx->tctx_lock);
 	mutex_unlock(&ctx->uring_lock);
 	spin_lock(&ctx->completion_lock);
 	spin_unlock(&ctx->completion_lock);
diff --git a/io_uring/register.c b/io_uring/register.c
index 62d39b3ff317..3d3822ff3fd9 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -320,6 +320,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
 		return 0;
 
 	/* now propagate the restriction to all registered users */
+	mutex_lock(&ctx->tctx_lock);
 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
 		tctx = node->task->io_uring;
 		if (WARN_ON_ONCE(!tctx->io_wq))
@@ -330,6 +331,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
 		/* ignore errors, it always returns zero anyway */
 		(void)io_wq_max_workers(tctx->io_wq, new_count);
 	}
+	mutex_unlock(&ctx->tctx_lock);
 	return 0;
 err:
 	if (sqd) {
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 5b66755579c0..6d6f44215ec8 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -136,9 +136,9 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
 			return ret;
 		}
 
-		mutex_lock(&ctx->uring_lock);
+		mutex_lock(&ctx->tctx_lock);
 		list_add(&node->ctx_node, &ctx->tctx_list);
-		mutex_unlock(&ctx->uring_lock);
+		mutex_unlock(&ctx->tctx_lock);
 	}
 	return 0;
 }
@@ -176,9 +176,9 @@ __cold void io_uring_del_tctx_node(unsigned long index)
 	WARN_ON_ONCE(current != node->task);
 	WARN_ON_ONCE(list_empty(&node->ctx_node));
 
-	mutex_lock(&node->ctx->uring_lock);
+	mutex_lock(&node->ctx->tctx_lock);
 	list_del(&node->ctx_node);
-	mutex_unlock(&node->ctx->uring_lock);
+	mutex_unlock(&node->ctx->tctx_lock);
 
 	if (tctx->last == node->ctx)
 		tctx->last = NULL;
-- 
cgit v1.2.3


From 304f5784e97281f18ef4bed574cbe5fcf6ce2f2e Mon Sep 17 00:00:00 2001
From: André Draszik <andre.draszik@linaro.org>
Date: Fri, 9 Jan 2026 08:38:43 +0000
Subject: regulator: core: reresolve unresolved supplies when available
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a regulator A and its supply B are provided by different devices,
the driver implementing B might be last to probe (with A still
pending resolution of its supply B). While we try to resolve all
pending supplies for all regulators (including A) during
regulator_register() of B via regulator_register_resolve_supply(),
supply resolution will still not work for A as the driver for B hasn't
finished binding to the PMIC device corresponding to B at that stage
yet. The regulator core explicitly only allows supplies from other
devices to be used once the relevant driver has fully bound, mainly to
avoid having to deal with cases where B itself might -EPROBE_DEFER.

In this case, A's supply will only be resolved as part of the core's
regulator_init_complete_work_function(), which currently is scheduled
to run after 30s. This was added as a work-around in
commit 3827b64dba27 ("regulator: core: Resolve supplies before
disabling unused regulators") to cover this situation.

There are two problems with that approach:
* it potentially runs long after all our consumers have probed
* an upcoming change will allow regulator_register() to complete
  successfully even when required supplies (e.g. due to always-on or
  boot-on) are missing at register time, deferring full configuration
  of the regulator (and usability by consumers, i.e. usually consumer
  probe) until the supply becomes available.
  Resolving supplies in the late work func can therefore make it
  impossible for consumers to probe at all, as the driver core will not
  know to reprobe consumers when supplies have resolved.

We could schedule an earlier work to try to resolve supplies sooner,
but that'd be racy as consumers of A might try to probe before A's
supply gets fully resolved via this extra work.

Instead, add a very simple regulator bus and add a dummy device with a
corresponding driver to it for each regulator that is missing its
supply during regulator_register(). This way, the driver core will call
our bus' probe whenever a new (regulator) device was successfully
bound, allowing us to retry resolving the supply during (our bus) probe
and to bind this dummy device if successful. In turn this means the
driver core will see a newly bound device and retry probing of all
pending consumers, if any.

With that in place, we can avoid walking the full list of all known
regulators to try resolve missing supplies during regulator_register(),
as the driver core will invoke the bus probe for regulators that are
still pending their supplies. We can also drop the code trying to
resolve supplies one last time before unused regulators get disabled,
as all supplies should have resolved at that point in time, and if they
haven't then there's no point in trying again, as the outcome won't
change.

Note: We can not reuse the existing struct device created for each
rail, as a device can not be part of a class and a bus simultaneously.

Signed-off-by: André Draszik <andre.draszik@linaro.org>
Link: https://patch.msgid.link/20260109-regulators-defer-v2-7-1a25dc968e60@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c         | 121 +++++++++++++++++++++++++++++++--------
 include/linux/regulator/driver.h |   1 +
 2 files changed, 98 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 86dbee3ffda0..08e92b1ba2dc 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -44,6 +44,8 @@ static LIST_HEAD(regulator_supply_alias_list);
 static LIST_HEAD(regulator_coupler_list);
 static bool has_full_constraints;
 
+static const struct bus_type regulator_bus;
+
 static struct dentry *debugfs_root;
 
 /*
@@ -5694,17 +5696,6 @@ static void rdev_init_debugfs(struct regulator_dev *rdev)
 			   &rdev->bypass_count);
 }
 
-static int regulator_register_resolve_supply(struct device *dev, void *data)
-{
-	struct regulator_dev *rdev = dev_to_rdev(dev);
-
-	if (regulator_resolve_supply(rdev))
-		rdev_dbg(rdev, "unable to resolve supply '%s'\n",
-			 rdev->supply_name);
-
-	return 0;
-}
-
 int regulator_coupler_register(struct regulator_coupler *coupler)
 {
 	mutex_lock(&regulator_list_mutex);
@@ -5923,6 +5914,7 @@ regulator_register(struct device *dev,
 	struct regulator_config *config = NULL;
 	static atomic_t regulator_no = ATOMIC_INIT(-1);
 	struct regulator_dev *rdev;
+	bool tried_supply_resolve = false;
 	bool dangling_cfg_gpiod = false;
 	bool dangling_of_gpiod = false;
 	int ret, i;
@@ -6093,6 +6085,7 @@ regulator_register(struct device *dev,
 		else
 			rdev_dbg(rdev, "unable to resolve supply early: %pe\n",
 				 ERR_PTR(ret));
+		tried_supply_resolve = true;
 	}
 	if (ret < 0)
 		goto wash;
@@ -6124,6 +6117,37 @@ regulator_register(struct device *dev,
 	if (ret != 0)
 		goto unset_supplies;
 
+	if (!tried_supply_resolve) {
+		/*
+		 * As an optimisation, try to resolve our supply (if any) now to
+		 * avoid adding the bus device. Errors are not fatal at this
+		 * stage, we'll simply try again later.
+		 */
+		ret = regulator_resolve_supply(rdev);
+		if (ret)
+			rdev_dbg(rdev,
+				 "unable to resolve supply (ignoring): %pe\n",
+				 ERR_PTR(ret));
+	}
+
+	/*
+	 * If we have a supply but couldn't resolve it yet, register a device
+	 * with our bus, so that the bus probe gets called whenever any new
+	 * driver binds, allowing us to retry matching supplies and which then
+	 * triggers (re)probe of consumers if successful.
+	 */
+	if (rdev->supply_name && !rdev->supply) {
+		device_initialize(&rdev->bdev);
+		rdev->bdev.bus = &regulator_bus;
+		rdev->bdev.parent = &rdev->dev;
+		device_set_pm_not_required(&rdev->dev);
+		dev_set_name(&rdev->bdev, "%s.bdev", dev_name(&rdev->dev));
+
+		ret = device_add(&rdev->bdev);
+		if (ret)
+			goto del_cdev_and_bdev;
+	}
+
 	rdev_init_debugfs(rdev);
 
 	/* try to resolve regulators coupling since a new one was registered */
@@ -6131,12 +6155,13 @@ regulator_register(struct device *dev,
 	regulator_resolve_coupling(rdev);
 	mutex_unlock(&regulator_list_mutex);
 
-	/* try to resolve regulators supply since a new one was registered */
-	class_for_each_device(&regulator_class, NULL, NULL,
-			      regulator_register_resolve_supply);
 	kfree(config);
 	return rdev;
 
+del_cdev_and_bdev:
+	if (rdev->bdev.bus == &regulator_bus)
+		put_device(&rdev->bdev);
+	device_del(&rdev->dev);
 unset_supplies:
 	mutex_lock(&regulator_list_mutex);
 	unset_regulator_supplies(rdev);
@@ -6189,6 +6214,9 @@ void regulator_unregister(struct regulator_dev *rdev)
 	unset_regulator_supplies(rdev);
 	list_del(&rdev->list);
 	regulator_ena_gpio_free(rdev);
+	if (rdev->bdev.bus == &regulator_bus)
+		/* only if the device was added in the first place */
+		device_unregister(&rdev->bdev);
 	device_unregister(&rdev->dev);
 
 	mutex_unlock(&regulator_list_mutex);
@@ -6269,6 +6297,45 @@ const struct class regulator_class = {
 	.pm = &regulator_pm_ops,
 #endif
 };
+
+#define bdev_to_rdev(__bdev) container_of_const(__bdev, struct regulator_dev, bdev)
+
+static int regulator_bus_match(struct device *bdev,
+			       const struct device_driver *drv)
+{
+	/* Match always succeeds, we only have one driver */
+	return 1;
+}
+
+static int regulator_bus_probe(struct device *bdev)
+{
+	struct regulator_dev *rdev = bdev_to_rdev(bdev);
+	int ret;
+
+	ret = regulator_resolve_supply(rdev);
+	if (ret)
+		rdev_dbg(rdev,
+			 "unable to resolve supply or constraints '%s': %pe\n",
+			 rdev->supply_name, ERR_PTR(ret));
+	else
+		rdev_dbg(rdev, "resolved supply '%s'\n", rdev->supply_name);
+
+	return ret;
+}
+
+static const struct bus_type regulator_bus = {
+	.name = "regulator",
+	.match = regulator_bus_match,
+	.probe = regulator_bus_probe,
+};
+
+static struct device_driver regulator_bus_driver = {
+	.name = "regulator-bus-drv",
+	.bus = &regulator_bus,
+	.suppress_bind_attrs = true,
+	.probe_type = PROBE_PREFER_ASYNCHRONOUS,
+};
+
 /**
  * regulator_has_full_constraints - the system has fully specified constraints
  *
@@ -6602,7 +6669,17 @@ static int __init regulator_init(void)
 {
 	int ret;
 
+	ret = bus_register(&regulator_bus);
+	if (ret)
+		return ret;
+
 	ret = class_register(&regulator_class);
+	if (ret)
+		goto err_class;
+
+	ret = driver_register(&regulator_bus_driver);
+	if (ret)
+		goto err_driver;
 
 	debugfs_root = debugfs_create_dir("regulator", NULL);
 	if (IS_ERR(debugfs_root))
@@ -6619,6 +6696,12 @@ static int __init regulator_init(void)
 
 	regulator_coupler_register(&generic_regulator_coupler);
 
+	return 0;
+
+err_driver:
+	class_unregister(&regulator_class);
+err_class:
+	bus_unregister(&regulator_bus);
 	return ret;
 }
 
@@ -6679,16 +6762,6 @@ __setup("regulator_ignore_unused", regulator_ignore_unused_setup);
 
 static void regulator_init_complete_work_function(struct work_struct *work)
 {
-	/*
-	 * Regulators may had failed to resolve their input supplies
-	 * when were registered, either because the input supply was
-	 * not registered yet or because its parent device was not
-	 * bound yet. So attempt to resolve the input supplies for
-	 * pending regulators before trying to disable unused ones.
-	 */
-	class_for_each_device(&regulator_class, NULL, NULL,
-			      regulator_register_resolve_supply);
-
 	/*
 	 * For debugging purposes, it may be useful to prevent unused
 	 * regulators from being disabled.
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 978cf593b662..d38353f2b56f 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -635,6 +635,7 @@ struct regulator_dev {
 	int ref_cnt;
 	struct module *owner;
 	struct device dev;
+	struct device bdev;
 	struct regulation_constraints *constraints;
 	struct regulator *supply;	/* for tree */
 	const char *supply_name;
-- 
cgit v1.2.3


From 8d38423d9dea7353a8a54a3ab2e0d0aa04ed34d0 Mon Sep 17 00:00:00 2001
From: André Draszik <andre.draszik@linaro.org>
Date: Fri, 9 Jan 2026 08:38:44 +0000
Subject: regulator: core: don't fail regulator_register() with missing
 required supply
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 98e48cd9283d ("regulator: core: resolve supply for
boot-on/always-on regulators"), the regulator core returns
-EPROBE_DEFER if a supply can not be resolved at regulator_register()
time due to set_machine_constraints() requiring that supply (e.g.
because of always-on or boot-on).

In some hardware designs, multiple PMICs are used where individual
rails of each act as supplies for rails of the other, and vice-versa.
In such a design no PMIC driver can probe when registering one top-
level regulator device (as is common practice for almost all regulator
drivers in Linux) since that commit. Supplies are only considered when
their driver has fully bound, but because in a design like the above
two drivers / devices depend on each other, neither will have fully
bound while the other probes. The Google Pixel 6 and 6 Pro (oriole and
raven) are examples of such a design.

One way to make this work would be to register each rail as an
individual device, rather than just one top-level regulator device.
Then, fw-devlink and Linux' driver core could do their usual handling
of deferred device probe as each rail would be probed individually.
This approach was dismissed in [1] as each regulator driver would have
to take care of this itself.

Alternatively, we can change the regulator core to not fail
regulator_register() if a rail's required supply can not be resolved
while keeping the intended change from above mentioned commit, and
instead retry whenever a new rail is registered. This commit implements
such an approach:

    If set_machine_constraints() requests probe deferral,
    regulator_register() still succeeds and we retry setting
    constraints as part of regulator_resolve_supply().
    We still do not enable the regulator or allow consumers to use it
    until constraints have been set (including resolution of the
    supply) to prevent enabling of a regulator before its supply.

With this change, we keep track of regulators with missing required
supplies and can therefore try to resolve them again and try to set
the constraints again once more regulators become available.

Care has to be taken to not allow consumers to use regulators that
haven't had their constraints set yet. regulator_get() ensures that
and now returns -EPROBE_DEFER in that case.

The implementation is straight-forward, thanks to our newly introduced
regulator-bus. Locking in regulator_resolve_supply() has to be done
carefully, as a combination of regulator_(un)lock_two() and
regulator_(un)lock_dependent() is needed. The reason is that
set_machine_constraints() might call regulator_enable() which needs
rdev and all its dependents locked, but everything else requires to
only have rdev and its supply locked.

Link: https://lore.kernel.org/all/aRn_-o-vie_QoDXD@sirena.co.uk/ [1]
Signed-off-by: André Draszik <andre.draszik@linaro.org>
Link: https://patch.msgid.link/20260109-regulators-defer-v2-8-1a25dc968e60@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c         | 148 +++++++++++++++++++++++++++++++--------
 include/linux/regulator/driver.h |   1 +
 2 files changed, 119 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 08e92b1ba2dc..8c2fd20edd50 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -98,6 +98,7 @@ struct regulator_event_work {
 	unsigned long event;
 };
 
+static int _regulator_enable(struct regulator *regulator);
 static int _regulator_is_enabled(struct regulator_dev *rdev);
 static int _regulator_disable(struct regulator *regulator);
 static int _regulator_get_error_flags(struct regulator_dev *rdev, unsigned int *flags);
@@ -1432,6 +1433,7 @@ static int handle_notify_limits(struct regulator_dev *rdev,
 /**
  * set_machine_constraints - sets regulator constraints
  * @rdev: regulator source
+ * @is_locked: whether or not this is called with locks held already
  *
  * Allows platform initialisation code to define and constrain
  * regulator circuits e.g. valid voltage/current ranges, etc.  NOTE:
@@ -1441,7 +1443,8 @@ static int handle_notify_limits(struct regulator_dev *rdev,
  *
  * Return: 0 on success or a negative error number on failure.
  */
-static int set_machine_constraints(struct regulator_dev *rdev)
+static int set_machine_constraints(struct regulator_dev *rdev,
+				   bool is_locked)
 {
 	int ret = 0;
 	const struct regulator_ops *ops = rdev->desc->ops;
@@ -1653,7 +1656,9 @@ static int set_machine_constraints(struct regulator_dev *rdev)
 		if (rdev->supply &&
 		    (rdev->constraints->always_on ||
 		     !regulator_is_enabled(rdev->supply))) {
-			ret = regulator_enable(rdev->supply);
+			ret = (is_locked
+			       ? _regulator_enable(rdev->supply)
+			       : regulator_enable(rdev->supply));
 			if (ret < 0) {
 				_regulator_put(rdev->supply);
 				rdev->supply = NULL;
@@ -1781,6 +1786,15 @@ static int register_regulator_event_forwarding(struct regulator_dev *rdev)
 	return 0;
 }
 
+static void unregister_regulator_event_forwarding(struct regulator_dev *rdev)
+{
+	if (!rdev->supply_fwd_nb.notifier_call)
+		return;
+
+	regulator_unregister_notifier(rdev->supply, &rdev->supply_fwd_nb);
+	rdev->supply_fwd_nb.notifier_call = NULL;
+}
+
 /**
  * set_supply - set regulator supply regulator
  * @rdev: regulator (locked)
@@ -2169,6 +2183,8 @@ static int regulator_resolve_supply(struct regulator_dev *rdev)
 	struct regulator_dev *r;
 	struct device *dev = rdev->dev.parent;
 	struct ww_acquire_ctx ww_ctx;
+	struct regulator *supply;
+	bool do_final_setup;
 	int ret = 0;
 
 	/* No supply to resolve? */
@@ -2176,7 +2192,7 @@ static int regulator_resolve_supply(struct regulator_dev *rdev)
 		return 0;
 
 	/* Supply already resolved? (fast-path without locking contention) */
-	if (rdev->supply)
+	if (rdev->supply && !rdev->constraints_pending)
 		return 0;
 
 	/* first do a dt based lookup on the node described in the virtual
@@ -2257,46 +2273,115 @@ static int regulator_resolve_supply(struct regulator_dev *rdev)
 
 	/* Supply just resolved by a concurrent task? */
 	if (rdev->supply) {
+		/* Constraints might still be pending due to concurrency. */
+		bool done = !rdev->constraints_pending;
+
+		supply = rdev->supply;
+
 		regulator_unlock_two(rdev, r, &ww_ctx);
 		put_device(&r->dev);
-		goto out;
-	}
 
-	ret = set_supply(rdev, r);
-	if (ret < 0) {
+		/*
+		 * Supply resolved by concurrent task, and constraints set as
+		 * well (or not required): fast path.
+		 */
+		if (done)
+			goto out;
+
+		do_final_setup = false;
+	} else {
+		ret = set_supply(rdev, r);
+		if (ret < 0) {
+			regulator_unlock_two(rdev, r, &ww_ctx);
+			put_device(&r->dev);
+			goto out;
+		}
+
+		supply = rdev->supply;
+
+		/*
+		 * Automatically register for event forwarding from the new
+		 * supply. This creates the downstream propagation link for
+		 * events like under-voltage.
+		 */
+		ret = register_regulator_event_forwarding(rdev);
+		if (ret < 0) {
+			rdev_warn(rdev,
+				  "Failed to register event forwarding: %pe\n",
+				  ERR_PTR(ret));
+
+			goto unset_supply;
+		}
+
 		regulator_unlock_two(rdev, r, &ww_ctx);
-		put_device(&r->dev);
-		goto out;
+
+		do_final_setup = true;
 	}
 
 	/*
-	 * Automatically register for event forwarding from the new supply.
-	 * This creates the downstream propagation link for events like
-	 * under-voltage.
+	 * Now that we have the supply, we can retry setting the machine
+	 * constraints, if necessary.
 	 */
-	ret = register_regulator_event_forwarding(rdev);
-	if (ret < 0) {
-		struct regulator *supply;
-
-		rdev_warn(rdev, "Failed to register event forwarding: %pe\n",
-			  ERR_PTR(ret));
-
-		supply = rdev->supply;
-		rdev->supply = NULL;
+	regulator_lock_dependent(rdev, &ww_ctx);
+	if (rdev->constraints_pending) {
+		if (!rdev->supply) {
+			/*
+			 * Supply could have been released by another task that
+			 * failed to set the constraints or event forwarding.
+			 */
+			regulator_unlock_dependent(rdev, &ww_ctx);
+			ret = -EPROBE_DEFER;
+			goto out;
+		}
 
-		regulator_unlock_two(rdev, supply->rdev, &ww_ctx);
+		ret = set_machine_constraints(rdev, true);
+		if (ret < 0) {
+			regulator_unlock_dependent(rdev, &ww_ctx);
+
+			rdev_warn(rdev,
+				  "Failed to set machine constraints: %pe\n",
+				  ERR_PTR(ret));
+
+			regulator_lock_two(rdev, r, &ww_ctx);
+
+			if (supply != rdev->supply) {
+				/*
+				 * Supply could have been released by another
+				 * task that got here before us. If it did, it
+				 * will have released 'supply' (i.e. the
+				 * previous rdev->supply) and we shouldn't do
+				 * that again via unset_supply.
+				 */
+				regulator_unlock_two(rdev, r, &ww_ctx);
+				goto out;
+			}
 
-		regulator_put(supply);
-		goto out;
+			unregister_regulator_event_forwarding(rdev);
+			rdev->constraints_pending = true;
+			goto unset_supply;
+		}
+		rdev->constraints_pending = false;
 	}
+	regulator_unlock_dependent(rdev, &ww_ctx);
 
-	regulator_unlock_two(rdev, r, &ww_ctx);
+	if (!do_final_setup)
+		goto out;
 
 	/* rdev->supply was created in set_supply() */
-	link_and_create_debugfs(rdev->supply, r, &rdev->dev);
+	link_and_create_debugfs(rdev->supply, rdev->supply->rdev, &rdev->dev);
 
 out:
 	return ret;
+
+unset_supply:
+	lockdep_assert_held_once(&rdev->mutex.base);
+	lockdep_assert_held_once(&r->mutex.base);
+	rdev->supply = NULL;
+	regulator_unlock_two(rdev, supply->rdev, &ww_ctx);
+
+	regulator_put(supply);
+
+	return ret;
 }
 
 /* common pre-checks for regulator requests */
@@ -6067,7 +6152,7 @@ regulator_register(struct device *dev,
 		dangling_of_gpiod = false;
 	}
 
-	ret = set_machine_constraints(rdev);
+	ret = set_machine_constraints(rdev, false);
 	if (ret == -EPROBE_DEFER) {
 		/* Regulator might be in bypass mode or an always-on or boot-on
 		 * regulator and so needs its supply to set the constraints or
@@ -6081,14 +6166,17 @@ regulator_register(struct device *dev,
 			 rdev->supply_name);
 		ret = regulator_resolve_supply(rdev);
 		if (!ret)
-			ret = set_machine_constraints(rdev);
+			ret = set_machine_constraints(rdev, false);
 		else
 			rdev_dbg(rdev, "unable to resolve supply early: %pe\n",
 				 ERR_PTR(ret));
 		tried_supply_resolve = true;
 	}
-	if (ret < 0)
-		goto wash;
+	if (ret < 0) {
+		if (ret != -EPROBE_DEFER)
+			goto wash;
+		rdev->constraints_pending = true;
+	}
 
 	ret = regulator_init_coupling(rdev);
 	if (ret < 0)
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index d38353f2b56f..09f3b67638f9 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -650,6 +650,7 @@ struct regulator_dev {
 	struct regulator_enable_gpio *ena_pin;
 	unsigned int ena_gpio_state:1;
 
+	unsigned int constraints_pending:1;
 	unsigned int is_switch:1;
 
 	/* time when this regulator was disabled last time */
-- 
cgit v1.2.3