From ed61378b4dc63efe76cb8c23a36b228043332da3 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 8 Dec 2025 09:05:48 -0500 Subject: iomap: replace folio_batch allocation with stack allocation Zhang Yi points out that the dynamic folio_batch allocation in iomap_fill_dirty_folios() is problematic for the ext4 on iomap work that is under development because it doesn't sufficiently handle the allocation failure case (by allowing a retry, for example). We've also seen lockdep (via syzbot) complain recently about the scope of the allocation. The dynamic allocation was initially added for simplicity and to help indicate whether the batch was used or not by the calling fs. To address these issues, put the batch on the stack of iomap_zero_range() and use a flag to control whether the batch should be used in the iomap folio lookup path. This keeps things simple and eliminates allocation issues with lockdep and for ext4 on iomap. While here, also clean up the fill helper signature to be more consistent with the underlying filemap helper. Pass through the return value of the filemap helper (folio count) and update the lookup offset via an out param. Fixes: 395ed1ef0012 ("iomap: optional zero range dirty folio processing") Signed-off-by: Brian Foster Link: https://patch.msgid.link/20251208140548.373411-1-bfoster@redhat.com Acked-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 520e967cb501..6bb941707d12 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -88,6 +88,9 @@ struct vm_fault; /* * Flags set by the core iomap code during operations: * + * IOMAP_F_FOLIO_BATCH indicates that the folio batch mechanism is active + * for this operation, set by iomap_fill_dirty_folios(). + * * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size * has changed as the result of this write operation. * @@ -95,6 +98,7 @@ struct vm_fault; * range it covers needs to be remapped by the high level before the operation * can proceed. */ +#define IOMAP_F_FOLIO_BATCH (1U << 13) #define IOMAP_F_SIZE_CHANGED (1U << 14) #define IOMAP_F_STALE (1U << 15) @@ -352,8 +356,8 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops); -loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset, - loff_t length); +unsigned int iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t *start, + loff_t end, unsigned int *iomap_flags); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -- cgit v1.2.3 From 12965a190eaea614bb49e22041e8fc0d03d0310f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 4 Dec 2025 08:48:33 -0500 Subject: filelock: allow lease_managers to dictate what qualifies as a conflict Requesting a delegation on a file from the userland fcntl() interface currently succeeds when there are conflicting opens present. This is because the lease handling code ignores conflicting opens for FL_LAYOUT and FL_DELEG leases. This was a hack put in place long ago, because nfsd already checks for conflicts in its own way. The kernel needs to perform this check for userland delegations the same way it is done for leases, however. Make this dependent on the lease_manager by adding a new ->lm_open_conflict() lease_manager operation and have generic_add_lease() call that instead of check_conflicting_open(). Morph check_conflicting_open() into a ->lm_open_conflict() op that is only called for userland leases/delegations. Set the ->lm_open_conflict() operations for nfsd to trivial functions that always return 0. Reviewed-by: Chuck Lever Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251204-dir-deleg-ro-v2-2-22d37f92ce2c@kernel.org Signed-off-by: Christian Brauner --- Documentation/filesystems/locking.rst | 1 + fs/locks.c | 90 ++++++++++++++++------------------- fs/nfsd/nfs4layouts.c | 23 ++++++++- fs/nfsd/nfs4state.c | 19 ++++++++ include/linux/filelock.h | 1 + 5 files changed, 84 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 77704fde9845..04c7691e50e0 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -416,6 +416,7 @@ lm_change yes no no lm_breaker_owns_lease: yes no no lm_lock_expirable yes no no lm_expire_lock no no yes +lm_open_conflict yes no no ====================== ============= ================= ========= buffer_head diff --git a/fs/locks.c b/fs/locks.c index be0b79286da8..e75c8084d937 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -585,10 +585,50 @@ lease_setup(struct file_lease *fl, void **priv) __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0); } +/** + * lease_open_conflict - see if the given file points to an inode that has + * an existing open that would conflict with the + * desired lease. + * @filp: file to check + * @arg: type of lease that we're trying to acquire + * + * Check to see if there's an existing open fd on this file that would + * conflict with the lease we're trying to set. + */ +static int +lease_open_conflict(struct file *filp, const int arg) +{ + struct inode *inode = file_inode(filp); + int self_wcount = 0, self_rcount = 0; + + if (arg == F_RDLCK) + return inode_is_open_for_write(inode) ? -EAGAIN : 0; + else if (arg != F_WRLCK) + return 0; + + /* + * Make sure that only read/write count is from lease requestor. + * Note that this will result in denying write leases when i_writecount + * is negative, which is what we want. (We shouldn't grant write leases + * on files open for execution.) + */ + if (filp->f_mode & FMODE_WRITE) + self_wcount = 1; + else if (filp->f_mode & FMODE_READ) + self_rcount = 1; + + if (atomic_read(&inode->i_writecount) != self_wcount || + atomic_read(&inode->i_readcount) != self_rcount) + return -EAGAIN; + + return 0; +} + static const struct lease_manager_operations lease_manager_ops = { .lm_break = lease_break_callback, .lm_change = lease_modify, .lm_setup = lease_setup, + .lm_open_conflict = lease_open_conflict, }; /* @@ -1754,52 +1794,6 @@ int fcntl_getdeleg(struct file *filp, struct delegation *deleg) return 0; } -/** - * check_conflicting_open - see if the given file points to an inode that has - * an existing open that would conflict with the - * desired lease. - * @filp: file to check - * @arg: type of lease that we're trying to acquire - * @flags: current lock flags - * - * Check to see if there's an existing open fd on this file that would - * conflict with the lease we're trying to set. - */ -static int -check_conflicting_open(struct file *filp, const int arg, int flags) -{ - struct inode *inode = file_inode(filp); - int self_wcount = 0, self_rcount = 0; - - if (flags & FL_LAYOUT) - return 0; - if (flags & FL_DELEG) - /* We leave these checks to the caller */ - return 0; - - if (arg == F_RDLCK) - return inode_is_open_for_write(inode) ? -EAGAIN : 0; - else if (arg != F_WRLCK) - return 0; - - /* - * Make sure that only read/write count is from lease requestor. - * Note that this will result in denying write leases when i_writecount - * is negative, which is what we want. (We shouldn't grant write leases - * on files open for execution.) - */ - if (filp->f_mode & FMODE_WRITE) - self_wcount = 1; - else if (filp->f_mode & FMODE_READ) - self_rcount = 1; - - if (atomic_read(&inode->i_writecount) != self_wcount || - atomic_read(&inode->i_readcount) != self_rcount) - return -EAGAIN; - - return 0; -} - static int generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **priv) { @@ -1836,7 +1830,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); - error = check_conflicting_open(filp, arg, lease->c.flc_flags); + error = lease->fl_lmops->lm_open_conflict(filp, arg); if (error) goto out; @@ -1893,7 +1887,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr * precedes these checks. */ smp_mb(); - error = check_conflicting_open(filp, arg, lease->c.flc_flags); + error = lease->fl_lmops->lm_open_conflict(filp, arg); if (error) { locks_unlink_lock_ctx(&lease->c); goto out; diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 683bd1130afe..ad7af8cfcf1f 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -764,9 +764,28 @@ nfsd4_layout_lm_change(struct file_lease *onlist, int arg, return lease_modify(onlist, arg, dispose); } +/** + * nfsd4_layout_lm_open_conflict - see if the given file points to an inode that has + * an existing open that would conflict with the + * desired lease. + * @filp: file to check + * @arg: type of lease that we're trying to acquire + * + * The kernel will call into this operation to determine whether there + * are conflicting opens that may prevent the layout from being granted. + * For nfsd, that check is done at a higher level, so this trivially + * returns 0. + */ +static int +nfsd4_layout_lm_open_conflict(struct file *filp, int arg) +{ + return 0; +} + static const struct lease_manager_operations nfsd4_layouts_lm_ops = { - .lm_break = nfsd4_layout_lm_break, - .lm_change = nfsd4_layout_lm_change, + .lm_break = nfsd4_layout_lm_break, + .lm_change = nfsd4_layout_lm_change, + .lm_open_conflict = nfsd4_layout_lm_open_conflict, }; int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 808c24fb5c9a..19d6d6db107f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5552,10 +5552,29 @@ nfsd_change_deleg_cb(struct file_lease *onlist, int arg, return -EAGAIN; } +/** + * nfsd4_deleg_lm_open_conflict - see if the given file points to an inode that has + * an existing open that would conflict with the + * desired lease. + * @filp: file to check + * @arg: type of lease that we're trying to acquire + * + * The kernel will call into this operation to determine whether there + * are conflicting opens that may prevent the deleg from being granted. + * For nfsd, that check is done at a higher level, so this trivially + * returns 0. + */ +static int +nfsd4_deleg_lm_open_conflict(struct file *filp, int arg) +{ + return 0; +} + static const struct lease_manager_operations nfsd_lease_mng_ops = { .lm_breaker_owns_lease = nfsd_breaker_owns_lease, .lm_break = nfsd_break_deleg_cb, .lm_change = nfsd_change_deleg_cb, + .lm_open_conflict = nfsd4_deleg_lm_open_conflict, }; static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid) diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 54b824c05299..2f5e5588ee07 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -49,6 +49,7 @@ struct lease_manager_operations { int (*lm_change)(struct file_lease *, int, struct list_head *); void (*lm_setup)(struct file_lease *, void **); bool (*lm_breaker_owns_lease)(struct file_lease *); + int (*lm_open_conflict)(struct file *, int); }; struct lock_manager { -- cgit v1.2.3 From 4a824c3128998158a093eaadd776a79abe3a601a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Dec 2025 15:31:27 +0000 Subject: entry: Always inline local_irq_{enable,disable}_exit_to_user() clang needs __always_inline instead of inline, even for tiny helpers. This saves some cycles in system call fast path, and saves 195 bytes on x86_64 build: $ size vmlinux.before vmlinux.after text data bss dec hex filename 34652814 22291961 5875180 62819955 3be8e73 vmlinux.before 34652619 22291961 5875180 62819760 3be8db0 vmlinux.after Signed-off-by: Eric Dumazet Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251204153127.1321824-1-edumazet@google.com --- include/linux/irq-entry-common.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 6ab913e57da0..d26d1b1bcbfb 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -110,7 +110,7 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) static inline void local_irq_enable_exit_to_user(unsigned long ti_work); #ifndef local_irq_enable_exit_to_user -static inline void local_irq_enable_exit_to_user(unsigned long ti_work) +static __always_inline void local_irq_enable_exit_to_user(unsigned long ti_work) { local_irq_enable(); } @@ -125,7 +125,7 @@ static inline void local_irq_enable_exit_to_user(unsigned long ti_work) static inline void local_irq_disable_exit_to_user(void); #ifndef local_irq_disable_exit_to_user -static inline void local_irq_disable_exit_to_user(void) +static __always_inline void local_irq_disable_exit_to_user(void) { local_irq_disable(); } -- cgit v1.2.3 From 87e7f6019097746d1d06f98874a9f179b7a68f3e Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 19 Dec 2025 10:36:38 +0200 Subject: software node: Also support referencing non-constant software nodes Fwnode references are be implemented differently if referenced node is a software node. _Generic() is used to differentiate between the two cases but only const software nodes were present in the selection. Also add non-const software nodes. Reported-by: Kenneth Crudup Closes: https://lore.kernel.org/all/af773b82-bef2-4209-baaf-526d4661b7fc@panix.com/ Fixes: d7cdbbc93c56 ("software node: allow referencing firmware nodes") Signed-off-by: Sakari Ailus Tested-By: Kenneth R. Crudup Tested-by: Mehdi Djait # Dell XPS 9315 Reviewed-by: Mehdi Djait Link: https://patch.msgid.link/20251219083638.2454138-1-sakari.ailus@linux.intel.com Signed-off-by: Danilo Krummrich --- include/linux/property.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 272bfbdea7bf..e30ef23a9af3 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -371,6 +371,7 @@ struct software_node_ref_args { (const struct software_node_ref_args) { \ .swnode = _Generic(_ref_, \ const struct software_node *: _ref_, \ + struct software_node *: _ref_, \ default: NULL), \ .fwnode = _Generic(_ref_, \ struct fwnode_handle *: _ref_, \ -- cgit v1.2.3 From 20e20b147cf7cb6780a5b95da2a0e37c52cd1015 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 15 Dec 2025 22:38:00 -0800 Subject: platform/x86/intel/vsec: correct kernel-doc comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix kernel-doc warnings in intel_vsec.h to eliminate all kernel-doc warnings: Warning: include/linux/intel_vsec.h:92 struct member 'read_telem' not described in 'pmt_callbacks' Warning: include/linux/intel_vsec.h:146 expecting prototype for struct intel_sec_device. Prototype was for struct intel_vsec_device instead Warning: include/linux/intel_vsec.h:146 struct member 'priv_data_size' not described in 'intel_vsec_device' In struct pmt_callbacks, correct the kernel-doc for @read_telem. kernel-doc doesn't support documenting callback function parameters, so drop the '@' signs on those and use "* *" to make them somewhat readable in the produced documentation output. Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20251216063801.2896495-1-rdunlap@infradead.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/intel_vsec.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index 53f6fe88e369..1a0f357c2427 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -80,13 +80,13 @@ enum intel_vsec_quirks { /** * struct pmt_callbacks - Callback infrastructure for PMT devices - * ->read_telem() when specified, called by client driver to access PMT data (instead - * of direct copy). - * @pdev: PCI device reference for the callback's use - * @guid: ID of data to acccss - * @data: buffer for the data to be copied - * @off: offset into the requested buffer - * @count: size of buffer + * @read_telem: when specified, called by client driver to access PMT + * data (instead of direct copy). + * * pdev: PCI device reference for the callback's use + * * guid: ID of data to acccss + * * data: buffer for the data to be copied + * * off: offset into the requested buffer + * * count: size of buffer */ struct pmt_callbacks { int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, loff_t off, u32 count); @@ -120,7 +120,7 @@ struct intel_vsec_platform_info { }; /** - * struct intel_sec_device - Auxbus specific device information + * struct intel_vsec_device - Auxbus specific device information * @auxdev: auxbus device struct for auxbus access * @pcidev: pci device associated with the device * @resource: any resources shared by the parent @@ -128,6 +128,7 @@ struct intel_vsec_platform_info { * @num_resources: number of resources * @id: xarray id * @priv_data: any private data needed + * @priv_data_size: size of private data area * @quirks: specified quirks * @base_addr: base address of entries (if specified) * @cap_id: the enumerated id of the vsec feature -- cgit v1.2.3 From 5393802c94e0ab1295c04c94c57bcb00222d4674 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 27 Nov 2025 10:39:24 -0800 Subject: genalloc.h: fix htmldocs warning WARNING: include/linux/genalloc.h:52 function parameter 'start_addr' not described in 'genpool_algo_t' Fixes: 52fbf1134d47 ("lib/genalloc.c: fix allocation of aligned buffer from non-aligned chunk") Reported-by: Stephen Rothwell Closes: https://lkml.kernel.org/r/20251127130624.563597e3@canb.auug.org.au Acked-by: Randy Dunlap Tested-by: Randy Dunlap Cc: Alexey Skidanov Signed-off-by: Andrew Morton --- include/linux/genalloc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 0bd581003cd5..60de63e46b33 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -44,6 +44,7 @@ struct gen_pool; * @nr: The number of zeroed bits we're looking for * @data: optional additional data used by the callback * @pool: the pool being allocated from + * @start_addr: start address of memory chunk */ typedef unsigned long (*genpool_algo_t)(unsigned long *map, unsigned long size, -- cgit v1.2.3 From 007f5da43b3d0ecff972e2616062b8da1f862f5e Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Thu, 4 Dec 2025 18:59:55 +0000 Subject: mm/kasan: fix incorrect unpoisoning in vrealloc for KASAN Patch series "kasan: vmalloc: Fixes for the percpu allocator and vrealloc", v3. Patches fix two issues related to KASAN and vmalloc. The first one, a KASAN tag mismatch, possibly resulting in a kernel panic, can be observed on systems with a tag-based KASAN enabled and with multiple NUMA nodes. Initially it was only noticed on x86 [1] but later a similar issue was also reported on arm64 [2]. Specifically the problem is related to how vm_structs interact with pcpu_chunks - both when they are allocated, assigned and when pcpu_chunk addresses are derived. When vm_structs are allocated they are unpoisoned, each with a different random tag, if vmalloc support is enabled along the KASAN mode. Later when first pcpu chunk is allocated it gets its 'base_addr' field set to the first allocated vm_struct. With that it inherits that vm_struct's tag. When pcpu_chunk addresses are later derived (by pcpu_chunk_addr(), for example in pcpu_alloc_noprof()) the base_addr field is used and offsets are added to it. If the initial conditions are satisfied then some of the offsets will point into memory allocated with a different vm_struct. So while the lower bits will get accurately derived the tag bits in the top of the pointer won't match the shadow memory contents. The solution (proposed at v2 of the x86 KASAN series [3]) is to unpoison the vm_structs with the same tag when allocating them for the per cpu allocator (in pcpu_get_vm_areas()). The second one reported by syzkaller [4] is related to vrealloc and happens because of random tag generation when unpoisoning memory without allocating new pages. This breaks shadow memory tracking and needs to reuse the existing tag instead of generating a new one. At the same time an inconsistency in used flags is corrected. This patch (of 3): Syzkaller reported a memory out-of-bounds bug [4]. This patch fixes two issues: 1. In vrealloc the KASAN_VMALLOC_VM_ALLOC flag is missing when unpoisoning the extended region. This flag is required to correctly associate the allocation with KASAN's vmalloc tracking. Note: In contrast, vzalloc (via __vmalloc_node_range_noprof) explicitly sets KASAN_VMALLOC_VM_ALLOC and calls kasan_unpoison_vmalloc() with it. vrealloc must behave consistently -- especially when reusing existing vmalloc regions -- to ensure KASAN can track allocations correctly. 2. When vrealloc reuses an existing vmalloc region (without allocating new pages) KASAN generates a new tag, which breaks tag-based memory access tracking. Introduce KASAN_VMALLOC_KEEP_TAG, a new KASAN flag that allows reusing the tag already attached to the pointer, ensuring consistent tag behavior during reallocation. Pass KASAN_VMALLOC_KEEP_TAG and KASAN_VMALLOC_VM_ALLOC to the kasan_unpoison_vmalloc inside vrealloc_node_align_noprof(). Link: https://lkml.kernel.org/r/cover.1765978969.git.m.wieczorretman@pm.me Link: https://lkml.kernel.org/r/38dece0a4074c43e48150d1e242f8242c73bf1a5.1764874575.git.m.wieczorretman@pm.me Link: https://lore.kernel.org/all/e7e04692866d02e6d3b32bb43b998e5d17092ba4.1738686764.git.maciej.wieczor-retman@intel.com/ [1] Link: https://lore.kernel.org/all/aMUrW1Znp1GEj7St@MiWiFi-R3L-srv/ [2] Link: https://lore.kernel.org/all/CAPAsAGxDRv_uFeMYu9TwhBVWHCCtkSxoWY4xmFB_vowMbi8raw@mail.gmail.com/ [3] Link: https://syzkaller.appspot.com/bug?extid=997752115a851cb0cf36 [4] Fixes: a0309faf1cb0 ("mm: vmalloc: support more granular vrealloc() sizing") Signed-off-by: Jiayuan Chen Co-developed-by: Maciej Wieczor-Retman Signed-off-by: Maciej Wieczor-Retman Reported-by: syzbot+997752115a851cb0cf36@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68e243a2.050a0220.1696c6.007d.GAE@google.com/T/ Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Danilo Krummrich Cc: Dmitriy Vyukov Cc: Kees Cook Cc: Marco Elver Cc: "Uladzislau Rezki (Sony)" Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- include/linux/kasan.h | 1 + mm/kasan/hw_tags.c | 2 +- mm/kasan/shadow.c | 4 +++- mm/vmalloc.c | 4 +++- 4 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index f335c1d7b61d..df3d8567dde9 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -28,6 +28,7 @@ typedef unsigned int __bitwise kasan_vmalloc_flags_t; #define KASAN_VMALLOC_INIT ((__force kasan_vmalloc_flags_t)0x01u) #define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u) #define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u) +#define KASAN_VMALLOC_KEEP_TAG ((__force kasan_vmalloc_flags_t)0x08u) #define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */ #define KASAN_VMALLOC_TLB_FLUSH 0x2 /* TLB flush */ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 1c373cc4b3fa..cbef5e450954 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -361,7 +361,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, return (void *)start; } - tag = kasan_random_tag(); + tag = (flags & KASAN_VMALLOC_KEEP_TAG) ? get_tag(start) : kasan_random_tag(); start = set_tag(start, tag); /* Unpoison and initialize memory up to size. */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 29a751a8a08d..32fbdf759ea2 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -631,7 +631,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, !(flags & KASAN_VMALLOC_PROT_NORMAL)) return (void *)start; - start = set_tag(start, kasan_random_tag()); + if (unlikely(!(flags & KASAN_VMALLOC_KEEP_TAG))) + start = set_tag(start, kasan_random_tag()); + kasan_unpoison(start, size, false); return (void *)start; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ecbac900c35f..94c0a9262a46 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4331,7 +4331,9 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align */ if (size <= alloced_size) { kasan_unpoison_vmalloc(p + old_size, size - old_size, - KASAN_VMALLOC_PROT_NORMAL); + KASAN_VMALLOC_PROT_NORMAL | + KASAN_VMALLOC_VM_ALLOC | + KASAN_VMALLOC_KEEP_TAG); /* * No need to zero memory here, as unused memory will have * already been zeroed at initial allocation time or during -- cgit v1.2.3 From 6f13db031e27e88213381039032a9cc061578ea6 Mon Sep 17 00:00:00 2001 From: Maciej Wieczor-Retman Date: Thu, 4 Dec 2025 19:00:04 +0000 Subject: kasan: refactor pcpu kasan vmalloc unpoison A KASAN tag mismatch, possibly causing a kernel panic, can be observed on systems with a tag-based KASAN enabled and with multiple NUMA nodes. It was reported on arm64 and reproduced on x86. It can be explained in the following points: 1. There can be more than one virtual memory chunk. 2. Chunk's base address has a tag. 3. The base address points at the first chunk and thus inherits the tag of the first chunk. 4. The subsequent chunks will be accessed with the tag from the first chunk. 5. Thus, the subsequent chunks need to have their tag set to match that of the first chunk. Refactor code by reusing __kasan_unpoison_vmalloc in a new helper in preparation for the actual fix. Link: https://lkml.kernel.org/r/eb61d93b907e262eefcaa130261a08bcb6c5ce51.1764874575.git.m.wieczorretman@pm.me Fixes: 1d96320f8d53 ("kasan, vmalloc: add vmalloc tagging for SW_TAGS") Signed-off-by: Maciej Wieczor-Retman Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Danilo Krummrich Cc: Dmitriy Vyukov Cc: Jiayuan Chen Cc: Kees Cook Cc: Marco Elver Cc: "Uladzislau Rezki (Sony)" Cc: Vincenzo Frascino Cc: [6.1+] Signed-off-by: Andrew Morton --- include/linux/kasan.h | 15 +++++++++++++++ mm/kasan/common.c | 17 +++++++++++++++++ mm/vmalloc.c | 4 +--- 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index df3d8567dde9..9c6ac4b62eb9 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -631,6 +631,16 @@ static __always_inline void kasan_poison_vmalloc(const void *start, __kasan_poison_vmalloc(start, size); } +void __kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, + kasan_vmalloc_flags_t flags); +static __always_inline void +kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, + kasan_vmalloc_flags_t flags) +{ + if (kasan_enabled()) + __kasan_unpoison_vmap_areas(vms, nr_vms, flags); +} + #else /* CONFIG_KASAN_VMALLOC */ static inline void kasan_populate_early_vm_area_shadow(void *start, @@ -655,6 +665,11 @@ static inline void *kasan_unpoison_vmalloc(const void *start, static inline void kasan_poison_vmalloc(const void *start, unsigned long size) { } +static __always_inline void +kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, + kasan_vmalloc_flags_t flags) +{ } + #endif /* CONFIG_KASAN_VMALLOC */ #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 1d27f1bd260b..b2b40c59ce18 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "kasan.h" #include "../slab.h" @@ -575,3 +576,19 @@ bool __kasan_check_byte(const void *address, unsigned long ip) } return true; } + +#ifdef CONFIG_KASAN_VMALLOC +void __kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, + kasan_vmalloc_flags_t flags) +{ + unsigned long size; + void *addr; + int area; + + for (area = 0 ; area < nr_vms ; area++) { + size = vms[area]->size; + addr = vms[area]->addr; + vms[area]->addr = __kasan_unpoison_vmalloc(addr, size, flags); + } +} +#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 94c0a9262a46..41dd01e8430c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -5027,9 +5027,7 @@ retry: * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ - for (area = 0; area < nr_vms; area++) - vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, - vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); + kasan_unpoison_vmap_areas(vms, nr_vms, KASAN_VMALLOC_PROT_NORMAL); kfree(vas); return vms; -- cgit v1.2.3 From 6ba776b533ca902631fa106b8a90811b3f40b08d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 14 Dec 2025 12:15:17 -0800 Subject: mm: leafops.h: correct kernel-doc function param. names Modify the kernel-doc function parameter names to prevent kernel-doc warnings: Warning: include/linux/leafops.h:135 function parameter 'entry' not described in 'leafent_type' Warning: include/linux/leafops.h:540 function parameter 'pte' not described in 'pte_is_uffd_marker' Link: https://lkml.kernel.org/r/20251214201517.2187051-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Lorenzo Stoakes Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/leafops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/leafops.h b/include/linux/leafops.h index cfafe7a5e7b1..a9ff94b744f2 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -133,7 +133,7 @@ static inline bool softleaf_is_none(softleaf_t entry) /** * softleaf_type() - Identify the type of leaf entry. - * @enntry: Leaf entry. + * @entry: Leaf entry. * * Returns: the leaf entry type associated with @entry. */ @@ -534,7 +534,7 @@ static inline bool pte_is_uffd_wp_marker(pte_t pte) /** * pte_is_uffd_marker() - Does this PTE entry encode a userfault-specific marker * leaf entry? - * @entry: Leaf entry. + * @pte: PTE entry. * * It's useful to be able to determine which leaf entries encode UFFD-specific * markers so we can handle these correctly. -- cgit v1.2.3 From fe55ea85939efcbf0e6baa234f0d70acb79e7b58 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Tue, 16 Dec 2025 09:48:51 +0800 Subject: kernel/kexec: change the prototype of kimage_map_segment() The kexec segment index will be required to extract the corresponding information for that segment in kimage_map_segment(). Additionally, kexec_segment already holds the kexec relocation destination address and size. Therefore, the prototype of kimage_map_segment() can be changed. Link: https://lkml.kernel.org/r/20251216014852.8737-1-piliu@redhat.com Fixes: 07d24902977e ("kexec: enable CMA based contiguous allocation") Signed-off-by: Pingfan Liu Acked-by: Baoquan He Cc: Mimi Zohar Cc: Roberto Sassu Cc: Alexander Graf Cc: Steven Chen Cc: Signed-off-by: Andrew Morton --- include/linux/kexec.h | 4 ++-- kernel/kexec_core.c | 9 ++++++--- security/integrity/ima/ima_kexec.c | 4 +--- 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index ff7e231b0485..8a22bc9b8c6c 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -530,7 +530,7 @@ extern bool kexec_file_dbg_print; #define kexec_dprintk(fmt, arg...) \ do { if (kexec_file_dbg_print) pr_info(fmt, ##arg); } while (0) -extern void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size); +extern void *kimage_map_segment(struct kimage *image, int idx); extern void kimage_unmap_segment(void *buffer); #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; @@ -540,7 +540,7 @@ static inline void __crash_kexec(struct pt_regs *regs) { } static inline void crash_kexec(struct pt_regs *regs) { } static inline int kexec_should_crash(struct task_struct *p) { return 0; } static inline int kexec_crash_loaded(void) { return 0; } -static inline void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size) +static inline void *kimage_map_segment(struct kimage *image, int idx) { return NULL; } static inline void kimage_unmap_segment(void *buffer) { } #define kexec_in_progress false diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 0f92acdd354d..1a79c5b18d8f 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -953,17 +953,20 @@ int kimage_load_segment(struct kimage *image, int idx) return result; } -void *kimage_map_segment(struct kimage *image, - unsigned long addr, unsigned long size) +void *kimage_map_segment(struct kimage *image, int idx) { + unsigned long addr, size, eaddr; unsigned long src_page_addr, dest_page_addr = 0; - unsigned long eaddr = addr + size; kimage_entry_t *ptr, entry; struct page **src_pages; unsigned int npages; void *vaddr = NULL; int i; + addr = image->segment[idx].mem; + size = image->segment[idx].memsz; + eaddr = addr + size; + /* * Collect the source pages and map them in a contiguous VA range. */ diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c index 7362f68f2d8b..5beb69edd12f 100644 --- a/security/integrity/ima/ima_kexec.c +++ b/security/integrity/ima/ima_kexec.c @@ -250,9 +250,7 @@ void ima_kexec_post_load(struct kimage *image) if (!image->ima_buffer_addr) return; - ima_kexec_buffer = kimage_map_segment(image, - image->ima_buffer_addr, - image->ima_buffer_size); + ima_kexec_buffer = kimage_map_segment(image, image->ima_segment_index); if (!ima_kexec_buffer) { pr_err("Could not map measurements buffer.\n"); return; -- cgit v1.2.3 From e6dbcb7c0e7b508d443a9aa6f77f63a2f83b1ae4 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 11 Dec 2025 07:06:01 +0000 Subject: mm: fixup pfnmap memory failure handling to use pgoff The memory failure handling implementation for the PFNMAP memory with no struct pages is faulty. The VA of the mapping is determined based on the the PFN. It should instead be based on the file mapping offset. At the occurrence of poison, the memory_failure_pfn is triggered on the poisoned PFN. Introduce a callback function that allows mm to translate the PFN to the corresponding file page offset. The kernel module using the registration API must implement the callback function and provide the translation. The translated value is then used to determine the VA information and sending the SIGBUS to the usermode process mapped to the poisoned PFN. The callback is also useful for the driver to be notified of the poisoned PFN, which may then track it. Link: https://lkml.kernel.org/r/20251211070603.338701-2-ankita@nvidia.com Fixes: 2ec41967189c ("mm: handle poisoning of pfn without struct pages") Signed-off-by: Ankit Agrawal Suggested-by: Jason Gunthorpe Cc: Kevin Tian Cc: Matthew R. Ochs Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Neo Jia Cc: Vikram Sethi Cc: Yishai Hadas Cc: Zhi Wang Signed-off-by: Andrew Morton --- include/linux/memory-failure.h | 2 ++ mm/memory-failure.c | 29 ++++++++++++++++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h index bc326503d2d2..7b5e11cf905f 100644 --- a/include/linux/memory-failure.h +++ b/include/linux/memory-failure.h @@ -9,6 +9,8 @@ struct pfn_address_space; struct pfn_address_space { struct interval_tree_node node; struct address_space *mapping; + int (*pfn_to_vma_pgoff)(struct vm_area_struct *vma, + unsigned long pfn, pgoff_t *pgoff); }; int register_pfn_address_space(struct pfn_address_space *pfn_space); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fbc5a01260c8..c80c2907da33 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2161,6 +2161,9 @@ int register_pfn_address_space(struct pfn_address_space *pfn_space) { guard(mutex)(&pfn_space_lock); + if (!pfn_space->pfn_to_vma_pgoff) + return -EINVAL; + if (interval_tree_iter_first(&pfn_space_itree, pfn_space->node.start, pfn_space->node.last)) @@ -2183,10 +2186,10 @@ void unregister_pfn_address_space(struct pfn_address_space *pfn_space) } EXPORT_SYMBOL_GPL(unregister_pfn_address_space); -static void add_to_kill_pfn(struct task_struct *tsk, - struct vm_area_struct *vma, - struct list_head *to_kill, - unsigned long pfn) +static void add_to_kill_pgoff(struct task_struct *tsk, + struct vm_area_struct *vma, + struct list_head *to_kill, + pgoff_t pgoff) { struct to_kill *tk; @@ -2197,12 +2200,12 @@ static void add_to_kill_pfn(struct task_struct *tsk, } /* Check for pgoff not backed by struct page */ - tk->addr = vma_address(vma, pfn, 1); + tk->addr = vma_address(vma, pgoff, 1); tk->size_shift = PAGE_SHIFT; if (tk->addr == -EFAULT) pr_info("Unable to find address %lx in %s\n", - pfn, tsk->comm); + pgoff, tsk->comm); get_task_struct(tsk); tk->tsk = tsk; @@ -2212,11 +2215,12 @@ static void add_to_kill_pfn(struct task_struct *tsk, /* * Collect processes when the error hit a PFN not backed by struct page. */ -static void collect_procs_pfn(struct address_space *mapping, +static void collect_procs_pfn(struct pfn_address_space *pfn_space, unsigned long pfn, struct list_head *to_kill) { struct vm_area_struct *vma; struct task_struct *tsk; + struct address_space *mapping = pfn_space->mapping; i_mmap_lock_read(mapping); rcu_read_lock(); @@ -2226,9 +2230,12 @@ static void collect_procs_pfn(struct address_space *mapping, t = task_early_kill(tsk, true); if (!t) continue; - vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) { - if (vma->vm_mm == t->mm) - add_to_kill_pfn(t, vma, to_kill, pfn); + vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX) { + pgoff_t pgoff; + + if (vma->vm_mm == t->mm && + !pfn_space->pfn_to_vma_pgoff(vma, pfn, &pgoff)) + add_to_kill_pgoff(t, vma, to_kill, pgoff); } } rcu_read_unlock(); @@ -2264,7 +2271,7 @@ static int memory_failure_pfn(unsigned long pfn, int flags) struct pfn_address_space *pfn_space = container_of(node, struct pfn_address_space, node); - collect_procs_pfn(pfn_space->mapping, pfn, &tokill); + collect_procs_pfn(pfn_space, pfn, &tokill); mf_handled = true; } -- cgit v1.2.3 From f183663901f21fe0fba8bd31ae894bc529709ee0 Mon Sep 17 00:00:00 2001 From: Bijan Tabatabai Date: Tue, 16 Dec 2025 14:07:27 -0600 Subject: mm: consider non-anon swap cache folios in folio_expected_ref_count() Currently, folio_expected_ref_count() only adds references for the swap cache if the folio is anonymous. However, according to the comment above the definition of PG_swapcache in enum pageflags, shmem folios can also have PG_swapcache set. This patch makes sure references for the swap cache are added if folio_test_swapcache(folio) is true. This issue was found when trying to hot-unplug memory in a QEMU/KVM virtual machine. When initiating hot-unplug when most of the guest memory is allocated, hot-unplug hangs partway through removal due to migration failures. The following message would be printed several times, and would be printed again about every five seconds: [ 49.641309] migrating pfn b12f25 failed ret:7 [ 49.641310] page: refcount:2 mapcount:0 mapping:0000000033bd8fe2 index:0x7f404d925 pfn:0xb12f25 [ 49.641311] aops:swap_aops [ 49.641313] flags: 0x300000000030508(uptodate|active|owner_priv_1|reclaim|swapbacked|node=0|zone=3) [ 49.641314] raw: 0300000000030508 ffffed312c4bc908 ffffed312c4bc9c8 0000000000000000 [ 49.641315] raw: 00000007f404d925 00000000000c823b 00000002ffffffff 0000000000000000 [ 49.641315] page dumped because: migration failure When debugging this, I found that these migration failures were due to __migrate_folio() returning -EAGAIN for a small set of folios because the expected reference count it calculates via folio_expected_ref_count() is one less than the actual reference count of the folios. Furthermore, all of the affected folios were not anonymous, but had the PG_swapcache flag set, inspiring this patch. After applying this patch, the memory hot-unplug behaves as expected. I tested this on a machine running Ubuntu 24.04 with kernel version 6.8.0-90-generic and 64GB of memory. The guest VM is managed by libvirt and runs Ubuntu 24.04 with kernel version 6.18 (though the head of the mm-unstable branch as a Dec 16, 2025 was also tested and behaves the same) and 48GB of memory. The libvirt XML definition for the VM can be found at [1]. CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE is set in the guest kernel so the hot-pluggable memory is automatically onlined. Below are the steps to reproduce this behavior: 1) Define and start and virtual machine host$ virsh -c qemu:///system define ./test_vm.xml # test_vm.xml from [1] host$ virsh -c qemu:///system start test_vm 2) Setup swap in the guest guest$ sudo fallocate -l 32G /swapfile guest$ sudo chmod 0600 /swapfile guest$ sudo mkswap /swapfile guest$ sudo swapon /swapfile 3) Use alloc_data [2] to allocate most of the remaining guest memory guest$ ./alloc_data 45 4) In a separate guest terminal, monitor the amount of used memory guest$ watch -n1 free -h 5) When alloc_data has finished allocating, initiate the memory hot-unplug using the provided xml file [3] host$ virsh -c qemu:///system detach-device test_vm ./remove.xml --live After initiating the memory hot-unplug, you should see the amount of available memory in the guest decrease, and the amount of used swap data increase. If everything works as expected, when all of the memory is unplugged, there should be around 8.5-9GB of data in swap. If the unplugging is unsuccessful, the amount of used swap data will settle below that. If that happens, you should be able to see log messages in dmesg similar to the one posted above. Link: https://lkml.kernel.org/r/20251216200727.2360228-1-bijan311@gmail.com Link: https://github.com/BijanT/linux_patch_files/blob/main/test_vm.xml [1] Link: https://github.com/BijanT/linux_patch_files/blob/main/alloc_data.c [2] Link: https://github.com/BijanT/linux_patch_files/blob/main/remove.xml [3] Fixes: 86ebd50224c0 ("mm: add folio_expected_ref_count() for reference count calculation") Signed-off-by: Bijan Tabatabai Acked-by: David Hildenbrand (Red Hat) Acked-by: Zi Yan Reviewed-by: Baolin Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shivank Garg Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Kairui Song Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 15076261d0c2..6f959d8ca4b4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2459,10 +2459,10 @@ static inline int folio_expected_ref_count(const struct folio *folio) if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio))) return 0; - if (folio_test_anon(folio)) { - /* One reference per page from the swapcache. */ - ref_count += folio_test_swapcache(folio) << order; - } else { + /* One reference per page from the swapcache. */ + ref_count += folio_test_swapcache(folio) << order; + + if (!folio_test_anon(folio)) { /* One reference per page from the pagecache. */ ref_count += !!folio->mapping << order; /* One reference from PG_private. */ -- cgit v1.2.3 From dc85a46928c41423ad89869baf05a589e2975575 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Thu, 18 Dec 2025 08:16:49 +0000 Subject: vfio/pci: Disable qword access to the PCI ROM bar Commit 2b938e3db335 ("vfio/pci: Enable iowrite64 and ioread64 for vfio pci") enables qword access to the PCI bar resources. However certain devices (e.g. Intel X710) are observed with problem upon qword accesses to the rom bar, e.g. triggering PCI aer errors. This is triggered by Qemu which caches the rom content by simply does a pread() of the remaining size until it gets the full contents. The other bars would only perform operations at the same access width as their guest drivers. Instead of trying to identify all broken devices, universally disable qword access to the rom bar i.e. going back to the old way which worked reliably for years. Reported-by: Farrah Chen Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220740 Fixes: 2b938e3db335 ("vfio/pci: Enable iowrite64 and ioread64 for vfio pci") Cc: stable@vger.kernel.org Signed-off-by: Kevin Tian Tested-by: Farrah Chen Link: https://lore.kernel.org/r/20251218081650.555015-2-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 4 ++-- drivers/vfio/pci/vfio_pci_rdwr.c | 25 ++++++++++++++++++------- include/linux/vfio_pci_core.h | 10 +++++++++- 3 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 84d142a47ec6..b45a24d00387 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -561,7 +561,7 @@ nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev, ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, nvdev->resmem.ioaddr, buf, offset, mem_count, - 0, 0, false); + 0, 0, false, VFIO_PCI_IO_WIDTH_8); } return ret; @@ -693,7 +693,7 @@ nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev, ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false, nvdev->resmem.ioaddr, (char __user *)buf, pos, mem_count, - 0, 0, true); + 0, 0, true, VFIO_PCI_IO_WIDTH_8); } return ret; diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 6192788c8ba3..25380b7dfe18 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -135,7 +135,8 @@ VFIO_IORDWR(64) ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, void __iomem *io, char __user *buf, loff_t off, size_t count, size_t x_start, - size_t x_end, bool iswrite) + size_t x_end, bool iswrite, + enum vfio_pci_io_width max_width) { ssize_t done = 0; int ret; @@ -150,20 +151,19 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, else fillable = 0; - if (fillable >= 8 && !(off % 8)) { + if (fillable >= 8 && !(off % 8) && max_width >= 8) { ret = vfio_pci_iordwr64(vdev, iswrite, test_mem, io, buf, off, &filled); if (ret) return ret; - } else - if (fillable >= 4 && !(off % 4)) { + } else if (fillable >= 4 && !(off % 4) && max_width >= 4) { ret = vfio_pci_iordwr32(vdev, iswrite, test_mem, io, buf, off, &filled); if (ret) return ret; - } else if (fillable >= 2 && !(off % 2)) { + } else if (fillable >= 2 && !(off % 2) && max_width >= 2) { ret = vfio_pci_iordwr16(vdev, iswrite, test_mem, io, buf, off, &filled); if (ret) @@ -234,6 +234,7 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, void __iomem *io; struct resource *res = &vdev->pdev->resource[bar]; ssize_t done; + enum vfio_pci_io_width max_width = VFIO_PCI_IO_WIDTH_8; if (pci_resource_start(pdev, bar)) end = pci_resource_len(pdev, bar); @@ -262,6 +263,16 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, if (!io) return -ENOMEM; x_end = end; + + /* + * Certain devices (e.g. Intel X710) don't support qword + * access to the ROM bar. Otherwise PCI AER errors might be + * triggered. + * + * Disable qword access to the ROM bar universally, which + * worked reliably for years before qword access is enabled. + */ + max_width = VFIO_PCI_IO_WIDTH_4; } else { int ret = vfio_pci_core_setup_barmap(vdev, bar); if (ret) { @@ -278,7 +289,7 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, } done = vfio_pci_core_do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos, - count, x_start, x_end, iswrite); + count, x_start, x_end, iswrite, max_width); if (done >= 0) *ppos += done; @@ -352,7 +363,7 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, * to the memory enable bit in the command register. */ done = vfio_pci_core_do_io_rw(vdev, false, iomem, buf, off, count, - 0, 0, iswrite); + 0, 0, iswrite, VFIO_PCI_IO_WIDTH_8); vga_put(vdev->pdev, rsrc); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 706877f998ff..1ac86896875c 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -145,6 +145,13 @@ struct vfio_pci_core_device { struct list_head dmabufs; }; +enum vfio_pci_io_width { + VFIO_PCI_IO_WIDTH_1 = 1, + VFIO_PCI_IO_WIDTH_2 = 2, + VFIO_PCI_IO_WIDTH_4 = 4, + VFIO_PCI_IO_WIDTH_8 = 8, +}; + /* Will be exported for vfio pci drivers usage */ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, unsigned int type, unsigned int subtype, @@ -188,7 +195,8 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, void __iomem *io, char __user *buf, loff_t off, size_t count, size_t x_start, - size_t x_end, bool iswrite); + size_t x_end, bool iswrite, + enum vfio_pci_io_width max_width); bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, loff_t reg_start, size_t reg_cnt, -- cgit v1.2.3 From f059588c552746e0fe299214f35c58effa715b74 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 4 Dec 2025 13:31:52 -0500 Subject: virtio: make it self-contained virtio.h uses struct module, add a forward declaration to make the header self-contained. Message-ID: <9171b5cac60793eb59ab044c96ee038bf1363bee.1764873799.git.mst@redhat.com> Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 132a474e5914..3626eb694728 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -13,6 +13,8 @@ #include #include +struct module; + /** * struct virtqueue - a queue to register buffers for sending or receiving. * @list: the chain of virtqueues for this device -- cgit v1.2.3 From e88dfb93311c81359b00c12e0b396bd0ea13ad6c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 4 Dec 2025 12:49:34 -0500 Subject: virtio_features: make it self-contained virtio_features.h uses WARN_ON_ONCE and memset so it must include linux/bug.h and linux/string.h Message-ID: <579986aa9b8d023844990d2a0e267382f8ad85d5.1764873799.git.mst@redhat.com> Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_features.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_features.h b/include/linux/virtio_features.h index ea2ad8717882..ce59ea91f474 100644 --- a/include/linux/virtio_features.h +++ b/include/linux/virtio_features.h @@ -3,6 +3,8 @@ #define _LINUX_VIRTIO_FEATURES_H #include +#include +#include #define VIRTIO_FEATURES_U64S 2 #define VIRTIO_FEATURES_BITS (VIRTIO_FEATURES_U64S * 64) -- cgit v1.2.3 From 9910159f06590c17df4fbddedaabb4c0201cc4cb Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 15 Dec 2025 14:17:23 +0100 Subject: iio: core: add separate lockdep class for info_exist_lock When one iio device is a consumer of another, it is possible that the ->info_exist_lock of both ends up being taken when reading the value of the consumer device. Since they currently belong to the same lockdep class (being initialized in a single location with mutex_init()), that results in a lockdep warning CPU0 ---- lock(&iio_dev_opaque->info_exist_lock); lock(&iio_dev_opaque->info_exist_lock); *** DEADLOCK *** May be due to missing lock nesting notation 4 locks held by sensors/414: #0: c31fd6dc (&p->lock){+.+.}-{3:3}, at: seq_read_iter+0x44/0x4e4 #1: c4f5a1c4 (&of->mutex){+.+.}-{3:3}, at: kernfs_seq_start+0x1c/0xac #2: c2827548 (kn->active#34){.+.+}-{0:0}, at: kernfs_seq_start+0x30/0xac #3: c1dd2b68 (&iio_dev_opaque->info_exist_lock){+.+.}-{3:3}, at: iio_read_channel_processed_scale+0x24/0xd8 stack backtrace: CPU: 0 UID: 0 PID: 414 Comm: sensors Not tainted 6.17.11 #5 NONE Hardware name: Generic AM33XX (Flattened Device Tree) Call trace: unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x44/0x60 dump_stack_lvl from print_deadlock_bug+0x2b8/0x334 print_deadlock_bug from __lock_acquire+0x13a4/0x2ab0 __lock_acquire from lock_acquire+0xd0/0x2c0 lock_acquire from __mutex_lock+0xa0/0xe8c __mutex_lock from mutex_lock_nested+0x1c/0x24 mutex_lock_nested from iio_read_channel_raw+0x20/0x6c iio_read_channel_raw from rescale_read_raw+0x128/0x1c4 rescale_read_raw from iio_channel_read+0xe4/0xf4 iio_channel_read from iio_read_channel_processed_scale+0x6c/0xd8 iio_read_channel_processed_scale from iio_hwmon_read_val+0x68/0xbc iio_hwmon_read_val from dev_attr_show+0x18/0x48 dev_attr_show from sysfs_kf_seq_show+0x80/0x110 sysfs_kf_seq_show from seq_read_iter+0xdc/0x4e4 seq_read_iter from vfs_read+0x238/0x2e4 vfs_read from ksys_read+0x6c/0xec ksys_read from ret_fast_syscall+0x0/0x1c Just as the mlock_key already has its own lockdep class, add a lock_class_key for the info_exist mutex. Note that this has in theory been a problem since before IIO first left staging, but it only occurs when a chain of consumers is in use and that is not often done. Fixes: ac917a81117c ("staging:iio:core set the iio_dev.info pointer to null on unregister under lock.") Signed-off-by: Rasmus Villemoes Reviewed-by: Peter Rosin Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-core.c | 4 +++- include/linux/iio/iio-opaque.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index f69deefcfb6f..117ffad4f376 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -1657,6 +1657,7 @@ static void iio_dev_release(struct device *device) mutex_destroy(&iio_dev_opaque->info_exist_lock); mutex_destroy(&iio_dev_opaque->mlock); + lockdep_unregister_key(&iio_dev_opaque->info_exist_key); lockdep_unregister_key(&iio_dev_opaque->mlock_key); ida_free(&iio_ida, iio_dev_opaque->id); @@ -1717,9 +1718,10 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv) INIT_LIST_HEAD(&iio_dev_opaque->ioctl_handlers); lockdep_register_key(&iio_dev_opaque->mlock_key); + lockdep_register_key(&iio_dev_opaque->info_exist_key); mutex_init_with_key(&iio_dev_opaque->mlock, &iio_dev_opaque->mlock_key); - mutex_init(&iio_dev_opaque->info_exist_lock); + mutex_init_with_key(&iio_dev_opaque->info_exist_lock, &iio_dev_opaque->info_exist_key); indio_dev->dev.parent = parent; indio_dev->dev.type = &iio_device_type; diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h index 4247497f3f8b..b87841a355f8 100644 --- a/include/linux/iio/iio-opaque.h +++ b/include/linux/iio/iio-opaque.h @@ -14,6 +14,7 @@ * @mlock: lock used to prevent simultaneous device state changes * @mlock_key: lockdep class for iio_dev lock * @info_exist_lock: lock to prevent use during removal + * @info_exist_key: lockdep class for info_exist lock * @trig_readonly: mark the current trigger immutable * @event_interface: event chrdevs associated with interrupt lines * @attached_buffers: array of buffers statically attached by the driver @@ -47,6 +48,7 @@ struct iio_dev_opaque { struct mutex mlock; struct lock_class_key mlock_key; struct mutex info_exist_lock; + struct lock_class_key info_exist_key; bool trig_readonly; struct iio_event_interface *event_interface; struct iio_buffer **attached_buffers; -- cgit v1.2.3 From 5623eb1ed035f01dfa620366a82b667545b10c82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 31 Dec 2025 08:12:46 -0700 Subject: io_uring/tctx: add separate lock for list of tctx's in ctx ctx->tcxt_list holds the tasks using this ring, and it's currently protected by the normal ctx->uring_lock. However, this can cause a circular locking issue, as reported by syzbot, where cancelations off exec end up needing to remove an entry from this list: ====================================================== WARNING: possible circular locking dependency detected syzkaller #0 Tainted: G L ------------------------------------------------------ syz.0.9999/12287 is trying to acquire lock: ffff88805851c0a8 (&ctx->uring_lock){+.+.}-{4:4}, at: io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179 but task is already holding lock: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: prepare_bprm_creds fs/exec.c:1360 [inline] ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: bprm_execve+0xb9/0x1400 fs/exec.c:1733 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&sig->cred_guard_mutex){+.+.}-{4:4}: __mutex_lock_common kernel/locking/mutex.c:614 [inline] __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776 proc_pid_attr_write+0x547/0x630 fs/proc/base.c:2837 vfs_write+0x27e/0xb30 fs/read_write.c:684 ksys_write+0x145/0x250 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #1 (sb_writers#3){.+.+}-{0:0}: percpu_down_read_internal include/linux/percpu-rwsem.h:53 [inline] percpu_down_read_freezable include/linux/percpu-rwsem.h:83 [inline] __sb_start_write include/linux/fs/super.h:19 [inline] sb_start_write+0x4d/0x1c0 include/linux/fs/super.h:125 mnt_want_write+0x41/0x90 fs/namespace.c:499 open_last_lookups fs/namei.c:4529 [inline] path_openat+0xadd/0x3dd0 fs/namei.c:4784 do_filp_open+0x1fa/0x410 fs/namei.c:4814 io_openat2+0x3e0/0x5c0 io_uring/openclose.c:143 __io_issue_sqe+0x181/0x4b0 io_uring/io_uring.c:1792 io_issue_sqe+0x165/0x1060 io_uring/io_uring.c:1815 io_queue_sqe io_uring/io_uring.c:2042 [inline] io_submit_sqe io_uring/io_uring.c:2320 [inline] io_submit_sqes+0xbf4/0x2140 io_uring/io_uring.c:2434 __do_sys_io_uring_enter io_uring/io_uring.c:3280 [inline] __se_sys_io_uring_enter+0x2e0/0x2b60 io_uring/io_uring.c:3219 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (&ctx->uring_lock){+.+.}-{4:4}: check_prev_add kernel/locking/lockdep.c:3165 [inline] check_prevs_add kernel/locking/lockdep.c:3284 [inline] validate_chain kernel/locking/lockdep.c:3908 [inline] __lock_acquire+0x15a6/0x2cf0 kernel/locking/lockdep.c:5237 lock_acquire+0x107/0x340 kernel/locking/lockdep.c:5868 __mutex_lock_common kernel/locking/mutex.c:614 [inline] __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776 io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179 io_uring_clean_tctx+0xd4/0x1a0 io_uring/tctx.c:195 io_uring_cancel_generic+0x6ca/0x7d0 io_uring/cancel.c:646 io_uring_task_cancel include/linux/io_uring.h:24 [inline] begin_new_exec+0x10ed/0x2440 fs/exec.c:1131 load_elf_binary+0x9f8/0x2d70 fs/binfmt_elf.c:1010 search_binary_handler fs/exec.c:1669 [inline] exec_binprm fs/exec.c:1701 [inline] bprm_execve+0x92e/0x1400 fs/exec.c:1753 do_execveat_common+0x510/0x6a0 fs/exec.c:1859 do_execve fs/exec.c:1933 [inline] __do_sys_execve fs/exec.c:2009 [inline] __se_sys_execve fs/exec.c:2004 [inline] __x64_sys_execve+0x94/0xb0 fs/exec.c:2004 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Chain exists of: &ctx->uring_lock --> sb_writers#3 --> &sig->cred_guard_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&sig->cred_guard_mutex); lock(sb_writers#3); lock(&sig->cred_guard_mutex); lock(&ctx->uring_lock); *** DEADLOCK *** 1 lock held by syz.0.9999/12287: #0: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: prepare_bprm_creds fs/exec.c:1360 [inline] #0: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: bprm_execve+0xb9/0x1400 fs/exec.c:1733 stack backtrace: CPU: 0 UID: 0 PID: 12287 Comm: syz.0.9999 Tainted: G L syzkaller #0 PREEMPT(full) Tainted: [L]=SOFTLOCKUP Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/25/2025 Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 print_circular_bug+0x2e2/0x300 kernel/locking/lockdep.c:2043 check_noncircular+0x12e/0x150 kernel/locking/lockdep.c:2175 check_prev_add kernel/locking/lockdep.c:3165 [inline] check_prevs_add kernel/locking/lockdep.c:3284 [inline] validate_chain kernel/locking/lockdep.c:3908 [inline] __lock_acquire+0x15a6/0x2cf0 kernel/locking/lockdep.c:5237 lock_acquire+0x107/0x340 kernel/locking/lockdep.c:5868 __mutex_lock_common kernel/locking/mutex.c:614 [inline] __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776 io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179 io_uring_clean_tctx+0xd4/0x1a0 io_uring/tctx.c:195 io_uring_cancel_generic+0x6ca/0x7d0 io_uring/cancel.c:646 io_uring_task_cancel include/linux/io_uring.h:24 [inline] begin_new_exec+0x10ed/0x2440 fs/exec.c:1131 load_elf_binary+0x9f8/0x2d70 fs/binfmt_elf.c:1010 search_binary_handler fs/exec.c:1669 [inline] exec_binprm fs/exec.c:1701 [inline] bprm_execve+0x92e/0x1400 fs/exec.c:1753 do_execveat_common+0x510/0x6a0 fs/exec.c:1859 do_execve fs/exec.c:1933 [inline] __do_sys_execve fs/exec.c:2009 [inline] __se_sys_execve fs/exec.c:2004 [inline] __x64_sys_execve+0x94/0xb0 fs/exec.c:2004 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7ff3a8b8f749 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ff3a9a97038 EFLAGS: 00000246 ORIG_RAX: 000000000000003b RAX: ffffffffffffffda RBX: 00007ff3a8de5fa0 RCX: 00007ff3a8b8f749 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000200000000400 RBP: 00007ff3a8c13f91 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ff3a8de6038 R14: 00007ff3a8de5fa0 R15: 00007ff3a8f0fa28 Add a separate lock just for the tctx_list, tctx_lock. This can nest under ->uring_lock, where necessary, and be used separately for list manipulation. For the cancelation off exec side, this removes the need to grab ->uring_lock, hence fixing the circular locking dependency. Reported-by: syzbot+b0e3b77ffaa8a4067ce5@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 +++++++- io_uring/cancel.c | 5 +++++ io_uring/io_uring.c | 5 +++++ io_uring/register.c | 2 ++ io_uring/tctx.c | 8 ++++---- 5 files changed, 23 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e1adb0d20a0a..a3e8ddc9b380 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -424,11 +424,17 @@ struct io_ring_ctx { struct user_struct *user; struct mm_struct *mm_account; + /* + * List of tctx nodes for this ctx, protected by tctx_lock. For + * cancelation purposes, nests under uring_lock. + */ + struct list_head tctx_list; + struct mutex tctx_lock; + /* ctx exit and cancelation */ struct llist_head fallback_llist; struct delayed_work fallback_work; struct work_struct exit_work; - struct list_head tctx_list; struct completion ref_comp; /* io-wq management, e.g. thread count */ diff --git a/io_uring/cancel.c b/io_uring/cancel.c index ca12ac10c0ae..07b8d852218b 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -184,7 +184,9 @@ static int __io_async_cancel(struct io_cancel_data *cd, } while (1); /* slow path, try all io-wq's */ + __set_current_state(TASK_RUNNING); io_ring_submit_lock(ctx, issue_flags); + mutex_lock(&ctx->tctx_lock); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { ret = io_async_cancel_one(node->task->io_uring, cd); @@ -194,6 +196,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, nr++; } } + mutex_unlock(&ctx->tctx_lock); io_ring_submit_unlock(ctx, issue_flags); return all ? nr : ret; } @@ -484,6 +487,7 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) bool ret = false; mutex_lock(&ctx->uring_lock); + mutex_lock(&ctx->tctx_lock); list_for_each_entry(node, &ctx->tctx_list, ctx_node) { struct io_uring_task *tctx = node->task->io_uring; @@ -496,6 +500,7 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); ret |= (cret != IO_WQ_CANCEL_NOTFOUND); } + mutex_unlock(&ctx->tctx_lock); mutex_unlock(&ctx->uring_lock); return ret; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 709943fedaf4..87a87396e940 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -340,6 +340,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->ltimeout_list); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); + mutex_init(&ctx->tctx_lock); ctx->submit_state.free_list.next = NULL; INIT_HLIST_HEAD(&ctx->waitid_list); xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC); @@ -3045,6 +3046,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) exit.ctx = ctx; mutex_lock(&ctx->uring_lock); + mutex_lock(&ctx->tctx_lock); while (!list_empty(&ctx->tctx_list)) { WARN_ON_ONCE(time_after(jiffies, timeout)); @@ -3056,6 +3058,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) if (WARN_ON_ONCE(ret)) continue; + mutex_unlock(&ctx->tctx_lock); mutex_unlock(&ctx->uring_lock); /* * See comment above for @@ -3064,7 +3067,9 @@ static __cold void io_ring_exit_work(struct work_struct *work) */ wait_for_completion_interruptible(&exit.completion); mutex_lock(&ctx->uring_lock); + mutex_lock(&ctx->tctx_lock); } + mutex_unlock(&ctx->tctx_lock); mutex_unlock(&ctx->uring_lock); spin_lock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock); diff --git a/io_uring/register.c b/io_uring/register.c index 62d39b3ff317..3d3822ff3fd9 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -320,6 +320,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, return 0; /* now propagate the restriction to all registered users */ + mutex_lock(&ctx->tctx_lock); list_for_each_entry(node, &ctx->tctx_list, ctx_node) { tctx = node->task->io_uring; if (WARN_ON_ONCE(!tctx->io_wq)) @@ -330,6 +331,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, /* ignore errors, it always returns zero anyway */ (void)io_wq_max_workers(tctx->io_wq, new_count); } + mutex_unlock(&ctx->tctx_lock); return 0; err: if (sqd) { diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 5b66755579c0..6d6f44215ec8 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -136,9 +136,9 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) return ret; } - mutex_lock(&ctx->uring_lock); + mutex_lock(&ctx->tctx_lock); list_add(&node->ctx_node, &ctx->tctx_list); - mutex_unlock(&ctx->uring_lock); + mutex_unlock(&ctx->tctx_lock); } return 0; } @@ -176,9 +176,9 @@ __cold void io_uring_del_tctx_node(unsigned long index) WARN_ON_ONCE(current != node->task); WARN_ON_ONCE(list_empty(&node->ctx_node)); - mutex_lock(&node->ctx->uring_lock); + mutex_lock(&node->ctx->tctx_lock); list_del(&node->ctx_node); - mutex_unlock(&node->ctx->uring_lock); + mutex_unlock(&node->ctx->tctx_lock); if (tctx->last == node->ctx) tctx->last = NULL; -- cgit v1.2.3 From 02d1e1a3f9239cdb3ecf2c6d365fb959d1bf39df Mon Sep 17 00:00:00 2001 From: Di Zhu Date: Wed, 24 Dec 2025 09:22:24 +0800 Subject: netdev: preserve NETIF_F_ALL_FOR_ALL across TSO updates Directly increment the TSO features incurs a side effect: it will also directly clear the flags in NETIF_F_ALL_FOR_ALL on the master device, which can cause issues such as the inability to enable the nocache copy feature on the bonding driver. The fix is to include NETIF_F_ALL_FOR_ALL in the update mask, thereby preventing it from being cleared. Fixes: b0ce3508b25e ("bonding: allow TSO being set on bonding master") Signed-off-by: Di Zhu Link: https://patch.msgid.link/20251224012224.56185-1-zhud@hygon.cn Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5870a9e514a5..d99b0fbc1942 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5323,7 +5323,8 @@ netdev_features_t netdev_increment_features(netdev_features_t all, static inline netdev_features_t netdev_add_tso_features(netdev_features_t features, netdev_features_t mask) { - return netdev_increment_features(features, NETIF_F_ALL_TSO, mask); + return netdev_increment_features(features, NETIF_F_ALL_TSO | + NETIF_F_ALL_FOR_ALL, mask); } int __netdev_update_features(struct net_device *dev); -- cgit v1.2.3 From cce0be6eb4971456b703aaeafd571650d314bcca Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 31 Dec 2025 11:42:31 -0500 Subject: NFS: Fix a deadlock involving nfs_release_folio() Wang Zhaolong reports a deadlock involving NFSv4.1 state recovery waiting on kthreadd, which is attempting to reclaim memory by calling nfs_release_folio(). The latter cannot make progress due to state recovery being needed. It seems that the only safe thing to do here is to kick off a writeback of the folio, without waiting for completion, or else kicking off an asynchronous commit. Reported-by: Wang Zhaolong Fixes: 96780ca55e3c ("NFS: fix up nfs_release_folio() to try to release the page") Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 3 ++- fs/nfs/nfstrace.h | 3 +++ fs/nfs/write.c | 33 +++++++++++++++++++++++++++++++++ include/linux/nfs_fs.h | 1 + 4 files changed, 39 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d020aab40c64..d1c138a416cf 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -511,7 +511,8 @@ static bool nfs_release_folio(struct folio *folio, gfp_t gfp) if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL || current_is_kswapd() || current_is_kcompactd()) return false; - if (nfs_wb_folio(folio->mapping->host, folio) < 0) + if (nfs_wb_folio_reclaim(folio->mapping->host, folio) < 0 || + folio_test_private(folio)) return false; } return nfs_fscache_release_folio(folio, gfp); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 6ce55e8e6b67..9f9ce4a565ea 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1062,6 +1062,9 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done, DEFINE_NFS_FOLIO_EVENT(nfs_aop_readpage); DEFINE_NFS_FOLIO_EVENT_DONE(nfs_aop_readpage_done); +DEFINE_NFS_FOLIO_EVENT(nfs_writeback_folio_reclaim); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_reclaim_done); + DEFINE_NFS_FOLIO_EVENT(nfs_writeback_folio); DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 336c510f3750..bf412455e8ed 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -2024,6 +2024,39 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) return ret; } +/** + * nfs_wb_folio_reclaim - Write back all requests on one page + * @inode: pointer to page + * @folio: pointer to folio + * + * Assumes that the folio has been locked by the caller + */ +int nfs_wb_folio_reclaim(struct inode *inode, struct folio *folio) +{ + loff_t range_start = folio_pos(folio); + size_t len = folio_size(folio); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + .range_start = range_start, + .range_end = range_start + len - 1, + .for_sync = 1, + }; + int ret; + + if (folio_test_writeback(folio)) + return -EBUSY; + if (folio_clear_dirty_for_io(folio)) { + trace_nfs_writeback_folio_reclaim(inode, range_start, len); + ret = nfs_writepage_locked(folio, &wbc); + trace_nfs_writeback_folio_reclaim_done(inode, range_start, len, + ret); + return ret; + } + nfs_commit_inode(inode, 0); + return 0; +} + /** * nfs_wb_folio - Write back all requests on one page * @inode: pointer to page diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index a6624edb7226..8dd79a3f3d66 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -637,6 +637,7 @@ extern int nfs_update_folio(struct file *file, struct folio *folio, extern int nfs_sync_inode(struct inode *inode); extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_folio(struct inode *inode, struct folio *folio); +extern int nfs_wb_folio_reclaim(struct inode *inode, struct folio *folio); int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_commit_data *nfs_commitdata_alloc(void); -- cgit v1.2.3 From a7fc8c641cab855824c45e5e8877e40fd528b5df Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 2 Jan 2026 12:29:38 +0100 Subject: net: airoha: Fix npu rx DMA definitions Fix typos in npu rx DMA descriptor definitions. Fixes: b3ef7bdec66fb ("net: airoha: Add airoha_offload.h header") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260102-airoha-npu-dma-rx-def-fixes-v1-1-205fc6bf7d94@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/soc/airoha/airoha_offload.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index 4d23cbb7d407..ab64ecdf39a0 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -71,12 +71,12 @@ static inline void airoha_ppe_dev_check_skb(struct airoha_ppe_dev *dev, #define NPU_RX1_DESC_NUM 512 /* CTRL */ -#define NPU_RX_DMA_DESC_LAST_MASK BIT(29) -#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(28, 15) -#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(14, 1) +#define NPU_RX_DMA_DESC_LAST_MASK BIT(27) +#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(26, 14) +#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(13, 1) #define NPU_RX_DMA_DESC_DONE_MASK BIT(0) /* INFO */ -#define NPU_RX_DMA_PKT_COUNT_MASK GENMASK(31, 28) +#define NPU_RX_DMA_PKT_COUNT_MASK GENMASK(31, 29) #define NPU_RX_DMA_PKT_ID_MASK GENMASK(28, 26) #define NPU_RX_DMA_SRC_PORT_MASK GENMASK(25, 21) #define NPU_RX_DMA_CRSN_MASK GENMASK(20, 16) -- cgit v1.2.3 From 2740ac33c87b3d0dfa022efd6ba04c6261b1abbd Mon Sep 17 00:00:00 2001 From: Johannes Brüderl Date: Sun, 7 Dec 2025 10:02:20 +0100 Subject: usb: core: add USB_QUIRK_NO_BOS for devices that hang on BOS descriptor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add USB_QUIRK_NO_BOS quirk flag to skip requesting the BOS descriptor for devices that cannot handle it. Add Elgato 4K X (0fd9:009b) to the quirk table. This device hangs when the BOS descriptor is requested at SuperSpeed Plus (10Gbps). Link: https://bugzilla.kernel.org/show_bug.cgi?id=220027 Cc: stable Signed-off-by: Johannes Brüderl Link: https://patch.msgid.link/20251207090220.14807-1-johannes.bruederl@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 5 +++++ drivers/usb/core/quirks.c | 3 +++ include/linux/usb/quirks.h | 3 +++ 3 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index baf5bc844b6f..2bb1ceb9d621 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -1040,6 +1040,11 @@ int usb_get_bos_descriptor(struct usb_device *dev) __u8 cap_type; int ret; + if (dev->quirks & USB_QUIRK_NO_BOS) { + dev_dbg(ddev, "skipping BOS descriptor\n"); + return -ENOMSG; + } + bos = kzalloc(sizeof(*bos), GFP_KERNEL); if (!bos) return -ENOMEM; diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 47f589c4104a..c4d85089d19b 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -450,6 +450,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x0c45, 0x7056), .driver_info = USB_QUIRK_IGNORE_REMOTE_WAKEUP }, + /* Elgato 4K X - BOS descriptor fetch hangs at SuperSpeed Plus */ + { USB_DEVICE(0x0fd9, 0x009b), .driver_info = USB_QUIRK_NO_BOS }, + /* Sony Xperia XZ1 Compact (lilac) smartphone in fastboot mode */ { USB_DEVICE(0x0fce, 0x0dde), .driver_info = USB_QUIRK_NO_LPM }, diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index 59409c1fc3de..2f7bd2fdc616 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -75,4 +75,7 @@ /* short SET_ADDRESS request timeout */ #define USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT BIT(16) +/* skip BOS descriptor request */ +#define USB_QUIRK_NO_BOS BIT(17) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3 From 5232196ff49be08350b27f1ba8e1fad87afc9cdf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 2 Jan 2026 14:31:48 -0500 Subject: ftrace: Make ftrace_graph_ent depth field signed The code has integrity checks to make sure that depth never goes below zero. But the depth field has recently been converted to unsigned long from "int" (for alignment reasons). As unsigned long can never be less than zero, the integrity checks no longer work. Convert depth to long from unsigned long to allow the integrity checks to work again. Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Cc: pengdonglin Link: https://patch.msgid.link/20260102143148.251c2e16@gandalf.local.home Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/aS6kGi0maWBl-MjZ@stanley.mountain/ Fixes: f83ac7544fbf7 ("function_graph: Enable funcgraph-args and funcgraph-retaddr to work simultaneously") Signed-off-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) --- include/linux/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 770f0dc993cc..a3a8989e3268 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1167,7 +1167,7 @@ static inline void ftrace_init(void) { } */ struct ftrace_graph_ent { unsigned long func; /* Current function */ - unsigned long depth; + long depth; /* signed to check for less than zero */ } __packed; /* -- cgit v1.2.3 From 5f1ef0dfcb5b7f4a91a9b0e0ba533efd9f7e2cdb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 5 Jan 2026 20:31:41 -0500 Subject: tracing: Add recursion protection in kernel stack trace recording A bug was reported about an infinite recursion caused by tracing the rcu events with the kernel stack trace trigger enabled. The stack trace code called back into RCU which then called the stack trace again. Expand the ftrace recursion protection to add a set of bits to protect events from recursion. Each bit represents the context that the event is in (normal, softirq, interrupt and NMI). Have the stack trace code use the interrupt context to protect against recursion. Note, the bug showed an issue in both the RCU code as well as the tracing stacktrace code. This only handles the tracing stack trace side of the bug. The RCU fix will be handled separately. Link: https://lore.kernel.org/all/20260102122807.7025fc87@gandalf.local.home/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Joel Fernandes Cc: "Paul E. McKenney" Cc: Boqun Feng Link: https://patch.msgid.link/20260105203141.515cd49f@gandalf.local.home Reported-by: Yao Kai Tested-by: Yao Kai Fixes: 5f5fa7ea89dc ("rcu: Don't use negative nesting depth in __rcu_read_unlock()") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_recursion.h | 9 +++++++++ kernel/trace/trace.c | 6 ++++++ 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index ae04054a1be3..e6ca052b2a85 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -34,6 +34,13 @@ enum { TRACE_INTERNAL_SIRQ_BIT, TRACE_INTERNAL_TRANSITION_BIT, + /* Internal event use recursion bits */ + TRACE_INTERNAL_EVENT_BIT, + TRACE_INTERNAL_EVENT_NMI_BIT, + TRACE_INTERNAL_EVENT_IRQ_BIT, + TRACE_INTERNAL_EVENT_SIRQ_BIT, + TRACE_INTERNAL_EVENT_TRANSITION_BIT, + TRACE_BRANCH_BIT, /* * Abuse of the trace_recursion. @@ -58,6 +65,8 @@ enum { #define TRACE_LIST_START TRACE_INTERNAL_BIT +#define TRACE_EVENT_START TRACE_INTERNAL_EVENT_BIT + #define TRACE_CONTEXT_MASK ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6f2148df14d9..aef9058537d5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3012,6 +3012,11 @@ static void __ftrace_trace_stack(struct trace_array *tr, struct ftrace_stack *fstack; struct stack_entry *entry; int stackidx; + int bit; + + bit = trace_test_and_set_recursion(_THIS_IP_, _RET_IP_, TRACE_EVENT_START); + if (bit < 0) + return; /* * Add one, for this function and the call to save_stack_trace() @@ -3080,6 +3085,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, /* Again, don't let gcc optimize things here */ barrier(); __this_cpu_dec(ftrace_stack_reserve); + trace_clear_recursion(bit); } static inline void ftrace_trace_stack(struct trace_array *tr, -- cgit v1.2.3 From ef56578274d2b98423c8ef82bb450223f5811b59 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Wed, 7 Jan 2026 17:59:41 +0100 Subject: cgroup: Eliminate cgrp_ancestor_storage in cgroup_root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cgrp_ancestor_storage has two drawbacks: - it's not guaranteed that the member immediately follows struct cgrp in cgroup_root (root cgroup's ancestors[0] might thus point to a padding and not in cgrp_ancestor_storage proper), - this idiom raises warnings with -Wflex-array-member-not-at-end. Instead of relying on the auxiliary member in cgroup_root, define the 0-th level ancestor inside struct cgroup (needed for static allocation of cgrp_dfl_root), deeper cgroups would allocate flexible _low_ancestors[]. Unionized alias through ancestors[] will transparently join the two ranges. The above change would still leave the flexible array at the end of struct cgroup inside cgroup_root, so move cgrp also towards the end of cgroup_root to resolve the -Wflex-array-member-not-at-end. Link: https://lore.kernel.org/r/5fb74444-2fbb-476e-b1bf-3f3e279d0ced@embeddedor.com/ Reported-by: Gustavo A. R. Silva Closes: https://lore.kernel.org/r/b3eb050d-9451-4b60-b06c-ace7dab57497@embeddedor.com/ Cc: David Laight Acked-by: Gustavo A. R. Silva Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 25 ++++++++++++++----------- kernel/cgroup/cgroup.c | 2 +- 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index b760a3c470a5..f7cc60de0058 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -626,7 +626,13 @@ struct cgroup { #endif /* All ancestors including self */ - struct cgroup *ancestors[]; + union { + DECLARE_FLEX_ARRAY(struct cgroup *, ancestors); + struct { + struct cgroup *_root_ancestor; + DECLARE_FLEX_ARRAY(struct cgroup *, _low_ancestors); + }; + }; }; /* @@ -647,16 +653,6 @@ struct cgroup_root { struct list_head root_list; struct rcu_head rcu; /* Must be near the top */ - /* - * The root cgroup. The containing cgroup_root will be destroyed on its - * release. cgrp->ancestors[0] will be used overflowing into the - * following field. cgrp_ancestor_storage must immediately follow. - */ - struct cgroup cgrp; - - /* must follow cgrp for cgrp->ancestors[0], see above */ - struct cgroup *cgrp_ancestor_storage; - /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; @@ -668,6 +664,13 @@ struct cgroup_root { /* The name for this hierarchy - may be empty */ char name[MAX_CGROUP_ROOT_NAMELEN]; + + /* + * The root cgroup. The containing cgroup_root will be destroyed on its + * release. This must be embedded last due to flexible array at the end + * of struct cgroup. + */ + struct cgroup cgrp; }; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e717208cfb18..554a02ee298b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5847,7 +5847,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL); + cgrp = kzalloc(struct_size(cgrp, _low_ancestors, level), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 54b603f2db6b95495bc33a8f2bde80f044baff9a Mon Sep 17 00:00:00 2001 From: Yaxiong Tian Date: Tue, 30 Dec 2025 14:15:34 +0800 Subject: PM: EM: Fix incorrect description of the cost field in struct em_perf_state Due to commit 1b600da51073 ("PM: EM: Optimize em_cpu_energy() and remove division"), the logic for energy consumption calculation has been modified. The actual calculation of cost is 10 * power * max_frequency / frequency instead of power * max_frequency / frequency. Therefore, the comment for cost has been updated to reflect the correct content. Fixes: 1b600da51073 ("PM: EM: Optimize em_cpu_energy() and remove division") Signed-off-by: Yaxiong Tian Reviewed-by: Lukasz Luba [ rjw: Added Fixes: tag ] Link: https://patch.msgid.link/20251230061534.816894-1-tianyaxiong@kylinos.cn Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 43aa6153dc57..e7497f804644 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -18,7 +18,7 @@ * @power: The power consumed at this level (by 1 CPU or by a registered * device). It can be a total power: static and dynamic. * @cost: The cost coefficient associated with this level, used during - * energy calculation. Equal to: power * max_frequency / frequency + * energy calculation. Equal to: 10 * power * max_frequency / frequency * @flags: see "em_perf_state flags" description below. */ struct em_perf_state { -- cgit v1.2.3 From 2e4b28c48f88ce9e263957b1d944cf5349952f88 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 11 Jan 2026 16:53:48 +0100 Subject: treewide: Update email address In a vain attempt to consolidate the email zoo switch everything to the kernel.org account. Signed-off-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- .mailmap | 1 + CREDITS | 2 +- .../ABI/stable/sysfs-kernel-time-aux-clocks | 2 +- Documentation/arch/x86/topology.rst | 2 +- Documentation/core-api/cpu_hotplug.rst | 2 +- Documentation/core-api/genericirq.rst | 2 +- Documentation/core-api/librs.rst | 2 +- .../devicetree/bindings/timer/mrvl,mmp-timer.yaml | 2 +- Documentation/driver-api/mtdnand.rst | 4 +-- .../translations/zh_CN/core-api/cpu_hotplug.rst | 2 +- .../translations/zh_CN/core-api/genericirq.rst | 2 +- MAINTAINERS | 36 +++++++++++----------- arch/sh/kernel/perf_event.c | 2 +- arch/sparc/kernel/perf_event.c | 2 +- arch/x86/events/core.c | 2 +- arch/x86/events/perf_event.h | 2 +- arch/x86/kernel/x86_init.c | 2 +- arch/x86/mm/pti.c | 2 +- drivers/mtd/nand/ecc-sw-hamming.c | 2 +- drivers/mtd/nand/raw/diskonchip.c | 2 +- drivers/mtd/nand/raw/nand_base.c | 4 +-- drivers/mtd/nand/raw/nand_bbt.c | 2 +- drivers/mtd/nand/raw/nand_ids.c | 2 +- drivers/mtd/nand/raw/nand_jedec.c | 2 +- drivers/mtd/nand/raw/nand_legacy.c | 2 +- drivers/mtd/nand/raw/nand_onfi.c | 2 +- drivers/mtd/nand/raw/ndfc.c | 2 +- drivers/uio/uio.c | 2 +- fs/jffs2/wbuf.c | 4 +-- include/linux/hrtimer.h | 2 +- include/linux/ktime.h | 2 +- include/linux/mtd/jedec.h | 2 +- include/linux/mtd/nand-ecc-sw-hamming.h | 2 +- include/linux/mtd/ndfc.h | 2 +- include/linux/mtd/onfi.h | 2 +- include/linux/mtd/platnand.h | 2 +- include/linux/mtd/rawnand.h | 2 +- include/linux/perf_event.h | 2 +- include/linux/plist.h | 2 +- include/linux/rslib.h | 2 +- include/linux/uio_driver.h | 2 +- include/uapi/linux/perf_event.h | 2 +- kernel/events/callchain.c | 2 +- kernel/events/core.c | 2 +- kernel/events/ring_buffer.c | 2 +- kernel/irq/debugfs.c | 2 +- kernel/irq/matrix.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/pelt.c | 2 +- kernel/time/clockevents.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/tick-broadcast.c | 2 +- kernel/time/tick-common.c | 2 +- kernel/time/tick-oneshot.c | 2 +- kernel/time/tick-sched.c | 2 +- lib/debugobjects.c | 2 +- lib/plist.c | 2 +- lib/reed_solomon/decode_rs.c | 2 +- lib/reed_solomon/encode_rs.c | 2 +- lib/reed_solomon/reed_solomon.c | 2 +- scripts/spdxcheck.py | 2 +- tools/include/uapi/linux/perf_event.h | 2 +- tools/perf/builtin-list.c | 2 +- 63 files changed, 83 insertions(+), 82 deletions(-) (limited to 'include/linux') diff --git a/.mailmap b/.mailmap index b23e0853d636..fa018b5bd533 100644 --- a/.mailmap +++ b/.mailmap @@ -801,6 +801,7 @@ Tanzir Hasan Tejun Heo Tomeu Vizoso Thomas Graf +Thomas Gleixner Thomas Körper Thomas Pedersen Thorsten Blum diff --git a/CREDITS b/CREDITS index ca75f110edb6..383809bc4b7a 100644 --- a/CREDITS +++ b/CREDITS @@ -1398,7 +1398,7 @@ D: SRM environment driver (for Alpha systems) P: 1024D/8399E1BB 250D 3BCF 7127 0D8C A444 A961 1DBD 5E75 8399 E1BB N: Thomas Gleixner -E: tglx@linutronix.de +E: tglx@kernel.org D: NAND flash hardware support, JFFS2 on NAND flash N: Jérôme Glisse diff --git a/Documentation/ABI/stable/sysfs-kernel-time-aux-clocks b/Documentation/ABI/stable/sysfs-kernel-time-aux-clocks index 825508f42af6..e1a894c8dd1b 100644 --- a/Documentation/ABI/stable/sysfs-kernel-time-aux-clocks +++ b/Documentation/ABI/stable/sysfs-kernel-time-aux-clocks @@ -1,5 +1,5 @@ What: /sys/kernel/time/aux_clocks//enable Date: May 2025 -Contact: Thomas Gleixner +Contact: Thomas Gleixner Description: Controls the enablement of auxiliary clock timekeepers. diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst index 86bec8ac2c4d..f779a68875c5 100644 --- a/Documentation/arch/x86/topology.rst +++ b/Documentation/arch/x86/topology.rst @@ -17,7 +17,7 @@ with the generic one and look at this one in parallel for the x86 specifics. Needless to say, code should use the generic functions - this file is *only* here to *document* the inner workings of x86 topology. -Started by Thomas Gleixner and Borislav Petkov . +Started by Thomas Gleixner and Borislav Petkov . The main aim of the topology facilities is to present adequate interfaces to code which needs to know/query/use the structure of the running system wrt diff --git a/Documentation/core-api/cpu_hotplug.rst b/Documentation/core-api/cpu_hotplug.rst index e1b0eeabbb5e..9b4afca9fd09 100644 --- a/Documentation/core-api/cpu_hotplug.rst +++ b/Documentation/core-api/cpu_hotplug.rst @@ -8,7 +8,7 @@ CPU hotplug in the Kernel Srivatsa Vaddagiri , Ashok Raj , Joel Schopp , - Thomas Gleixner + Thomas Gleixner Introduction ============ diff --git a/Documentation/core-api/genericirq.rst b/Documentation/core-api/genericirq.rst index 582bde9bf5a9..b16d751d4b98 100644 --- a/Documentation/core-api/genericirq.rst +++ b/Documentation/core-api/genericirq.rst @@ -439,6 +439,6 @@ Credits The following people have contributed to this document: -1. Thomas Gleixner tglx@linutronix.de +1. Thomas Gleixner tglx@kernel.org 2. Ingo Molnar mingo@elte.hu diff --git a/Documentation/core-api/librs.rst b/Documentation/core-api/librs.rst index 6010f5bc5bf9..0d88893dbc03 100644 --- a/Documentation/core-api/librs.rst +++ b/Documentation/core-api/librs.rst @@ -209,4 +209,4 @@ testing. Thanks a lot. The following people have contributed to this document: -Thomas Gleixner\ tglx@linutronix.de +Thomas Gleixner\ tglx@kernel.org diff --git a/Documentation/devicetree/bindings/timer/mrvl,mmp-timer.yaml b/Documentation/devicetree/bindings/timer/mrvl,mmp-timer.yaml index fe6bc4173789..0643cfcc6bc7 100644 --- a/Documentation/devicetree/bindings/timer/mrvl,mmp-timer.yaml +++ b/Documentation/devicetree/bindings/timer/mrvl,mmp-timer.yaml @@ -8,7 +8,7 @@ title: Marvell MMP Timer maintainers: - Daniel Lezcano - - Thomas Gleixner + - Thomas Gleixner - Rob Herring properties: diff --git a/Documentation/driver-api/mtdnand.rst b/Documentation/driver-api/mtdnand.rst index ce77e024c4f1..adf03983f1ba 100644 --- a/Documentation/driver-api/mtdnand.rst +++ b/Documentation/driver-api/mtdnand.rst @@ -996,11 +996,11 @@ The following people have contributed to the NAND driver: 2. David Woodhouse\ dwmw2@infradead.org -3. Thomas Gleixner\ tglx@linutronix.de +3. Thomas Gleixner\ tglx@kernel.org A lot of users have provided bugfixes, improvements and helping hands for testing. Thanks a lot. The following people have contributed to this document: -1. Thomas Gleixner\ tglx@linutronix.de +1. Thomas Gleixner\ tglx@kernel.org diff --git a/Documentation/translations/zh_CN/core-api/cpu_hotplug.rst b/Documentation/translations/zh_CN/core-api/cpu_hotplug.rst index bc0d7ea6d834..3447fbf0e695 100644 --- a/Documentation/translations/zh_CN/core-api/cpu_hotplug.rst +++ b/Documentation/translations/zh_CN/core-api/cpu_hotplug.rst @@ -22,7 +22,7 @@ Srivatsa Vaddagiri , Ashok Raj , Joel Schopp , - Thomas Gleixner + Thomas Gleixner 简介 ==== diff --git a/Documentation/translations/zh_CN/core-api/genericirq.rst b/Documentation/translations/zh_CN/core-api/genericirq.rst index 05ccb954c18d..d2c1bd94bb97 100644 --- a/Documentation/translations/zh_CN/core-api/genericirq.rst +++ b/Documentation/translations/zh_CN/core-api/genericirq.rst @@ -404,6 +404,6 @@ kernel/irq/chip.c 感谢以下人士对本文档作出的贡献: -1. Thomas Gleixner tglx@linutronix.de +1. Thomas Gleixner tglx@kernel.org 2. Ingo Molnar mingo@elte.hu diff --git a/MAINTAINERS b/MAINTAINERS index 32b5e41d9849..ee036e0a3ef6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6175,7 +6175,7 @@ F: include/linux/clk.h CLOCKSOURCE, CLOCKEVENT DRIVERS M: Daniel Lezcano -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core @@ -6541,7 +6541,7 @@ S: Maintained F: drivers/cpufreq/virtual-cpufreq.c CPU HOTPLUG -M: Thomas Gleixner +M: Thomas Gleixner M: Peter Zijlstra L: linux-kernel@vger.kernel.org S: Maintained @@ -6968,7 +6968,7 @@ F: Documentation/scsi/dc395x.rst F: drivers/scsi/dc395x.* DEBUGOBJECTS: -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/debugobjects @@ -10371,7 +10371,7 @@ F: include/uapi/linux/fuse.h F: tools/testing/selftests/filesystems/fuse/ FUTEX SUBSYSTEM -M: Thomas Gleixner +M: Thomas Gleixner M: Ingo Molnar R: Peter Zijlstra R: Darren Hart @@ -10515,7 +10515,7 @@ F: drivers/base/arch_topology.c F: include/linux/arch_topology.h GENERIC ENTRY CODE -M: Thomas Gleixner +M: Thomas Gleixner M: Peter Zijlstra M: Andy Lutomirski L: linux-kernel@vger.kernel.org @@ -10628,7 +10628,7 @@ F: drivers/uio/uio_pci_generic.c GENERIC VDSO LIBRARY M: Andy Lutomirski -M: Thomas Gleixner +M: Thomas Gleixner M: Vincenzo Frascino L: linux-kernel@vger.kernel.org S: Maintained @@ -11241,7 +11241,7 @@ F: drivers/hid/hid-logitech-hidpp.c HIGH-RESOLUTION TIMERS, TIMER WHEEL, CLOCKEVENTS M: Anna-Maria Behnsen M: Frederic Weisbecker -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core @@ -11264,7 +11264,7 @@ R: Boqun Feng R: FUJITA Tomonori R: Frederic Weisbecker R: Lyude Paul -R: Thomas Gleixner +R: Thomas Gleixner R: Anna-Maria Behnsen R: John Stultz R: Stephen Boyd @@ -13334,7 +13334,7 @@ F: Documentation/devicetree/bindings/sound/irondevice,* F: sound/soc/codecs/sma* IRQ DOMAINS (IRQ NUMBER MAPPING LIBRARY) -M: Thomas Gleixner +M: Thomas Gleixner S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core F: Documentation/core-api/irq/irq-domain.rst @@ -13344,7 +13344,7 @@ F: kernel/irq/irqdomain.c F: kernel/irq/msi.c IRQ SUBSYSTEM -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core @@ -13357,7 +13357,7 @@ F: kernel/irq/ F: lib/group_cpus.c IRQCHIP DRIVERS -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core @@ -14451,7 +14451,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-nonmm-unstab F: lib/* LICENSES and SPDX stuff -M: Thomas Gleixner +M: Thomas Gleixner M: Greg Kroah-Hartman L: linux-spdx@vger.kernel.org S: Maintained @@ -18576,7 +18576,7 @@ NOHZ, DYNTICKS SUPPORT M: Anna-Maria Behnsen M: Frederic Weisbecker M: Ingo Molnar -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/nohz @@ -20761,7 +20761,7 @@ F: drivers/platform/x86/portwell-ec.c POSIX CLOCKS and TIMERS M: Anna-Maria Behnsen M: Frederic Weisbecker -M: Thomas Gleixner +M: Thomas Gleixner L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core @@ -26272,7 +26272,7 @@ F: drivers/net/wireless/ti/ TIMEKEEPING, CLOCKSOURCE CORE, NTP, ALARMTIMER M: John Stultz -M: Thomas Gleixner +M: Thomas Gleixner R: Stephen Boyd L: linux-kernel@vger.kernel.org S: Supported @@ -28203,7 +28203,7 @@ F: net/lapb/ F: net/x25/ X86 ARCHITECTURE (32-BIT AND 64-BIT) -M: Thomas Gleixner +M: Thomas Gleixner M: Ingo Molnar M: Borislav Petkov M: Dave Hansen @@ -28219,7 +28219,7 @@ F: tools/testing/selftests/x86 X86 CPUID DATABASE M: Borislav Petkov -M: Thomas Gleixner +M: Thomas Gleixner M: x86@kernel.org R: Ahmed S. Darwish L: x86-cpuid@lists.linux.dev @@ -28235,7 +28235,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/asm F: arch/x86/entry/ X86 HARDWARE VULNERABILITIES -M: Thomas Gleixner +M: Thomas Gleixner M: Borislav Petkov M: Peter Zijlstra M: Josh Poimboeuf diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 1d2507f22437..1fbb7d46e484 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -7,7 +7,7 @@ * Heavily based on the x86 and PowerPC implementations. * * x86: - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index cae4d33002a5..0ce4ae343531 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -6,7 +6,7 @@ * This code is based almost entirely upon the x86 perf event * code, which is: * - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 0c38a31d5fc7..576baa9a52c5 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1,7 +1,7 @@ /* * Performance events x86 architecture code * - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 3161ec0a3416..62963022b517 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1,7 +1,7 @@ /* * Performance events x86 architecture header * - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 0a2bbd674a6d..ebefb77c37bb 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Thomas Gleixner + * Copyright (C) 2009 Linutronix GmbH, Thomas Gleixner * * For licencing details see kernel-base/COPYING */ diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index b10d4d131dce..f7546e9e8e89 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -15,7 +15,7 @@ * Signed-off-by: Michael Schwarz * * Major changes to the original code by: Dave Hansen - * Mostly rewritten by Thomas Gleixner and + * Mostly rewritten by Thomas Gleixner and * Andy Lutomirsky */ #include diff --git a/drivers/mtd/nand/ecc-sw-hamming.c b/drivers/mtd/nand/ecc-sw-hamming.c index f2d0effad9d2..bc62a71f9fdd 100644 --- a/drivers/mtd/nand/ecc-sw-hamming.c +++ b/drivers/mtd/nand/ecc-sw-hamming.c @@ -8,7 +8,7 @@ * * Completely replaces the previous ECC implementation which was written by: * Steven J. Hill (sjhill@realitydiluted.com) - * Thomas Gleixner (tglx@linutronix.de) + * Thomas Gleixner (tglx@kernel.org) * * Information on how this algorithm works and how it was developed * can be found in Documentation/driver-api/mtd/nand_ecc.rst diff --git a/drivers/mtd/nand/raw/diskonchip.c b/drivers/mtd/nand/raw/diskonchip.c index 70d6c2250f32..540b6baf8bb1 100644 --- a/drivers/mtd/nand/raw/diskonchip.c +++ b/drivers/mtd/nand/raw/diskonchip.c @@ -11,7 +11,7 @@ * Error correction code lifted from the old docecc code * Author: Fabrice Bellard (fabrice.bellard@netgem.com) * Copyright (C) 2000 Netgem S.A. - * converted to the generic Reed-Solomon library by Thomas Gleixner + * converted to the generic Reed-Solomon library by Thomas Gleixner * * Interface to generic NAND code for M-Systems DiskOnChip devices */ diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index ad6d66309597..f2322de93ab4 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -8,7 +8,7 @@ * http://www.linux-mtd.infradead.org/doc/nand.html * * Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com) - * 2002-2006 Thomas Gleixner (tglx@linutronix.de) + * 2002-2006 Thomas Gleixner (tglx@kernel.org) * * Credits: * David Woodhouse for adding multichip support @@ -6594,5 +6594,5 @@ EXPORT_SYMBOL_GPL(nand_cleanup); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steven J. Hill "); -MODULE_AUTHOR("Thomas Gleixner "); +MODULE_AUTHOR("Thomas Gleixner "); MODULE_DESCRIPTION("Generic NAND flash driver code"); diff --git a/drivers/mtd/nand/raw/nand_bbt.c b/drivers/mtd/nand/raw/nand_bbt.c index a8fba5f39f59..3050ab7e6eb6 100644 --- a/drivers/mtd/nand/raw/nand_bbt.c +++ b/drivers/mtd/nand/raw/nand_bbt.c @@ -3,7 +3,7 @@ * Overview: * Bad block table support for the NAND driver * - * Copyright © 2004 Thomas Gleixner (tglx@linutronix.de) + * Copyright © 2004 Thomas Gleixner (tglx@kernel.org) * * Description: * diff --git a/drivers/mtd/nand/raw/nand_ids.c b/drivers/mtd/nand/raw/nand_ids.c index 650351c62af6..62a8cf86d9e2 100644 --- a/drivers/mtd/nand/raw/nand_ids.c +++ b/drivers/mtd/nand/raw/nand_ids.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2002 Thomas Gleixner (tglx@linutronix.de) + * Copyright (C) 2002 Thomas Gleixner (tglx@kernel.org) */ #include diff --git a/drivers/mtd/nand/raw/nand_jedec.c b/drivers/mtd/nand/raw/nand_jedec.c index b3cc8f360529..89e6dd8ed1a8 100644 --- a/drivers/mtd/nand/raw/nand_jedec.c +++ b/drivers/mtd/nand/raw/nand_jedec.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com) - * 2002-2006 Thomas Gleixner (tglx@linutronix.de) + * 2002-2006 Thomas Gleixner (tglx@kernel.org) * * Credits: * David Woodhouse for adding multichip support diff --git a/drivers/mtd/nand/raw/nand_legacy.c b/drivers/mtd/nand/raw/nand_legacy.c index 743792edf98d..97700f80d5b8 100644 --- a/drivers/mtd/nand/raw/nand_legacy.c +++ b/drivers/mtd/nand/raw/nand_legacy.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com) - * 2002-2006 Thomas Gleixner (tglx@linutronix.de) + * 2002-2006 Thomas Gleixner (tglx@kernel.org) * * Credits: * David Woodhouse for adding multichip support diff --git a/drivers/mtd/nand/raw/nand_onfi.c b/drivers/mtd/nand/raw/nand_onfi.c index 861975e44b55..11954440e4de 100644 --- a/drivers/mtd/nand/raw/nand_onfi.c +++ b/drivers/mtd/nand/raw/nand_onfi.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com) - * 2002-2006 Thomas Gleixner (tglx@linutronix.de) + * 2002-2006 Thomas Gleixner (tglx@kernel.org) * * Credits: * David Woodhouse for adding multichip support diff --git a/drivers/mtd/nand/raw/ndfc.c b/drivers/mtd/nand/raw/ndfc.c index 13365128194d..7ad8bc04be1a 100644 --- a/drivers/mtd/nand/raw/ndfc.c +++ b/drivers/mtd/nand/raw/ndfc.c @@ -272,5 +272,5 @@ static struct platform_driver ndfc_driver = { module_platform_driver(ndfc_driver); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Thomas Gleixner "); +MODULE_AUTHOR("Thomas Gleixner "); MODULE_DESCRIPTION("OF Platform driver for NDFC"); diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index d93ed4e86a17..fa0d4e6aee16 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -3,7 +3,7 @@ * drivers/uio/uio.c * * Copyright(C) 2005, Benedikt Spranger - * Copyright(C) 2005, Thomas Gleixner + * Copyright(C) 2005, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2006, Hans J. Koch * Copyright(C) 2006, Greg Kroah-Hartman * diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index bb815a002984..3ab3f0ff7ebb 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -2,10 +2,10 @@ * JFFS2 -- Journalling Flash File System, Version 2. * * Copyright © 2001-2007 Red Hat, Inc. - * Copyright © 2004 Thomas Gleixner + * Copyright © 2004 Thomas Gleixner * * Created by David Woodhouse - * Modified debugged and enhanced by Thomas Gleixner + * Modified debugged and enhanced by Thomas Gleixner * * For licensing information, see the file 'LICENCE' in this directory. * diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2cf1bf65b225..0de12f14d6a4 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -2,7 +2,7 @@ /* * hrtimers - High-resolution kernel timers * - * Copyright(C) 2005, Thomas Gleixner + * Copyright(C) 2005, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 383ed9985802..f247e564602f 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -3,7 +3,7 @@ * * ktime_t - nanosecond-resolution time format. * - * Copyright(C) 2005, Thomas Gleixner + * Copyright(C) 2005, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes and macros. diff --git a/include/linux/mtd/jedec.h b/include/linux/mtd/jedec.h index 56047a4e54c9..255972f3d88d 100644 --- a/include/linux/mtd/jedec.h +++ b/include/linux/mtd/jedec.h @@ -2,7 +2,7 @@ /* * Copyright © 2000-2010 David Woodhouse * Steven J. Hill - * Thomas Gleixner + * Thomas Gleixner * * Contains all JEDEC related definitions */ diff --git a/include/linux/mtd/nand-ecc-sw-hamming.h b/include/linux/mtd/nand-ecc-sw-hamming.h index c6c71894c575..2aa2f8ef68d2 100644 --- a/include/linux/mtd/nand-ecc-sw-hamming.h +++ b/include/linux/mtd/nand-ecc-sw-hamming.h @@ -2,7 +2,7 @@ /* * Copyright (C) 2000-2010 Steven J. Hill * David Woodhouse - * Thomas Gleixner + * Thomas Gleixner * * This file is the header for the NAND Hamming ECC implementation. */ diff --git a/include/linux/mtd/ndfc.h b/include/linux/mtd/ndfc.h index 98f075b86931..622891191e9c 100644 --- a/include/linux/mtd/ndfc.h +++ b/include/linux/mtd/ndfc.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (c) 2006 Thomas Gleixner + * Copyright (c) 2006 Linutronix GmbH, Thomas Gleixner * * Info: * Contains defines, datastructures for ndfc nand controller diff --git a/include/linux/mtd/onfi.h b/include/linux/mtd/onfi.h index 55ab2e4d62f9..09a5cbd8f232 100644 --- a/include/linux/mtd/onfi.h +++ b/include/linux/mtd/onfi.h @@ -2,7 +2,7 @@ /* * Copyright © 2000-2010 David Woodhouse * Steven J. Hill - * Thomas Gleixner + * Thomas Gleixner * * Contains all ONFI related definitions */ diff --git a/include/linux/mtd/platnand.h b/include/linux/mtd/platnand.h index bc11eb6b593b..2df6fba699f2 100644 --- a/include/linux/mtd/platnand.h +++ b/include/linux/mtd/platnand.h @@ -2,7 +2,7 @@ /* * Copyright © 2000-2010 David Woodhouse * Steven J. Hill - * Thomas Gleixner + * Thomas Gleixner * * Contains all platform NAND related definitions. */ diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index d30bdc3fcfd7..5c70e7bd3ed5 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -2,7 +2,7 @@ /* * Copyright © 2000-2010 David Woodhouse * Steven J. Hill - * Thomas Gleixner + * Thomas Gleixner * * Info: * Contains standard defines and IDs for NAND flash devices diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9870d768db4c..9ded2e582c60 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1,7 +1,7 @@ /* * Performance events: * - * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * diff --git a/include/linux/plist.h b/include/linux/plist.h index 8c1c8adf7fe9..16cf4355b5c1 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -8,7 +8,7 @@ * 2001-2005 (c) MontaVista Software, Inc. * Daniel Walker * - * (C) 2005 Thomas Gleixner + * (C) 2005 Linutronix GmbH, Thomas Gleixner * * Simplifications of the original code by * Oleg Nesterov diff --git a/include/linux/rslib.h b/include/linux/rslib.h index a04dacbdc8ae..a2848f6907e3 100644 --- a/include/linux/rslib.h +++ b/include/linux/rslib.h @@ -2,7 +2,7 @@ /* * Generic Reed Solomon encoder / decoder library * - * Copyright (C) 2004 Thomas Gleixner (tglx@linutronix.de) + * Copyright (C) 2004 Thomas Gleixner (tglx@kernel.org) * * RS code lifted from reed solomon library written by Phil Karn * Copyright 2002 Phil Karn, KA9Q diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 18238dc8bfd3..334641e20fb1 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -3,7 +3,7 @@ * include/linux/uio_driver.h * * Copyright(C) 2005, Benedikt Spranger - * Copyright(C) 2005, Thomas Gleixner + * Copyright(C) 2005, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2006, Hans J. Koch * Copyright(C) 2006, Greg Kroah-Hartman * diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index c44a8fb3e418..72f03153dd32 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -2,7 +2,7 @@ /* * Performance events: * - * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index b9c7e00725d6..1f6589578703 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -2,7 +2,7 @@ /* * Performance events callchain code, extracted from core.c: * - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. diff --git a/kernel/events/core.c b/kernel/events/core.c index dad0d3d2e85f..f5e9d30e4fa9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2,7 +2,7 @@ /* * Performance events core code: * - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 20a905023736..3e7de2661417 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -2,7 +2,7 @@ /* * Performance events ring-buffer code: * - * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008 Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 3527defd2890..5c5ebaee35f2 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -// Copyright 2017 Thomas Gleixner +// Copyright 2017 Linutronix GmbH, Thomas Gleixner #include #include diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 8f222d1cccec..a50f2305a8dc 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2017 Thomas Gleixner +// Copyright (C) 2017 Linutronix GmbH, Thomas Gleixner #include #include diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index da46c3164537..e71302282671 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -15,7 +15,7 @@ * Author: Srivatsa Vaddagiri * * Scaled math optimizations by Thomas Gleixner - * Copyright (C) 2007, Thomas Gleixner + * Copyright (C) 2007, Linutronix GmbH, Thomas Gleixner * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index fa83bbaf4f3e..897790889ba3 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -15,7 +15,7 @@ * Author: Srivatsa Vaddagiri * * Scaled math optimizations by Thomas Gleixner - * Copyright (C) 2007, Thomas Gleixner + * Copyright (C) 2007, Linutronix GmbH, Thomas Gleixner * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index a59bc75ab7c5..eaae1ce9f060 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -2,7 +2,7 @@ /* * This file contains functions which manage clock event devices. * - * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f8ea8c8fc895..bdb30cc5e873 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0207868c8b4d..f63c65881364 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -3,7 +3,7 @@ * This file contains functions which emulate a local clock-event * device via a broadcast event source. * - * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 7e33d3f2e889..d305d8521896 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -3,7 +3,7 @@ * This file contains the base functions to manage periodic tick * related events. * - * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index ffee943d796d..7472597f3225 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -3,7 +3,7 @@ * This file contains functions which manage high resolution tick * related events. * - * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 8ddf74e705d3..2f8a7923fa27 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * diff --git a/lib/debugobjects.c b/lib/debugobjects.c index ecf8e7f978e3..89a1d6745dc2 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -2,7 +2,7 @@ /* * Generic infrastructure for lifetime debugging of objects. * - * Copyright (C) 2008, Thomas Gleixner + * Copyright (C) 2008, Linutronix GmbH, Thomas Gleixner */ #define pr_fmt(fmt) "ODEBUG: " fmt diff --git a/lib/plist.c b/lib/plist.c index ba677c31e8f3..a5bef38add43 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -10,7 +10,7 @@ * 2001-2005 (c) MontaVista Software, Inc. * Daniel Walker * - * (C) 2005 Thomas Gleixner + * (C) 2005 Linutronix GmbH, Thomas Gleixner * * Simplifications of the original code by * Oleg Nesterov diff --git a/lib/reed_solomon/decode_rs.c b/lib/reed_solomon/decode_rs.c index 805de84ae83d..ef86ee2aec58 100644 --- a/lib/reed_solomon/decode_rs.c +++ b/lib/reed_solomon/decode_rs.c @@ -5,7 +5,7 @@ * Copyright 2002, Phil Karn, KA9Q * May be used under the terms of the GNU General Public License (GPL) * - * Adaption to the kernel by Thomas Gleixner (tglx@linutronix.de) + * Adaption to the kernel by Thomas Gleixner (tglx@kernel.org) * * Generic data width independent code which is included by the wrappers. */ diff --git a/lib/reed_solomon/encode_rs.c b/lib/reed_solomon/encode_rs.c index 9112d46e869e..1d9e51dcc83d 100644 --- a/lib/reed_solomon/encode_rs.c +++ b/lib/reed_solomon/encode_rs.c @@ -5,7 +5,7 @@ * Copyright 2002, Phil Karn, KA9Q * May be used under the terms of the GNU General Public License (GPL) * - * Adaption to the kernel by Thomas Gleixner (tglx@linutronix.de) + * Adaption to the kernel by Thomas Gleixner (tglx@kernel.org) * * Generic data width independent code which is included by the wrappers. */ diff --git a/lib/reed_solomon/reed_solomon.c b/lib/reed_solomon/reed_solomon.c index bbc01bad3053..a9e2dcb6f2a7 100644 --- a/lib/reed_solomon/reed_solomon.c +++ b/lib/reed_solomon/reed_solomon.c @@ -2,7 +2,7 @@ /* * Generic Reed Solomon encoder / decoder library * - * Copyright (C) 2004 Thomas Gleixner (tglx@linutronix.de) + * Copyright (C) 2004 Thomas Gleixner (tglx@kernel.org) * * Reed Solomon code lifted from reed solomon library written by Phil Karn * Copyright 2002 Phil Karn, KA9Q diff --git a/scripts/spdxcheck.py b/scripts/spdxcheck.py index 8d608f61bf37..908029e45ca2 100755 --- a/scripts/spdxcheck.py +++ b/scripts/spdxcheck.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 -# Copyright Thomas Gleixner +# Copyright Linutronix GmbH, Thomas Gleixner from argparse import ArgumentParser from ply import lex, yacc diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index c44a8fb3e418..72f03153dd32 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -2,7 +2,7 @@ /* * Performance events: * - * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index 5cbca0bacd35..87a5491048ac 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -4,7 +4,7 @@ * * Builtin list command: list all event types * - * Copyright (C) 2009, Thomas Gleixner + * Copyright (C) 2009, Linutronix GmbH, Thomas Gleixner * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo */ -- cgit v1.2.3 From 05f66cf5e7a5fc7c7227541f8a4a476037999916 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 26 Dec 2025 19:39:38 +0800 Subject: PCI: Provide pci_free_irq_vectors() stub 473b9f331718 ("rust: pci: fix build failure when CONFIG_PCI_MSI is disabled") fixed a build error by providing Rust helpers when CONFIG_PCI_MSI is not set. However the Rust helpers rely on pci_free_irq_vectors(), which is only available when CONFIG_PCI=y. When CONFIG_PCI is not set, there is already a stub for pci_alloc_irq_vectors(). Add a similar stub for pci_free_irq_vectors(). Fixes: 473b9f331718 ("rust: pci: fix build failure when CONFIG_PCI_MSI is disabled") Reported-by: FUJITA Tomonori Closes: https://lore.kernel.org/rust-for-linux/20251209014312.575940-1-fujita.tomonori@gmail.com/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512220740.4Kexm4dW-lkp@intel.com/ Reported-by: Liang Jie Closes: https://lore.kernel.org/rust-for-linux/20251222034415.1384223-1-buaajxlj@163.com/ Signed-off-by: Boqun Feng Signed-off-by: Bjorn Helgaas Reviewed-by: Drew Fustini Reviewed-by: David Gow Reviewed-by: Joel Fernandes Reviewed-by: Danilo Krummrich Link: https://patch.msgid.link/20251226113938.52145-1-boqun.feng@gmail.com --- include/linux/pci.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 864775651c6f..b5cc0c2b9906 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2210,6 +2210,10 @@ pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, { return -ENOSPC; } + +static inline void pci_free_irq_vectors(struct pci_dev *dev) +{ +} #endif /* CONFIG_PCI */ /* Include architecture-dependent settings and functions */ -- cgit v1.2.3 From dfdf774656205515b2d6ad94fce63c7ccbe92d91 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 9 Jan 2026 10:29:06 +0100 Subject: net: airoha: Fix typo in airoha_ppe_setup_tc_block_cb definition Fix Typo in airoha_ppe_dev_setup_tc_block_cb routine definition when CONFIG_NET_AIROHA is not enabled. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202601090517.Fj6v501r-lkp@intel.com/ Fixes: f45fc18b6de04 ("net: airoha: Add airoha_ppe_dev struct definition") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260109-airoha_ppe_dev_setup_tc_block_cb-typo-v1-1-282e8834a9f9@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/soc/airoha/airoha_offload.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index ab64ecdf39a0..d01ef4a6b3d7 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -52,8 +52,8 @@ static inline void airoha_ppe_put_dev(struct airoha_ppe_dev *dev) { } -static inline int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, - void *type_data) +static inline int airoha_ppe_dev_setup_tc_block_cb(struct airoha_ppe_dev *dev, + void *type_data) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 1e0a2ba7afb1b60f02599093d84b72ce62ad11c0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 13 Jan 2026 10:50:41 +0100 Subject: sched: Provide idle_rq() helper A fix for the dl_server 'requires' idle_cpu() usage, which made me note that it and available_idle_cpu() are extern function calls. And while idle_cpu() is used outside of kernel/sched/, available_idle_cpu() is not. This makes it hard to make idle_cpu() an inline helper, so provide idle_rq() and implement idle_cpu() and available_idle_cpu() using that. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/sched.h | 1 - kernel/sched/sched.h | 22 ++++++++++++++++++++++ kernel/sched/syscalls.c | 30 +----------------------------- 3 files changed, 23 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d395f2810fac..da0133524d08 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1874,7 +1874,6 @@ static inline int task_nice(const struct task_struct *p) extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); -extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d30cca6870f5..e885a935b716 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1364,6 +1364,28 @@ static inline u32 sched_rng(void) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +static inline bool idle_rq(struct rq *rq) +{ + return rq->curr == rq->idle && !rq->nr_running && !rq->ttwu_pending; +} + +/** + * available_idle_cpu - is a given CPU idle for enqueuing work. + * @cpu: the CPU in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +static inline bool available_idle_cpu(int cpu) +{ + if (!idle_rq(cpu_rq(cpu))) + return 0; + + if (vcpu_is_preempted(cpu)) + return 0; + + return 1; +} + #ifdef CONFIG_SCHED_PROXY_EXEC static inline void rq_set_donor(struct rq *rq, struct task_struct *t) { diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 0496dc29ed0f..cb337de679b8 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -180,35 +180,7 @@ int task_prio(const struct task_struct *p) */ int idle_cpu(int cpu) { - struct rq *rq = cpu_rq(cpu); - - if (rq->curr != rq->idle) - return 0; - - if (rq->nr_running) - return 0; - - if (rq->ttwu_pending) - return 0; - - return 1; -} - -/** - * available_idle_cpu - is a given CPU idle for enqueuing work. - * @cpu: the CPU in question. - * - * Return: 1 if the CPU is currently idle. 0 otherwise. - */ -int available_idle_cpu(int cpu) -{ - if (!idle_cpu(cpu)) - return 0; - - if (vcpu_is_preempted(cpu)) - return 0; - - return 1; + return idle_rq(cpu_rq(cpu)); } /** -- cgit v1.2.3 From 6626734dd2b151753e134730e27d17e64784c345 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 12 Jan 2026 15:46:37 +0000 Subject: mm_zone: Generalise has_managed_dma() It would be useful to be able to check for potential DMA pages beyond just ZONE_DMA - generalise the existing has_managed_dma() function to allow checking other zones too. Signed-off-by: Robin Murphy Acked-by: David Hildenbrand (Red Hat) Acked-by: Mike Rapoport (Microsoft) Tested-by: Vladimir Kondratiev Reviewed-by: Baoquan He Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/bd002d2351074e57be1ca08f03f333debac658fb.1768230104.git.robin.murphy@arm.com --- include/linux/mmzone.h | 9 +++++---- mm/page_alloc.c | 8 ++------ 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 75ef7c9f9307..fc5d6c88d2f0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1648,14 +1648,15 @@ static inline int is_highmem(const struct zone *zone) return is_highmem_idx(zone_idx(zone)); } -#ifdef CONFIG_ZONE_DMA -bool has_managed_dma(void); -#else +bool has_managed_zone(enum zone_type zone); static inline bool has_managed_dma(void) { +#ifdef CONFIG_ZONE_DMA + return has_managed_zone(ZONE_DMA); +#else return false; -} #endif +} #ifndef CONFIG_NUMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 822e05f1a964..36ccc85c5073 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7418,20 +7418,16 @@ bool put_page_back_buddy(struct page *page) } #endif -#ifdef CONFIG_ZONE_DMA -bool has_managed_dma(void) +bool has_managed_zone(enum zone_type zone) { struct pglist_data *pgdat; for_each_online_pgdat(pgdat) { - struct zone *zone = &pgdat->node_zones[ZONE_DMA]; - - if (managed_zone(zone)) + if (managed_zone(&pgdat->node_zones[zone])) return true; } return false; } -#endif /* CONFIG_ZONE_DMA */ #ifdef CONFIG_UNACCEPTED_MEMORY -- cgit v1.2.3 From e2fb7836b01747815f8bb94981c35f2688afb120 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 19 Dec 2025 08:40:04 +0700 Subject: mm: describe @flags parameter in memalloc_flags_save() Patch series "mm kernel-doc fixes". Here are kernel-doc fixes for mm subsystem. I'm also including textsearch fix since there's currently no maintainer for include/linux/textsearch.h (get_maintainer.pl only shows LKML). This patch (of 4): Sphinx reports kernel-doc warning: WARNING: ./include/linux/sched/mm.h:332 function parameter 'flags' not described in 'memalloc_flags_save' Describe @flags to fix it. Link: https://lkml.kernel.org/r/20251219014006.16328-2-bagasdotme@gmail.com Link: https://lkml.kernel.org/r/20251219014006.16328-3-bagasdotme@gmail.com Signed-off-by: Bagas Sanjaya Fixes: 3f6d5e6a468d ("mm: introduce memalloc_flags_{save,restore}") Acked-by: David Hildenbrand (Red Hat) Acked-by: Harry Yoo Signed-off-by: Andrew Morton --- include/linux/sched/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0e1d73955fa5..95d0040df584 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -325,6 +325,7 @@ static inline void might_alloc(gfp_t gfp_mask) /** * memalloc_flags_save - Add a PF_* flag to current->flags, save old value + * @flags: Flags to add. * * This allows PF_* flags to be conveniently added, irrespective of current * value, and then the old version restored with memalloc_flags_restore(). -- cgit v1.2.3 From f26528478bb102c28e7ac0cbfc8ec8185afdafc7 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 19 Dec 2025 08:40:05 +0700 Subject: textsearch: describe @list member in ts_ops search Sphinx reports kernel-doc warning: WARNING: ./include/linux/textsearch.h:49 struct member 'list' not described in 'ts_ops' Describe @list member to fix it. Link: https://lkml.kernel.org/r/20251219014006.16328-4-bagasdotme@gmail.com Fixes: 2de4ff7bd658 ("[LIB]: Textsearch infrastructure.") Signed-off-by: Bagas Sanjaya Cc: Thomas Graf Cc: "David S. Miller" Signed-off-by: Andrew Morton --- include/linux/textsearch.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/textsearch.h b/include/linux/textsearch.h index 6673e4d4ac2e..4933777404d6 100644 --- a/include/linux/textsearch.h +++ b/include/linux/textsearch.h @@ -35,6 +35,7 @@ struct ts_state * @get_pattern: return head of pattern * @get_pattern_len: return length of pattern * @owner: module reference to algorithm + * @list: list to search */ struct ts_ops { -- cgit v1.2.3 From 6cfab50e1440fde19af7c614aacd85e11aa4dcea Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 19 Dec 2025 08:40:07 +0700 Subject: mm, kfence: describe @slab parameter in __kfence_obj_info() Sphinx reports kernel-doc warning: WARNING: ./include/linux/kfence.h:220 function parameter 'slab' not described in '__kfence_obj_info' Fix it by describing @slab parameter. Link: https://lkml.kernel.org/r/20251219014006.16328-6-bagasdotme@gmail.com Fixes: 2dfe63e61cc3 ("mm, kfence: support kmem_dump_obj() for KFENCE objects") Signed-off-by: Bagas Sanjaya Acked-by: Marco Elver Acked-by: David Hildenbrand (Red Hat) Acked-by: Harry Yoo Signed-off-by: Andrew Morton --- include/linux/kfence.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 0ad1ddbb8b99..e5822f6e7f27 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -211,6 +211,7 @@ struct kmem_obj_info; * __kfence_obj_info() - fill kmem_obj_info struct * @kpp: kmem_obj_info to be filled * @object: the object + * @slab: the slab * * Return: * * false - not a KFENCE object -- cgit v1.2.3 From e561383a39ed6e5c85a0b2369720743b694327ae Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 31 Dec 2025 16:03:09 +0800 Subject: powerpc/watchdog: add support for hardlockup_sys_info sysctl Commit a9af76a78760 ("watchdog: add sys_info sysctls to dump sys info on system lockup") adds 'hardlock_sys_info' systcl knob for general kernel watchdog to control what kinds of system debug info to be dumped on hardlockup. Add similar support in powerpc watchdog code to make the sysctl knob more general, which also fixes a compiling warning in general watchdog code reported by 0day bot. Link: https://lkml.kernel.org/r/20251231080309.39642-1-feng.tang@linux.alibaba.com Fixes: a9af76a78760 ("watchdog: add sys_info sysctls to dump sys info on system lockup") Signed-off-by: Feng Tang Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512030920.NFKtekA7-lkp@intel.com/ Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/powerpc/kernel/watchdog.c | 15 ++++++++++----- include/linux/nmi.h | 1 + kernel/watchdog.c | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 2429cb1c7baa..764001deb060 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -235,7 +236,11 @@ static void watchdog_smp_panic(int cpu) pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n", cpu, tb, last_reset, tb_to_ns(tb - last_reset) / 1000000); - if (!sysctl_hardlockup_all_cpu_backtrace) { + if (sysctl_hardlockup_all_cpu_backtrace || + (hardlockup_si_mask & SYS_INFO_ALL_BT)) { + trigger_allbutcpu_cpu_backtrace(cpu); + cpumask_clear(&wd_smp_cpus_ipi); + } else { /* * Try to trigger the stuck CPUs, unless we are going to * get a backtrace on all of them anyway. @@ -244,11 +249,9 @@ static void watchdog_smp_panic(int cpu) smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); __cpumask_clear_cpu(c, &wd_smp_cpus_ipi); } - } else { - trigger_allbutcpu_cpu_backtrace(cpu); - cpumask_clear(&wd_smp_cpus_ipi); } + sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); if (hardlockup_panic) nmi_panic(NULL, "Hard LOCKUP"); @@ -415,9 +418,11 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) xchg(&__wd_nmi_output, 1); // see wd_lockup_ipi - if (sysctl_hardlockup_all_cpu_backtrace) + if (sysctl_hardlockup_all_cpu_backtrace || + (hardlockup_si_mask & SYS_INFO_ALL_BT)) trigger_allbutcpu_cpu_backtrace(cpu); + sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); diff --git a/include/linux/nmi.h b/include/linux/nmi.h index cf3c6ab408aa..207156f2143c 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -83,6 +83,7 @@ static inline void reset_hung_task_detector(void) { } #if defined(CONFIG_HARDLOCKUP_DETECTOR) extern void hardlockup_detector_disable(void); extern unsigned int hardlockup_panic; +extern unsigned long hardlockup_si_mask; #else static inline void hardlockup_detector_disable(void) {} #endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 0685e3a8aa0a..366122f4a0f8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -71,7 +71,7 @@ unsigned int __read_mostly hardlockup_panic = * hard lockup is detected, it could be task, memory, lock etc. * Refer include/linux/sys_info.h for detailed bit definition. */ -static unsigned long hardlockup_si_mask; +unsigned long hardlockup_si_mask; #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 4650ff58a1b9ee68b2d3a207047998dd42e939b2 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 9 Jan 2026 15:41:33 +0100 Subject: Revert "can: raw: instantly reject unsupported CAN frames" This reverts commit 1a620a723853a0f49703c317d52dc6b9602cbaa8 and its follow-up fixes for the introduced dependency issues. commit 1a620a723853 ("can: raw: instantly reject unsupported CAN frames") commit cb2dc6d2869a ("can: Kconfig: select CAN driver infrastructure by default") commit 6abd4577bccc ("can: fix build dependency") commit 5a5aff6338c0 ("can: fix build dependency") The entire problem was caused by the requirement that a new network layer feature needed to know about the protocol capabilities of the CAN devices. Instead of accessing CAN device internal data structures which caused the dependency problems a better approach has been developed which makes use of CAN specific ml_priv data which is accessible from both sides. Cc: Marc Kleine-Budde Cc: Arnd Bergmann Cc: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20260109144135.8495-2-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- drivers/net/can/Kconfig | 7 ++++-- drivers/net/can/Makefile | 2 +- drivers/net/can/dev/Makefile | 5 ++-- include/linux/can/dev.h | 7 ------ net/can/raw.c | 54 +++++++------------------------------------- 5 files changed, 17 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig index cfaea6178a71..e15e320db476 100644 --- a/drivers/net/can/Kconfig +++ b/drivers/net/can/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only menuconfig CAN_DEV - bool "CAN Device Drivers" + tristate "CAN Device Drivers" default y depends on CAN help @@ -17,7 +17,10 @@ menuconfig CAN_DEV virtual ones. If you own such devices or plan to use the virtual CAN interfaces to develop applications, say Y here. -if CAN_DEV && CAN + To compile as a module, choose M here: the module will be called + can-dev. + +if CAN_DEV config CAN_VCAN tristate "Virtual Local CAN Interface (vcan)" diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile index 37e2f1a2faec..d7bc10a6b8ea 100644 --- a/drivers/net/can/Makefile +++ b/drivers/net/can/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_CAN_VCAN) += vcan.o obj-$(CONFIG_CAN_VXCAN) += vxcan.o obj-$(CONFIG_CAN_SLCAN) += slcan/ -obj-$(CONFIG_CAN_DEV) += dev/ +obj-y += dev/ obj-y += esd/ obj-y += rcar/ obj-y += rockchip/ diff --git a/drivers/net/can/dev/Makefile b/drivers/net/can/dev/Makefile index 64226acf0f3d..633687d6b6c0 100644 --- a/drivers/net/can/dev/Makefile +++ b/drivers/net/can/dev/Makefile @@ -1,8 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CAN) += can-dev.o +obj-$(CONFIG_CAN_DEV) += can-dev.o + +can-dev-y += skb.o -can-dev-$(CONFIG_CAN_DEV) += skb.o can-dev-$(CONFIG_CAN_CALC_BITTIMING) += calc_bittiming.o can-dev-$(CONFIG_CAN_NETLINK) += bittiming.o can-dev-$(CONFIG_CAN_NETLINK) += dev.o diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index f6416a56e95d..52c8be5c160e 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -111,14 +111,7 @@ struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, void free_candev(struct net_device *dev); /* a candev safe wrapper around netdev_priv */ -#if IS_ENABLED(CONFIG_CAN_NETLINK) struct can_priv *safe_candev_priv(struct net_device *dev); -#else -static inline struct can_priv *safe_candev_priv(struct net_device *dev) -{ - return NULL; -} -#endif int open_candev(struct net_device *dev); void close_candev(struct net_device *dev); diff --git a/net/can/raw.c b/net/can/raw.c index be1ef7cf4204..f36a83d3447c 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -892,58 +892,20 @@ static void raw_put_canxl_vcid(struct raw_sock *ro, struct sk_buff *skb) } } -static inline bool raw_dev_cc_enabled(struct net_device *dev, - struct can_priv *priv) +static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) { - /* The CANXL-only mode disables error-signalling on the CAN bus - * which is needed to send CAN CC/FD frames - */ - if (priv) - return !can_dev_in_xl_only_mode(priv); - - /* virtual CAN interfaces always support CAN CC */ - return true; -} - -static inline bool raw_dev_fd_enabled(struct net_device *dev, - struct can_priv *priv) -{ - /* check FD ctrlmode on real CAN interfaces */ - if (priv) - return (priv->ctrlmode & CAN_CTRLMODE_FD); - - /* check MTU for virtual CAN FD interfaces */ - return (READ_ONCE(dev->mtu) >= CANFD_MTU); -} - -static inline bool raw_dev_xl_enabled(struct net_device *dev, - struct can_priv *priv) -{ - /* check XL ctrlmode on real CAN interfaces */ - if (priv) - return (priv->ctrlmode & CAN_CTRLMODE_XL); - - /* check MTU for virtual CAN XL interfaces */ - return can_is_canxl_dev_mtu(READ_ONCE(dev->mtu)); -} - -static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, - struct net_device *dev) -{ - struct can_priv *priv = safe_candev_priv(dev); - - /* Classical CAN */ - if (can_is_can_skb(skb) && raw_dev_cc_enabled(dev, priv)) + /* Classical CAN -> no checks for flags and device capabilities */ + if (can_is_can_skb(skb)) return CAN_MTU; - /* CAN FD */ + /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */ if (ro->fd_frames && can_is_canfd_skb(skb) && - raw_dev_fd_enabled(dev, priv)) + (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu))) return CANFD_MTU; - /* CAN XL */ + /* CAN XL -> needs to be enabled and a CAN XL device */ if (ro->xl_frames && can_is_canxl_skb(skb) && - raw_dev_xl_enabled(dev, priv)) + can_is_canxl_dev_mtu(mtu)) return CANXL_MTU; return 0; @@ -999,7 +961,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) err = -EINVAL; /* check for valid CAN (CC/FD/XL) frame content */ - txmtu = raw_check_txframe(ro, skb, dev); + txmtu = raw_check_txframe(ro, skb, READ_ONCE(dev->mtu)); if (!txmtu) goto free_skb; -- cgit v1.2.3 From 166e87329ce6f1eaa3475ba2d14ed30e54727c0d Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 9 Jan 2026 15:41:34 +0100 Subject: can: propagate CAN device capabilities via ml_priv Commit 1a620a723853 ("can: raw: instantly reject unsupported CAN frames") caused a sequence of dependency and linker fixes. Instead of accessing CAN device internal data structures which caused the dependency problems this patch introduces capability information into the CAN specific ml_priv data which is accessible from both sides. With this change the CAN network layer can check the required features and the decoupling of the driver layer and network layer is restored. Fixes: 1a620a723853 ("can: raw: instantly reject unsupported CAN frames") Cc: Marc Kleine-Budde Cc: Arnd Bergmann Cc: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20260109144135.8495-3-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 27 +++++++++++++++++++++++++++ drivers/net/can/dev/netlink.c | 1 + drivers/net/can/vcan.c | 15 +++++++++++++++ drivers/net/can/vxcan.c | 15 +++++++++++++++ include/linux/can/can-ml.h | 24 ++++++++++++++++++++++++ include/linux/can/dev.h | 1 + 6 files changed, 83 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 091f30e94c61..7ab9578f5b89 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -375,6 +375,32 @@ void can_set_default_mtu(struct net_device *dev) } } +void can_set_cap_info(struct net_device *dev) +{ + struct can_priv *priv = netdev_priv(dev); + u32 can_cap; + + if (can_dev_in_xl_only_mode(priv)) { + /* XL only mode => no CC/FD capability */ + can_cap = CAN_CAP_XL; + } else { + /* mixed mode => CC + FD/XL capability */ + can_cap = CAN_CAP_CC; + + if (priv->ctrlmode & CAN_CTRLMODE_FD) + can_cap |= CAN_CAP_FD; + + if (priv->ctrlmode & CAN_CTRLMODE_XL) + can_cap |= CAN_CAP_XL; + } + + if (priv->ctrlmode & (CAN_CTRLMODE_LISTENONLY | + CAN_CTRLMODE_RESTRICTED)) + can_cap |= CAN_CAP_RO; + + can_set_cap(dev, can_cap); +} + /* helper to define static CAN controller features at device creation time */ int can_set_static_ctrlmode(struct net_device *dev, u32 static_mode) { @@ -390,6 +416,7 @@ int can_set_static_ctrlmode(struct net_device *dev, u32 static_mode) /* override MTU which was set by default in can_setup()? */ can_set_default_mtu(dev); + can_set_cap_info(dev); return 0; } diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c index d6b0e686fb11..0498198a4696 100644 --- a/drivers/net/can/dev/netlink.c +++ b/drivers/net/can/dev/netlink.c @@ -377,6 +377,7 @@ static int can_ctrlmode_changelink(struct net_device *dev, } can_set_default_mtu(dev); + can_set_cap_info(dev); return 0; } diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c index fdc662aea279..76e6b7b5c6a1 100644 --- a/drivers/net/can/vcan.c +++ b/drivers/net/can/vcan.c @@ -130,6 +130,19 @@ static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } +static void vcan_set_cap_info(struct net_device *dev) +{ + u32 can_cap = CAN_CAP_CC; + + if (dev->mtu > CAN_MTU) + can_cap |= CAN_CAP_FD; + + if (dev->mtu >= CANXL_MIN_MTU) + can_cap |= CAN_CAP_XL; + + can_set_cap(dev, can_cap); +} + static int vcan_change_mtu(struct net_device *dev, int new_mtu) { /* Do not allow changing the MTU while running */ @@ -141,6 +154,7 @@ static int vcan_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); + vcan_set_cap_info(dev); return 0; } @@ -162,6 +176,7 @@ static void vcan_setup(struct net_device *dev) dev->tx_queue_len = 0; dev->flags = IFF_NOARP; can_set_ml_priv(dev, netdev_priv(dev)); + vcan_set_cap_info(dev); /* set flags according to driver capabilities */ if (echo) diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index b2c19f8c5f8e..f14c6f02b662 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -125,6 +125,19 @@ static int vxcan_get_iflink(const struct net_device *dev) return iflink; } +static void vxcan_set_cap_info(struct net_device *dev) +{ + u32 can_cap = CAN_CAP_CC; + + if (dev->mtu > CAN_MTU) + can_cap |= CAN_CAP_FD; + + if (dev->mtu >= CANXL_MIN_MTU) + can_cap |= CAN_CAP_XL; + + can_set_cap(dev, can_cap); +} + static int vxcan_change_mtu(struct net_device *dev, int new_mtu) { /* Do not allow changing the MTU while running */ @@ -136,6 +149,7 @@ static int vxcan_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); + vxcan_set_cap_info(dev); return 0; } @@ -167,6 +181,7 @@ static void vxcan_setup(struct net_device *dev) can_ml = netdev_priv(dev) + ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN); can_set_ml_priv(dev, can_ml); + vxcan_set_cap_info(dev); } /* forward declaration for rtnl_create_link() */ diff --git a/include/linux/can/can-ml.h b/include/linux/can/can-ml.h index 8afa92d15a66..1e99fda2b380 100644 --- a/include/linux/can/can-ml.h +++ b/include/linux/can/can-ml.h @@ -46,6 +46,12 @@ #include #include +/* exposed CAN device capabilities for network layer */ +#define CAN_CAP_CC BIT(0) /* CAN CC aka Classical CAN */ +#define CAN_CAP_FD BIT(1) /* CAN FD */ +#define CAN_CAP_XL BIT(2) /* CAN XL */ +#define CAN_CAP_RO BIT(3) /* read-only mode (LISTEN/RESTRICTED) */ + #define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) #define CAN_EFF_RCV_HASH_BITS 10 #define CAN_EFF_RCV_ARRAY_SZ (1 << CAN_EFF_RCV_HASH_BITS) @@ -64,6 +70,7 @@ struct can_ml_priv { #ifdef CAN_J1939 struct j1939_priv *j1939_priv; #endif + u32 can_cap; }; static inline struct can_ml_priv *can_get_ml_priv(struct net_device *dev) @@ -77,4 +84,21 @@ static inline void can_set_ml_priv(struct net_device *dev, netdev_set_ml_priv(dev, ml_priv, ML_PRIV_CAN); } +static inline bool can_cap_enabled(struct net_device *dev, u32 cap) +{ + struct can_ml_priv *can_ml = can_get_ml_priv(dev); + + if (!can_ml) + return false; + + return (can_ml->can_cap & cap); +} + +static inline void can_set_cap(struct net_device *dev, u32 cap) +{ + struct can_ml_priv *can_ml = can_get_ml_priv(dev); + + can_ml->can_cap = cap; +} + #endif /* CAN_ML_H */ diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 52c8be5c160e..6d0710d6f571 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -116,6 +116,7 @@ struct can_priv *safe_candev_priv(struct net_device *dev); int open_candev(struct net_device *dev); void close_candev(struct net_device *dev); void can_set_default_mtu(struct net_device *dev); +void can_set_cap_info(struct net_device *dev); int __must_check can_set_static_ctrlmode(struct net_device *dev, u32 static_mode); int can_hwtstamp_get(struct net_device *netdev, -- cgit v1.2.3 From a995fe1a3aa78b7d06cc1cc7b6b8436c5e93b07f Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Wed, 7 Jan 2026 11:35:05 +0100 Subject: rust: driver: drop device private data post unbind Currently, the driver's device private data is allocated and initialized from driver core code called from bus abstractions after the driver's probe() callback returned the corresponding initializer. Similarly, the driver's device private data is dropped within the remove() callback of bus abstractions after calling the remove() callback of the corresponding driver. However, commit 6f61a2637abe ("rust: device: introduce Device::drvdata()") introduced an accessor for the driver's device private data for a Device, i.e. a device that is currently bound to a driver. Obviously, this is in conflict with dropping the driver's device private data in remove(), since a device can not be considered to be fully unbound after remove() has finished: We also have to consider registrations guarded by devres - such as IRQ or class device registrations - which are torn down after remove() in devres_release_all(). Thus, it can happen that, for instance, a class device or IRQ callback still calls Device::drvdata(), which then runs concurrently to remove() (which sets dev->driver_data to NULL and drops the driver's device private data), before devres_release_all() started to tear down the corresponding registration. This is because devres guarded registrations can, as expected, access the corresponding Device that defines their scope. In C it simply is the driver's responsibility to ensure that its device private data is freed after e.g. an IRQ registration is unregistered. Typically, C drivers achieve this by allocating their device private data with e.g. devm_kzalloc() before doing anything else, i.e. before e.g. registering an IRQ with devm_request_threaded_irq(), relying on the reverse order cleanup of devres. Technically, we could do something similar in Rust. However, the resulting code would be pretty messy: In Rust we have to differentiate between allocated but uninitialized memory and initialized memory in the type system. Thus, we would need to somehow keep track of whether the driver's device private data object has been initialized (i.e. probe() was successful and returned a valid initializer for this memory) and conditionally call the destructor of the corresponding object when it is freed. This is because we'd need to allocate and register the memory of the driver's device private data *before* it is initialized by the initializer returned by the driver's probe() callback, because the driver could already register devres guarded registrations within probe() outside of the driver's device private data initializer. Luckily there is a much simpler solution: Instead of dropping the driver's device private data at the end of remove(), we just drop it after the device has been fully unbound, i.e. after all devres callbacks have been processed. For this, we introduce a new post_unbind() callback private to the driver-core, i.e. the callback is neither exposed to drivers, nor to bus abstractions. This way, the driver-core code can simply continue to conditionally allocate the memory for the driver's device private data when the driver's initializer is returned from probe() - no change needed - and drop it when the driver-core code receives the post_unbind() callback. Closes: https://lore.kernel.org/all/DEZMS6Y4A7XE.XE7EUBT5SJFJ@kernel.org/ Fixes: 6f61a2637abe ("rust: device: introduce Device::drvdata()") Acked-by: Alice Ryhl Acked-by: Greg Kroah-Hartman Acked-by: Igor Korotin Link: https://patch.msgid.link/20260107103511.570525-7-dakr@kernel.org [ Remove #ifdef CONFIG_RUST, rename post_unbind() to post_unbind_rust(). - Danilo] Signed-off-by: Danilo Krummrich --- drivers/base/dd.c | 2 ++ include/linux/device/driver.h | 9 +++++++++ rust/kernel/auxiliary.rs | 4 ++-- rust/kernel/device.rs | 20 +++++++++++--------- rust/kernel/driver.rs | 36 +++++++++++++++++++++++++++++++++++- rust/kernel/i2c.rs | 4 ++-- rust/kernel/pci.rs | 4 ++-- rust/kernel/platform.rs | 4 ++-- rust/kernel/usb.rs | 4 ++-- 9 files changed, 67 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 349f31bedfa1..bea8da5f8a3a 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -548,6 +548,8 @@ static DEVICE_ATTR_RW(state_synced); static void device_unbind_cleanup(struct device *dev) { devres_release_all(dev); + if (dev->driver->p_cb.post_unbind_rust) + dev->driver->p_cb.post_unbind_rust(dev); arch_teardown_dma_ops(dev); kfree(dev->dma_range_map); dev->dma_range_map = NULL; diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index cd8e0f0a634b..bbc67ec513ed 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -85,6 +85,8 @@ enum probe_type { * uevent. * @p: Driver core's private data, no one other than the driver * core can touch this. + * @p_cb: Callbacks private to the driver core; no one other than the + * driver core is allowed to touch this. * * The device driver-model tracks all of the drivers known to the system. * The main reason for this tracking is to enable the driver core to match @@ -119,6 +121,13 @@ struct device_driver { void (*coredump) (struct device *dev); struct driver_private *p; + struct { + /* + * Called after remove() and after all devres entries have been + * processed. This is a Rust only callback. + */ + void (*post_unbind_rust)(struct device *dev); + } p_cb; }; diff --git a/rust/kernel/auxiliary.rs b/rust/kernel/auxiliary.rs index 17574aa5066f..be76f11aecb7 100644 --- a/rust/kernel/auxiliary.rs +++ b/rust/kernel/auxiliary.rs @@ -96,9 +96,9 @@ impl Adapter { // SAFETY: `remove_callback` is only ever called after a successful call to // `probe_callback`, hence it's guaranteed that `Device::set_drvdata()` has been called // and stored a `Pin>`. - let data = unsafe { adev.as_ref().drvdata_obtain::() }; + let data = unsafe { adev.as_ref().drvdata_borrow::() }; - T::unbind(adev, data.as_ref()); + T::unbind(adev, data); } } diff --git a/rust/kernel/device.rs b/rust/kernel/device.rs index 71b200df0f40..031720bf5d8c 100644 --- a/rust/kernel/device.rs +++ b/rust/kernel/device.rs @@ -232,30 +232,32 @@ impl Device { /// /// # Safety /// - /// - Must only be called once after a preceding call to [`Device::set_drvdata`]. /// - The type `T` must match the type of the `ForeignOwnable` previously stored by /// [`Device::set_drvdata`]. - pub unsafe fn drvdata_obtain(&self) -> Pin> { + pub(crate) unsafe fn drvdata_obtain(&self) -> Option>> { // SAFETY: By the type invariants, `self.as_raw()` is a valid pointer to a `struct device`. let ptr = unsafe { bindings::dev_get_drvdata(self.as_raw()) }; // SAFETY: By the type invariants, `self.as_raw()` is a valid pointer to a `struct device`. unsafe { bindings::dev_set_drvdata(self.as_raw(), core::ptr::null_mut()) }; + if ptr.is_null() { + return None; + } + // SAFETY: - // - By the safety requirements of this function, `ptr` comes from a previous call to - // `into_foreign()`. + // - If `ptr` is not NULL, it comes from a previous call to `into_foreign()`. // - `dev_get_drvdata()` guarantees to return the same pointer given to `dev_set_drvdata()` // in `into_foreign()`. - unsafe { Pin::>::from_foreign(ptr.cast()) } + Some(unsafe { Pin::>::from_foreign(ptr.cast()) }) } /// Borrow the driver's private data bound to this [`Device`]. /// /// # Safety /// - /// - Must only be called after a preceding call to [`Device::set_drvdata`] and before - /// [`Device::drvdata_obtain`]. + /// - Must only be called after a preceding call to [`Device::set_drvdata`] and before the + /// device is fully unbound. /// - The type `T` must match the type of the `ForeignOwnable` previously stored by /// [`Device::set_drvdata`]. pub unsafe fn drvdata_borrow(&self) -> Pin<&T> { @@ -271,7 +273,7 @@ impl Device { /// # Safety /// /// - Must only be called after a preceding call to [`Device::set_drvdata`] and before - /// [`Device::drvdata_obtain`]. + /// the device is fully unbound. /// - The type `T` must match the type of the `ForeignOwnable` previously stored by /// [`Device::set_drvdata`]. unsafe fn drvdata_unchecked(&self) -> Pin<&T> { @@ -320,7 +322,7 @@ impl Device { // SAFETY: // - The above check of `dev_get_drvdata()` guarantees that we are called after - // `set_drvdata()` and before `drvdata_obtain()`. + // `set_drvdata()`. // - We've just checked that the type of the driver's private data is in fact `T`. Ok(unsafe { self.drvdata_unchecked() }) } diff --git a/rust/kernel/driver.rs b/rust/kernel/driver.rs index ba1ca1f7a7e2..bee3ae21a27b 100644 --- a/rust/kernel/driver.rs +++ b/rust/kernel/driver.rs @@ -177,7 +177,39 @@ unsafe impl Sync for Registration {} // any thread, so `Registration` is `Send`. unsafe impl Send for Registration {} -impl Registration { +impl Registration { + extern "C" fn post_unbind_callback(dev: *mut bindings::device) { + // SAFETY: The driver core only ever calls the post unbind callback with a valid pointer to + // a `struct device`. + // + // INVARIANT: `dev` is valid for the duration of the `post_unbind_callback()`. + let dev = unsafe { &*dev.cast::>() }; + + // `remove()` and all devres callbacks have been completed at this point, hence drop the + // driver's device private data. + // + // SAFETY: By the safety requirements of the `Driver` trait, `T::DriverData` is the + // driver's device private data type. + drop(unsafe { dev.drvdata_obtain::() }); + } + + /// Attach generic `struct device_driver` callbacks. + fn callbacks_attach(drv: &Opaque) { + let ptr = drv.get().cast::(); + + // SAFETY: + // - `drv.get()` yields a valid pointer to `Self::DriverType`. + // - Adding `DEVICE_DRIVER_OFFSET` yields the address of the embedded `struct device_driver` + // as guaranteed by the safety requirements of the `Driver` trait. + let base = unsafe { ptr.add(T::DEVICE_DRIVER_OFFSET) }; + + // CAST: `base` points to the offset of the embedded `struct device_driver`. + let base = base.cast::(); + + // SAFETY: It is safe to set the fields of `struct device_driver` on initialization. + unsafe { (*base).p_cb.post_unbind_rust = Some(Self::post_unbind_callback) }; + } + /// Creates a new instance of the registration object. pub fn new(name: &'static CStr, module: &'static ThisModule) -> impl PinInit { try_pin_init!(Self { @@ -189,6 +221,8 @@ impl Registration { // just been initialised above, so it's also valid for read. let drv = unsafe { &*(ptr as *const Opaque) }; + Self::callbacks_attach(drv); + // SAFETY: `drv` is guaranteed to be pinned until `T::unregister`. unsafe { T::register(drv, name, module) } }), diff --git a/rust/kernel/i2c.rs b/rust/kernel/i2c.rs index e86242227081..39b0a9a207fd 100644 --- a/rust/kernel/i2c.rs +++ b/rust/kernel/i2c.rs @@ -178,9 +178,9 @@ impl Adapter { // SAFETY: `remove_callback` is only ever called after a successful call to // `probe_callback`, hence it's guaranteed that `I2cClient::set_drvdata()` has been called // and stored a `Pin>`. - let data = unsafe { idev.as_ref().drvdata_obtain::() }; + let data = unsafe { idev.as_ref().drvdata_borrow::() }; - T::unbind(idev, data.as_ref()); + T::unbind(idev, data); } extern "C" fn shutdown_callback(idev: *mut bindings::i2c_client) { diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 590723dcb5ae..bea76ca9c3da 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -123,9 +123,9 @@ impl Adapter { // SAFETY: `remove_callback` is only ever called after a successful call to // `probe_callback`, hence it's guaranteed that `Device::set_drvdata()` has been called // and stored a `Pin>`. - let data = unsafe { pdev.as_ref().drvdata_obtain::() }; + let data = unsafe { pdev.as_ref().drvdata_borrow::() }; - T::unbind(pdev, data.as_ref()); + T::unbind(pdev, data); } } diff --git a/rust/kernel/platform.rs b/rust/kernel/platform.rs index b8a681df9ddc..35a5813ffb33 100644 --- a/rust/kernel/platform.rs +++ b/rust/kernel/platform.rs @@ -101,9 +101,9 @@ impl Adapter { // SAFETY: `remove_callback` is only ever called after a successful call to // `probe_callback`, hence it's guaranteed that `Device::set_drvdata()` has been called // and stored a `Pin>`. - let data = unsafe { pdev.as_ref().drvdata_obtain::() }; + let data = unsafe { pdev.as_ref().drvdata_borrow::() }; - T::unbind(pdev, data.as_ref()); + T::unbind(pdev, data); } } diff --git a/rust/kernel/usb.rs b/rust/kernel/usb.rs index 4cf4bb1705b5..67ce5c85c619 100644 --- a/rust/kernel/usb.rs +++ b/rust/kernel/usb.rs @@ -103,9 +103,9 @@ impl Adapter { // SAFETY: `disconnect_callback` is only ever called after a successful call to // `probe_callback`, hence it's guaranteed that `Device::set_drvdata()` has been called // and stored a `Pin>`. - let data = unsafe { dev.drvdata_obtain::() }; + let data = unsafe { dev.drvdata_borrow::() }; - T::disconnect(intf, data.as_ref()); + T::disconnect(intf, data); } } -- cgit v1.2.3 From 6ac433f8b2590b09ca00863d218665729ac985f7 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 24 Dec 2025 12:33:57 -0500 Subject: mm: rename cpu_bitmap field to flexible_array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpu_bitmap flexible array now contains more than just the cpu_bitmap. In preparation for changing the static mm_struct definitions to cover for the additional space required, change the cpu_bitmap type from "unsigned long" to "char", require an unsigned long alignment of the flexible array, and rename the field from "cpu_bitmap" to "flexible_array". Introduce the MM_STRUCT_FLEXIBLE_ARRAY_INIT macro to statically initialize the flexible array. This covers the init_mm and efi_mm static definitions. This is a preparation step for fixing the missing mm_cid size for static mm_struct definitions. Link: https://lkml.kernel.org/r/20251224173358.647691-3-mathieu.desnoyers@efficios.com Fixes: af7f588d8f73 ("sched: Introduce per-memory-map concurrency ID") Signed-off-by: Mathieu Desnoyers Reviewed-by: Thomas Gleixner Cc: Mark Brown Cc: Aboorva Devarajan Cc: Al Viro Cc: Baolin Wang Cc: Christan König Cc: Christian Brauner Cc: Christoph Lameter Cc: David Hildenbrand Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: "Liam R . Howlett" Cc: Lorenzo Stoakes Cc: Martin Liu Cc: Masami Hiramatsu Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: "Paul E. McKenney" Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sweet Tea Dorminy Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wei Yang Cc: Yu Zhao Cc: Peter Zijlstra (Intel) Cc: Signed-off-by: Andrew Morton --- drivers/firmware/efi/efi.c | 2 +- include/linux/mm_types.h | 13 +++++++++---- mm/init-mm.c | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index f5ff6e84a9b7..17b5f3415465 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -74,10 +74,10 @@ struct mm_struct efi_mm = { .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), .user_ns = &init_user_ns, - .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, #ifdef CONFIG_SCHED_MM_CID .mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(efi_mm.mm_cid.lock), #endif + .flexible_array = MM_STRUCT_FLEXIBLE_ARRAY_INIT, }; struct workqueue_struct *efi_rts_wq; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 42af2292951d..110b319a2ffb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1329,7 +1329,7 @@ struct mm_struct { * The mm_cpumask needs to be at the end of mm_struct, because it * is dynamically sized based on nr_cpu_ids. */ - unsigned long cpu_bitmap[]; + char flexible_array[] __aligned(__alignof__(unsigned long)); }; /* Copy value to the first system word of mm flags, non-atomically. */ @@ -1366,19 +1366,24 @@ static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm, MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; +#define MM_STRUCT_FLEXIBLE_ARRAY_INIT \ +{ \ + [0 ... sizeof(cpumask_t)-1] = 0 \ +} + /* Pointer magic because the dynamic array size confuses some compilers. */ static inline void mm_init_cpumask(struct mm_struct *mm) { unsigned long cpu_bitmap = (unsigned long)mm; - cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); + cpu_bitmap += offsetof(struct mm_struct, flexible_array); cpumask_clear((struct cpumask *)cpu_bitmap); } /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) { - return (struct cpumask *)&mm->cpu_bitmap; + return (struct cpumask *)&mm->flexible_array; } #ifdef CONFIG_LRU_GEN @@ -1469,7 +1474,7 @@ static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm) { unsigned long bitmap = (unsigned long)mm; - bitmap += offsetof(struct mm_struct, cpu_bitmap); + bitmap += offsetof(struct mm_struct, flexible_array); /* Skip cpu_bitmap */ bitmap += cpumask_size(); return (struct cpumask *)bitmap; diff --git a/mm/init-mm.c b/mm/init-mm.c index a514f8ce47e3..c5556bb9d5f0 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -47,7 +47,7 @@ struct mm_struct init_mm = { #ifdef CONFIG_SCHED_MM_CID .mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(init_mm.mm_cid.lock), #endif - .cpu_bitmap = CPU_BITS_NONE, + .flexible_array = MM_STRUCT_FLEXIBLE_ARRAY_INIT, INIT_MM_CONTEXT(init_mm) }; -- cgit v1.2.3 From be31340a4cc259340044b7fc4f7e97f58c74ee8e Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 24 Dec 2025 12:33:58 -0500 Subject: mm: take into account mm_cid size for mm_struct static definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both init_mm and efi_mm static definitions need to make room for the 2 mm_cid cpumasks. This fixes possible out-of-bounds accesses to init_mm and efi_mm. Add a space between # and define for the mm_alloc_cid() definition to make it consistent with the coding style used in the rest of this header file. Link: https://lkml.kernel.org/r/20251224173358.647691-4-mathieu.desnoyers@efficios.com Fixes: af7f588d8f73 ("sched: Introduce per-memory-map concurrency ID") Signed-off-by: Mathieu Desnoyers Reviewed-by: Thomas Gleixner Cc: Mark Brown Cc: Aboorva Devarajan Cc: Al Viro Cc: Baolin Wang Cc: Christan König Cc: Christian Brauner Cc: Christoph Lameter Cc: David Hildenbrand Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: "Liam R . Howlett" Cc: Lorenzo Stoakes Cc: Martin Liu Cc: Masami Hiramatsu Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: "Paul E. McKenney" Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sweet Tea Dorminy Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wei Yang Cc: Yu Zhao Cc: Peter Zijlstra (Intel) Cc: Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 110b319a2ffb..aa4639888f89 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1368,7 +1368,7 @@ extern struct mm_struct init_mm; #define MM_STRUCT_FLEXIBLE_ARRAY_INIT \ { \ - [0 ... sizeof(cpumask_t)-1] = 0 \ + [0 ... sizeof(cpumask_t) + MM_CID_STATIC_SIZE - 1] = 0 \ } /* Pointer magic because the dynamic array size confuses some compilers. */ @@ -1500,7 +1500,7 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct * mm_init_cid(mm, p); return 0; } -#define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__)) +# define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__)) static inline void mm_destroy_cid(struct mm_struct *mm) { @@ -1514,6 +1514,8 @@ static inline unsigned int mm_cid_size(void) return cpumask_size() + bitmap_size(num_possible_cpus()); } +/* Use 2 * NR_CPUS as worse case for static allocation. */ +# define MM_CID_STATIC_SIZE (2 * sizeof(cpumask_t)) #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; } @@ -1522,6 +1524,7 @@ static inline unsigned int mm_cid_size(void) { return 0; } +# define MM_CID_STATIC_SIZE 0 #endif /* CONFIG_SCHED_MM_CID */ struct mmu_gather; -- cgit v1.2.3 From f9a49aa302a05e91ca01f69031cb79a0ea33031f Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 5 Jan 2026 13:17:27 -0800 Subject: fs/writeback: skip AS_NO_DATA_INTEGRITY mappings in wait_sb_inodes() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Above the while() loop in wait_sb_inodes(), we document that we must wait for all pages under writeback for data integrity. Consequently, if a mapping, like fuse, traditionally does not have data integrity semantics, there is no need to wait at all; we can simply skip these inodes. This restores fuse back to prior behavior where syncs are no-ops. This fixes a user regression where if a system is running a faulty fuse server that does not reply to issued write requests, this causes wait_sb_inodes() to wait forever. Link: https://lkml.kernel.org/r/20260105211737.4105620-2-joannelkoong@gmail.com Fixes: 0c58a97f919c ("fuse: remove tmp folio for writebacks and internal rb tree") Signed-off-by: Joanne Koong Reported-by: Athul Krishna Reported-by: J. Neuschäfer Reviewed-by: Bernd Schubert Tested-by: J. Neuschäfer Cc: Alexander Viro Cc: Bernd Schubert Cc: Bonaccorso Salvatore Cc: Christian Brauner Cc: David Hildenbrand Cc: Jan Kara Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Rapoport Cc: Miklos Szeredi Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- fs/fs-writeback.c | 7 ++++++- fs/fuse/file.c | 4 +++- include/linux/pagemap.h | 11 +++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6800886c4d10..baa2f2141146 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2750,8 +2750,13 @@ static void wait_sb_inodes(struct super_block *sb) * The mapping can appear untagged while still on-list since we * do not have the mapping lock. Skip it here, wb completion * will remove it. + * + * If the mapping does not have data integrity semantics, + * there's no need to wait for the writeout to complete, as the + * mapping cannot guarantee that data is persistently stored. */ - if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK) || + mapping_no_data_integrity(mapping)) continue; spin_unlock_irq(&sb->s_inode_wblist_lock); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 01bc894e9c2b..3b2a171e652f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3200,8 +3200,10 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; - if (fc->writeback_cache) + if (fc->writeback_cache) { mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); + mapping_set_no_data_integrity(&inode->i_data); + } INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 31a848485ad9..ec442af3f886 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -210,6 +210,7 @@ enum mapping_flags { AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't account usage to user cgroups */ + AS_NO_DATA_INTEGRITY = 11, /* no data integrity guarantees */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, @@ -345,6 +346,16 @@ static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct addres return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } +static inline void mapping_set_no_data_integrity(struct address_space *mapping) +{ + set_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); +} + +static inline bool mapping_no_data_integrity(const struct address_space *mapping) +{ + return test_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(const struct address_space *mapping) { return mapping->gfp_mask; -- cgit v1.2.3 From ca1a47cd3f5f4c46ca188b1c9a27af87d1ab2216 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 23 Dec 2025 22:40:34 +0100 Subject: mm/hugetlb: fix hugetlb_pmd_shared() Patch series "mm/hugetlb: fixes for PMD table sharing (incl. using mmu_gather)", v3. One functional fix, one performance regression fix, and two related comment fixes. I cleaned up my prototype I recently shared [1] for the performance fix, deferring most of the cleanups I had in the prototype to a later point. While doing that I identified the other things. The goal of this patch set is to be backported to stable trees "fairly" easily. At least patch #1 and #4. Patch #1 fixes hugetlb_pmd_shared() not detecting any sharing Patch #2 + #3 are simple comment fixes that patch #4 interacts with. Patch #4 is a fix for the reported performance regression due to excessive IPI broadcasts during fork()+exit(). The last patch is all about TLB flushes, IPIs and mmu_gather. Read: complicated There are plenty of cleanups in the future to be had + one reasonable optimization on x86. But that's all out of scope for this series. Runtime tested, with a focus on fixing the performance regression using the original reproducer [2] on x86. This patch (of 4): We switched from (wrongly) using the page count to an independent shared count. Now, shared page tables have a refcount of 1 (excluding speculative references) and instead use ptdesc->pt_share_count to identify sharing. We didn't convert hugetlb_pmd_shared(), so right now, we would never detect a shared PMD table as such, because sharing/unsharing no longer touches the refcount of a PMD table. Page migration, like mbind() or migrate_pages() would allow for migrating folios mapped into such shared PMD tables, even though the folios are not exclusive. In smaps we would account them as "private" although they are "shared", and we would be wrongly setting the PM_MMAP_EXCLUSIVE in the pagemap interface. Fix it by properly using ptdesc_pmd_is_shared() in hugetlb_pmd_shared(). Link: https://lkml.kernel.org/r/20251223214037.580860-1-david@kernel.org Link: https://lkml.kernel.org/r/20251223214037.580860-2-david@kernel.org Link: https://lore.kernel.org/all/8cab934d-4a56-44aa-b641-bfd7e23bd673@kernel.org/ [1] Link: https://lore.kernel.org/all/8cab934d-4a56-44aa-b641-bfd7e23bd673@kernel.org/ [2] Fixes: 59d9094df3d7 ("mm: hugetlb: independent PMD page table shared count") Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Rik van Riel Reviewed-by: Lance Yang Tested-by: Lance Yang Reviewed-by: Harry Yoo Tested-by: Laurence Oberman Reviewed-by: Lorenzo Stoakes Acked-by: Oscar Salvador Cc: Liu Shixin Cc: Uschakow, Stanislav" Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 019a1c5281e4..03c8725efa28 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1326,7 +1326,7 @@ static inline __init void hugetlb_cma_reserve(int order) #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static inline bool hugetlb_pmd_shared(pte_t *pte) { - return page_count(virt_to_page(pte)) > 1; + return ptdesc_pmd_is_shared(virt_to_ptdesc(pte)); } #else static inline bool hugetlb_pmd_shared(pte_t *pte) -- cgit v1.2.3 From 8ce720d5bd91e9dc16db3604aa4b1bf76770a9a1 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 23 Dec 2025 22:40:37 +0100 Subject: mm/hugetlb: fix excessive IPI broadcasts when unsharing PMD tables using mmu_gather As reported, ever since commit 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race") we can end up in some situations where we perform so many IPI broadcasts when unsharing hugetlb PMD page tables that it severely regresses some workloads. In particular, when we fork()+exit(), or when we munmap() a large area backed by many shared PMD tables, we perform one IPI broadcast per unshared PMD table. There are two optimizations to be had: (1) When we process (unshare) multiple such PMD tables, such as during exit(), it is sufficient to send a single IPI broadcast (as long as we respect locking rules) instead of one per PMD table. Locking prevents that any of these PMD tables could get reused before we drop the lock. (2) When we are not the last sharer (> 2 users including us), there is no need to send the IPI broadcast. The shared PMD tables cannot become exclusive (fully unshared) before an IPI will be broadcasted by the last sharer. Concurrent GUP-fast could walk into a PMD table just before we unshared it. It could then succeed in grabbing a page from the shared page table even after munmap() etc succeeded (and supressed an IPI). But there is not difference compared to GUP-fast just sleeping for a while after grabbing the page and re-enabling IRQs. Most importantly, GUP-fast will never walk into page tables that are no-longer shared, because the last sharer will issue an IPI broadcast. (if ever required, checking whether the PUD changed in GUP-fast after grabbing the page like we do in the PTE case could handle this) So let's rework PMD sharing TLB flushing + IPI sync to use the mmu_gather infrastructure so we can implement these optimizations and demystify the code at least a bit. Extend the mmu_gather infrastructure to be able to deal with our special hugetlb PMD table sharing implementation. To make initialization of the mmu_gather easier when working on a single VMA (in particular, when dealing with hugetlb), provide tlb_gather_mmu_vma(). We'll consolidate the handling for (full) unsharing of PMD tables in tlb_unshare_pmd_ptdesc() and tlb_flush_unshared_tables(), and track in "struct mmu_gather" whether we had (full) unsharing of PMD tables. Because locking is very special (concurrent unsharing+reuse must be prevented), we disallow deferring flushing to tlb_finish_mmu() and instead require an explicit earlier call to tlb_flush_unshared_tables(). From hugetlb code, we call huge_pmd_unshare_flush() where we make sure that the expected lock protecting us from concurrent unsharing+reuse is still held. Check with a VM_WARN_ON_ONCE() in tlb_finish_mmu() that tlb_flush_unshared_tables() was properly called earlier. Document it all properly. Notes about tlb_remove_table_sync_one() interaction with unsharing: There are two fairly tricky things: (1) tlb_remove_table_sync_one() is a NOP on architectures without CONFIG_MMU_GATHER_RCU_TABLE_FREE. Here, the assumption is that the previous TLB flush would send an IPI to all relevant CPUs. Careful: some architectures like x86 only send IPIs to all relevant CPUs when tlb->freed_tables is set. The relevant architectures should be selecting MMU_GATHER_RCU_TABLE_FREE, but x86 might not do that in stable kernels and it might have been problematic before this patch. Also, the arch flushing behavior (independent of IPIs) is different when tlb->freed_tables is set. Do we have to enlighten them to also take care of tlb->unshared_tables? So far we didn't care, so hopefully we are fine. Of course, we could be setting tlb->freed_tables as well, but that might then unnecessarily flush too much, because the semantics of tlb->freed_tables are a bit fuzzy. This patch changes nothing in this regard. (2) tlb_remove_table_sync_one() is not a NOP on architectures with CONFIG_MMU_GATHER_RCU_TABLE_FREE that actually don't need a sync. Take x86 as an example: in the common case (!pv, !X86_FEATURE_INVLPGB) we still issue IPIs during TLB flushes and don't actually need the second tlb_remove_table_sync_one(). This optimized can be implemented on top of this, by checking e.g., in tlb_remove_table_sync_one() whether we really need IPIs. But as described in (1), it really must honor tlb->freed_tables then to send IPIs to all relevant CPUs. Notes on TLB flushing changes: (1) Flushing for non-shared PMD tables We're converting from flush_hugetlb_tlb_range() to tlb_remove_huge_tlb_entry(). Given that we properly initialize the MMU gather in tlb_gather_mmu_vma() to be hugetlb aware, similar to __unmap_hugepage_range(), that should be fine. (2) Flushing for shared PMD tables We're converting from various things (flush_hugetlb_tlb_range(), tlb_flush_pmd_range(), flush_tlb_range()) to tlb_flush_pmd_range(). tlb_flush_pmd_range() achieves the same that tlb_remove_huge_tlb_entry() would achieve in these scenarios. Note that tlb_remove_huge_tlb_entry() also calls __tlb_remove_tlb_entry(), however that is only implemented on powerpc, which does not support PMD table sharing. Similar to (1), tlb_gather_mmu_vma() should make sure that TLB flushing keeps on working as expected. Further, note that the ptdesc_pmd_pts_dec() in huge_pmd_share() is not a concern, as we are holding the i_mmap_lock the whole time, preventing concurrent unsharing. That ptdesc_pmd_pts_dec() usage will be removed separately as a cleanup later. There are plenty more cleanups to be had, but they have to wait until this is fixed. [david@kernel.org: fix kerneldoc] Link: https://lkml.kernel.org/r/f223dd74-331c-412d-93fc-69e360a5006c@kernel.org Link: https://lkml.kernel.org/r/20251223214037.580860-5-david@kernel.org Fixes: 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race") Signed-off-by: David Hildenbrand (Red Hat) Reported-by: Uschakow, Stanislav" Closes: https://lore.kernel.org/all/4d3878531c76479d9f8ca9789dc6485d@amazon.de/ Tested-by: Laurence Oberman Acked-by: Harry Yoo Reviewed-by: Lorenzo Stoakes Cc: Lance Yang Cc: Liu Shixin Cc: Oscar Salvador Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 77 ++++++++++++++++++++++++++++- include/linux/hugetlb.h | 15 ++++-- include/linux/mm_types.h | 1 + mm/hugetlb.c | 123 +++++++++++++++++++++++++++------------------- mm/mmu_gather.c | 33 +++++++++++++ mm/rmap.c | 25 +++++++--- 6 files changed, 208 insertions(+), 66 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 1fff717cae51..4d679d2a206b 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -46,7 +46,8 @@ * * The mmu_gather API consists of: * - * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu() + * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_gather_mmu_vma() / + * tlb_finish_mmu() * * start and finish a mmu_gather * @@ -364,6 +365,20 @@ struct mmu_gather { unsigned int vma_huge : 1; unsigned int vma_pfn : 1; + /* + * Did we unshare (unmap) any shared page tables? For now only + * used for hugetlb PMD table sharing. + */ + unsigned int unshared_tables : 1; + + /* + * Did we unshare any page tables such that they are now exclusive + * and could get reused+modified by the new owner? When setting this + * flag, "unshared_tables" will be set as well. For now only used + * for hugetlb PMD table sharing. + */ + unsigned int fully_unshared_tables : 1; + unsigned int batch_count; #ifndef CONFIG_MMU_GATHER_NO_GATHER @@ -400,6 +415,7 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb) tlb->cleared_pmds = 0; tlb->cleared_puds = 0; tlb->cleared_p4ds = 0; + tlb->unshared_tables = 0; /* * Do not reset mmu_gather::vma_* fields here, we do not * call into tlb_start_vma() again to set them if there is an @@ -484,7 +500,7 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) * these bits. */ if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || - tlb->cleared_puds || tlb->cleared_p4ds)) + tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables)) return; tlb_flush(tlb); @@ -773,6 +789,63 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) } #endif +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING +static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt, + unsigned long addr) +{ + /* + * The caller must make sure that concurrent unsharing + exclusive + * reuse is impossible until tlb_flush_unshared_tables() was called. + */ + VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt)); + ptdesc_pmd_pts_dec(pt); + + /* Clearing a PUD pointing at a PMD table with PMD leaves. */ + tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE); + + /* + * If the page table is now exclusively owned, we fully unshared + * a page table. + */ + if (!ptdesc_pmd_is_shared(pt)) + tlb->fully_unshared_tables = true; + tlb->unshared_tables = true; +} + +static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb) +{ + /* + * As soon as the caller drops locks to allow for reuse of + * previously-shared tables, these tables could get modified and + * even reused outside of hugetlb context, so we have to make sure that + * any page table walkers (incl. TLB, GUP-fast) are aware of that + * change. + * + * Even if we are not fully unsharing a PMD table, we must + * flush the TLB for the unsharer now. + */ + if (tlb->unshared_tables) + tlb_flush_mmu_tlbonly(tlb); + + /* + * Similarly, we must make sure that concurrent GUP-fast will not + * walk previously-shared page tables that are getting modified+reused + * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast. + * + * We only perform this when we are the last sharer of a page table, + * as the IPI will reach all CPUs: any GUP-fast. + * + * Note that on configs where tlb_remove_table_sync_one() is a NOP, + * the expectation is that the tlb_flush_mmu_tlbonly() would have issued + * required IPIs already for us. + */ + if (tlb->fully_unshared_tables) { + tlb_remove_table_sync_one(); + tlb->fully_unshared_tables = false; + } +} +#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ + #endif /* CONFIG_MMU */ #endif /* _ASM_GENERIC__TLB_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 03c8725efa28..e51b8ef0cebd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -240,8 +240,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep); +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); @@ -300,13 +301,17 @@ static inline struct address_space *hugetlb_folio_mapping_lock_write( return NULL; } -static inline int huge_pmd_unshare(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline int huge_pmd_unshare(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return 0; } +static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb, + struct vm_area_struct *vma) +{ +} + static inline void adjust_range_if_pmd_sharing_possible( struct vm_area_struct *vma, unsigned long *start, unsigned long *end) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index aa4639888f89..78950eb8926d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1530,6 +1530,7 @@ static inline unsigned int mm_cid_size(void) struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma); extern void tlb_finish_mmu(struct mmu_gather *tlb); struct vm_fault; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 67131aa24d77..a1832da0f623 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5112,7 +5112,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long last_addr_mask; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; - bool shared_pmd = false; + struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, old_end); @@ -5122,6 +5122,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, * range. */ flush_cache_range(vma, range.start, range.end); + tlb_gather_mmu_vma(&tlb, vma); mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); @@ -5138,8 +5139,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte))) continue; - if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { - shared_pmd = true; + if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) { old_addr |= last_addr_mask; new_addr |= last_addr_mask; continue; @@ -5150,15 +5150,16 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, break; move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz); + tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr); } - if (shared_pmd) - flush_hugetlb_tlb_range(vma, range.start, range.end); - else - flush_hugetlb_tlb_range(vma, old_end - len, old_end); + tlb_flush_mmu_tlbonly(&tlb); + huge_pmd_unshare_flush(&tlb, vma); + mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); return len + old_addr - old_end; } @@ -5177,7 +5178,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); bool adjust_reservation; unsigned long last_addr_mask; - bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); @@ -5200,10 +5200,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, address, ptep)) { + if (huge_pmd_unshare(tlb, vma, address, ptep)) { spin_unlock(ptl); - tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); - force_flush = true; address |= last_addr_mask; continue; } @@ -5319,14 +5317,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } tlb_end_vma(tlb, vma); - /* - * There is nothing protecting a previously-shared page table that we - * unshared through huge_pmd_unshare() from getting freed after we - * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare() - * succeeded, flush the range corresponding to the pud. - */ - if (force_flush) - tlb_flush_mmu_tlbonly(tlb); + huge_pmd_unshare_flush(tlb, vma); } void __hugetlb_zap_begin(struct vm_area_struct *vma, @@ -6425,11 +6416,11 @@ long hugetlb_change_protection(struct vm_area_struct *vma, pte_t pte; struct hstate *h = hstate_vma(vma); long pages = 0, psize = huge_page_size(h); - bool shared_pmd = false; struct mmu_notifier_range range; unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + struct mmu_gather tlb; /* * In the case of shared PMDs, the area to flush could be beyond @@ -6442,6 +6433,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, BUG_ON(address >= end); flush_cache_range(vma, range.start, range.end); + tlb_gather_mmu_vma(&tlb, vma); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); @@ -6468,7 +6460,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, } } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, address, ptep)) { + if (huge_pmd_unshare(&tlb, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it @@ -6477,7 +6469,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma, WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); pages++; spin_unlock(ptl); - shared_pmd = true; address |= last_addr_mask; continue; } @@ -6538,22 +6529,16 @@ long hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; + tlb_remove_huge_tlb_entry(h, &tlb, ptep, address); } next: spin_unlock(ptl); cond_resched(); } - /* - * There is nothing protecting a previously-shared page table that we - * unshared through huge_pmd_unshare() from getting freed after we - * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare() - * succeeded, flush the range corresponding to the pud. - */ - if (shared_pmd) - flush_hugetlb_tlb_range(vma, range.start, range.end); - else - flush_hugetlb_tlb_range(vma, start, end); + + tlb_flush_mmu_tlbonly(&tlb); + huge_pmd_unshare_flush(&tlb, vma); /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are * downgrading page table protection not changing it to point to a new @@ -6564,6 +6549,7 @@ next: i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); return pages > 0 ? (pages << h->order) : pages; } @@ -6920,18 +6906,27 @@ out: return pte; } -/* - * unmap huge page backed by shared pte. +/** + * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users + * @tlb: the current mmu_gather. + * @vma: the vma covering the pmd table. + * @addr: the address we are trying to unshare. + * @ptep: pointer into the (pmd) page table. + * + * Called with the page table lock held, the i_mmap_rwsem held in write mode + * and the hugetlb vma lock held in write mode. * - * Called with page table lock held. + * Note: The caller must call huge_pmd_unshare_flush() before dropping the + * i_mmap_rwsem. * - * returns: 1 successfully unmapped a shared pte page - * 0 the underlying pte page is not shared, or it is the last user + * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it + * was not a shared PMD table. */ -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { unsigned long sz = huge_page_size(hstate_vma(vma)); + struct mm_struct *mm = vma->vm_mm; pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); @@ -6943,18 +6938,36 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); pud_clear(pud); - /* - * Once our caller drops the rmap lock, some other process might be - * using this page table as a normal, non-hugetlb page table. - * Wait for pending gup_fast() in other threads to finish before letting - * that happen. - */ - tlb_remove_table_sync_one(); - ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); + + tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr); + mm_dec_nr_pmds(mm); return 1; } +/* + * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls + * @tlb: the current mmu_gather. + * @vma: the vma covering the pmd table. + * + * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table + * unsharing with concurrent page table walkers. + * + * This function must be called after a sequence of huge_pmd_unshare() + * calls while still holding the i_mmap_rwsem. + */ +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + /* + * We must synchronize page table unsharing such that nobody will + * try reusing a previously-shared page table while it might still + * be in use by previous sharers (TLB, GUP_fast). + */ + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + + tlb_flush_unshared_tables(tlb); +} + #else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, @@ -6963,12 +6976,16 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, return NULL; } -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { return 0; } +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ +} + void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { @@ -7235,6 +7252,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; + struct mmu_gather tlb; unsigned long address; spinlock_t *ptl; pte_t *ptep; @@ -7246,6 +7264,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, return; flush_cache_range(vma, start, end); + tlb_gather_mmu_vma(&tlb, vma); + /* * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. @@ -7264,10 +7284,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - huge_pmd_unshare(mm, vma, address, ptep); + huge_pmd_unshare(&tlb, vma, address, ptep); spin_unlock(ptl); } - flush_hugetlb_tlb_range(vma, start, end); + huge_pmd_unshare_flush(&tlb, vma); if (take_locks) { i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); @@ -7277,6 +7297,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); } /* diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 247e3f9db6c7..7468ec388455 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -426,6 +427,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, #endif tlb->vma_pfn = 0; + tlb->fully_unshared_tables = 0; __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); } @@ -459,6 +461,31 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) __tlb_gather_mmu(tlb, mm, true); } +/** + * tlb_gather_mmu_vma - initialize an mmu_gather structure for operating on a + * single VMA + * @tlb: the mmu_gather structure to initialize + * @vma: the vm_area_struct + * + * Called to initialize an (on-stack) mmu_gather structure for operating on + * a single VMA. In contrast to tlb_gather_mmu(), calling this function will + * not require another call to tlb_start_vma(). In contrast to tlb_start_vma(), + * this function will *not* call flush_cache_range(). + * + * For hugetlb VMAs, this function will also initialize the mmu_gather + * page_size accordingly, not requiring a separate call to + * tlb_change_page_size(). + * + */ +void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + tlb_gather_mmu(tlb, vma->vm_mm); + tlb_update_vma_flags(tlb, vma); + if (is_vm_hugetlb_page(vma)) + /* All entries have the same size. */ + tlb_change_page_size(tlb, huge_page_size(hstate_vma(vma))); +} + /** * tlb_finish_mmu - finish an mmu_gather structure * @tlb: the mmu_gather structure to finish @@ -468,6 +495,12 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) */ void tlb_finish_mmu(struct mmu_gather *tlb) { + /* + * We expect an earlier huge_pmd_unshare_flush() call to sort this out, + * due to complicated locking requirements with page table unsharing. + */ + VM_WARN_ON_ONCE(tlb->fully_unshared_tables); + /* * If there are parallel threads are doing PTE changes on same range * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB diff --git a/mm/rmap.c b/mm/rmap.c index 748f48727a16..7b9879ef442d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -76,7 +76,7 @@ #include #include -#include +#include #define CREATE_TRACE_POINTS #include @@ -2008,13 +2008,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * if unsuccessful. */ if (!anon) { + struct mmu_gather tlb; + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) goto walk_abort; - if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + + tlb_gather_mmu_vma(&tlb, vma); + if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) { hugetlb_vma_unlock_write(vma); - flush_tlb_range(vma, - range.start, range.end); + huge_pmd_unshare_flush(&tlb, vma); + tlb_finish_mmu(&tlb); /* * The PMD table was unmapped, * consequently unmapping the folio. @@ -2022,6 +2026,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, goto walk_done; } hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); if (pte_dirty(pteval)) @@ -2398,17 +2403,20 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * fail if unsuccessful. */ if (!anon) { + struct mmu_gather tlb; + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) { page_vma_mapped_walk_done(&pvmw); ret = false; break; } - if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { - hugetlb_vma_unlock_write(vma); - flush_tlb_range(vma, - range.start, range.end); + tlb_gather_mmu_vma(&tlb, vma); + if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + huge_pmd_unshare_flush(&tlb, vma); + tlb_finish_mmu(&tlb); /* * The PMD table was unmapped, * consequently unmapping the folio. @@ -2417,6 +2425,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, break; } hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); } /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); -- cgit v1.2.3 From 35e247032606f06c2f19d90a6562bc315206b7a7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 14 Jan 2026 11:00:06 +0000 Subject: mm: do not copy page tables unnecessarily for VM_UFFD_WP Commit ab04b530e7e8 ("mm: introduce copy-on-fork VMAs and make VM_MAYBE_GUARD one") aggregates flags checks in vma_needs_copy(), including VM_UFFD_WP. However in doing so, it incorrectly performed this check against src_vma. This check was done on the assumption that all relevant flags are copied upon fork. However the userfaultfd logic is very innovative in that it implements custom logic on fork in dup_userfaultfd(), including a rather well hidden case where lacking UFFD_FEATURE_EVENT_FORK causes VM_UFFD_WP to not be propagated to the destination VMA. And indeed, vma_needs_copy(), prior to this patch, did check this property on dst_vma, not src_vma. Since all the other relevant flags are copied on fork, we can simply fix this by checking against dst_vma. While we're here, we fix a comment against VM_COPY_ON_FORK (noting that it did indeed already reference dst_vma) to make it abundantly clear that we must check against the destination VMA. Link: https://lkml.kernel.org/r/20260114110006.1047071-1-lorenzo.stoakes@oracle.com Fixes: ab04b530e7e8 ("mm: introduce copy-on-fork VMAs and make VM_MAYBE_GUARD one") Signed-off-by: Lorenzo Stoakes Reported-by: Chris Mason Closes: https://lore.kernel.org/all/20260113231257.3002271-1-clm@meta.com/ Acked-by: David Hildenbrand (Red Hat) Acked-by: Pedro Falcato Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++++- mm/memory.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6f959d8ca4b4..f0d5be9dc736 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -608,7 +608,11 @@ enum { /* * Flags which should result in page tables being copied on fork. These are * flags which indicate that the VMA maps page tables which cannot be - * reconsistuted upon page fault, so necessitate page table copying upon + * reconsistuted upon page fault, so necessitate page table copying upon fork. + * + * Note that these flags should be compared with the DESTINATION VMA not the + * source, as VM_UFFD_WP may not be propagated to destination, while all other + * flags will be. * * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be * reasonably reconstructed on page fault. diff --git a/mm/memory.c b/mm/memory.c index a0822b564cc0..da360a6eb8a4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1465,7 +1465,11 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { - if (src_vma->vm_flags & VM_COPY_ON_FORK) + /* + * We check against dst_vma as while sane VMA flags will have been + * copied, VM_UFFD_WP may be set only on dst_vma. + */ + if (dst_vma->vm_flags & VM_COPY_ON_FORK) return true; /* * The presence of an anon_vma indicates an anonymous VMA has page -- cgit v1.2.3