From 4ced4cf5c9d172d91f181df3accdf949d3761aab Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 17 Feb 2026 18:01:05 +0000 Subject: binfmt_elf_fdpic: fix AUXV size calculation for ELF_HWCAP3 and ELF_HWCAP4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 4e6e8c2b757f ("binfmt_elf: Wire up AT_HWCAP3 at AT_HWCAP4") added support for AT_HWCAP3 and AT_HWCAP4, but it missed updating the AUX vector size calculation in create_elf_fdpic_tables() and AT_VECTOR_SIZE_BASE in include/linux/auxvec.h. Similar to the fix for AT_HWCAP2 in commit c6a09e342f8e ("binfmt_elf_fdpic: fix AUXV size calculation when ELF_HWCAP2 is defined"), this omission leads to a mismatch between the reserved space and the actual number of AUX entries, eventually triggering a kernel BUG_ON(csp != sp). Fix this by incrementing nitems when ELF_HWCAP3 or ELF_HWCAP4 are defined and updating AT_VECTOR_SIZE_BASE. Cc: Mark Brown Cc: Max Filippov Reviewed-by: Michal Koutný Reviewed-by: Mark Brown Reviewed-by: Cyrill Gorcunov Reviewed-by: Alexander Mikhalitsyn Fixes: 4e6e8c2b757f ("binfmt_elf: Wire up AT_HWCAP3 at AT_HWCAP4") Signed-off-by: Andrei Vagin Link: https://patch.msgid.link/20260217180108.1420024-2-avagin@google.com Signed-off-by: Kees Cook --- include/linux/auxvec.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h index 407f7005e6d6..8bcb9b726262 100644 --- a/include/linux/auxvec.h +++ b/include/linux/auxvec.h @@ -4,6 +4,6 @@ #include -#define AT_VECTOR_SIZE_BASE 22 /* NEW_AUX_ENT entries in auxiliary table */ +#define AT_VECTOR_SIZE_BASE 24 /* NEW_AUX_ENT entries in auxiliary table */ /* number of "#define AT_.*" above, minus {AT_NULL, AT_IGNORE, AT_NOTELF} */ #endif /* _LINUX_AUXVEC_H */ -- cgit v1.2.3 From 14de1552a4e3fece78bb20314887e70888c9d448 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 11 Mar 2026 16:14:55 -0700 Subject: include/linux/local_lock_internal.h: Make this header file again compatible with sparse There are two versions of the __this_cpu_local_lock() definitions in include/linux/local_lock_internal.h: one version that relies on the Clang overloading functionality and another version that does not. Select the latter version when using sparse. This patch fixes the following errors reported by sparse: include/linux/local_lock_internal.h:331:40: sparse: sparse: multiple definitions for function '__this_cpu_local_lock' include/linux/local_lock_internal.h:325:37: sparse: the previous one is here Closes: https://lore.kernel.org/oe-kbuild-all/202603062334.wgI5htP0-lkp@intel.com/ Fixes: d3febf16dee2 ("locking/local_lock: Support Clang's context analysis") Reported-by: kernel test robot Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Link: https://patch.msgid.link/20260311231455.1961413-1-bvanassche@acm.org --- include/linux/local_lock_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index eff711bf973f..234be7f12c15 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -315,7 +315,7 @@ do { \ #endif /* CONFIG_PREEMPT_RT */ -#if defined(WARN_CONTEXT_ANALYSIS) +#if defined(WARN_CONTEXT_ANALYSIS) && !defined(__CHECKER__) /* * Because the compiler only knows about the base per-CPU variable, use this * helper function to make the compiler think we lock/unlock the @base variable, -- cgit v1.2.3 From 8324a54f604da18f21070702a8ad82ab2062787b Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 3 Feb 2026 19:10:45 +0200 Subject: serial: 8250: Add serial8250_handle_irq_locked() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 8250_port exports serial8250_handle_irq() to HW specific 8250 drivers. It takes port's lock within but a HW specific 8250 driver may want to take port's lock itself, do something, and then call the generic handler in 8250_port but to do that, the caller has to release port's lock for no good reason. Introduce serial8250_handle_irq_locked() which a HW specific driver can call while already holding port's lock. As this is new export, put it straight into a namespace (where all 8250 exports should eventually be moved). Tested-by: Bandal, Shankar Tested-by: Murthy, Shanth Cc: stable Reviewed-by: Andy Shevchenko Signed-off-by: Ilpo Järvinen Link: https://patch.msgid.link/20260203171049.4353-4-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_port.c | 24 ++++++++++++++++-------- include/linux/serial_8250.h | 1 + 2 files changed, 17 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 20cf123a0540..14d6aca44551 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -1782,20 +1783,16 @@ static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir) } /* - * This handles the interrupt from one port. + * Context: port's lock must be held by the caller. */ -int serial8250_handle_irq(struct uart_port *port, unsigned int iir) +void serial8250_handle_irq_locked(struct uart_port *port, unsigned int iir) { struct uart_8250_port *up = up_to_u8250p(port); struct tty_port *tport = &port->state->port; bool skip_rx = false; - unsigned long flags; u16 status; - if (iir & UART_IIR_NO_INT) - return 0; - - uart_port_lock_irqsave(port, &flags); + lockdep_assert_held_once(&port->lock); status = serial_lsr_in(up); @@ -1828,8 +1825,19 @@ int serial8250_handle_irq(struct uart_port *port, unsigned int iir) else if (!up->dma->tx_running) __stop_tx(up); } +} +EXPORT_SYMBOL_NS_GPL(serial8250_handle_irq_locked, "SERIAL_8250"); - uart_unlock_and_check_sysrq_irqrestore(port, flags); +/* + * This handles the interrupt from one port. + */ +int serial8250_handle_irq(struct uart_port *port, unsigned int iir) +{ + if (iir & UART_IIR_NO_INT) + return 0; + + guard(uart_port_lock_irqsave)(port); + serial8250_handle_irq_locked(port, iir); return 1; } diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 01efdce0fda0..a95b2d143d24 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -195,6 +195,7 @@ void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl); void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud, unsigned int quot); int fsl8250_handle_irq(struct uart_port *port); +void serial8250_handle_irq_locked(struct uart_port *port, unsigned int iir); int serial8250_handle_irq(struct uart_port *port, unsigned int iir); u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr); void serial8250_read_char(struct uart_8250_port *up, u16 lsr); -- cgit v1.2.3 From 5eb608319bb56464674a71b4a66ea65c6c435d64 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 27 Jan 2026 17:56:01 -0500 Subject: vt: save/restore unicode screen buffer for alternate screen The alternate screen support added by commit 23743ba64709 ("vt: add support for smput/rmput escape codes") only saves and restores the regular screen buffer (vc_origin), but completely ignores the corresponding unicode screen buffer (vc_uni_lines) creating a messed-up display. Add vc_saved_uni_lines to save the unicode screen buffer when entering the alternate screen, and restore it when leaving. Also ensure proper cleanup in reset_terminal() and vc_deallocate(). Fixes: 23743ba64709 ("vt: add support for smput/rmput escape codes") Cc: stable Signed-off-by: Nicolas Pitre Link: https://patch.msgid.link/5o2p6qp3-91pq-0p17-or02-1oors4417ns7@onlyvoer.pbz Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt.c | 8 ++++++++ include/linux/console_struct.h | 1 + 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index c1f152d8b03b..e2df99e3d458 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -1339,6 +1339,8 @@ struct vc_data *vc_deallocate(unsigned int currcons) kfree(vc->vc_saved_screen); vc->vc_saved_screen = NULL; } + vc_uniscr_free(vc->vc_saved_uni_lines); + vc->vc_saved_uni_lines = NULL; } return vc; } @@ -1884,6 +1886,8 @@ static void enter_alt_screen(struct vc_data *vc) vc->vc_saved_screen = kmemdup((u16 *)vc->vc_origin, size, GFP_KERNEL); if (vc->vc_saved_screen == NULL) return; + vc->vc_saved_uni_lines = vc->vc_uni_lines; + vc->vc_uni_lines = NULL; vc->vc_saved_rows = vc->vc_rows; vc->vc_saved_cols = vc->vc_cols; save_cur(vc); @@ -1905,6 +1909,8 @@ static void leave_alt_screen(struct vc_data *vc) dest = ((u16 *)vc->vc_origin) + r * vc->vc_cols; memcpy(dest, src, 2 * cols); } + vc_uniscr_set(vc, vc->vc_saved_uni_lines); + vc->vc_saved_uni_lines = NULL; restore_cur(vc); /* Update the entire screen */ if (con_should_update(vc)) @@ -2227,6 +2233,8 @@ static void reset_terminal(struct vc_data *vc, int do_clear) if (vc->vc_saved_screen != NULL) { kfree(vc->vc_saved_screen); vc->vc_saved_screen = NULL; + vc_uniscr_free(vc->vc_saved_uni_lines); + vc->vc_saved_uni_lines = NULL; vc->vc_saved_rows = 0; vc->vc_saved_cols = 0; } diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index 13b35637bd5a..d5ca855116df 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -160,6 +160,7 @@ struct vc_data { struct uni_pagedict **uni_pagedict_loc; /* [!] Location of uni_pagedict variable for this console */ u32 **vc_uni_lines; /* unicode screen content */ u16 *vc_saved_screen; + u32 **vc_saved_uni_lines; unsigned int vc_saved_cols; unsigned int vc_saved_rows; /* additional information is in vt_kern.h */ -- cgit v1.2.3 From 45c6a2dc7ec8339052666b06065c521a10cc29bb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:52:14 -0800 Subject: iommu/io-pgtable: fix all kernel-doc warnings in io-pgtable.h Avoid kernel-doc warnings in io-pgtable.h: - use the correct struct member names or kernel-doc format - add a missing struct member description - add a missing function return comment section Warning: include/linux/io-pgtable.h:187 struct member 'coherent_walk' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:187 struct member 'arm_lpae_s1_cfg' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:187 struct member 'arm_lpae_s2_cfg' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:187 struct member 'arm_v7s_cfg' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:187 struct member 'arm_mali_lpae_cfg' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:187 struct member 'apple_dart_cfg' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:187 struct member 'amd' not described in 'io_pgtable_cfg' Warning: include/linux/io-pgtable.h:223 struct member 'read_and_clear_dirty' not described in 'io_pgtable_ops' Warning: include/linux/io-pgtable.h:237 No description found for return value of 'alloc_io_pgtable_ops' Signed-off-by: Randy Dunlap Reviewed-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 7a1516011ccf..e19872e37e06 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -53,7 +53,7 @@ struct iommu_flush_ops { * tables. * @ias: Input address (iova) size, in bits. * @oas: Output address (paddr) size, in bits. - * @coherent_walk A flag to indicate whether or not page table walks made + * @coherent_walk: A flag to indicate whether or not page table walks made * by the IOMMU are coherent with the CPU caches. * @tlb: TLB management callbacks for this set of tables. * @iommu_dev: The device representing the DMA configuration for the @@ -136,6 +136,7 @@ struct io_pgtable_cfg { void (*free)(void *cookie, void *pages, size_t size); /* Low-level data specific to the table format */ + /* private: */ union { struct { u64 ttbr; @@ -203,6 +204,9 @@ struct arm_lpae_io_pgtable_walk_data { * @unmap_pages: Unmap a range of virtually contiguous pages of the same size. * @iova_to_phys: Translate iova to physical address. * @pgtable_walk: (optional) Perform a page table walk for a given iova. + * @read_and_clear_dirty: Record dirty info per IOVA. If an IOVA is dirty, + * clear its dirty state from the PTE unless the + * IOMMU_DIRTY_NO_CLEAR flag is passed in. * * These functions map directly onto the iommu_ops member functions with * the same names. @@ -231,7 +235,9 @@ struct io_pgtable_ops { * the configuration actually provided by the allocator (e.g. the * pgsize_bitmap may be restricted). * @cookie: An opaque token provided by the IOMMU driver and passed back to - * the callback routines in cfg->tlb. + * the callback routines. + * + * Returns: Pointer to the &struct io_pgtable_ops for this set of page tables. */ struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt, struct io_pgtable_cfg *cfg, -- cgit v1.2.3 From cb3d1049f4ea77d5ad93f17d8ac1f2ed4da70501 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 3 Mar 2026 12:53:18 +0100 Subject: driver core: generalize driver_override in struct device Currently, there are 12 busses (including platform and PCI) that duplicate the driver_override logic for their individual devices. All of them seem to be prone to the bug described in [1]. While this could be solved for every bus individually using a separate lock, solving this in the driver-core generically results in less (and cleaner) changes overall. Thus, move driver_override to struct device, provide corresponding accessors for busses and handle locking with a separate lock internally. In particular, add device_set_driver_override(), device_has_driver_override(), device_match_driver_override() and generalize the sysfs store() and show() callbacks via a driver_override feature flag in struct bus_type. Until all busses have migrated, keep driver_set_override() in place. Note that we can't use the device lock for the reasons described in [2]. Link: https://bugzilla.kernel.org/show_bug.cgi?id=220789 [1] Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/ [2] Tested-by: Gui-Dong Han Co-developed-by: Gui-Dong Han Signed-off-by: Gui-Dong Han Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20260303115720.48783-2-dakr@kernel.org [ Use dev->bus instead of sp->bus for consistency; fix commit message to refer to the struct bus_type's driver_override feature flag. - Danilo ] Signed-off-by: Danilo Krummrich --- drivers/base/bus.c | 43 ++++++++++++++++++++++++++++++++- drivers/base/core.c | 2 ++ drivers/base/dd.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/device.h | 54 +++++++++++++++++++++++++++++++++++++++++ include/linux/device/bus.h | 4 ++++ 5 files changed, 162 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/base/bus.c b/drivers/base/bus.c index bb61d8adbab1..8b6722ff8590 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -504,6 +504,36 @@ int bus_for_each_drv(const struct bus_type *bus, struct device_driver *start, } EXPORT_SYMBOL_GPL(bus_for_each_drv); +static ssize_t driver_override_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret; + + ret = __device_set_driver_override(dev, buf, count); + if (ret) + return ret; + + return count; +} + +static ssize_t driver_override_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + guard(spinlock)(&dev->driver_override.lock); + return sysfs_emit(buf, "%s\n", dev->driver_override.name); +} +static DEVICE_ATTR_RW(driver_override); + +static struct attribute *driver_override_dev_attrs[] = { + &dev_attr_driver_override.attr, + NULL, +}; + +static const struct attribute_group driver_override_dev_group = { + .attrs = driver_override_dev_attrs, +}; + /** * bus_add_device - add device to bus * @dev: device being added @@ -537,9 +567,15 @@ int bus_add_device(struct device *dev) if (error) goto out_put; + if (dev->bus->driver_override) { + error = device_add_group(dev, &driver_override_dev_group); + if (error) + goto out_groups; + } + error = sysfs_create_link(&sp->devices_kset->kobj, &dev->kobj, dev_name(dev)); if (error) - goto out_groups; + goto out_override; error = sysfs_create_link(&dev->kobj, &sp->subsys.kobj, "subsystem"); if (error) @@ -550,6 +586,9 @@ int bus_add_device(struct device *dev) out_subsys: sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev)); +out_override: + if (dev->bus->driver_override) + device_remove_group(dev, &driver_override_dev_group); out_groups: device_remove_groups(dev, sp->bus->dev_groups); out_put: @@ -607,6 +646,8 @@ void bus_remove_device(struct device *dev) sysfs_remove_link(&dev->kobj, "subsystem"); sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev)); + if (dev->bus->driver_override) + device_remove_group(dev, &driver_override_dev_group); device_remove_groups(dev, dev->bus->dev_groups); if (klist_node_attached(&dev->p->knode_bus)) klist_del(&dev->p->knode_bus); diff --git a/drivers/base/core.c b/drivers/base/core.c index 791f9e444df8..09b98f02f559 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2556,6 +2556,7 @@ static void device_release(struct kobject *kobj) devres_release_all(dev); kfree(dev->dma_range_map); + kfree(dev->driver_override.name); if (dev->release) dev->release(dev); @@ -3159,6 +3160,7 @@ void device_initialize(struct device *dev) kobject_init(&dev->kobj, &device_ktype); INIT_LIST_HEAD(&dev->dma_pools); mutex_init(&dev->mutex); + spin_lock_init(&dev->driver_override.lock); lockdep_set_novalidate_class(&dev->mutex); spin_lock_init(&dev->devres_lock); INIT_LIST_HEAD(&dev->devres_head); diff --git a/drivers/base/dd.c b/drivers/base/dd.c index bea8da5f8a3a..37c7e54e0e4c 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -381,6 +381,66 @@ static void __exit deferred_probe_exit(void) } __exitcall(deferred_probe_exit); +int __device_set_driver_override(struct device *dev, const char *s, size_t len) +{ + const char *new, *old; + char *cp; + + if (!s) + return -EINVAL; + + /* + * The stored value will be used in sysfs show callback (sysfs_emit()), + * which has a length limit of PAGE_SIZE and adds a trailing newline. + * Thus we can store one character less to avoid truncation during sysfs + * show. + */ + if (len >= (PAGE_SIZE - 1)) + return -EINVAL; + + /* + * Compute the real length of the string in case userspace sends us a + * bunch of \0 characters like python likes to do. + */ + len = strlen(s); + + if (!len) { + /* Empty string passed - clear override */ + spin_lock(&dev->driver_override.lock); + old = dev->driver_override.name; + dev->driver_override.name = NULL; + spin_unlock(&dev->driver_override.lock); + kfree(old); + + return 0; + } + + cp = strnchr(s, len, '\n'); + if (cp) + len = cp - s; + + new = kstrndup(s, len, GFP_KERNEL); + if (!new) + return -ENOMEM; + + spin_lock(&dev->driver_override.lock); + old = dev->driver_override.name; + if (cp != s) { + dev->driver_override.name = new; + spin_unlock(&dev->driver_override.lock); + } else { + /* "\n" passed - clear override */ + dev->driver_override.name = NULL; + spin_unlock(&dev->driver_override.lock); + + kfree(new); + } + kfree(old); + + return 0; +} +EXPORT_SYMBOL_GPL(__device_set_driver_override); + /** * device_is_bound() - Check if device is bound to a driver * @dev: device to check diff --git a/include/linux/device.h b/include/linux/device.h index 0be95294b6e6..e65d564f01cd 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -483,6 +483,8 @@ struct device_physical_location { * on. This shrinks the "Board Support Packages" (BSPs) and * minimizes board-specific #ifdefs in drivers. * @driver_data: Private pointer for driver specific info. + * @driver_override: Driver name to force a match. Do not touch directly; use + * device_set_driver_override() instead. * @links: Links to suppliers and consumers of this device. * @power: For device power management. * See Documentation/driver-api/pm/devices.rst for details. @@ -576,6 +578,10 @@ struct device { core doesn't touch it */ void *driver_data; /* Driver data, set and get with dev_set_drvdata/dev_get_drvdata */ + struct { + const char *name; + spinlock_t lock; + } driver_override; struct mutex mutex; /* mutex to synchronize calls to * its driver. */ @@ -701,6 +707,54 @@ struct device_link { #define kobj_to_dev(__kobj) container_of_const(__kobj, struct device, kobj) +int __device_set_driver_override(struct device *dev, const char *s, size_t len); + +/** + * device_set_driver_override() - Helper to set or clear driver override. + * @dev: Device to change + * @s: NUL-terminated string, new driver name to force a match, pass empty + * string to clear it ("" or "\n", where the latter is only for sysfs + * interface). + * + * Helper to set or clear driver override of a device. + * + * Returns: 0 on success or a negative error code on failure. + */ +static inline int device_set_driver_override(struct device *dev, const char *s) +{ + return __device_set_driver_override(dev, s, s ? strlen(s) : 0); +} + +/** + * device_has_driver_override() - Check if a driver override has been set. + * @dev: device to check + * + * Returns true if a driver override has been set for this device. + */ +static inline bool device_has_driver_override(struct device *dev) +{ + guard(spinlock)(&dev->driver_override.lock); + return !!dev->driver_override.name; +} + +/** + * device_match_driver_override() - Match a driver against the device's driver_override. + * @dev: device to check + * @drv: driver to match against + * + * Returns > 0 if a driver override is set and matches the given driver, 0 if a + * driver override is set but does not match, or < 0 if a driver override is not + * set at all. + */ +static inline int device_match_driver_override(struct device *dev, + const struct device_driver *drv) +{ + guard(spinlock)(&dev->driver_override.lock); + if (dev->driver_override.name) + return !strcmp(dev->driver_override.name, drv->name); + return -1; +} + /** * device_iommu_mapped - Returns true when the device DMA is translated * by an IOMMU diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index 63de5f053c33..c1b463cd6464 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -65,6 +65,9 @@ struct fwnode_handle; * this bus. * @pm: Power management operations of this bus, callback the specific * device driver's pm-ops. + * @driver_override: Set to true if this bus supports the driver_override + * mechanism, which allows userspace to force a specific + * driver to bind to a device via a sysfs attribute. * @need_parent_lock: When probing or removing a device on this bus, the * device core should lock the device's parent. * @@ -106,6 +109,7 @@ struct bus_type { const struct dev_pm_ops *pm; + bool driver_override; bool need_parent_lock; }; -- cgit v1.2.3 From 2b38efc05bf7a8568ec74bfffea0f5cfa62bc01d Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 3 Mar 2026 12:53:21 +0100 Subject: driver core: platform: use generic driver_override infrastructure When a driver is probed through __driver_attach(), the bus' match() callback is called without the device lock held, thus accessing the driver_override field without a lock, which can cause a UAF. Fix this by using the driver-core driver_override infrastructure taking care of proper locking internally. Note that calling match() from __driver_attach() without the device lock held is intentional. [1] Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/ [1] Reported-by: Gui-Dong Han Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220789 Fixes: 3d713e0e382e ("driver core: platform: add device binding path 'driver_override'") Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20260303115720.48783-5-dakr@kernel.org Signed-off-by: Danilo Krummrich --- drivers/base/platform.c | 37 +++++-------------------------------- drivers/bus/simple-pm-bus.c | 4 ++-- drivers/clk/imx/clk-scu.c | 3 +-- drivers/slimbus/qcom-ngd-ctrl.c | 6 ++---- include/linux/platform_device.h | 5 ----- sound/soc/samsung/i2s.c | 6 +++--- 6 files changed, 13 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index b45d41b018ca..d44591d52e36 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -603,7 +603,6 @@ static void platform_device_release(struct device *dev) kfree(pa->pdev.dev.platform_data); kfree(pa->pdev.mfd_cell); kfree(pa->pdev.resource); - kfree(pa->pdev.driver_override); kfree(pa); } @@ -1306,38 +1305,9 @@ static ssize_t numa_node_show(struct device *dev, } static DEVICE_ATTR_RO(numa_node); -static ssize_t driver_override_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct platform_device *pdev = to_platform_device(dev); - ssize_t len; - - device_lock(dev); - len = sysfs_emit(buf, "%s\n", pdev->driver_override); - device_unlock(dev); - - return len; -} - -static ssize_t driver_override_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct platform_device *pdev = to_platform_device(dev); - int ret; - - ret = driver_set_override(dev, &pdev->driver_override, buf, count); - if (ret) - return ret; - - return count; -} -static DEVICE_ATTR_RW(driver_override); - static struct attribute *platform_dev_attrs[] = { &dev_attr_modalias.attr, &dev_attr_numa_node.attr, - &dev_attr_driver_override.attr, NULL, }; @@ -1377,10 +1347,12 @@ static int platform_match(struct device *dev, const struct device_driver *drv) { struct platform_device *pdev = to_platform_device(dev); struct platform_driver *pdrv = to_platform_driver(drv); + int ret; /* When driver_override is set, only bind to the matching driver */ - if (pdev->driver_override) - return !strcmp(pdev->driver_override, drv->name); + ret = device_match_driver_override(dev, drv); + if (ret >= 0) + return ret; /* Attempt an OF style match first */ if (of_driver_match_device(dev, drv)) @@ -1516,6 +1488,7 @@ static const struct dev_pm_ops platform_dev_pm_ops = { const struct bus_type platform_bus_type = { .name = "platform", .dev_groups = platform_dev_groups, + .driver_override = true, .match = platform_match, .uevent = platform_uevent, .probe = platform_probe, diff --git a/drivers/bus/simple-pm-bus.c b/drivers/bus/simple-pm-bus.c index 3f00d953fb9a..c920bd6fbaaf 100644 --- a/drivers/bus/simple-pm-bus.c +++ b/drivers/bus/simple-pm-bus.c @@ -36,7 +36,7 @@ static int simple_pm_bus_probe(struct platform_device *pdev) * that's not listed in simple_pm_bus_of_match. We don't want to do any * of the simple-pm-bus tasks for these devices, so return early. */ - if (pdev->driver_override) + if (device_has_driver_override(&pdev->dev)) return 0; match = of_match_device(dev->driver->of_match_table, dev); @@ -78,7 +78,7 @@ static void simple_pm_bus_remove(struct platform_device *pdev) { const void *data = of_device_get_match_data(&pdev->dev); - if (pdev->driver_override || data) + if (device_has_driver_override(&pdev->dev) || data) return; dev_dbg(&pdev->dev, "%s\n", __func__); diff --git a/drivers/clk/imx/clk-scu.c b/drivers/clk/imx/clk-scu.c index a85ec48a798b..9b33df9967ec 100644 --- a/drivers/clk/imx/clk-scu.c +++ b/drivers/clk/imx/clk-scu.c @@ -706,8 +706,7 @@ struct clk_hw *imx_clk_scu_alloc_dev(const char *name, if (ret) goto put_device; - ret = driver_set_override(&pdev->dev, &pdev->driver_override, - "imx-scu-clk", strlen("imx-scu-clk")); + ret = device_set_driver_override(&pdev->dev, "imx-scu-clk"); if (ret) goto put_device; diff --git a/drivers/slimbus/qcom-ngd-ctrl.c b/drivers/slimbus/qcom-ngd-ctrl.c index 9aa7218b4e8d..1ed6be6e85d2 100644 --- a/drivers/slimbus/qcom-ngd-ctrl.c +++ b/drivers/slimbus/qcom-ngd-ctrl.c @@ -1535,10 +1535,8 @@ static int of_qcom_slim_ngd_register(struct device *parent, ngd->id = id; ngd->pdev->dev.parent = parent; - ret = driver_set_override(&ngd->pdev->dev, - &ngd->pdev->driver_override, - QCOM_SLIM_NGD_DRV_NAME, - strlen(QCOM_SLIM_NGD_DRV_NAME)); + ret = device_set_driver_override(&ngd->pdev->dev, + QCOM_SLIM_NGD_DRV_NAME); if (ret) { platform_device_put(ngd->pdev); kfree(ngd); diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 813da101b5bf..ed1d50d1c3c1 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -31,11 +31,6 @@ struct platform_device { struct resource *resource; const struct platform_device_id *id_entry; - /* - * Driver name to force a match. Do not set directly, because core - * frees it. Use driver_set_override() to set or clear it. - */ - const char *driver_override; /* MFD cell pointer */ struct mfd_cell *mfd_cell; diff --git a/sound/soc/samsung/i2s.c b/sound/soc/samsung/i2s.c index e9964f0e010a..140907a41a70 100644 --- a/sound/soc/samsung/i2s.c +++ b/sound/soc/samsung/i2s.c @@ -1360,10 +1360,10 @@ static int i2s_create_secondary_device(struct samsung_i2s_priv *priv) if (!pdev_sec) return -ENOMEM; - pdev_sec->driver_override = kstrdup("samsung-i2s", GFP_KERNEL); - if (!pdev_sec->driver_override) { + ret = device_set_driver_override(&pdev_sec->dev, "samsung-i2s"); + if (ret) { platform_device_put(pdev_sec); - return -ENOMEM; + return ret; } ret = platform_device_add(pdev_sec); -- cgit v1.2.3 From 418eab7a6f3c002d8e64d6e95ec27118017019af Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Mar 2026 14:29:20 -0600 Subject: io_uring/kbuf: propagate BUF_MORE through early buffer commit path When io_should_commit() returns true (eg for non-pollable files), buffer commit happens at buffer selection time and sel->buf_list is set to NULL. When __io_put_kbufs() generates CQE flags at completion time, it calls __io_put_kbuf_ring() which finds a NULL buffer_list and hence cannot determine whether the buffer was consumed or not. This means that IORING_CQE_F_BUF_MORE is never set for non-pollable input with incrementally consumed buffers. Likewise for io_buffers_select(), which always commits upfront and discards the return value of io_kbuf_commit(). Add REQ_F_BUF_MORE to store the result of io_kbuf_commit() during early commit. Then __io_put_kbuf_ring() can check this flag and set IORING_F_BUF_MORE accordingy. Reported-by: Martin Michaelis Cc: stable@vger.kernel.org Fixes: ae98dbf43d75 ("io_uring/kbuf: add support for incremental buffer consumption") Link: https://github.com/axboe/liburing/issues/1553 Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ io_uring/kbuf.c | 10 +++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index dd1420bfcb73..214fdbd49052 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -541,6 +541,7 @@ enum { REQ_F_BL_NO_RECYCLE_BIT, REQ_F_BUFFERS_COMMIT_BIT, REQ_F_BUF_NODE_BIT, + REQ_F_BUF_MORE_BIT, REQ_F_HAS_METADATA_BIT, REQ_F_IMPORT_BUFFER_BIT, REQ_F_SQE_COPIED_BIT, @@ -626,6 +627,8 @@ enum { REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), /* buf node is valid */ REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), + /* incremental buffer consumption, more space available */ + REQ_F_BUF_MORE = IO_REQ_FLAG(REQ_F_BUF_MORE_BIT), /* request has read/write metadata assigned */ REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), /* diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index a4cb6752b7aa..f72f38d22d2b 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -216,7 +216,8 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr)); if (io_should_commit(req, issue_flags)) { - io_kbuf_commit(req, sel.buf_list, *len, 1); + if (!io_kbuf_commit(req, sel.buf_list, *len, 1)) + req->flags |= REQ_F_BUF_MORE; sel.buf_list = NULL; } return sel; @@ -349,7 +350,8 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, */ if (ret > 0) { req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; - io_kbuf_commit(req, sel->buf_list, arg->out_len, ret); + if (!io_kbuf_commit(req, sel->buf_list, arg->out_len, ret)) + req->flags |= REQ_F_BUF_MORE; } } else { ret = io_provided_buffers_select(req, &arg->out_len, sel->buf_list, arg->iovs); @@ -395,8 +397,10 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req, if (bl) ret = io_kbuf_commit(req, bl, len, nr); + if (ret && (req->flags & REQ_F_BUF_MORE)) + ret = false; - req->flags &= ~REQ_F_BUFFER_RING; + req->flags &= ~(REQ_F_BUFFER_RING | REQ_F_BUF_MORE); return ret; } -- cgit v1.2.3 From 9bb0a4d6a4433b75274204b083dac8e515d2007d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:47 +0200 Subject: dma-mapping: Clarify valid conditions for CPU cache line overlap Rename the DMA_ATTR_CPU_CACHE_CLEAN attribute to better reflect that it is debugging aid to inform DMA core code that CPU cache line overlaps are allowed, and refine the documentation describing its use. Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-3-1dde90a7f08b@nvidia.com --- Documentation/core-api/dma-attributes.rst | 22 ++++++++++++++-------- drivers/virtio/virtio_ring.c | 10 +++++----- include/linux/dma-mapping.h | 8 ++++---- include/trace/events/dma.h | 2 +- kernel/dma/debug.c | 2 +- 5 files changed, 25 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 1d7bfad73b1c..48cfe86cc06d 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -149,11 +149,17 @@ For architectures that require cache flushing for DMA coherence DMA_ATTR_MMIO will not perform any cache flushing. The address provided must never be mapped cacheable into the CPU. -DMA_ATTR_CPU_CACHE_CLEAN ------------------------- - -This attribute indicates the CPU will not dirty any cacheline overlapping this -DMA_FROM_DEVICE/DMA_BIDIRECTIONAL buffer while it is mapped. This allows -multiple small buffers to safely share a cacheline without risk of data -corruption, suppressing DMA debug warnings about overlapping mappings. -All mappings sharing a cacheline should have this attribute. +DMA_ATTR_DEBUGGING_IGNORE_CACHELINES +------------------------------------ + +This attribute indicates that CPU cache lines may overlap for buffers mapped +with DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. + +Such overlap may occur when callers map multiple small buffers that reside +within the same cache line. In this case, callers must guarantee that the CPU +will not dirty these cache lines after the mappings are established. When this +condition is met, multiple buffers can safely share a cache line without risking +data corruption. + +All mappings that share a cache line must set this attribute to suppress DMA +debug warnings about overlapping mappings. diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 335692d41617..fbca7ce1c6bf 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2912,10 +2912,10 @@ EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); * @data: the token identifying the buffer. * @gfp: how to do memory allocations (if necessary). * - * Same as virtqueue_add_inbuf but passes DMA_ATTR_CPU_CACHE_CLEAN to indicate - * that the CPU will not dirty any cacheline overlapping this buffer while it - * is available, and to suppress overlapping cacheline warnings in DMA debug - * builds. + * Same as virtqueue_add_inbuf but passes DMA_ATTR_DEBUGGING_IGNORE_CACHELINES + * to indicate that the CPU will not dirty any cacheline overlapping this buffer + * while it is available, and to suppress overlapping cacheline warnings in DMA + * debug builds. * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). @@ -2928,7 +2928,7 @@ int virtqueue_add_inbuf_cache_clean(struct virtqueue *vq, gfp_t gfp) { return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, false, gfp, - DMA_ATTR_CPU_CACHE_CLEAN); + DMA_ATTR_DEBUGGING_IGNORE_CACHELINES); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_cache_clean); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 29973baa0581..da44394b3a1a 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -80,11 +80,11 @@ #define DMA_ATTR_MMIO (1UL << 10) /* - * DMA_ATTR_CPU_CACHE_CLEAN: Indicates the CPU will not dirty any cacheline - * overlapping this buffer while it is mapped for DMA. All mappings sharing - * a cacheline must have this attribute for this to be considered safe. + * DMA_ATTR_DEBUGGING_IGNORE_CACHELINES: Indicates the CPU cache line can be + * overlapped. All mappings sharing a cacheline must have this attribute for + * this to be considered safe. */ -#define DMA_ATTR_CPU_CACHE_CLEAN (1UL << 11) +#define DMA_ATTR_DEBUGGING_IGNORE_CACHELINES (1UL << 11) /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index 69cb3805ee81..8c64bc0721fe 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -33,7 +33,7 @@ TRACE_DEFINE_ENUM(DMA_NONE); { DMA_ATTR_NO_WARN, "NO_WARN" }, \ { DMA_ATTR_PRIVILEGED, "PRIVILEGED" }, \ { DMA_ATTR_MMIO, "MMIO" }, \ - { DMA_ATTR_CPU_CACHE_CLEAN, "CACHE_CLEAN" }) + { DMA_ATTR_DEBUGGING_IGNORE_CACHELINES, "CACHELINES_OVERLAP" }) DECLARE_EVENT_CLASS(dma_map, TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr, diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index be207be74996..83e1cfe05f08 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -601,7 +601,7 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) unsigned long flags; int rc; - entry->is_cache_clean = !!(attrs & DMA_ATTR_CPU_CACHE_CLEAN); + entry->is_cache_clean = attrs & DMA_ATTR_DEBUGGING_IGNORE_CACHELINES; bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); -- cgit v1.2.3 From e6a58fa2556203a7f6731b4071705dc81cca5ca5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:48 +0200 Subject: dma-mapping: Introduce DMA require coherency attribute The mapping buffers which carry this attribute require DMA coherent system. This means that they can't take SWIOTLB path, can perform CPU cache overlap and doesn't perform cache flushing. Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-4-1dde90a7f08b@nvidia.com --- Documentation/core-api/dma-attributes.rst | 16 ++++++++++++++++ include/linux/dma-mapping.h | 7 +++++++ include/trace/events/dma.h | 3 ++- kernel/dma/debug.c | 3 ++- kernel/dma/mapping.c | 6 ++++++ 5 files changed, 33 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 48cfe86cc06d..123c8468d58f 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -163,3 +163,19 @@ data corruption. All mappings that share a cache line must set this attribute to suppress DMA debug warnings about overlapping mappings. + +DMA_ATTR_REQUIRE_COHERENT +------------------------- + +DMA mapping requests with the DMA_ATTR_REQUIRE_COHERENT fail on any +system where SWIOTLB or cache management is required. This should only +be used to support uAPI designs that require continuous HW DMA +coherence with userspace processes, for example RDMA and DRM. At a +minimum the memory being mapped must be userspace memory from +pin_user_pages() or similar. + +Drivers should consider using dma_mmap_pages() instead of this +interface when building their uAPIs, when possible. + +It must never be used in an in-kernel driver that only works with +kernel memory. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index da44394b3a1a..482b919f040f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -86,6 +86,13 @@ */ #define DMA_ATTR_DEBUGGING_IGNORE_CACHELINES (1UL << 11) +/* + * DMA_ATTR_REQUIRE_COHERENT: Indicates that DMA coherency is required. + * All mappings that carry this attribute can't work with SWIOTLB and cache + * flushing. + */ +#define DMA_ATTR_REQUIRE_COHERENT (1UL << 12) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index 8c64bc0721fe..63597b004424 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -33,7 +33,8 @@ TRACE_DEFINE_ENUM(DMA_NONE); { DMA_ATTR_NO_WARN, "NO_WARN" }, \ { DMA_ATTR_PRIVILEGED, "PRIVILEGED" }, \ { DMA_ATTR_MMIO, "MMIO" }, \ - { DMA_ATTR_DEBUGGING_IGNORE_CACHELINES, "CACHELINES_OVERLAP" }) + { DMA_ATTR_DEBUGGING_IGNORE_CACHELINES, "CACHELINES_OVERLAP" }, \ + { DMA_ATTR_REQUIRE_COHERENT, "REQUIRE_COHERENT" }) DECLARE_EVENT_CLASS(dma_map, TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr, diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 83e1cfe05f08..0677918f06a8 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -601,7 +601,8 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) unsigned long flags; int rc; - entry->is_cache_clean = attrs & DMA_ATTR_DEBUGGING_IGNORE_CACHELINES; + entry->is_cache_clean = attrs & (DMA_ATTR_DEBUGGING_IGNORE_CACHELINES | + DMA_ATTR_REQUIRE_COHERENT); bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 3928a509c44c..6d3dd0bd3a88 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -164,6 +164,9 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, if (WARN_ON_ONCE(!dev->dma_mask)) return DMA_MAPPING_ERROR; + if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT)) + return DMA_MAPPING_ERROR; + if (dma_map_direct(dev, ops) || (!is_mmio && arch_dma_map_phys_direct(dev, phys + size))) addr = dma_direct_map_phys(dev, phys, size, dir, attrs); @@ -235,6 +238,9 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, BUG_ON(!valid_dma_direction(dir)); + if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT)) + return -EOPNOTSUPP; + if (WARN_ON_ONCE(!dev->dma_mask)) return 0; -- cgit v1.2.3 From 1613462be621ad5103ec338a7b0ca0746ec4e5f1 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 14 Oct 2025 13:28:15 +0200 Subject: xen/privcmd: add boot control for restricted usage in domU When running in an unprivileged domU under Xen, the privcmd driver is restricted to allow only hypercalls against a target domain, for which the current domU is acting as a device model. Add a boot parameter "unrestricted" to allow all hypercalls (the hypervisor will still refuse destructive hypercalls affecting other guests). Make this new parameter effective only in case the domU wasn't started using secure boot, as otherwise hypercalls targeting the domU itself might result in violating the secure boot functionality. This is achieved by adding another lockdown reason, which can be tested to not being set when applying the "unrestricted" option. This is part of XSA-482 Signed-off-by: Juergen Gross --- V2: - new patch --- drivers/xen/privcmd.c | 13 +++++++++++++ include/linux/security.h | 1 + security/security.c | 1 + 3 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index a83bad69f4f2..bbf9ee21306c 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -72,6 +73,11 @@ module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint, MODULE_PARM_DESC(dm_op_buf_max_size, "Maximum size of a dm_op hypercall buffer"); +static bool unrestricted; +module_param(unrestricted, bool, 0); +MODULE_PARM_DESC(unrestricted, + "Don't restrict hypercalls to target domain if running in a domU"); + struct privcmd_data { domid_t domid; }; @@ -1708,6 +1714,13 @@ static struct notifier_block xenstore_notifier = { static void __init restrict_driver(void) { + if (unrestricted) { + if (security_locked_down(LOCKDOWN_XEN_USER_ACTIONS)) + pr_warn("Kernel is locked down, parameter \"unrestricted\" ignored\n"); + else + return; + } + restrict_wait = true; register_xenstore_notifier(&xenstore_notifier); diff --git a/include/linux/security.h b/include/linux/security.h index 83a646d72f6f..ee88dd2d2d1f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -145,6 +145,7 @@ enum lockdown_reason { LOCKDOWN_BPF_WRITE_USER, LOCKDOWN_DBG_WRITE_KERNEL, LOCKDOWN_RTAS_ERROR_INJECTION, + LOCKDOWN_XEN_USER_ACTIONS, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, diff --git a/security/security.c b/security/security.c index 67af9228c4e9..a26c1474e2e4 100644 --- a/security/security.c +++ b/security/security.c @@ -61,6 +61,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = { [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM", [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM", [LOCKDOWN_RTAS_ERROR_INJECTION] = "RTAS error injection", + [LOCKDOWN_XEN_USER_ACTIONS] = "Xen guest user action", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", -- cgit v1.2.3 From 26f775a054c3cda86ad465a64141894a90a9e145 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Mar 2026 07:52:17 -0700 Subject: mm/damon/core: avoid use of half-online-committed context One major usage of damon_call() is online DAMON parameters update. It is done by calling damon_commit_ctx() inside the damon_call() callback function. damon_commit_ctx() can fail for two reasons: 1) invalid parameters and 2) internal memory allocation failures. In case of failures, the damon_ctx that attempted to be updated (commit destination) can be partially updated (or, corrupted from a perspective), and therefore shouldn't be used anymore. The function only ensures the damon_ctx object can safely deallocated using damon_destroy_ctx(). The API callers are, however, calling damon_commit_ctx() only after asserting the parameters are valid, to avoid damon_commit_ctx() fails due to invalid input parameters. But it can still theoretically fail if the internal memory allocation fails. In the case, DAMON may run with the partially updated damon_ctx. This can result in unexpected behaviors including even NULL pointer dereference in case of damos_commit_dests() failure [1]. Such allocation failure is arguably too small to fail, so the real world impact would be rare. But, given the bad consequence, this needs to be fixed. Avoid such partially-committed (maybe-corrupted) damon_ctx use by saving the damon_commit_ctx() failure on the damon_ctx object. For this, introduce damon_ctx->maybe_corrupted field. damon_commit_ctx() sets it when it is failed. kdamond_call() checks if the field is set after each damon_call_control->fn() is executed. If it is set, ignore remaining callback requests and return. All kdamond_call() callers including kdamond_fn() also check the maybe_corrupted field right after kdamond_call() invocations. If the field is set, break the kdamond_fn() main loop so that DAMON sill doesn't use the context that might be corrupted. [sj@kernel.org: let kdamond_call() with cancel regardless of maybe_corrupted] Link: https://lkml.kernel.org/r/20260320031553.2479-1-sj@kernel.org Link: https://sashiko.dev/#/patchset/20260319145218.86197-1-sj%40kernel.org Link: https://lkml.kernel.org/r/20260319145218.86197-1-sj@kernel.org Link: https://lore.kernel.org/20260319043309.97966-1-sj@kernel.org [1] Fixes: 3301f1861d34 ("mm/damon/sysfs: handle commit command using damon_call()") Signed-off-by: SeongJae Park Cc: [6.15+] Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ mm/damon/core.c | 8 ++++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index a4fea23da857..be3d198043ff 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -810,6 +810,12 @@ struct damon_ctx { struct damos_walk_control *walk_control; struct mutex walk_control_lock; + /* + * indicate if this may be corrupted. Currentonly this is set only for + * damon_commit_ctx() failure. + */ + bool maybe_corrupted; + /* Working thread of the given DAMON context */ struct task_struct *kdamond; /* Protects @kdamond field access */ diff --git a/mm/damon/core.c b/mm/damon/core.c index c1d1091d307e..3e1890d64d06 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1252,6 +1252,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) { int err; + dst->maybe_corrupted = true; if (!is_power_of_2(src->min_region_sz)) return -EINVAL; @@ -1277,6 +1278,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) dst->addr_unit = src->addr_unit; dst->min_region_sz = src->min_region_sz; + dst->maybe_corrupted = false; return 0; } @@ -2678,6 +2680,8 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel) complete(&control->completion); else if (control->canceled && control->dealloc_on_cancel) kfree(control); + if (!cancel && ctx->maybe_corrupted) + break; } mutex_lock(&ctx->call_controls_lock); @@ -2707,6 +2711,8 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) kdamond_usleep(min_wait_time); kdamond_call(ctx, false); + if (ctx->maybe_corrupted) + return -EINVAL; damos_walk_cancel(ctx); } return -EBUSY; @@ -2790,6 +2796,8 @@ static int kdamond_fn(void *data) * kdamond_merge_regions() if possible, to reduce overhead */ kdamond_call(ctx, false); + if (ctx->maybe_corrupted) + break; if (!list_empty(&ctx->schemes)) kdamond_apply_schemes(ctx); else -- cgit v1.2.3 From 38ec410b99a5ee6566f75650ce3d4fd632940fd0 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Fri, 20 Mar 2026 10:18:17 +0800 Subject: virtio-net: correct hdr_len handling for VIRTIO_NET_F_GUEST_HDRLEN The commit be50da3e9d4a ("net: virtio_net: implement exact header length guest feature") introduces support for the VIRTIO_NET_F_GUEST_HDRLEN feature in virtio-net. This feature requires virtio-net to set hdr_len to the actual header length of the packet when transmitting, the number of bytes from the start of the packet to the beginning of the transport-layer payload. However, in practice, hdr_len was being set using skb_headlen(skb), which is clearly incorrect. This commit fixes that issue. Fixes: be50da3e9d4a ("net: virtio_net: implement exact header length guest feature") Signed-off-by: Xuan Zhuo Link: https://patch.msgid.link/20260320021818.111741-2-xuanzhuo@linux.alibaba.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- drivers/net/tun_vnet.h | 2 +- drivers/net/virtio_net.c | 6 +++++- include/linux/virtio_net.h | 34 ++++++++++++++++++++++++++++++---- 3 files changed, 36 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h index a5f93b6c4482..fa5cab9d3e55 100644 --- a/drivers/net/tun_vnet.h +++ b/drivers/net/tun_vnet.h @@ -244,7 +244,7 @@ tun_vnet_hdr_tnl_from_skb(unsigned int flags, if (virtio_net_hdr_tnl_from_skb(skb, tnl_hdr, has_tnl_offload, tun_vnet_is_little_endian(flags), - vlan_hlen, true)) { + vlan_hlen, true, false)) { struct virtio_net_hdr_v1 *hdr = &tnl_hdr->hash_hdr.hdr; struct skb_shared_info *sinfo = skb_shinfo(skb); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 72d6a9c6a5a2..7106333ef904 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3267,8 +3267,12 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan) struct virtio_net_hdr_v1_hash_tunnel *hdr; int num_sg; unsigned hdr_len = vi->hdr_len; + bool feature_hdrlen; bool can_push; + feature_hdrlen = virtio_has_feature(vi->vdev, + VIRTIO_NET_F_GUEST_HDRLEN); + pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest); /* Make sure it's safe to cast between formats */ @@ -3288,7 +3292,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan) if (virtio_net_hdr_tnl_from_skb(skb, hdr, vi->tx_tnl, virtio_is_little_endian(vi->vdev), 0, - false)) + false, feature_hdrlen)) return -EPROTO; if (vi->mergeable_rx_bufs) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 75dabb763c65..361b60c8be68 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -207,6 +207,23 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, return __virtio_net_hdr_to_skb(skb, hdr, little_endian, hdr->gso_type); } +/* This function must be called after virtio_net_hdr_from_skb(). */ +static inline void __virtio_net_set_hdrlen(const struct sk_buff *skb, + struct virtio_net_hdr *hdr, + bool little_endian) +{ + u16 hdr_len; + + hdr_len = skb_transport_offset(skb); + + if (hdr->gso_type == VIRTIO_NET_HDR_GSO_UDP_L4) + hdr_len += sizeof(struct udphdr); + else + hdr_len += tcp_hdrlen(skb); + + hdr->hdr_len = __cpu_to_virtio16(little_endian, hdr_len); +} + static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, struct virtio_net_hdr *hdr, bool little_endian, @@ -385,7 +402,8 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, bool tnl_hdr_negotiated, bool little_endian, int vlan_hlen, - bool has_data_valid) + bool has_data_valid, + bool feature_hdrlen) { struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)vhdr; unsigned int inner_nh, outer_th; @@ -394,9 +412,17 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, tnl_gso_type = skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM); - if (!tnl_gso_type) - return virtio_net_hdr_from_skb(skb, hdr, little_endian, - has_data_valid, vlan_hlen); + if (!tnl_gso_type) { + ret = virtio_net_hdr_from_skb(skb, hdr, little_endian, + has_data_valid, vlan_hlen); + if (ret) + return ret; + + if (feature_hdrlen && hdr->hdr_len) + __virtio_net_set_hdrlen(skb, hdr, little_endian); + + return ret; + } /* Tunnel support not negotiated but skb ask for it. */ if (!tnl_hdr_negotiated) -- cgit v1.2.3 From 6c860dc02a8e60b438e26940227dfa641fcdb66a Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Fri, 20 Mar 2026 10:18:18 +0800 Subject: virtio-net: correct hdr_len handling for tunnel gso The commit a2fb4bc4e2a6a03 ("net: implement virtio helpers to handle UDP GSO tunneling.") introduces support for the UDP GSO tunnel feature in virtio-net. The virtio spec says: If the \field{gso_type} has the VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 bit or VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6 bit set, \field{hdr_len} accounts for all the headers up to and including the inner transport. The commit did not update the hdr_len to include the inner transport. I observed that the "hdr_len" is 116 for this packet: 17:36:18.241105 52:55:00:d1:27:0a > 2e:2c:df:46:a9:e1, ethertype IPv4 (0x0800), length 2912: (tos 0x0, ttl 64, id 45197, offset 0, flags [none], proto UDP (17), length 2898) 192.168.122.100.50613 > 192.168.122.1.4789: [bad udp cksum 0x8106 -> 0x26a0!] VXLAN, flags [I] (0x08), vni 1 fa:c3:ba:82:05:ee > ce:85:0c:31:77:e5, ethertype IPv4 (0x0800), length 2862: (tos 0x0, ttl 64, id 14678, offset 0, flags [DF], proto TCP (6), length 2848) 192.168.3.1.49880 > 192.168.3.2.9898: Flags [P.], cksum 0x9266 (incorrect -> 0xaa20), seq 515667:518463, ack 1, win 64, options [nop,nop,TS val 2990048824 ecr 2798801412], length 2796 116 = 14(mac) + 20(ip) + 8(udp) + 8(vxlan) + 14(inner mac) + 20(inner ip) + 32(innner tcp) Fixes: a2fb4bc4e2a6a03 ("net: implement virtio helpers to handle UDP GSO tunneling.") Signed-off-by: Xuan Zhuo Link: https://patch.msgid.link/20260320021818.111741-3-xuanzhuo@linux.alibaba.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- include/linux/virtio_net.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 361b60c8be68..f36d21b5bc19 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -224,6 +224,22 @@ static inline void __virtio_net_set_hdrlen(const struct sk_buff *skb, hdr->hdr_len = __cpu_to_virtio16(little_endian, hdr_len); } +/* This function must be called after virtio_net_hdr_from_skb(). */ +static inline void __virtio_net_set_tnl_hdrlen(const struct sk_buff *skb, + struct virtio_net_hdr *hdr) +{ + u16 hdr_len; + + hdr_len = skb_inner_transport_offset(skb); + + if (hdr->gso_type == VIRTIO_NET_HDR_GSO_UDP_L4) + hdr_len += sizeof(struct udphdr); + else + hdr_len += inner_tcp_hdrlen(skb); + + hdr->hdr_len = __cpu_to_virtio16(true, hdr_len); +} + static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, struct virtio_net_hdr *hdr, bool little_endian, @@ -440,6 +456,9 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, if (ret) return ret; + if (feature_hdrlen && hdr->hdr_len) + __virtio_net_set_tnl_hdrlen(skb, hdr); + if (skb->protocol == htons(ETH_P_IPV6)) hdr->gso_type |= VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6; else -- cgit v1.2.3 From 2cdaff22ed26f1e619aa2b43f27bb84f2c6ef8f8 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Wed, 25 Mar 2026 02:55:48 +0100 Subject: dma-mapping: add missing `inline` for `dma_free_attrs` Under an UML build for an upcoming series [1], I got `-Wstatic-in-inline` for `dma_free_attrs`: BINDGEN rust/bindings/bindings_generated.rs - due to target missing In file included from rust/helpers/helpers.c:59: rust/helpers/dma.c:17:2: warning: static function 'dma_free_attrs' is used in an inline function with external linkage [-Wstatic-in-inline] 17 | dma_free_attrs(dev, size, cpu_addr, dma_handle, attrs); | ^ rust/helpers/dma.c:12:1: note: use 'static' to give inline function 'rust_helper_dma_free_attrs' internal linkage 12 | __rust_helper void rust_helper_dma_free_attrs(struct device *dev, size_t size, | ^ | static The issue is that `dma_free_attrs` was not marked `inline` when it was introduced alongside the rest of the stubs. Thus mark it. Fixes: ed6ccf10f24b ("dma-mapping: properly stub out the DMA API for !CONFIG_HAS_DMA") Closes: https://lore.kernel.org/rust-for-linux/20260322194616.89847-1-ojeda@kernel.org/ [1] Signed-off-by: Miguel Ojeda Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260325015548.70912-1-ojeda@kernel.org --- include/linux/dma-mapping.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 482b919f040f..99ef042ecdb4 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -255,8 +255,8 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size, { return NULL; } -static void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_handle, unsigned long attrs) +static inline void dma_free_attrs(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) { } static inline void *dmam_alloc_attrs(struct device *dev, size_t size, -- cgit v1.2.3 From 175b45ed343a9c547b5f45293d3ea08d38a7b6f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 14 Mar 2026 04:12:58 -0700 Subject: srcu: Use raw spinlocks so call_srcu() can be used under preempt_disable() Tree SRCU has used non-raw spinlocks for many years, motivated by a desire to avoid unnecessary real-time latency and the absence of any reason to use raw spinlocks. However, the recent use of SRCU in tracing as the underlying implementation of RCU Tasks Trace means that call_srcu() is invoked from preemption-disabled regions of code, which in turn requires that any locks acquired by call_srcu() or its callees must be raw spinlocks. This commit therefore converts SRCU's spinlocks to raw spinlocks. [boqun: Add Fixes tag] Reported-by: Kumar Kartikeya Dwivedi Fixes: c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast") Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng Cc: Sebastian Andrzej Siewior --- include/linux/srcutree.h | 8 +-- kernel/rcu/rcu.h | 9 +++ kernel/rcu/srcutree.c | 174 +++++++++++++++++++---------------------------- 3 files changed, 82 insertions(+), 109 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 958cb7ef41cb..dfb31d11ff05 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -34,7 +34,7 @@ struct srcu_data { /* Values: SRCU_READ_FLAVOR_.* */ /* Update-side state. */ - spinlock_t __private lock ____cacheline_internodealigned_in_smp; + raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp; struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ @@ -55,7 +55,7 @@ struct srcu_data { * Node in SRCU combining tree, similar in function to rcu_data. */ struct srcu_node { - spinlock_t __private lock; + raw_spinlock_t __private lock; unsigned long srcu_have_cbs[4]; /* GP seq for children having CBs, but only */ /* if greater than ->srcu_gp_seq. */ unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs have CBs for given GP? */ @@ -74,7 +74,7 @@ struct srcu_usage { /* First node at each level. */ int srcu_size_state; /* Small-to-big transition state. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ - spinlock_t __private lock; /* Protect counters and size state. */ + raw_spinlock_t __private lock; /* Protect counters and size state. */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ @@ -156,7 +156,7 @@ struct srcu_struct { #define __SRCU_USAGE_INIT(name) \ { \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ .srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL, \ .srcu_gp_seq_needed = SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE, \ .srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL, \ diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index dc5d614b372c..9b10b57b79ad 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -502,6 +502,15 @@ do { \ ___locked; \ }) +#define raw_spin_trylock_irqsave_rcu_node(p, flags) \ +({ \ + bool ___locked = raw_spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + #define raw_lockdep_assert_held_rcu_node(p) \ lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index aef8e91ad33e..2328827f8775 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -77,42 +77,6 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); static void process_srcu(struct work_struct *work); static void srcu_delay_timer(struct timer_list *t); -/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ -#define spin_lock_rcu_node(p) \ -do { \ - spin_lock(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) - -#define spin_lock_irq_rcu_node(p) \ -do { \ - spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define spin_unlock_irq_rcu_node(p) \ - spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) - -#define spin_lock_irqsave_rcu_node(p, flags) \ -do { \ - spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define spin_trylock_irqsave_rcu_node(p, flags) \ -({ \ - bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ - \ - if (___locked) \ - smp_mb__after_unlock_lock(); \ - ___locked; \ -}) - -#define spin_unlock_irqrestore_rcu_node(p, flags) \ - spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ - /* * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and @@ -131,7 +95,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) */ for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq; @@ -186,7 +150,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) /* Each pass through this loop initializes one srcu_node structure. */ srcu_for_each_node_breadth_first(ssp, snp) { - spin_lock_init(&ACCESS_PRIVATE(snp, lock)); + raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { @@ -242,7 +206,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) if (!ssp->srcu_sup) return -ENOMEM; if (!is_static) - spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + raw_spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_sup->srcu_cb_mutex); @@ -394,20 +358,20 @@ static void srcu_transition_to_big(struct srcu_struct *ssp) /* Double-checked locking on ->srcu_size-state. */ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) return; - spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) { - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } __srcu_transition_to_big(ssp); - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* * Check to see if the just-encountered contention event justifies * a transition to SRCU_SIZE_BIG. */ -static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) +static void raw_spin_lock_irqsave_check_contention(struct srcu_struct *ssp) { unsigned long j; @@ -429,16 +393,16 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module * parameter permits this. */ -static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) +static void raw_spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) { struct srcu_struct *ssp = sdp->ssp; - if (spin_trylock_irqsave_rcu_node(sdp, *flags)) + if (raw_spin_trylock_irqsave_rcu_node(sdp, *flags)) return; - spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); - spin_lock_irqsave_check_contention(ssp); - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags); - spin_lock_irqsave_rcu_node(sdp, *flags); + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); + raw_spin_lock_irqsave_check_contention(ssp); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags); + raw_spin_lock_irqsave_rcu_node(sdp, *flags); } /* @@ -447,12 +411,12 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module * parameter permits this. */ -static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +static void raw_spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) { - if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags)) + if (raw_spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags)) return; - spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); - spin_lock_irqsave_check_contention(ssp); + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); + raw_spin_lock_irqsave_check_contention(ssp); } /* @@ -470,13 +434,13 @@ static void check_init_srcu_struct(struct srcu_struct *ssp) /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) { - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } init_srcu_struct_fields(ssp, true); - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -742,9 +706,9 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) unsigned long delay; struct srcu_usage *sup = ssp->srcu_sup; - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); delay = srcu_get_delay(ssp); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); if (WARN_ON(!delay)) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) @@ -960,7 +924,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) mutex_lock(&sup->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq_rcu_node(sup); + raw_spin_lock_irq_rcu_node(sup); idx = rcu_seq_state(sup->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); if (srcu_gp_is_expedited(ssp)) @@ -971,7 +935,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) gpseq = rcu_seq_current(&sup->srcu_gp_seq); if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq)) WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq); - spin_unlock_irq_rcu_node(sup); + raw_spin_unlock_irq_rcu_node(sup); mutex_unlock(&sup->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -983,7 +947,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) } else { idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); srcu_for_each_node_breadth_first(ssp, snp) { - spin_lock_irq_rcu_node(snp); + raw_spin_lock_irq_rcu_node(snp); cbs = false; last_lvl = snp >= sup->level[rcu_num_lvls - 1]; if (last_lvl) @@ -998,7 +962,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) else mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq_rcu_node(snp); + raw_spin_unlock_irq_rcu_node(snp); if (cbs) srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); } @@ -1008,27 +972,27 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (!(gpseq & counter_wrap_check)) for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irq_rcu_node(sdp); + raw_spin_lock_irq_rcu_node(sdp); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); } /* Callback initiation done, allow grace periods after next. */ mutex_unlock(&sup->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq_rcu_node(sup); + raw_spin_lock_irq_rcu_node(sup); gpseq = rcu_seq_current(&sup->srcu_gp_seq); if (!rcu_seq_state(gpseq) && ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) { srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(sup); + raw_spin_unlock_irq_rcu_node(sup); srcu_reschedule(ssp, 0); } else { - spin_unlock_irq_rcu_node(sup); + raw_spin_unlock_irq_rcu_node(sup); } /* Transition to big if needed. */ @@ -1059,19 +1023,19 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) || (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) return; - spin_lock_irqsave_rcu_node(snp, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); sgsne = snp->srcu_gp_seq_needed_exp; if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) { - spin_unlock_irqrestore_rcu_node(snp, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); return; } WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } - spin_lock_irqsave_ssp_contention(ssp, &flags); + raw_spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -1109,12 +1073,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf) return; /* GP already done and CBs recorded. */ - spin_lock_irqsave_rcu_node(snp, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) { if (snp == snp_leaf && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore_rcu_node(snp, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); if (snp == snp_leaf && snp_seq != s) { srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); return; @@ -1129,11 +1093,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, sgsne = snp->srcu_gp_seq_needed_exp; if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s))) WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave_ssp_contention(ssp, &flags); + raw_spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -1160,7 +1124,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, else if (list_empty(&sup->work.work.entry)) list_add(&sup->work.work.entry, &srcu_boot_list); } - spin_unlock_irqrestore_rcu_node(sup, flags); + raw_spin_unlock_irqrestore_rcu_node(sup, flags); } /* @@ -1172,9 +1136,9 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { unsigned long curdelay; - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = !srcu_get_delay(ssp); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) @@ -1285,12 +1249,12 @@ static bool srcu_should_expedite(struct srcu_struct *ssp) return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_rcu_node(sdp, flags); + raw_spin_lock_irqsave_rcu_node(sdp, flags); if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); return false; /* Callbacks already present, so not idle. */ } - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); /* * No local callbacks, so probabilistically probe global state. @@ -1350,7 +1314,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_sdp_contention(sdp, &flags); + raw_spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); /* @@ -1410,7 +1374,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, sdp->srcu_gp_seq_needed_exp = s; needexp = true; } - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); /* Ensure that snp node tree is fully initialized before traversing it */ if (ss_state < SRCU_SIZE_WAIT_BARRIER) @@ -1522,7 +1486,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) /* * Make sure that later code is ordered after the SRCU grace - * period. This pairs with the spin_lock_irq_rcu_node() + * period. This pairs with the raw_spin_lock_irq_rcu_node() * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed * because the current CPU might have been totally uninvolved with * (and thus unordered against) that grace period. @@ -1701,7 +1665,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp) */ static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) { - spin_lock_irq_rcu_node(sdp); + raw_spin_lock_irq_rcu_node(sdp); atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); @@ -1710,7 +1674,7 @@ static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt); } - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); } /** @@ -1761,7 +1725,7 @@ static void srcu_expedite_current_cb(struct rcu_head *rhp) bool needcb = false; struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head); - spin_lock_irqsave_sdp_contention(sdp, &flags); + raw_spin_lock_irqsave_sdp_contention(sdp, &flags); if (sdp->srcu_ec_state == SRCU_EC_IDLE) { WARN_ON_ONCE(1); } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { @@ -1771,7 +1735,7 @@ static void srcu_expedite_current_cb(struct rcu_head *rhp) sdp->srcu_ec_state = SRCU_EC_PENDING; needcb = true; } - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); // If needed, requeue ourselves as an expedited SRCU callback. if (needcb) __call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); @@ -1795,7 +1759,7 @@ void srcu_expedite_current(struct srcu_struct *ssp) migrate_disable(); sdp = this_cpu_ptr(ssp->sda); - spin_lock_irqsave_sdp_contention(sdp, &flags); + raw_spin_lock_irqsave_sdp_contention(sdp, &flags); if (sdp->srcu_ec_state == SRCU_EC_IDLE) { sdp->srcu_ec_state = SRCU_EC_PENDING; needcb = true; @@ -1804,7 +1768,7 @@ void srcu_expedite_current(struct srcu_struct *ssp) } else { WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); } - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); // If needed, queue an expedited SRCU callback. if (needcb) __call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); @@ -1848,17 +1812,17 @@ static void srcu_advance_state(struct srcu_struct *ssp) */ idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; } idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); if (idx != SRCU_STATE_IDLE) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* Someone else started the grace period. */ @@ -1872,10 +1836,10 @@ static void srcu_advance_state(struct srcu_struct *ssp) return; /* readers present, retry later. */ } srcu_flip(ssp); - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2); ssp->srcu_sup->srcu_n_exp_nodelay = 0; - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); } if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) { @@ -1913,7 +1877,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); - spin_lock_irq_rcu_node(sdp); + raw_spin_lock_irq_rcu_node(sdp); WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); @@ -1924,7 +1888,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) */ if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); return; /* Someone else on the job or nothing to do. */ } @@ -1932,7 +1896,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); len = ready_cbs.len; - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); @@ -1947,11 +1911,11 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ - spin_lock_irq_rcu_node(sdp); + raw_spin_lock_irq_rcu_node(sdp); rcu_segcblist_add_len(&sdp->srcu_cblist, -len); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); /* An SRCU barrier or callbacks from previous nesting work pending */ if (more) srcu_schedule_cbs_sdp(sdp, 0); @@ -1965,7 +1929,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) { bool pushgp = true; - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ @@ -1975,7 +1939,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) /* Outstanding request and no GP. Start one. */ srcu_gp_start(ssp); } - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); if (pushgp) queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay); @@ -1995,9 +1959,9 @@ static void process_srcu(struct work_struct *work) ssp = sup->srcu_ssp; srcu_advance_state(ssp); - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = srcu_get_delay(ssp); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); if (curdelay) { WRITE_ONCE(sup->reschedule_count, 0); } else { -- cgit v1.2.3 From 7c405fb3279b39244b260b54f1bd6488689ae235 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Wed, 18 Mar 2026 17:56:21 -0700 Subject: rcu: Use an intermediate irq_work to start process_srcu() Since commit c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast") we switched to SRCU in BPF. However as BPF instrument can happen basically everywhere (including where a scheduler lock is held), call_srcu() now needs to avoid acquiring scheduler lock because otherwise it could cause deadlock [1]. Fix this by following what the previous RCU Tasks Trace did: using an irq_work to delay the queuing of the work to start process_srcu(). [boqun: Apply Joel's feedback] [boqun: Apply Andrea's test feedback] Reported-by: Andrea Righi Closes: https://lore.kernel.org/all/abjzvz_tL_siV17s@gpd4/ Fixes: commit c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast") Link: https://lore.kernel.org/rcu/3c4c5a29-24ea-492d-aeee-e0d9605b4183@nvidia.com/ [1] Suggested-by: Zqiang Tested-by: Andrea Righi Tested-by: Paul E. McKenney Tested-by: Joel Fernandes Signed-off-by: Boqun Feng --- include/linux/srcutree.h | 1 + kernel/rcu/srcutree.c | 30 ++++++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index dfb31d11ff05..be76fa4fc170 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -95,6 +95,7 @@ struct srcu_usage { unsigned long reschedule_jiffies; unsigned long reschedule_count; struct delayed_work work; + struct irq_work irq_work; struct srcu_struct *srcu_ssp; }; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 678bd9a73875..0d01cd8c4b4a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -75,6 +76,7 @@ static bool __read_mostly srcu_init_done; static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); static void process_srcu(struct work_struct *work); +static void srcu_irq_work(struct irq_work *work); static void srcu_delay_timer(struct timer_list *t); /* @@ -216,6 +218,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); + init_irq_work(&ssp->srcu_sup->irq_work, srcu_irq_work); ssp->srcu_sup->sda_is_static = is_static; if (!is_static) { ssp->sda = alloc_percpu(struct srcu_data); @@ -716,6 +719,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ + /* Wait for irq_work to finish first as it may queue a new work. */ + irq_work_sync(&sup->irq_work); flush_delayed_work(&sup->work); for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); @@ -1121,9 +1126,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, // it isn't. And it does not have to be. After all, it // can only be executed during early boot when there is only // the one boot CPU running with interrupts still disabled. + // + // Use an irq_work here to avoid acquiring runqueue lock with + // srcu rcu_node::lock held. BPF instrument could introduce the + // opposite dependency, hence we need to break the possible + // locking dependency here. if (likely(srcu_init_done)) - queue_delayed_work(rcu_gp_wq, &sup->work, - !!srcu_get_delay(ssp)); + irq_work_queue(&sup->irq_work); else if (list_empty(&sup->work.work.entry)) list_add(&sup->work.work.entry, &srcu_boot_list); } @@ -1982,6 +1991,23 @@ static void process_srcu(struct work_struct *work) srcu_reschedule(ssp, curdelay); } +static void srcu_irq_work(struct irq_work *work) +{ + struct srcu_struct *ssp; + struct srcu_usage *sup; + unsigned long delay; + unsigned long flags; + + sup = container_of(work, struct srcu_usage, irq_work); + ssp = sup->srcu_ssp; + + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + delay = srcu_get_delay(ssp); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + + queue_delayed_work(rcu_gp_wq, &sup->work, !!delay); +} + void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags, unsigned long *gp_seq) { -- cgit v1.2.3 From a6fc88b22bc8d12ad52e8412c667ec0f5bf055af Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 23 Mar 2026 20:14:18 -0400 Subject: srcu: Use irq_work to start GP in tiny SRCU Tiny SRCU's srcu_gp_start_if_needed() directly calls schedule_work(), which acquires the workqueue pool->lock. This causes a lockdep splat when call_srcu() is called with a scheduler lock held, due to: call_srcu() [holding pi_lock] srcu_gp_start_if_needed() schedule_work() -> pool->lock workqueue_init() / create_worker() [holding pool->lock] wake_up_process() -> try_to_wake_up() -> pi_lock Also add irq_work_sync() to cleanup_srcu_struct() to prevent a use-after-free if a queued irq_work fires after cleanup begins. Tested with rcutorture SRCU-T and no lockdep warnings. [ Thanks to Boqun for similar fix in patch "rcu: Use an intermediate irq_work to start process_srcu()" ] Signed-off-by: Joel Fernandes Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/srcutiny.h | 4 ++++ kernel/rcu/srcutiny.c | 19 ++++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index dec7cbe015aa..905b629e8fa3 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -11,6 +11,7 @@ #ifndef _LINUX_SRCU_TINY_H #define _LINUX_SRCU_TINY_H +#include #include struct srcu_struct { @@ -24,18 +25,21 @@ struct srcu_struct { struct rcu_head *srcu_cb_head; /* Pending callbacks: Head. */ struct rcu_head **srcu_cb_tail; /* Pending callbacks: Tail. */ struct work_struct srcu_work; /* For driving grace periods. */ + struct irq_work srcu_irq_work; /* Defer schedule_work() to irq work. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ }; void srcu_drive_gp(struct work_struct *wp); +void srcu_tiny_irq_work(struct irq_work *irq_work); #define __SRCU_STRUCT_INIT(name, __ignored, ___ignored, ____ignored) \ { \ .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ .srcu_cb_tail = &name.srcu_cb_head, \ .srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \ + .srcu_irq_work = { .func = srcu_tiny_irq_work }, \ __SRCU_DEP_MAP_INIT(name) \ } diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 3450c3751ef7..a2e2d516e51b 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -41,6 +42,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp) ssp->srcu_idx_max = 0; INIT_WORK(&ssp->srcu_work, srcu_drive_gp); INIT_LIST_HEAD(&ssp->srcu_work.entry); + init_irq_work(&ssp->srcu_irq_work, srcu_tiny_irq_work); return 0; } @@ -84,6 +86,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); void cleanup_srcu_struct(struct srcu_struct *ssp) { WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); + irq_work_sync(&ssp->srcu_irq_work); flush_work(&ssp->srcu_work); WARN_ON(ssp->srcu_gp_running); WARN_ON(ssp->srcu_gp_waiting); @@ -177,6 +180,20 @@ void srcu_drive_gp(struct work_struct *wp) } EXPORT_SYMBOL_GPL(srcu_drive_gp); +/* + * Use an irq_work to defer schedule_work() to avoid acquiring the workqueue + * pool->lock while the caller might hold scheduler locks, causing lockdep + * splats due to workqueue_init() doing a wakeup. + */ +void srcu_tiny_irq_work(struct irq_work *irq_work) +{ + struct srcu_struct *ssp; + + ssp = container_of(irq_work, struct srcu_struct, srcu_irq_work); + schedule_work(&ssp->srcu_work); +} +EXPORT_SYMBOL_GPL(srcu_tiny_irq_work); + static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; @@ -189,7 +206,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) WRITE_ONCE(ssp->srcu_idx_max, cookie); if (!READ_ONCE(ssp->srcu_gp_running)) { if (likely(srcu_init_done)) - schedule_work(&ssp->srcu_work); + irq_work_queue(&ssp->srcu_irq_work); else if (list_empty(&ssp->srcu_work.entry)) list_add(&ssp->srcu_work.entry, &srcu_boot_list); } -- cgit v1.2.3